diff --git a/.github/workflows/backward-compat.yml b/.github/workflows/backward-compat.yml
index ffc19c6d70..f454f99a00 100644
--- a/.github/workflows/backward-compat.yml
+++ b/.github/workflows/backward-compat.yml
@@ -98,9 +98,24 @@ jobs:
         env:
           GH_TOKEN: ${{ github.token }}
           PR_TITLE: ${{ github.event.pull_request.title }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           echo "Breaking changes detected. Checking for acknowledgment..."
 
+          # In merge_group context, github.event.pull_request is empty.
+          # Extract PR number from the merge queue branch name
+          # (format: gh-readonly-queue/main/pr-<NUMBER>-<SHA>)
+          if [ -z "$PR_TITLE" ]; then
+            PR_NUM="$PR_NUMBER"
+            if [ -z "$PR_NUM" ]; then
+              PR_NUM=$(echo "${GITHUB_REF_NAME}" | sed -n 's|.*pr-\([0-9]*\)-.*|\1|p')
+            fi
+            if [ -n "$PR_NUM" ]; then
+              PR_TITLE=$(gh pr view "$PR_NUM" --json title --jq '.title' 2>/dev/null || echo "")
+              echo "Resolved PR title from PR #${PR_NUM}: $PR_TITLE"
+            fi
+          fi
+
           # Check PR title for '!:' marker (conventional commits)
           if [[ "$PR_TITLE" =~ ^[a-z]+\!: ]]; then
             echo "✓ Breaking change acknowledged in PR title"
@@ -190,9 +205,24 @@ jobs:
         env:
           GH_TOKEN: ${{ github.token }}
           PR_TITLE: ${{ github.event.pull_request.title }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           echo "Integration tests failed. Checking for acknowledgment..."
 
+          # In merge_group context, github.event.pull_request is empty.
+          # Extract PR number from the merge queue branch name
+          # (format: gh-readonly-queue/main/pr-<NUMBER>-<SHA>)
+          if [ -z "$PR_TITLE" ]; then
+            PR_NUM="$PR_NUMBER"
+            if [ -z "$PR_NUM" ]; then
+              PR_NUM=$(echo "${GITHUB_REF_NAME}" | sed -n 's|.*pr-\([0-9]*\)-.*|\1|p')
+            fi
+            if [ -n "$PR_NUM" ]; then
+              PR_TITLE=$(gh pr view "$PR_NUM" --json title --jq '.title' 2>/dev/null || echo "")
+              echo "Resolved PR title from PR #${PR_NUM}: $PR_TITLE"
+            fi
+          fi
+
           # Check PR title for '!:' marker (conventional commits)
           if [[ "$PR_TITLE" =~ ^[a-z]+\!: ]]; then
             echo "✓ Breaking change acknowledged in PR title"
diff --git a/.github/workflows/file-processors-tests.yml b/.github/workflows/file-processors-tests.yml
index cdb8c4566c..863b9d38b5 100644
--- a/.github/workflows/file-processors-tests.yml
+++ b/.github/workflows/file-processors-tests.yml
@@ -51,6 +51,8 @@ jobs:
         run: uv pip install docling
 
       - name: Start Llama Stack server with docling
+        env:
+          LLAMA_STACK_DISABLE_VERSION_CHECK: "1"
         run: |
           uv run --no-sync llama stack run \
             --providers "file_processors=inline::docling,files=inline::localfs" \
diff --git a/.github/workflows/test-external-provider-module.yml b/.github/workflows/test-external-provider-module.yml
index b492e097cd..a3bfb96de7 100644
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@@ -2,18 +2,10 @@ name: Test External Providers Installed via Module
 
 run-name: Test External Provider installation via Python module
 
+# Disabled until we find a suitable external provider for CI.
+# The lmeval provider depends on the eval API which is being removed.
 on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'src/llama_stack/**'
-      - 'tests/integration/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - 'tests/external/*'
-      - '.github/workflows/test-external-provider-module.yml' # This workflow
+  workflow_dispatch: {}
 
 jobs:
   test-external-providers-from-module:
@@ -25,16 +17,22 @@ jobs:
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
 
-      - name: Install lmeval provider
+      - name: Install weather external API and kaze provider
         run: |
-          uv pip install llama-stack-provider-lmeval
+          uv pip install tests/external/llama-stack-api-weather
+          uv pip install tests/external/llama-stack-provider-kaze
+
+      - name: Configure external API and provider
+        run: |
+          mkdir -p ~/.llama/apis.d ~/.llama/providers.d
+          cp tests/external/weather.yaml ~/.llama/apis.d/
+          cp tests/external/kaze.yaml ~/.llama/providers.d/
 
       - name: Start Llama Stack server in background
         env:
-          TRUSTYAI_LMEVAL_USE_K8S: "false"
           LLAMA_STACK_LOG_FILE: "server.log"
         run: |
-          nohup uv run llama stack run tests/external/llama-stack-provider-lmeval/config.yaml > server.log 2>&1 &
+          nohup uv run llama stack run tests/external/config.yaml > server.log 2>&1 &
 
       - name: Wait for Llama Stack server to be ready
         run: |
@@ -54,10 +52,10 @@ jobs:
         run: |
           response=$(curl -s http://localhost:8321/v1/providers)
           echo "$response" | python3 -m json.tool
-          if echo "$response" | grep -q "trustyai_lmeval"; then
-            echo "lmeval external provider loaded successfully"
+          if echo "$response" | grep -q "kaze"; then
+            echo "kaze external provider loaded successfully"
           else
-            echo "ERROR: lmeval provider not found in providers list"
+            echo "ERROR: kaze provider not found in providers list"
             exit 1
           fi
 
diff --git a/client-sdks/stainless/config.yml b/client-sdks/stainless/config.yml
index 70cfbeb1c8..06d0b9847a 100644
--- a/client-sdks/stainless/config.yml
+++ b/client-sdks/stainless/config.yml
@@ -43,26 +43,6 @@ client_settings:
 environments:
   production: http://any-hosted-llama-stack.com
 pagination:
-- name: datasets_iterrows
-  type: offset
-  request:
-    dataset_id:
-      type: string
-    start_index:
-      type: integer
-      x-stainless-pagination-property:
-        purpose: offset_count_param
-    limit:
-      type: integer
-  response:
-    data:
-      type: array
-      items:
-        type: object
-    next_index:
-      type: integer
-      x-stainless-pagination-property:
-        purpose: offset_count_start_field
 - name: openai_cursor_page
   type: cursor
   request:
@@ -105,39 +85,6 @@ settings:
     '
 openapi:
   transformations:
-  - command: mergeObject
-    reason: Better return_type using enum
-    args:
-      target:
-      - $.components.schemas
-      object:
-        ReturnType:
-          additionalProperties: false
-          properties:
-            type:
-              enum:
-              - string
-              - number
-              - boolean
-              - array
-              - object
-              - json
-              - union
-              - chat_completion_input
-              - completion_input
-              - agent_turn_input
-          required:
-          - type
-          type: object
-  - command: replaceProperties
-    reason: Replace return type properties with better model (see above)
-    args:
-      filter:
-        only:
-        - $.components.schemas.ScoringFn.properties.return_type
-        - $.components.schemas.RegisterScoringFunctionRequest.properties.return_type
-      value:
-        $ref: '#/components/schemas/ReturnType'
   - command: oneOfToAnyOf
     reason: Prism (mock server) doesn't like one of our requests as it technically
       matches multiple variants
@@ -163,7 +110,6 @@ resources:
       param_type: ParamType
       safety_violation: SafetyViolation
       sampling_params: SamplingParams
-      scoring_result: ScoringResult
       system_message: SystemMessage
       health_info: HealthInfo
       provider_info: ProviderInfo
@@ -365,22 +311,6 @@ resources:
         endpoint: get /v1/shields
       register: post /v1/shields
       delete: delete /v1/shields/{identifier}
-  scoring:
-    methods:
-      score: post /v1/scoring/score
-      score_batch: post /v1/scoring/score-batch
-  scoring_functions:
-    models:
-      scoring_fn: ScoringFn
-      scoring_fn_params: ScoringFnParams
-      list_scoring_functions_response: ListScoringFunctionsResponse
-    methods:
-      retrieve: get /v1/scoring-functions/{scoring_fn_id}
-      list:
-        paginated: false
-        endpoint: get /v1/scoring-functions
-      register: post /v1/scoring-functions
-      unregister: delete /v1/scoring-functions/{scoring_fn_id}
   files:
     models:
       file: OpenAIFileObject
@@ -400,33 +330,6 @@ resources:
       cancel: post /v1/batches/{batch_id}/cancel
   alpha:
     subresources:
-      benchmarks:
-        models:
-          benchmark: Benchmark
-          list_benchmarks_response: ListBenchmarksResponse
-        methods:
-          retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
-          list:
-            paginated: false
-            endpoint: get /v1alpha/eval/benchmarks
-          register: post /v1alpha/eval/benchmarks
-          unregister: delete /v1alpha/eval/benchmarks/{benchmark_id}
-      eval:
-        models:
-          evaluate_response: EvaluateResponse
-          benchmark_config: BenchmarkConfig
-          job: Job
-        methods:
-          evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
-          run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
-          evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
-          run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
-        subresources:
-          jobs:
-            methods:
-              cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
-              status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
-              retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
       admin:
         methods:
           list_providers: get /v1alpha/admin/providers
@@ -437,17 +340,3 @@ resources:
       inference:
         methods:
           rerank: post /v1alpha/inference/rerank
-  beta:
-    subresources:
-      datasets:
-        models:
-          list_datasets_response: ListDatasetsResponse
-        methods:
-          register: post /v1beta/datasets
-          retrieve: get /v1beta/datasets/{dataset_id}
-          list:
-            paginated: false
-            endpoint: get /v1beta/datasets
-          unregister: delete /v1beta/datasets/{dataset_id}
-          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
-          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml
index d903796d75..57c8f3d955 100644
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@@ -1948,190 +1948,6 @@ paths:
             schema:
               $ref: '#/components/schemas/RunShieldRequest'
         required: true
-  /v1/scoring-functions:
-    get:
-      responses:
-        '200':
-          description: A ListScoringFunctionsResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListScoringFunctionsResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring Functions
-      summary: List all scoring functions.
-      description: List all scoring functions.
-      operationId: list_scoring_functions_v1_scoring_functions_get
-    post:
-      responses:
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-        '204':
-          description: The scoring function was successfully registered.
-      tags:
-      - Scoring Functions
-      summary: Register a scoring function.
-      description: Register a scoring function.
-      operationId: register_scoring_function_v1_scoring_functions_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
-        required: true
-      deprecated: true
-  /v1/scoring-functions/{scoring_fn_id}:
-    get:
-      responses:
-        '200':
-          description: A ScoringFn.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoringFn'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Scoring Functions
-      summary: Get a scoring function by its ID.
-      description: Get a scoring function by its ID.
-      operationId: get_scoring_function_v1_scoring_functions__scoring_fn_id__get
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to get.
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The scoring function was successfully unregistered.
-      tags:
-      - Scoring Functions
-      summary: Unregister a scoring function.
-      description: Unregister a scoring function.
-      operationId: unregister_scoring_function_v1_scoring_functions__scoring_fn_id__delete
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to unregister.
-      deprecated: true
-  /v1/scoring/score:
-    post:
-      responses:
-        '200':
-          description: A ScoreResponse object containing rows and aggregated results.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring
-      summary: Score a list of rows.
-      description: Score a list of rows.
-      operationId: score_v1_scoring_score_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreRequest'
-        required: true
-  /v1/scoring/score-batch:
-    post:
-      responses:
-        '200':
-          description: A ScoreBatchResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreBatchResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring
-      summary: Score a batch of rows.
-      description: Score a batch of rows.
-      operationId: score_batch_v1_scoring_score_batch_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreBatchRequest'
-        required: true
   /v1/shields:
     get:
       responses:
@@ -3380,116 +3196,15 @@ paths:
       description: Get the version of the service.
       operationId: version_v1_version_get
       x-public: true
-  /v1beta/datasetio/append-rows/{dataset_id}:
+  /v1alpha/inference/rerank:
     post:
-      responses:
-        '204':
-          description: Rows were successfully appended.
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - DatasetIO
-      summary: Append rows to a dataset.
-      description: Append rows to a dataset.
-      operationId: append_rows_v1beta_datasetio_append_rows__dataset_id__post
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to append the rows to.
-          title: Dataset Id
-        description: The ID of the dataset to append the rows to.
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/AppendRowsRequest'
-  /v1beta/datasetio/iterrows/{dataset_id}:
-    get:
-      responses:
-        '200':
-          description: A PaginatedResponse containing the rows.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/PaginatedResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - DatasetIO
-      summary: Get a paginated list of rows from a dataset.
-      description: |-
-        Get a paginated list of rows from a dataset.
-
-        Uses offset-based pagination where:
-        - start_index: The starting index (0-based). If None, starts from beginning.
-        - limit: Number of items to return. If None or -1, returns all items.
-
-        The response includes:
-        - data: List of items for the current page.
-        - has_more: Whether there are more items available after this set.
-      operationId: iterrows_v1beta_datasetio_iterrows__dataset_id__get
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-        description: The ID of the dataset to get the rows from.
-      - name: start_index
-        in: query
-        required: false
-        schema:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          title: Start Index
-        description: Index into dataset for the first row to get. Get all rows if None.
-      - name: limit
-        in: query
-        required: false
-        schema:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          title: Limit
-        description: The number of rows to get.
-  /v1beta/datasets:
-    get:
       responses:
         '200':
-          description: A list of dataset objects.
+          description: RerankResponse with indices sorted by relevance score (descending).
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/ListDatasetsResponse'
+                $ref: '#/components/schemas/RerankResponse'
         '400':
           description: Bad Request
           $ref: '#/components/responses/BadRequest400'
@@ -3503,18 +3218,25 @@ paths:
           description: Default Response
           $ref: '#/components/responses/DefaultError'
       tags:
-      - Datasets
-      summary: List all datasets.
-      description: List all datasets.
-      operationId: list_datasets_v1beta_datasets_get
-    post:
+      - Inference
+      summary: Rerank documents based on relevance to a query.
+      description: Rerank a list of documents based on their relevance to a query.
+      operationId: rerank_v1alpha_inference_rerank_post
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RerankRequest'
+        required: true
+  /v1alpha/admin/providers:
+    get:
       responses:
         '200':
-          description: The registered dataset object.
+          description: A list of provider information objects.
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/Dataset'
+                $ref: '#/components/schemas/ListProvidersResponse'
         '400':
           description: Bad Request
           $ref: '#/components/responses/BadRequest400'
@@ -3528,26 +3250,19 @@ paths:
           description: Default Response
           $ref: '#/components/responses/DefaultError'
       tags:
-      - Datasets
-      summary: Register a new dataset.
-      description: Register a new dataset.
-      operationId: register_dataset_v1beta_datasets_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterDatasetRequest'
-        required: true
-      deprecated: true
-  /v1beta/datasets/{dataset_id}:
+      - Admin
+      summary: List all available providers
+      description: List all available providers with their configuration and health status.
+      operationId: list_providers_v1alpha_admin_providers_get
+  /v1alpha/admin/providers/{provider_id}:
     get:
       responses:
         '200':
-          description: The dataset object.
+          description: The provider information object.
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/Dataset'
+                $ref: '#/components/schemas/ProviderInfo'
         '400':
           $ref: '#/components/responses/BadRequest400'
           description: Bad Request
@@ -3560,469 +3275,31 @@ paths:
         default:
           $ref: '#/components/responses/DefaultError'
           description: Default Response
+        '404':
+          description: Provider not found.
       tags:
-      - Datasets
-      summary: Get a dataset by its ID.
-      description: Get a dataset by its ID.
-      operationId: get_dataset_v1beta_datasets__dataset_id__get
+      - Admin
+      summary: Get provider details
+      description: Get detailed information about a specific provider.
+      operationId: inspect_provider_v1alpha_admin_providers__provider_id__get
       parameters:
-      - name: dataset_id
+      - name: provider_id
         in: path
         required: true
         schema:
           type: string
-          description: The ID of the dataset to get.
-          title: Dataset Id
-        description: The ID of the dataset to get.
-    delete:
+          description: The ID of the provider to inspect.
+          title: Provider Id
+        description: The ID of the provider to inspect.
+  /v1alpha/admin/inspect/routes:
+    get:
       responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The dataset was successfully unregistered.
-      tags:
-      - Datasets
-      summary: Unregister a dataset by its ID.
-      description: Unregister a dataset by its ID.
-      operationId: unregister_dataset_v1beta_datasets__dataset_id__delete
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-        description: The ID of the dataset to unregister.
-      deprecated: true
-  /v1alpha/eval/benchmarks:
-    get:
-      responses:
-        '200':
-          description: A ListBenchmarksResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListBenchmarksResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Benchmarks
-      summary: List all benchmarks.
-      description: List all benchmarks.
-      operationId: list_benchmarks_v1alpha_eval_benchmarks_get
-    post:
-      responses:
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-        '204':
-          description: The benchmark was successfully registered.
-      tags:
-      - Benchmarks
-      summary: Register a benchmark.
-      description: Register a benchmark.
-      operationId: register_benchmark_v1alpha_eval_benchmarks_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterBenchmarkRequest'
-        required: true
-      deprecated: true
-  /v1alpha/eval/benchmarks/{benchmark_id}:
-    get:
-      responses:
-        '200':
-          description: A Benchmark.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Benchmark'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Benchmarks
-      summary: Get a benchmark by its ID.
-      description: Get a benchmark by its ID.
-      operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-        description: The ID of the benchmark to get.
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The benchmark was successfully unregistered.
-      tags:
-      - Benchmarks
-      summary: Unregister a benchmark.
-      description: Unregister a benchmark.
-      operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-        description: The ID of the benchmark to unregister.
-      deprecated: true
-  /v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
-    post:
-      responses:
-        '200':
-          description: EvaluateResponse object containing generations and scores.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Evaluate Rows
-      description: Evaluate a list of rows on a benchmark.
-      operationId: evaluate_rows_v1alpha_eval_benchmarks__benchmark_id__evaluations_post
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark
-          title: Benchmark Id
-        description: The ID of the benchmark
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EvaluateRowsBodyRequest'
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs:
-    post:
-      responses:
-        '200':
-          description: The job that was created to run the evaluation.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Run Eval
-      description: Run an evaluation on a benchmark.
-      operationId: run_eval_v1alpha_eval_benchmarks__benchmark_id__jobs_post
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark
-          title: Benchmark Id
-        description: The ID of the benchmark
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RunEvalBodyRequest'
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
-    get:
-      responses:
-        '200':
-          description: The status of the evaluation job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Job Status
-      description: Get the status of a job.
-      operationId: job_status_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: Successful Response
-      tags:
-      - Eval
-      summary: Job Cancel
-      description: Cancel a job.
-      operationId: job_cancel_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__delete
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
-    get:
-      responses:
-        '200':
-          description: The result of the job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Job Result
-      description: Get the result of a job.
-      operationId: job_result_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__result_get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-  /v1alpha/inference/rerank:
-    post:
-      responses:
-        '200':
-          description: RerankResponse with indices sorted by relevance score (descending).
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/RerankResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Inference
-      summary: Rerank documents based on relevance to a query.
-      description: Rerank a list of documents based on their relevance to a query.
-      operationId: rerank_v1alpha_inference_rerank_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RerankRequest'
-        required: true
-  /v1alpha/admin/providers:
-    get:
-      responses:
-        '200':
-          description: A list of provider information objects.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListProvidersResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Admin
-      summary: List all available providers
-      description: List all available providers with their configuration and health status.
-      operationId: list_providers_v1alpha_admin_providers_get
-  /v1alpha/admin/providers/{provider_id}:
-    get:
-      responses:
-        '200':
-          description: The provider information object.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ProviderInfo'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '404':
-          description: Provider not found.
-      tags:
-      - Admin
-      summary: Get provider details
-      description: Get detailed information about a specific provider.
-      operationId: inspect_provider_v1alpha_admin_providers__provider_id__get
-      parameters:
-      - name: provider_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the provider to inspect.
-          title: Provider Id
-        description: The ID of the provider to inspect.
-  /v1alpha/admin/inspect/routes:
-    get:
-      responses:
-        '200':
-          description: A list of route information objects.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListRoutesResponse'
+        '200':
+          description: A list of route information objects.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListRoutesResponse'
         '400':
           $ref: '#/components/responses/BadRequest400'
           description: Bad Request
@@ -9515,408 +8792,87 @@ components:
       - error
       title: ViolationLevel
       description: Severity level of a safety violation.
-    AggregationFunctionType:
-      type: string
-      enum:
-      - average
-      - weighted_average
-      - median
-      - categorical_count
-      - accuracy
-      title: AggregationFunctionType
-      description: Types of aggregation functions for scoring results.
     ArrayType:
+      description: Parameter type for array values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - array
       title: ArrayType
-      description: Parameter type for array values.
-    BasicScoringFnParams:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - basic
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: BasicScoringFnParams
-      description: Parameters for basic scoring function configuration.
     BooleanType:
+      description: Parameter type for boolean values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - boolean
       title: BooleanType
-      description: Parameter type for boolean values.
     ChatCompletionInputType:
+      description: Parameter type for chat completion input.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - chat_completion_input
       title: ChatCompletionInputType
-      description: Parameter type for chat completion input.
     CompletionInputType:
+      description: Parameter type for completion input.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - completion_input
       title: CompletionInputType
-      description: Parameter type for completion input.
     JsonType:
+      description: Parameter type for JSON values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - json
       title: JsonType
-      description: Parameter type for JSON values.
-    LLMAsJudgeScoringFnParams:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - llm_as_judge
-        judge_model:
-          type: string
-          title: Judge Model
-        prompt_template:
-          anyOf:
-          - type: string
-          - type: 'null'
-        judge_score_regexes:
-          items:
-            type: string
-          type: array
-          title: Judge Score Regexes
-          description: Regexes to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      required:
-      - judge_model
-      title: LLMAsJudgeScoringFnParams
-      description: Parameters for LLM-as-judge scoring function configuration.
     NumberType:
+      description: Parameter type for numeric values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - number
       title: NumberType
-      description: Parameter type for numeric values.
     ObjectType:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - object
-      title: ObjectType
       description: Parameter type for object values.
-    RegexParserScoringFnParams:
       properties:
         type:
-          type: string
           title: Type
-          enum:
-          - regex_parser
-        parsing_regexes:
-          items:
-            type: string
-          type: array
-          title: Parsing Regexes
-          description: Regex to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: RegexParserScoringFnParams
-      description: Parameters for regex parser scoring function configuration.
-    ScoringFn:
-      properties:
-        identifier:
           type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
           enum:
-          - scoring_function
-        description:
-          anyOf:
-          - type: string
-          - type: 'null'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this definition
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the deterministic function
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval
-      required:
-      - identifier
-      - provider_id
-      - return_type
-      title: ScoringFn
-      description: A scoring function resource for evaluating model outputs.
-    ScoringFnParams:
-      discriminator:
-        mapping:
-          basic: '#/components/schemas/BasicScoringFnParams'
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        title: LLMAsJudgeScoringFnParams
-      - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        title: RegexParserScoringFnParams
-      - $ref: '#/components/schemas/BasicScoringFnParams'
-        title: BasicScoringFnParams
-      title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-    ScoringFnParamsType:
-      description: Types of scoring function parameter configurations.
-      enum:
-      - llm_as_judge
-      - regex_parser
-      - basic
-      title: ScoringFnParamsType
-      type: string
+          - object
+      title: ObjectType
     StringType:
+      description: Parameter type for string values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - string
       title: StringType
-      description: Parameter type for string values.
     UnionType:
+      description: Parameter type for union values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - union
       title: UnionType
-      description: Parameter type for union values.
-    ListScoringFunctionsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/ScoringFn'
-          type: array
-          title: Data
-          description: List of scoring function objects.
-      required:
-      - data
-      title: ListScoringFunctionsResponse
-      description: Response containing a list of scoring function objects.
-    ScoreRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Input Rows
-          description: The rows to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-      required:
-      - input_rows
-      - scoring_functions
-      title: ScoreRequest
-      description: Request model for scoring a list of rows.
-    ScoreResponse:
-      properties:
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult.
-      required:
-      - results
-      title: ScoreResponse
-      description: The response from scoring.
-    ScoringResult:
-      properties:
-        score_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Score Rows
-          description: The scoring result for each row. Each row is a map of column name to value.
-        aggregated_results:
-          additionalProperties: true
-          type: object
-          title: Aggregated Results
-          description: Map of metric name to aggregated value
-      required:
-      - score_rows
-      - aggregated_results
-      title: ScoringResult
-      description: A scoring result for a single row.
-    ScoreBatchRequest:
-      properties:
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-        save_results_dataset:
-          type: boolean
-          title: Save Results Dataset
-          description: Whether to save the results to a dataset.
-          default: false
-      required:
-      - dataset_id
-      - scoring_functions
-      title: ScoreBatchRequest
-      description: Request model for scoring a batch of rows from a dataset.
-    ScoreBatchResponse:
-      properties:
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: (Optional) The identifier of the dataset that was scored
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult
-      required:
-      - results
-      title: ScoreBatchResponse
-      description: Response from batch scoring operations on datasets.
     Shield:
       properties:
         identifier:
@@ -10969,264 +9925,48 @@ components:
       - version
       title: VersionInfo
       description: Version information for the service.
-    AppendRowsRequest:
-      properties:
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: The rows to append to the dataset.
-      required:
-      - rows
-      title: AppendRowsRequest
-      description: Request body for appending rows to a dataset.
     PaginatedResponse:
+      description: A generic paginated response that follows a simple format.
       properties:
         data:
           items:
             additionalProperties: true
             type: object
-          type: array
           title: Data
+          type: array
         has_more:
-          type: boolean
           title: Has More
+          type: boolean
         url:
           anyOf:
           - type: string
           - type: 'null'
+          nullable: true
       required:
       - data
       - has_more
       title: PaginatedResponse
-      description: A generic paginated response that follows a simple format.
-    Dataset:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          description: Type of resource, always 'dataset' for datasets
-          enum:
-          - dataset
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: Purpose of the dataset indicating its intended use
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: Data source configuration for the dataset
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this dataset
-      required:
-      - identifier
-      - provider_id
-      - purpose
-      - source
-      title: Dataset
-      description: Dataset resource for storing and accessing training or evaluation data.
-    RowsDataSource:
-      properties:
-        type:
-          type: string
-          title: Type
-          description: The type of data source.
-          enum:
-          - rows
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: 'The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]'
-      required:
-      - rows
-      title: RowsDataSource
-      description: A dataset stored in rows.
-    URIDataSource:
-      properties:
-        type:
-          type: string
-          title: Type
-          description: The type of data source.
-          enum:
-          - uri
-        uri:
-          type: string
-          title: Uri
-          description: The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}"
-      required:
-      - uri
-      title: URIDataSource
-      description: A dataset that can be obtained from a URI.
-    ListDatasetsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Dataset'
-          type: array
-          title: Data
-          description: List of datasets
-      required:
-      - data
-      title: ListDatasetsResponse
-      description: Response from listing datasets.
-    Benchmark:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          description: The resource type, always benchmark.
-          enum:
-          - benchmark
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: Identifier of the dataset to use for the benchmark evaluation.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: List of scoring function identifiers to apply during evaluation.
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Metadata for this evaluation task.
-      required:
-      - identifier
-      - provider_id
-      - dataset_id
-      - scoring_functions
-      title: Benchmark
-      description: A benchmark resource for evaluating model performance.
-    ListBenchmarksResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Benchmark'
-          type: array
-          title: Data
-          description: List of benchmark objects.
-      required:
-      - data
-      title: ListBenchmarksResponse
-      description: Response containing a list of benchmark objects.
-    BenchmarkConfig:
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/ModelCandidate'
-          description: The candidate to evaluate
-        scoring_params:
-          additionalProperties:
-            oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          type: object
-          title: Scoring Params
-          description: Map between scoring function id and parameters for each scoring function you want to run
-        num_examples:
-          anyOf:
-          - type: integer
-            minimum: 1.0
-          - type: 'null'
-          description: Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated
-      required:
-      - eval_candidate
-      title: BenchmarkConfig
-      description: A benchmark configuration for evaluation.
     GreedySamplingStrategy:
+      description: Greedy sampling strategy that selects the highest probability token at each step.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'greedy' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - greedy
       title: GreedySamplingStrategy
-      description: Greedy sampling strategy that selects the highest probability token at each step.
-    ModelCandidate:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - model
-        model:
-          type: string
-          minLength: 1
-          title: Model
-          description: The model ID to evaluate
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model
-        system_message:
-          anyOf:
-          - $ref: '#/components/schemas/SystemMessage'
-            title: SystemMessage
-          - type: 'null'
-          description: The system message providing instructions or context to the model
-          title: SystemMessage
-      required:
-      - model
-      - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
     SamplingParams:
+      description: Sampling parameters for text generation.
       properties:
         strategy:
+          description: The sampling strategy to use.
+          discriminator:
+            mapping:
+              greedy: '#/components/schemas/GreedySamplingStrategy'
+              top_k: '#/components/schemas/TopKSamplingStrategy'
+              top_p: '#/components/schemas/TopPSamplingStrategy'
+            propertyName: type
           oneOf:
           - $ref: '#/components/schemas/GreedySamplingStrategy'
             title: GreedySamplingStrategy
@@ -11235,200 +9975,127 @@ components:
           - $ref: '#/components/schemas/TopKSamplingStrategy'
             title: TopKSamplingStrategy
           title: GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy
-          description: The sampling strategy to use.
-          discriminator:
-            propertyName: type
-            mapping:
-              greedy: '#/components/schemas/GreedySamplingStrategy'
-              top_k: '#/components/schemas/TopKSamplingStrategy'
-              top_p: '#/components/schemas/TopPSamplingStrategy'
         max_tokens:
           anyOf:
-          - type: integer
-            minimum: 1.0
+          - minimum: 1
+            type: integer
           - type: 'null'
           description: The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length.
+          nullable: true
         repetition_penalty:
           anyOf:
-          - type: number
-            maximum: 2.0
+          - maximum: 2.0
             minimum: -2.0
+            type: number
           - type: 'null'
-          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
           default: 1.0
+          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
         stop:
           anyOf:
           - items:
               type: string
-            type: array
             maxItems: 4
+            type: array
           - type: 'null'
           description: Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+          nullable: true
       title: SamplingParams
-      description: Sampling parameters for text generation.
     SystemMessage:
+      description: A system message providing instructions or context to the model.
       properties:
         role:
-          type: string
-          title: Role
           description: Must be 'system' to identify this as a system message.
+          title: Role
+          type: string
           enum:
           - system
         content:
           anyOf:
           - type: string
-          - oneOf:
-            - $ref: '#/components/schemas/ImageContentItem-Input'
-              title: ImageContentItem-Input
-            - $ref: '#/components/schemas/TextContentItem'
-              title: TextContentItem
-            discriminator:
-              propertyName: type
+          - discriminator:
               mapping:
-                image: '#/components/schemas/ImageContentItem-Input'
+                image: '#/components/schemas/ImageContentItem'
                 text: '#/components/schemas/TextContentItem'
-            title: ImageContentItem-Input | TextContentItem
+              propertyName: type
+            oneOf:
+            - $ref: '#/components/schemas/ImageContentItem'
+              title: ImageContentItem
+            - $ref: '#/components/schemas/TextContentItem'
+              title: TextContentItem
+            title: ImageContentItem | TextContentItem
           - items:
-              oneOf:
-              - $ref: '#/components/schemas/ImageContentItem-Input'
-                title: ImageContentItem-Input
-              - $ref: '#/components/schemas/TextContentItem'
-                title: TextContentItem
               discriminator:
-                propertyName: type
                 mapping:
-                  image: '#/components/schemas/ImageContentItem-Input'
+                  image: '#/components/schemas/ImageContentItem'
                   text: '#/components/schemas/TextContentItem'
-              title: ImageContentItem-Input | TextContentItem
+                propertyName: type
+              oneOf:
+              - $ref: '#/components/schemas/ImageContentItem'
+                title: ImageContentItem
+              - $ref: '#/components/schemas/TextContentItem'
+                title: TextContentItem
+              title: ImageContentItem | TextContentItem
             type: array
-            title: list[ImageContentItem-Input | TextContentItem]
-          title: string | list[ImageContentItem-Input | TextContentItem]
+            title: list[ImageContentItem | TextContentItem]
           description: The content of the 'system prompt'. If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages.
+          title: string | list[ImageContentItem | TextContentItem]
       required:
       - content
       title: SystemMessage
-      description: A system message providing instructions or context to the model.
     TopKSamplingStrategy:
+      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'top_k' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - top_k
         top_k:
-          type: integer
-          minimum: 1.0
-          title: Top K
           description: Number of top tokens to consider for sampling. Must be at least 1.
+          minimum: 1
+          title: Top K
+          type: integer
       required:
       - top_k
       title: TopKSamplingStrategy
-      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
     TopPSamplingStrategy:
+      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'top_p' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - top_p
         temperature:
-          type: number
+          description: Controls randomness in sampling. Higher values increase randomness.
           maximum: 2.0
           title: Temperature
-          description: Controls randomness in sampling. Higher values increase randomness.
+          type: number
           minimum: 0.0
         top_p:
-          type: number
+          default: 0.95
+          description: Cumulative probability threshold for nucleus sampling.
           maximum: 1.0
           minimum: 0.0
           title: Top P
-          description: Cumulative probability threshold for nucleus sampling.
-          default: 0.95
+          type: number
       required:
       - temperature
       title: TopPSamplingStrategy
-      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
-    EvaluateRowsRequest:
-      description: Request model for evaluating a list of rows on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        input_rows:
-          description: The rows to evaluate
-          items:
-            additionalProperties: true
-            type: object
-          minItems: 1
-          title: Input Rows
-          type: array
-        scoring_functions:
-          description: The scoring functions to use for the evaluation
-          items:
-            type: string
-          minItems: 1
-          title: Scoring Functions
-          type: array
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsRequest
-    EvaluateResponse:
-      properties:
-        generations:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Generations
-          description: The generations from the evaluation
-        scores:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Scores
-          description: The scores from the evaluation. Each key in the dict is a scoring function name
-      required:
-      - generations
-      - scores
-      title: EvaluateResponse
-      description: The response from an evaluation.
-    RunEvalRequest:
-      description: Request model for running an evaluation on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - benchmark_config
-      title: RunEvalRequest
     Job:
+      description: A job execution instance with status tracking.
       properties:
         job_id:
-          type: string
           title: Job Id
+          type: string
         status:
           $ref: '#/components/schemas/JobStatus'
       required:
       - job_id
       - status
       title: Job
-      description: A job execution instance with status tracking.
     RerankRequest:
       properties:
         model:
@@ -11573,85 +10240,6 @@ components:
       - $ref: '#/components/schemas/CompletionInputType'
         title: CompletionInputType
       title: StringType | ... (9 variants)
-    RegisterScoringFunctionRequest:
-      properties:
-        scoring_fn_id:
-          type: string
-          title: Scoring Fn Id
-          description: The ID of the scoring function to register.
-        description:
-          type: string
-          title: Description
-          description: The description of the scoring function.
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the scoring function.
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        provider_scoring_fn_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider scoring function to use for the scoring function.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the scoring function.
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequest
-      description: Request model for registering a scoring function.
     RegisterShieldRequest:
       properties:
         shield_id:
@@ -11678,90 +10266,6 @@ components:
       - shield_id
       title: RegisterShieldRequest
       description: Request model for registering a shield.
-    DataSource:
-      discriminator:
-        mapping:
-          rows: '#/components/schemas/RowsDataSource'
-          uri: '#/components/schemas/URIDataSource'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/URIDataSource'
-        title: URIDataSource
-      - $ref: '#/components/schemas/RowsDataSource'
-        title: RowsDataSource
-      title: URIDataSource | RowsDataSource
-    RegisterDatasetRequest:
-      properties:
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: The purpose of the dataset.
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: The data source of the dataset.
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata for the dataset.
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the dataset. If not provided, an ID will be generated.
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequest
-      description: Request model for registering a dataset.
-    RegisterBenchmarkRequest:
-      properties:
-        benchmark_id:
-          type: string
-          title: Benchmark Id
-          description: The ID of the benchmark to register.
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to use for the benchmark.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: The scoring functions to use for the benchmark.
-        provider_benchmark_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider benchmark to use for the benchmark.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the benchmark.
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata to use for the benchmark.
-      required:
-      - benchmark_id
-      - dataset_id
-      - scoring_functions
-      title: RegisterBenchmarkRequest
-      description: Request model for registering a benchmark.
     AllowedToolsFilter:
       properties:
         tool_names:
@@ -12698,13 +11202,6 @@ components:
       - model
       title: CreateResponseRequest
       description: Request model for creating a response.
-    DatasetPurpose:
-      type: string
-      enum:
-      - eval/question-answer
-      - eval/messages-answer
-      title: DatasetPurpose
-      description: Purpose of the dataset. Each purpose has a required input data schema.
     EmbeddedChunk-Input:
       properties:
         content:
@@ -12843,32 +11340,6 @@ components:
           - type: 'null'
       additionalProperties: true
       title: Errors
-    EvaluateRowsBodyRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          minItems: 1
-          title: Input Rows
-          description: The rows to evaluate
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          minItems: 1
-          title: Scoring Functions
-          description: The scoring functions to use for the evaluation
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsBodyRequest
-      description: Request body model for evaluating rows (without path parameter).
     HealthStatus:
       type: string
       enum:
@@ -12912,16 +11383,6 @@ components:
       required:
       - cached_tokens
       title: InputTokensDetails
-    JobStatus:
-      type: string
-      enum:
-      - completed
-      - in_progress
-      - failed
-      - scheduled
-      - cancelled
-      title: JobStatus
-      description: Status of a job execution.
     ListConnectorsResponse:
       properties:
         data:
@@ -13896,15 +12357,6 @@ components:
       - disabled
       title: ResponseTruncation
       description: Controls how the service truncates input when it exceeds the model context window.
-    RunEvalBodyRequest:
-      properties:
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_config
-      title: RunEvalBodyRequest
-      description: Request body model for running an evaluation (without path parameter).
     SearchRankingOptions:
       properties:
         ranker:
@@ -14292,50 +12744,6 @@ components:
       - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
         title: OpenAIResponseContentPartReasoningText
       title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
-    ListBenchmarksRequest:
-      description: Request model for listing benchmarks.
-      properties: {}
-      title: ListBenchmarksRequest
-    GetBenchmarkRequest:
-      description: Request model for getting a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: GetBenchmarkRequest
-    UnregisterBenchmarkRequest:
-      description: Request model for unregistering a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: UnregisterBenchmarkRequest
-    GetDatasetRequest:
-      description: Request model for getting a dataset by ID.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: GetDatasetRequest
-    UnregisterDatasetRequest:
-      description: Request model for unregistering a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: UnregisterDatasetRequest
     ListModelsResponse:
       description: Response containing a list of model objects.
       properties:
@@ -14368,39 +12776,6 @@ components:
       required:
       - model_id
       title: UnregisterModelRequest
-    DialogType:
-      description: Parameter type for dialog data with semantic output labels.
-      properties:
-        type:
-          title: Type
-          type: string
-          enum:
-          - dialog
-      title: DialogType
-    ListScoringFunctionsRequest:
-      description: Request model for listing scoring functions.
-      properties: {}
-      title: ListScoringFunctionsRequest
-    GetScoringFunctionRequest:
-      description: Request model for getting a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: GetScoringFunctionRequest
-    UnregisterScoringFunctionRequest:
-      description: Request model for unregistering a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: UnregisterScoringFunctionRequest
     GetShieldRequest:
       description: Request model for getting a shield by identifier.
       properties:
@@ -14499,16 +12874,10 @@ components:
       - responses
       - batches
       - vector_io
-      - datasetio
-      - scoring
-      - eval
       - tool_runtime
       - models
       - shields
       - vector_stores
-      - datasets
-      - scoring_functions
-      - benchmarks
       - tool_groups
       - files
       - file_processors
@@ -15309,6 +13678,25 @@ components:
       required:
       - batch_id
       title: CancelBatchRequest
+    JobStatus:
+      description: Status of a job execution.
+      enum:
+      - completed
+      - in_progress
+      - failed
+      - scheduled
+      - cancelled
+      title: JobStatus
+      type: string
+    DialogType:
+      description: Parameter type for dialog data with semantic output labels.
+      properties:
+        type:
+          title: Type
+          type: string
+          enum:
+          - dialog
+      title: DialogType
     ConnectorInput:
       description: Input for creating a connector
       properties:
@@ -15542,90 +13930,6 @@ components:
       - conversation_id
       - item_id
       title: DeleteItemRequest
-    IterRowsRequest:
-      description: Request model for iterating over rows in a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-          type: string
-        start_index:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          nullable: true
-        limit:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          nullable: true
-      required:
-      - dataset_id
-      title: IterRowsRequest
-    BenchmarkIdRequest:
-      description: Request model containing benchmark_id path parameter.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark
-          minLength: 1
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: BenchmarkIdRequest
-    JobStatusRequest:
-      description: Request model for getting the status of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the status of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobStatusRequest
-    JobCancelRequest:
-      description: Request model for canceling a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to cancel
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobCancelRequest
-    JobResultRequest:
-      description: Request model for getting the result of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the result of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobResultRequest
     ProcessFileRequest:
       description: |-
         Request model for file processing operation.
diff --git a/docs/docs/advanced_apis/evaluation.md b/docs/docs/advanced_apis/evaluation.md
deleted file mode 100644
index 085916da34..0000000000
--- a/docs/docs/advanced_apis/evaluation.md
+++ /dev/null
@@ -1,170 +0,0 @@
-# Evaluation
-
-## Evaluation Concepts
-
-The Llama Stack Evaluation flow allows you to run evaluations on your GenAI application datasets or pre-registered benchmarks.
-
-We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications:
-
-- `/datasetio` + `/datasets` API
-- `/scoring` + `/scoring_functions` API
-- `/eval` + `/benchmarks` API
-
-This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
-
-The Evaluation APIs are associated with a set of Resources. Please visit the Resources section in our [Core Concepts](/docs/concepts/) guide for better high-level understanding.
-
-- **DatasetIO**: defines interface with datasets and data loaders.
-  - Associated with `Dataset` resource.
-- **Scoring**: evaluate outputs of the system.
-  - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
-- **Eval**: generate outputs (via Inference or Agents) and perform scoring.
-  - Associated with `Benchmark` resource.
-
-## Evaluation Providers
-
-Llama Stack provides multiple evaluation providers:
-
-- **Builtin** (`inline::builtin`) - Meta's reference implementation with multi-language support
-- **NVIDIA** (`remote::nvidia`) - NVIDIA's evaluation platform integration
-
-### Builtin
-
-Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.
-
-#### Configuration
-
-| Field | Type | Required | Default | Description |
-|-------|------|----------|---------|-------------|
-| `kvstore` | `RedisKVStoreConfig \| SqliteKVStoreConfig \| PostgresKVStoreConfig \| MongoDBKVStoreConfig` | No | sqlite | Key-value store configuration |
-
-#### Sample Configuration
-
-```yaml
-kvstore:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/builtin_eval.db
-```
-
-#### Features
-
-- Multi-language evaluation support
-- Comprehensive evaluation metrics
-- Integration with various key-value stores (SQLite, Redis, PostgreSQL, MongoDB)
-- Built-in support for popular benchmarks
-
-### NVIDIA
-
-NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.
-
-#### Configuration
-
-| Field | Type | Required | Default | Description |
-|-------|------|----------|---------|-------------|
-| `evaluator_url` | `str` | No | <http://0.0.0.0:7331> | The url for accessing the evaluator service |
-
-#### Sample Configuration
-
-```yaml
-evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
-```
-
-#### Features
-
-- Integration with NVIDIA's evaluation platform
-- Remote evaluation capabilities
-- Scalable evaluation processing
-
-## Open-benchmark Eval
-
-### List of open-benchmarks Llama Stack support
-
-Llama stack pre-registers several popular open-benchmarks to easily evaluate model performance via CLI.
-
-The list of open-benchmarks we currently support:
-
-- [MMLU-COT](https://arxiv.org/abs/2009.03300) (Measuring Massive Multitask Language Understanding): Benchmark designed to comprehensively evaluate the breadth and depth of a model's academic and professional understanding
-- [GPQA-COT](https://arxiv.org/abs/2311.12022) (A Graduate-Level Google-Proof Q&A Benchmark): A challenging benchmark of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry.
-- [SimpleQA](https://openai.com/index/introducing-simpleqa/): Benchmark designed to access models to answer short, fact-seeking questions.
-- [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI): Benchmark designed to evaluate multimodal models.
-
-You can follow this [contributing guide](/docs/references/evals_reference/#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack
-
-### Run evaluation on open-benchmarks via CLI
-
-We have built-in functionality to run the supported open-benchmarks using llama-stack-client CLI
-
-#### Spin up Llama Stack server
-
-Spin up llama stack server with 'open-benchmark' template
-
-```bash
-llama stack run llama_stack/distributions/open-benchmark/config.yaml
-```
-
-#### Run eval CLI
-
-There are 3 necessary inputs to run a benchmark eval
-
-- `list of benchmark_ids`: The list of benchmark ids to run evaluation on
-- `model-id`: The model id to evaluate on
-- `output_dir`: Path to store the evaluate results
-
-```bash
-llama-stack-client eval run-benchmark <benchmark_id_1> <benchmark_id_2> ... \
---model_id <model id to evaluate on> \
---output_dir <directory to store the evaluate results>
-```
-
-You can run
-
-```bash
-llama-stack-client eval run-benchmark help
-```
-
-to see the description of all the flags that eval run-benchmark has
-
-In the output log, you can find the file path that has your evaluation results. Open that file and you can see you aggregate evaluation results over there.
-
-## Usage Example
-
-Here's a basic example of using the evaluation API:
-
-```python
-from llama_stack_client import LlamaStackClient
-
-client = LlamaStackClient(base_url="http://localhost:8321")
-
-# Register a dataset for evaluation
-client.datasets.register(
-    purpose="evaluation",
-    source={
-        "type": "uri",
-        "uri": "huggingface://datasets/llamastack/evaluation_dataset",
-    },
-    dataset_id="my_eval_dataset",
-)
-
-# Run evaluation
-eval_result = client.eval.run_evaluation(
-    dataset_id="my_eval_dataset",
-    scoring_functions=["accuracy", "bleu"],
-    model_id="my_model",
-)
-
-print(f"Evaluation completed: {eval_result}")
-```
-
-## Best Practices
-
-- **Choose appropriate providers**: Use Builtin for comprehensive evaluation, NVIDIA for platform-specific needs
-- **Configure storage properly**: Ensure your key-value store configuration matches your performance requirements
-- **Monitor evaluation progress**: Large evaluations can take time - implement proper monitoring
-- **Use appropriate scoring functions**: Select scoring metrics that align with your evaluation goals
-
-## What's Next?
-
-- Check out our Colab notebook on working examples with running benchmark evaluations [here](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb#scrollTo=mxLCsP4MvFqP).
-- Check out our [Building Applications - Evaluation](../building_applications/evals) guide for more details on how to use the Evaluation APIs to evaluate your applications.
-- Check out our [Evaluation Reference](/docs/references/evals_reference/) for more details on the APIs.
-- Explore the [Scoring](./scoring) documentation for available scoring functions.
diff --git a/docs/docs/advanced_apis/scoring.md b/docs/docs/advanced_apis/scoring.md
deleted file mode 100644
index df1e2460d4..0000000000
--- a/docs/docs/advanced_apis/scoring.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# Scoring
-
-The Scoring API in Llama Stack allows you to evaluate outputs of your GenAI system using various scoring functions and metrics. This section covers all available scoring providers and their configuration.
-
-## Overview
-
-Llama Stack provides multiple scoring providers:
-
-- **Basic** (`inline::basic`) - Simple evaluation metrics and scoring functions
-- **Braintrust** (`inline::braintrust`) - Advanced evaluation using the Braintrust platform
-- **LLM-as-Judge** (`inline::llm-as-judge`) - Uses language models to evaluate responses
-
-The Scoring API is associated with `ScoringFunction` resources and provides a suite of out-of-the-box scoring functions. You can also add custom evaluators to meet specific evaluation needs.
-
-## Basic Scoring
-
-Basic scoring provider for simple evaluation metrics and scoring functions. This provider offers fundamental scoring capabilities without external dependencies.
-
-### Configuration
-
-No configuration required - this provider works out of the box.
-
-```yaml
-{}
-```
-
-### Features
-
-- Simple evaluation metrics (accuracy, precision, recall, F1-score)
-- String matching and similarity metrics
-- Basic statistical scoring functions
-- No external dependencies required
-- Fast execution for standard metrics
-
-### Use Cases
-
-- Quick evaluation of basic accuracy metrics
-- String similarity comparisons
-- Statistical analysis of model outputs
-- Development and testing scenarios
-
-## Braintrust
-
-Braintrust scoring provider for evaluation and scoring using the [Braintrust platform](https://braintrustdata.com/). Braintrust provides advanced evaluation capabilities and experiment tracking.
-
-### Configuration
-
-| Field | Type | Required | Default | Description |
-|-------|------|----------|---------|-------------|
-| `openai_api_key` | `str \| None` | No |  | The OpenAI API Key for LLM-powered evaluations |
-
-### Sample Configuration
-
-```yaml
-openai_api_key: ${env.OPENAI_API_KEY:=}
-```
-
-### Features
-
-- Advanced evaluation metrics
-- Experiment tracking and comparison
-- LLM-powered evaluation functions
-- Integration with Braintrust's evaluation suite
-- Detailed scoring analytics and insights
-
-### Use Cases
-
-- Production evaluation pipelines
-- A/B testing of model versions
-- Advanced scoring with custom metrics
-- Detailed evaluation reporting and analysis
-
-## LLM-as-Judge
-
-LLM-as-judge scoring provider that uses language models to evaluate and score responses. This approach leverages the reasoning capabilities of large language models to assess quality, relevance, and other subjective metrics.
-
-### Configuration
-
-No configuration required - this provider works out of the box.
-
-```yaml
-{}
-```
-
-### Features
-
-- Subjective quality evaluation using LLMs
-- Flexible evaluation criteria definition
-- Natural language evaluation explanations
-- Support for complex evaluation scenarios
-- Contextual understanding of responses
-
-### Use Cases
-
-- Evaluating response quality and relevance
-- Assessing creativity and coherence
-- Subjective metric evaluation
-- Human-like judgment for complex tasks
-
-## Usage Examples
-
-### Basic Scoring Example
-
-```python
-from llama_stack_client import LlamaStackClient
-
-client = LlamaStackClient(base_url="http://localhost:8321")
-
-# Register a basic accuracy scoring function
-client.scoring_functions.register(
-    scoring_function_id="basic_accuracy",
-    provider_id="basic",
-    provider_scoring_function_id="accuracy",
-)
-
-# Use the scoring function
-result = client.scoring.score(
-    input_rows=[
-        {"expected": "Paris", "actual": "Paris"},
-        {"expected": "London", "actual": "Paris"},
-    ],
-    scoring_function_id="basic_accuracy",
-)
-print(f"Accuracy: {result.results[0].score}")
-```
-
-### LLM-as-Judge Example
-
-```python
-# Register an LLM-as-judge scoring function
-client.scoring_functions.register(
-    scoring_function_id="quality_judge",
-    provider_id="llm_judge",
-    provider_scoring_function_id="response_quality",
-    params={
-        "criteria": "Evaluate response quality, relevance, and helpfulness",
-        "scale": "1-10",
-    },
-)
-
-# Score responses using LLM judgment
-result = client.scoring.score(
-    input_rows=[
-        {
-            "query": "What is machine learning?",
-            "response": "Machine learning is a subset of AI that enables computers to learn patterns from data...",
-        }
-    ],
-    scoring_function_id="quality_judge",
-)
-```
-
-### Braintrust Integration Example
-
-```python
-# Register a Braintrust scoring function
-client.scoring_functions.register(
-    scoring_function_id="braintrust_eval",
-    provider_id="braintrust",
-    provider_scoring_function_id="semantic_similarity",
-)
-
-# Run evaluation with Braintrust
-result = client.scoring.score(
-    input_rows=[
-        {
-            "reference": "The capital of France is Paris",
-            "candidate": "Paris is the capital city of France",
-        }
-    ],
-    scoring_function_id="braintrust_eval",
-)
-```
-
-## Best Practices
-
-- **Choose appropriate providers**: Use Basic for simple metrics, Braintrust for advanced analytics, LLM-as-Judge for subjective evaluation
-- **Define clear criteria**: When using LLM-as-Judge, provide specific evaluation criteria and scales
-- **Validate scoring functions**: Test your scoring functions with known examples before production use
-- **Monitor performance**: Track scoring performance and adjust thresholds based on results
-- **Combine multiple metrics**: Use different scoring providers together for comprehensive evaluation
-
-## Integration with Evaluation
-
-The Scoring API works closely with the [Evaluation](./evaluation) API to provide comprehensive evaluation workflows:
-
-1. **Datasets** are loaded via the DatasetIO API
-2. **Evaluation** generates model outputs using the Eval API
-3. **Scoring** evaluates the quality of outputs using various scoring functions
-4. **Results** are aggregated and reported for analysis
-
-## Next Steps
-
-- Check out the [Evaluation](./evaluation) guide for running complete evaluations
-- See the [Building Applications - Evaluation](../building_applications/evals) guide for application examples
-- Review the [Evaluation Reference](../references/evals_reference/) for comprehensive scoring function usage
-- Explore the [Evaluation Concepts](../concepts/evaluation_concepts) for detailed conceptual information
diff --git a/docs/docs/distributions/remote_hosted_distro/oci.md b/docs/docs/distributions/remote_hosted_distro/oci.md
index 93c1e35e2d..02cb69bab1 100644
--- a/docs/docs/distributions/remote_hosted_distro/oci.md
+++ b/docs/docs/distributions/remote_hosted_distro/oci.md
@@ -8,13 +8,10 @@ The `llamastack/distribution-oci` distribution consists of the following provide
 
 | API | Provider(s) |
 |-----|-------------|
-| datasetio | `remote::huggingface`, `inline::localfs` |
-| eval | `inline::builtin` |
 | files | `inline::localfs` |
 | inference | `remote::oci` |
 | responses | `inline::builtin` |
 | safety | `inline::llama-guard` |
-| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::file-search`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
diff --git a/docs/docs/distributions/self_hosted_distro/nvidia.md b/docs/docs/distributions/self_hosted_distro/nvidia.md
index 3372b067d1..ae4cb8d5aa 100644
--- a/docs/docs/distributions/self_hosted_distro/nvidia.md
+++ b/docs/docs/distributions/self_hosted_distro/nvidia.md
@@ -8,13 +8,10 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 
 | API | Provider(s) |
 |-----|-------------|
-| datasetio | `inline::localfs`, `remote::nvidia` |
-| eval | `remote::nvidia` |
 | files | `inline::localfs` |
 | inference | `remote::nvidia` |
 | responses | `inline::builtin` |
 | safety | `remote::nvidia` |
-| scoring | `inline::basic` |
 | tool_runtime | `inline::file-search` |
 | vector_io | `inline::faiss` |
 
@@ -26,16 +23,10 @@ The following environment variables can be configured:
 
 - `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`)
 
-- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
-
-- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
-
 - `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
 
 - `NVIDIA_GUARDRAILS_CONFIG_ID`: NVIDIA Guardrail Configuration ID (default: `self-check`)
 
-- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
-
 - `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
 
 - `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
diff --git a/docs/docusaurus.config.ts b/docs/docusaurus.config.ts
index 249b7614fd..7e5e4e97ac 100644
--- a/docs/docusaurus.config.ts
+++ b/docs/docusaurus.config.ts
@@ -155,23 +155,44 @@ const config: Config = {
       ],
     },
     footer: {
-      style: 'dark',
+      style: 'light',
       links: [
         {
-          title: 'Docs',
+          title: 'Getting Started',
           items: [
             {
-              label: 'Getting Started',
+              label: 'Quickstart',
               to: '/docs/getting_started/quickstart',
             },
             {
               label: 'Concepts',
               to: '/docs/concepts',
             },
+            {
+              label: 'Distributions',
+              to: '/docs/distributions/building_distro',
+            },
+            {
+              label: 'Providers',
+              to: '/docs/providers',
+            },
+          ],
+        },
+        {
+          title: 'API',
+          items: [
             {
               label: 'API Reference',
               to: '/docs/api-overview',
             },
+            {
+              label: 'OpenAI Compatibility',
+              to: '/docs/api-openai',
+            },
+            {
+              label: 'Blog',
+              to: '/blog',
+            },
           ],
         },
         {
@@ -181,14 +202,22 @@ const config: Config = {
               label: 'Discord',
               href: 'https://discord.gg/llama-stack',
             },
+            {
+              label: 'GitHub Discussions',
+              href: 'https://github.com/llamastack/llama-stack/discussions',
+            },
             {
               label: 'Issues',
               href: 'https://github.com/llamastack/llama-stack/issues',
             },
+            {
+              label: 'Contributing',
+              to: '/docs/contributing',
+            },
           ],
         },
         {
-          title: 'More',
+          title: 'Resources',
           items: [
             {
               label: 'GitHub',
@@ -198,10 +227,14 @@ const config: Config = {
               label: 'PyPI',
               href: 'https://pypi.org/project/llama-stack/',
             },
+            {
+              label: 'Releases',
+              href: 'https://github.com/llamastack/llama-stack/releases',
+            },
           ],
         },
       ],
-      copyright: `Copyright © ${new Date().getFullYear()} Meta Platforms, Inc. Built with Docusaurus.`,
+      copyright: `Copyright © ${new Date().getFullYear()} Meta Platforms, Inc.`,
     },
     colorMode: {
       defaultMode: 'dark',
diff --git a/docs/sidebars.ts b/docs/sidebars.ts
index 8e623cf57b..8ccbcde8e9 100644
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -197,10 +197,6 @@ const sidebars: SidebarsConfig = {
           label: 'Scoring',
           collapsed: false,
           items: [
-            'providers/scoring/index',
-            'providers/scoring/inline_basic',
-            'providers/scoring/inline_braintrust',
-            'providers/scoring/inline_llm-as-judge'
           ],
         },
         {
@@ -276,8 +272,6 @@ const sidebars: SidebarsConfig = {
       label: 'Advanced APIs',
       collapsed: false,
       items: [
-        'advanced_apis/evaluation',
-        'advanced_apis/scoring',
       ],
     },
     {
diff --git a/docs/src/css/custom.css b/docs/src/css/custom.css
index 4bd5d61084..e92731bb21 100644
--- a/docs/src/css/custom.css
+++ b/docs/src/css/custom.css
@@ -93,6 +93,7 @@ html {
   font-feature-settings: 'cv02', 'cv03', 'cv04', 'cv11';
   -webkit-font-smoothing: antialiased;
   -moz-osx-font-smoothing: grayscale;
+  text-rendering: optimizeLegibility;
 }
 
 h1, h2, h3, h4, h5, h6 {
@@ -103,10 +104,19 @@ h1, h2, h3, h4, h5, h6 {
 
 h1 {
   font-size: 2.2rem;
+  line-height: 1.2;
 }
 
 h2 {
   font-size: 1.6rem;
+  line-height: 1.3;
+  margin-top: 2.5rem;
+}
+
+h3 {
+  font-size: 1.25rem;
+  line-height: 1.35;
+  margin-top: 2rem;
 }
 
 .navbar__title {
@@ -115,23 +125,150 @@ h2 {
 
 /* ========== NAVBAR ========== */
 .navbar {
-  backdrop-filter: blur(12px);
-  -webkit-backdrop-filter: blur(12px);
-  border-bottom: 1px solid rgba(0, 0, 0, 0.05);
+  backdrop-filter: blur(16px);
+  -webkit-backdrop-filter: blur(16px);
+  border-bottom: 1px solid rgba(0, 0, 0, 0.06);
+  padding: 0.15rem 0;
+  transition: all 0.3s;
+}
+
+/* Gradient accent line at top of page */
+.navbar::before {
+  content: '';
+  position: absolute;
+  top: 0;
+  left: 0;
+  right: 0;
+  height: 2px;
+  background: linear-gradient(90deg, #6d28d9 0%, #2dd4bf 50%, #60a5fa 100%);
+  opacity: 0.8;
 }
 
 [data-theme='dark'] .navbar {
-  border-bottom: 1px solid rgba(255, 255, 255, 0.05);
+  border-bottom: 1px solid rgba(255, 255, 255, 0.06);
+}
+
+/* Logo styling */
+.navbar__logo img {
+  border-radius: 6px;
+  transition: transform 0.2s;
+}
+
+.navbar__brand:hover .navbar__logo img {
+  transform: scale(1.08);
+}
+
+.navbar__brand {
+  margin-left: 0.75rem;
 }
 
 .navbar__title {
   font-weight: 700;
   font-size: 1.1rem;
+  background: linear-gradient(135deg, var(--ifm-font-color-base) 60%, var(--ifm-color-primary) 100%);
+  -webkit-background-clip: text;
+  -webkit-text-fill-color: transparent;
+  background-clip: text;
 }
 
 .navbar__link {
   font-weight: 500;
-  font-size: 0.9rem;
+  font-size: 0.88rem;
+  border-radius: 0.375rem;
+  padding: 0.4rem 0.75rem !important;
+  transition: all 0.15s;
+  position: relative;
+}
+
+.navbar__link:hover {
+  background: rgba(109, 40, 217, 0.06);
+}
+
+[data-theme='dark'] .navbar__link:hover {
+  background: rgba(167, 139, 250, 0.08);
+}
+
+.navbar__link--active {
+  font-weight: 600;
+}
+
+.navbar__link--active::after {
+  content: '';
+  position: absolute;
+  bottom: -2px;
+  left: 50%;
+  transform: translateX(-50%);
+  width: 16px;
+  height: 2px;
+  border-radius: 1px;
+  background: var(--ifm-color-primary);
+}
+
+/* GitHub link icon styling */
+.navbar__items--right .navbar__link[href*="github"] {
+  display: flex;
+  align-items: center;
+  gap: 0.4rem;
+  padding: 0.35rem 0.85rem !important;
+  border: 1px solid rgba(0, 0, 0, 0.1);
+  border-radius: 100px;
+  font-size: 0.82rem;
+  transition: all 0.2s;
+}
+
+[data-theme='dark'] .navbar__items--right .navbar__link[href*="github"] {
+  border-color: rgba(255, 255, 255, 0.1);
+}
+
+.navbar__items--right .navbar__link[href*="github"]:hover {
+  background: rgba(0, 0, 0, 0.06);
+  border-color: rgba(0, 0, 0, 0.2);
+  transform: translateY(-1px);
+}
+
+[data-theme='dark'] .navbar__items--right .navbar__link[href*="github"]:hover {
+  background: rgba(255, 255, 255, 0.08);
+  border-color: rgba(255, 255, 255, 0.2);
+}
+
+/* Dropdown menu styling */
+.dropdown__menu {
+  border-radius: 0.75rem;
+  border: 1px solid rgba(0, 0, 0, 0.08);
+  box-shadow: 0 12px 40px rgba(0, 0, 0, 0.1), 0 0 0 1px rgba(0, 0, 0, 0.04);
+  padding: 0.5rem;
+  min-width: 180px;
+}
+
+[data-theme='dark'] .dropdown__menu {
+  border-color: rgba(255, 255, 255, 0.08);
+  box-shadow: 0 12px 40px rgba(0, 0, 0, 0.4), 0 0 0 1px rgba(255, 255, 255, 0.04);
+  background: #1a1a2e;
+}
+
+.dropdown__link {
+  border-radius: 0.375rem;
+  padding: 0.5rem 0.75rem;
+  font-size: 0.85rem;
+  transition: all 0.15s;
+}
+
+.dropdown__link:hover {
+  background: rgba(109, 40, 217, 0.06);
+}
+
+[data-theme='dark'] .dropdown__link:hover {
+  background: rgba(167, 139, 250, 0.1);
+}
+
+.dropdown__link--active {
+  background: rgba(109, 40, 217, 0.08);
+  font-weight: 600;
+  color: var(--ifm-color-primary);
+}
+
+[data-theme='dark'] .dropdown__link--active {
+  background: rgba(167, 139, 250, 0.12);
 }
 
 /* ========== SIDEBAR ========== */
@@ -334,12 +471,94 @@ div[class*='expandButton'] {
 /* ========== CONTENT ========== */
 .markdown {
   line-height: 1.75;
+  font-size: 0.95rem;
+  color: var(--ifm-font-color-base);
 }
 
 .markdown > p {
   margin-bottom: 1.25rem;
 }
 
+/* Stronger visual hierarchy for headings in docs */
+.markdown h1 {
+  padding-bottom: 0.5rem;
+  border-bottom: 1px solid rgba(0, 0, 0, 0.06);
+  margin-bottom: 1.5rem;
+}
+
+[data-theme='dark'] .markdown h1 {
+  border-bottom-color: rgba(255, 255, 255, 0.06);
+}
+
+.markdown h2 {
+  padding-bottom: 0.35rem;
+  border-bottom: 1px solid rgba(0, 0, 0, 0.04);
+  margin-bottom: 1rem;
+}
+
+[data-theme='dark'] .markdown h2 {
+  border-bottom-color: rgba(255, 255, 255, 0.04);
+}
+
+/* Lists */
+.markdown ul, .markdown ol {
+}
+
+.markdown li {
+  margin-bottom: 0.35rem;
+}
+
+.markdown li > p {
+  margin-bottom: 0.5rem;
+}
+
+/* Inline code in docs */
+.markdown code {
+  background: rgba(109, 40, 217, 0.06);
+  color: var(--ifm-color-primary-dark);
+  border: 1px solid rgba(109, 40, 217, 0.08);
+}
+
+[data-theme='dark'] .markdown code {
+  background: rgba(167, 139, 250, 0.1);
+  color: var(--ifm-color-primary-light);
+  border-color: rgba(167, 139, 250, 0.12);
+}
+
+/* Don't style code inside code blocks */
+.markdown pre code {
+  background: none;
+  color: inherit;
+  border: none;
+  padding: 0;
+}
+
+/* Blockquotes */
+.markdown blockquote {
+  border-left: 3px solid var(--ifm-color-primary-lighter);
+  background: rgba(109, 40, 217, 0.03);
+  border-radius: 0 0.5rem 0.5rem 0;
+  padding: 0.75rem 1.25rem;
+  margin: 1.5rem 0;
+}
+
+[data-theme='dark'] .markdown blockquote {
+  background: rgba(167, 139, 250, 0.04);
+  border-left-color: rgba(167, 139, 250, 0.3);
+}
+
+/* Horizontal rules */
+.markdown hr {
+  border: none;
+  height: 1px;
+  background: linear-gradient(90deg, transparent 0%, rgba(109, 40, 217, 0.15) 50%, transparent 100%);
+  margin: 2.5rem 0;
+}
+
+[data-theme='dark'] .markdown hr {
+  background: linear-gradient(90deg, transparent 0%, rgba(167, 139, 250, 0.15) 50%, transparent 100%);
+}
+
 /* Better table styling */
 table {
   display: table;
@@ -415,21 +634,417 @@ code {
   font-weight: 500;
 }
 
-pre code {
-  font-size: 0.85rem;
-  line-height: 1.6;
-}
-
+/* Flatten code block backgrounds to a single layer */
 .theme-code-block {
   border: 1px solid rgba(0, 0, 0, 0.06);
-  border-radius: 1rem;
+  border-radius: 0.75rem;
   overflow: hidden;
-  box-shadow: 0 4px 24px rgba(0, 0, 0, 0.06);
+  box-shadow: 0 2px 12px rgba(0, 0, 0, 0.06);
+  background: var(--ifm-pre-background) !important;
 }
 
 [data-theme='dark'] .theme-code-block {
   border-color: rgba(255, 255, 255, 0.06);
-  box-shadow: 0 4px 24px rgba(0, 0, 0, 0.3);
+  box-shadow: 0 2px 12px rgba(0, 0, 0, 0.3);
+}
+
+/* Title bar (filename label) */
+.theme-code-block div[class*='codeBlockTitle'] {
+  background: transparent !important;
+  border-bottom: 1px solid rgba(255, 255, 255, 0.06);
+  padding: 0.5rem 1.25rem;
+  font-size: 0.8rem;
+}
+
+/* The pre inside code blocks */
+.theme-code-block pre {
+  background: transparent !important;
+  padding: 1.25rem 1.5rem !important;
+  margin: 0;
+  border-radius: 0;
+}
+
+.theme-code-block pre code {
+  font-size: 0.85rem;
+  line-height: 1.6;
+  padding: 0 !important;
+  background: transparent !important;
+  border: none;
+}
+
+/* Copy button */
+.theme-code-block button[class*='copyButton'] {
+  background: rgba(0, 0, 0, 0.04);
+  border: 1px solid rgba(0, 0, 0, 0.08);
+  border-radius: 0.375rem;
+  color: rgba(0, 0, 0, 0.4);
+  transition: all 0.15s;
+}
+
+.theme-code-block button[class*='copyButton']:hover {
+  background: rgba(0, 0, 0, 0.08);
+  color: rgba(0, 0, 0, 0.7);
+}
+
+[data-theme='dark'] .theme-code-block button[class*='copyButton'] {
+  background: rgba(255, 255, 255, 0.05);
+  border-color: rgba(255, 255, 255, 0.08);
+  color: rgba(255, 255, 255, 0.5);
+}
+
+[data-theme='dark'] .theme-code-block button[class*='copyButton']:hover {
+  background: rgba(255, 255, 255, 0.1);
+  color: rgba(255, 255, 255, 0.8);
+}
+
+/* Title bar light mode */
+.theme-code-block div[class*='codeBlockTitle'] {
+  color: #64748b;
+}
+
+[data-theme='dark'] .theme-code-block div[class*='codeBlockTitle'] {
+  color: rgba(255, 255, 255, 0.5);
+}
+
+/* ========== LIGHT CODE BLOCKS (global) ========== */
+/* Light mode: all code blocks get light background */
+.theme-code-block {
+  background: #f5f5f7 !important;
+  border-color: rgba(0, 0, 0, 0.08) !important;
+}
+
+.theme-code-block pre {
+  background: transparent !important;
+}
+
+/* Also catch pre elements used by OpenAPI plugin directly */
+pre {
+  background: #f5f5f7 !important;
+  color: #383a42 !important;
+}
+
+/* Override Prism token colors for light background */
+.token.comment,
+.token.prolog { color: #6a737d !important; }
+.token.keyword { color: #a626a4 !important; }
+.token.string,
+.token.attr-value { color: #50a14f !important; }
+.token.function { color: #4078f2 !important; }
+.token.class-name { color: #c18401 !important; }
+.token.number { color: #986801 !important; }
+.token.operator { color: #0184bc !important; }
+.token.punctuation { color: #383a42 !important; }
+.token.property { color: #e45649 !important; }
+.token.builtin { color: #c18401 !important; }
+.token.boolean { color: #986801 !important; }
+.token.plain { color: #383a42 !important; }
+code { color: #383a42; }
+
+/* Dark mode: revert everything to oneDark defaults */
+[data-theme='dark'] .theme-code-block {
+  background: var(--ifm-pre-background) !important;
+  border-color: rgba(255, 255, 255, 0.06) !important;
+}
+
+[data-theme='dark'] pre {
+  background: var(--ifm-pre-background) !important;
+  color: var(--ifm-pre-color) !important;
+}
+
+[data-theme='dark'] code {
+  color: #cdd6f4;
+}
+
+[data-theme='dark'] .token.comment,
+[data-theme='dark'] .token.prolog,
+[data-theme='dark'] .token.keyword,
+[data-theme='dark'] .token.string,
+[data-theme='dark'] .token.attr-value,
+[data-theme='dark'] .token.function,
+[data-theme='dark'] .token.class-name,
+[data-theme='dark'] .token.number,
+[data-theme='dark'] .token.operator,
+[data-theme='dark'] .token.punctuation,
+[data-theme='dark'] .token.property,
+[data-theme='dark'] .token.builtin,
+[data-theme='dark'] .token.boolean,
+[data-theme='dark'] .token.plain {
+  color: unset !important;
+}
+
+/* ========== OPENAPI DOC PAGES ========== */
+
+/* Code samples container (Python/curl tabs + code) */
+.openapi-code__code-samples-container {
+  background: #f5f5f7 !important;
+  border: 1px solid rgba(0, 0, 0, 0.08) !important;
+  border-radius: 0.75rem !important;
+}
+
+[data-theme='dark'] .openapi-code__code-samples-container {
+  background: var(--ifm-pre-background) !important;
+  border-color: rgba(255, 255, 255, 0.06) !important;
+}
+
+/* Code block wrapper inside code samples and snippets */
+.openapi-explorer__code-block {
+  background: #f5f5f7 !important;
+}
+
+[data-theme='dark'] .openapi-explorer__code-block {
+  background: var(--ifm-pre-background) !important;
+}
+
+.openapi-explorer__code-block-content {
+  background: transparent !important;
+}
+
+.openapi-explorer__code-block-title {
+  color: #64748b !important;
+  background: transparent !important;
+  border-bottom: 1px solid rgba(0, 0, 0, 0.06) !important;
+}
+
+[data-theme='dark'] .openapi-explorer__code-block-title {
+  color: #94a3b8 !important;
+  border-bottom-color: rgba(255, 255, 255, 0.06) !important;
+}
+
+/* Hide line numbers in API code samples */
+.openapi-explorer__code-block-code-line-number {
+  display: none !important;
+}
+
+.openapi-explorer__code-block-code-line-content {
+  color: #383a42 !important;
+}
+
+.openapi-explorer__code-block-code-line-content .token.keyword { color: #a626a4 !important; }
+.openapi-explorer__code-block-code-line-content .token.string { color: #50a14f !important; }
+.openapi-explorer__code-block-code-line-content .token.function { color: #4078f2 !important; }
+.openapi-explorer__code-block-code-line-content .token.class-name { color: #c18401 !important; }
+.openapi-explorer__code-block-code-line-content .token.operator { color: #0184bc !important; }
+.openapi-explorer__code-block-code-line-content .token.punctuation { color: #383a42 !important; }
+.openapi-explorer__code-block-code-line-content .token.builtin { color: #c18401 !important; }
+.openapi-explorer__code-block-code-line-content .token.comment { color: #6a737d !important; }
+.openapi-explorer__code-block-code-line-content .token.number { color: #986801 !important; }
+.openapi-explorer__code-block-code-line-content .token.property { color: #e45649 !important; }
+
+[data-theme='dark'] .openapi-explorer__code-block-code-line-content {
+  color: #cdd6f4 !important;
+}
+
+[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.keyword { color: #c678dd !important; }
+[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.string { color: #98c379 !important; }
+[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.function { color: #61afef !important; }
+[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.class-name { color: #e5c07b !important; }
+[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.operator { color: #56b6c2 !important; }
+[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.punctuation { color: #abb2bf !important; }
+[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.builtin { color: #e5c07b !important; }
+[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.comment { color: #5c6370 !important; }
+[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.number { color: #d19a66 !important; }
+[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.property { color: #e06c75 !important; }
+
+/* Outermost code snippets wrapper */
+.openapi-tabs__code-container,
+.openapi-tabs__code-container-inner,
+.openapi-tabs__code-content,
+.openapi-tabs__code-list-container {
+  background: #f5f5f7 !important;
+  color: #383a42 !important;
+}
+
+[data-theme='dark'] .openapi-tabs__code-container,
+[data-theme='dark'] .openapi-tabs__code-container-inner,
+[data-theme='dark'] .openapi-tabs__code-content,
+[data-theme='dark'] .openapi-tabs__code-list-container {
+  background: var(--ifm-pre-background) !important;
+  color: #cdd6f4 !important;
+}
+
+/* Tab items (PYTHON, CURL labels) */
+.openapi-tabs__code-item--python,
+.openapi-tabs__code-item--curl,
+[class*='openapi-tabs__code-item'] {
+  color: #64748b !important;
+  border-color: rgba(0, 0, 0, 0.1) !important;
+}
+
+[class*='openapi-tabs__code-item'][aria-selected='true'] {
+  border-color: var(--ifm-color-primary) !important;
+  color: #1e293b !important;
+}
+
+[data-theme='dark'] [class*='openapi-tabs__code-item'] {
+  color: #94a3b8 !important;
+  border-color: rgba(255, 255, 255, 0.1) !important;
+}
+
+[data-theme='dark'] [class*='openapi-tabs__code-item'][aria-selected='true'] {
+  color: #e2e8f0 !important;
+  border-color: var(--ifm-color-primary) !important;
+}
+
+/* Variant tabs (OPENAI label) */
+[class*='openapi-tabs__code-item--variant'],
+[class*='openapi-tabs__code-item--sample'] {
+  color: #64748b !important;
+  border-color: rgba(0, 0, 0, 0.1) !important;
+}
+
+[data-theme='dark'] [class*='openapi-tabs__code-item--variant'],
+[data-theme='dark'] [class*='openapi-tabs__code-item--sample'] {
+  color: #94a3b8 !important;
+  border-color: rgba(255, 255, 255, 0.1) !important;
+}
+
+/* Request form panel */
+.openapi-explorer__request-form {
+  background: #f5f5f7 !important;
+  border: 1px solid rgba(0, 0, 0, 0.08) !important;
+  border-radius: 0.75rem !important;
+  color: #1e293b !important;
+}
+
+[data-theme='dark'] .openapi-explorer__request-form {
+  background: var(--ifm-pre-background) !important;
+  border-color: rgba(255, 255, 255, 0.06) !important;
+  color: #e2e8f0 !important;
+}
+
+/* Request/response headers */
+.openapi-explorer__request-title,
+.openapi-explorer__response-title {
+  color: #1e293b !important;
+}
+
+[data-theme='dark'] .openapi-explorer__request-title,
+[data-theme='dark'] .openapi-explorer__response-title {
+  color: #e2e8f0 !important;
+}
+
+.openapi-explorer__request-header-container,
+.openapi-explorer__response-title-container {
+  background: rgba(0, 0, 0, 0.03) !important;
+  border-bottom: 1px solid rgba(0, 0, 0, 0.06) !important;
+}
+
+[data-theme='dark'] .openapi-explorer__request-header-container,
+[data-theme='dark'] .openapi-explorer__response-title-container {
+  background: rgba(255, 255, 255, 0.03) !important;
+  border-bottom-color: rgba(255, 255, 255, 0.06) !important;
+}
+
+/* Response container */
+.openapi-explorer__response-container {
+  background: #f5f5f7 !important;
+  border: 1px solid rgba(0, 0, 0, 0.08) !important;
+  border-radius: 0.75rem !important;
+  color: #1e293b !important;
+}
+
+[data-theme='dark'] .openapi-explorer__response-container {
+  background: var(--ifm-pre-background) !important;
+  border-color: rgba(255, 255, 255, 0.06) !important;
+  color: #e2e8f0 !important;
+}
+
+/* Response placeholder text */
+.openapi-explorer__response-placeholder-message {
+  color: #64748b !important;
+}
+
+[data-theme='dark'] .openapi-explorer__response-placeholder-message {
+  color: #94a3b8 !important;
+}
+
+/* Server URL display */
+.openapi-explorer__server-url {
+  color: #6d28d9 !important;
+}
+
+[data-theme='dark'] .openapi-explorer__server-url {
+  color: #a78bfa !important;
+}
+
+/* Details/collapsible sections */
+.openapi-explorer__details-summary {
+  color: #1e293b !important;
+}
+
+[data-theme='dark'] .openapi-explorer__details-summary {
+  color: #e2e8f0 !important;
+}
+
+/* Form labels and inputs */
+.openapi-explorer__form-item-label {
+  color: #1e293b !important;
+}
+
+[data-theme='dark'] .openapi-explorer__form-item-label {
+  color: #e2e8f0 !important;
+}
+
+/* Send API request button */
+.openapi-explorer__request-btn {
+  border-radius: 0.5rem !important;
+}
+
+/* Response clear button */
+.openapi-explorer__response-clear-btn {
+  color: #64748b !important;
+}
+
+[data-theme='dark'] .openapi-explorer__response-clear-btn {
+  color: #94a3b8 !important;
+}
+
+/* Tabs in API pages (operation tabs, schema tabs) */
+.openapi-tabs__container,
+.openapi-tabs__operation-container,
+.openapi-tabs__response-container,
+.openapi-tabs__schema-container {
+  background: transparent !important;
+}
+
+/* Method + URL box (GET http://...) */
+.openapi__method-endpoint {
+  background: #f5f5f7 !important;
+  border: 1px solid rgba(0, 0, 0, 0.08) !important;
+  border-radius: 0.5rem !important;
+  display: flex !important;
+  align-items: center !important;
+  gap: 0.75rem !important;
+  padding: 0.6rem 1rem !important;
+  margin: 0 0 1rem !important;
+  overflow-x: auto;
+}
+
+[data-theme='dark'] .openapi__method-endpoint {
+  background: var(--ifm-pre-background) !important;
+  border-color: rgba(255, 255, 255, 0.06) !important;
+}
+
+.openapi__method-endpoint .badge {
+  flex-shrink: 0;
+  font-size: 0.7rem !important;
+  padding: 0.2rem 0.5rem !important;
+  line-height: 1.2 !important;
+}
+
+.openapi__method-endpoint-path {
+  color: #383a42 !important;
+  font-size: 0.85rem !important;
+  font-weight: 500 !important;
+  margin: 0 !important;
+  padding: 0 !important;
+  border: none !important;
+  line-height: 1.4 !important;
+  white-space: nowrap;
+}
+
+[data-theme='dark'] .openapi__method-endpoint-path {
+  color: #cdd6f4 !important;
 }
 
 /* ========== ADMONITIONS ========== */
@@ -446,27 +1061,107 @@ pre code {
 }
 
 /* ========== FOOTER ========== */
-.footer--dark {
-  background: #0c0c14;
-  --ifm-footer-link-color: #94a3b8;
-  --ifm-footer-title-color: #e2e8f0;
+.footer {
+  position: relative;
+  padding-top: 4rem !important;
+  padding-bottom: 2rem !important;
+  background: #f8f8fa;
 }
 
-.footer--dark .footer__link-item {
-  color: #94a3b8;
-  font-size: 0.875rem;
+[data-theme='dark'] .footer {
+  background: #0a0a12;
 }
 
-.footer--dark .footer__link-item:hover {
-  color: #e2e8f0;
+/* Gradient separator line at top of footer */
+.footer::before {
+  content: '';
+  position: absolute;
+  top: 0;
+  left: 0;
+  right: 0;
+  height: 1px;
+  background: linear-gradient(90deg, transparent 0%, rgba(109, 40, 217, 0.3) 20%, rgba(45, 212, 191, 0.2) 50%, rgba(96, 165, 250, 0.3) 80%, transparent 100%);
+}
+
+/* Subtle glow behind footer top */
+.footer::after {
+  content: '';
+  position: absolute;
+  top: 0;
+  left: 50%;
+  transform: translateX(-50%);
+  width: 60%;
+  height: 120px;
+  background: radial-gradient(ellipse at center top, rgba(109, 40, 217, 0.04) 0%, transparent 70%);
+  pointer-events: none;
+}
+
+[data-theme='dark'] .footer::after {
+  background: radial-gradient(ellipse at center top, rgba(109, 40, 217, 0.06) 0%, transparent 70%);
+}
+
+.footer .footer__link-item {
+  color: #64748b;
+  font-size: 0.85rem;
+  transition: all 0.15s;
+  padding: 0.2rem 0;
+  display: inline-block;
 }
 
-.footer--dark .footer__title {
+.footer .footer__link-item:hover {
+  color: #1e293b;
+  transform: translateX(2px);
+}
+
+[data-theme='dark'] .footer .footer__link-item:hover {
   color: #e2e8f0;
-  font-weight: 600;
-  font-size: 0.8rem;
+}
+
+.footer .footer__title {
+  color: #1e293b;
+  font-weight: 700;
+  font-size: 0.78rem;
   text-transform: uppercase;
-  letter-spacing: 0.05em;
+  letter-spacing: 0.08em;
+  margin-bottom: 1rem;
+  position: relative;
+  padding-bottom: 0.6rem;
+}
+
+[data-theme='dark'] .footer .footer__title {
+  color: #f1f5f9;
+}
+
+.footer .footer__title::after {
+  content: '';
+  position: absolute;
+  bottom: 0;
+  left: 0;
+  width: 20px;
+  height: 2px;
+  border-radius: 1px;
+  background: linear-gradient(90deg, #6d28d9, #2dd4bf);
+}
+
+.footer .footer__col {
+  padding: 0 1rem;
+}
+
+/* Footer copyright area */
+.footer .footer__bottom {
+  margin-top: 3rem;
+  padding-top: 1.5rem;
+  border-top: 1px solid rgba(0, 0, 0, 0.06);
+}
+
+[data-theme='dark'] .footer .footer__bottom {
+  border-top-color: rgba(255, 255, 255, 0.06);
+}
+
+.footer .footer__copyright {
+  font-size: 0.8rem;
+  color: #94a3b8;
+  text-align: center;
 }
 
 /* ========== PAGINATION ========== */
@@ -744,4 +1439,8 @@ footer.row .col {
   border-bottom-color: rgba(255, 255, 255, 0.06);
 }
 
-/* Emoji icons on doc cards are removed via swizzled DocCard component */
+/* ========== DOC CARDS (API listing pages) ========== */
+/* Reduce excessive internal padding */
+.card.padding--lg {
+  padding: 0 0.75rem !important;
+}
diff --git a/docs/src/pages/index.js b/docs/src/pages/index.js
index f6ff05ff08..195e8e3bb0 100644
--- a/docs/src/pages/index.js
+++ b/docs/src/pages/index.js
@@ -26,14 +26,81 @@ const Icons = {
       <path d="M12 20h9"/><path d="M16.5 3.5a2.121 2.121 0 013 3L7 19l-4 1 1-4L16.5 3.5z"/>
     </svg>
   ),
+  chat: (
+    <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
+      <path d="M21 15a2 2 0 01-2 2H7l-4 4V5a2 2 0 012-2h14a2 2 0 012 2z"/>
+    </svg>
+  ),
+  zap: (
+    <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
+      <polygon points="13 2 3 14 12 14 11 22 21 10 12 10 13 2"/>
+    </svg>
+  ),
+  layers: (
+    <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
+      <polygon points="12 2 2 7 12 12 22 7 12 2"/><polyline points="2 17 12 22 22 17"/><polyline points="2 12 12 17 22 12"/>
+    </svg>
+  ),
+  database: (
+    <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
+      <ellipse cx="12" cy="5" rx="9" ry="3"/><path d="M21 12c0 1.66-4 3-9 3s-9-1.34-9-3"/><path d="M3 5v14c0 1.66 4 3 9 3s9-1.34 9-3V5"/>
+    </svg>
+  ),
+  file: (
+    <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
+      <path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><polyline points="14 2 14 8 20 8"/>
+    </svg>
+  ),
+  cpu: (
+    <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
+      <rect x="4" y="4" width="16" height="16" rx="2" ry="2"/><rect x="9" y="9" width="6" height="6"/><line x1="9" y1="1" x2="9" y2="4"/><line x1="15" y1="1" x2="15" y2="4"/><line x1="9" y1="20" x2="9" y2="23"/><line x1="15" y1="20" x2="15" y2="23"/><line x1="20" y1="9" x2="23" y2="9"/><line x1="20" y1="14" x2="23" y2="14"/><line x1="1" y1="9" x2="4" y2="9"/><line x1="1" y1="14" x2="4" y2="14"/>
+    </svg>
+  ),
+  shield: (
+    <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
+      <path d="M12 22s8-4 8-10V5l-8-3-8 3v7c0 6 8 10 8 10z"/>
+    </svg>
+  ),
+  message: (
+    <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
+      <path d="M4 4h16c1.1 0 2 .9 2 2v12c0 1.1-.9 2-2 2H4c-1.1 0-2-.9-2-2V6c0-1.1.9-2 2-2z"/><polyline points="22,6 12,13 2,6"/>
+    </svg>
+  ),
+  conversation: (
+    <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
+      <path d="M21 11.5a8.38 8.38 0 01-.9 3.8 8.5 8.5 0 01-7.6 4.7 8.38 8.38 0 01-3.8-.9L3 21l1.9-5.7a8.38 8.38 0 01-.9-3.8 8.5 8.5 0 014.7-7.6 8.38 8.38 0 013.8-.9h.5a8.48 8.48 0 018 8v.5z"/>
+    </svg>
+  ),
+  plug: (
+    <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
+      <path d="M12 22v-5"/><path d="M9 8V2"/><path d="M15 8V2"/><path d="M18 8v5a6 6 0 01-12 0V8z"/>
+    </svg>
+  ),
+  stack: (
+    <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
+      <rect x="4" y="2" width="16" height="6" rx="1"/><rect x="4" y="10" width="16" height="6" rx="1"/><rect x="4" y="18" width="16" height="4" rx="1"/>
+    </svg>
+  ),
+  arrow: (
+    <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round">
+      <line x1="5" y1="12" x2="19" y2="12"/><polyline points="12 5 19 12 12 19"/>
+    </svg>
+  ),
 };
 
+const FEATURE_ICONS = [Icons.chat, Icons.zap, Icons.layers, Icons.database, Icons.shield, Icons.message, Icons.conversation, Icons.plug, Icons.file, Icons.stack, Icons.cpu];
+
 const FEATURES = [
   { label: 'Chat Completions', path: '/v1/chat/completions', desc: 'Standard OpenAI-compatible chat and completion endpoints' },
   { label: 'Responses API', path: '/v1/responses', desc: 'Server-side agentic orchestration with tool calling and MCP' },
   { label: 'Embeddings', path: '/v1/embeddings', desc: 'Text embeddings from any provider' },
   { label: 'Vector Stores', path: '/v1/vector_stores', desc: 'Managed document storage and semantic search' },
-  { label: 'Files & Batches', path: '/v1/files', desc: 'File upload, processing, and batch operations' },
+  { label: 'Moderations', path: '/v1/moderations', desc: 'Content moderation and safety with configurable shields' },
+  { label: 'Messages API', path: '/v1/messages', desc: 'Native Anthropic Messages API support' },
+  { label: 'Conversations', path: '/v1/conversations', desc: 'Multi-turn conversation state management and history' },
+  { label: 'Connectors', path: '/v1/connectors', desc: 'External connectors like MCP servers and tool integrations' },
+  { label: 'Files', path: '/v1/files', desc: 'File upload, processing, and content extraction' },
+  { label: 'Batches', path: '/v1/batches', desc: 'Async batch processing for large-scale workloads' },
   { label: 'Models', path: '/v1/models', desc: 'Model discovery and management' },
 ];
 
@@ -120,6 +187,9 @@ function CodeTabs() {
   const [active, setActive] = useState(0);
   return (
     <div className={styles.codeBlock}>
+      <div className={styles.codeWindowDots}>
+        <span /><span /><span />
+      </div>
       <div className={styles.codeTabs}>
         {CODE_EXAMPLES.map((ex, i) => (
           <button
@@ -139,10 +209,14 @@ function CodeTabs() {
 function Hero() {
   return (
     <section className={styles.hero}>
-      <div className={styles.heroGlow} />
+      <div className={styles.heroMesh} />
+      <div className={styles.heroGrid} />
       <div className="container">
         <div className={styles.heroInner}>
-          <div className={styles.badge}>OpenAI-Compatible API Server</div>
+          <div className={styles.badge}>
+            <span className={styles.badgeDot} />
+            OpenAI-Compatible API Server
+          </div>
           <h1 className={styles.title}>
             Build AI apps with<br />
             <span className={styles.gradient}>any model, anywhere</span>
@@ -153,7 +227,9 @@ function Hero() {
           </p>
           <InstallBlock />
           <div className={styles.actions}>
-            <Link className={styles.primaryBtn} to="/docs/getting_started/quickstart">Get Started</Link>
+            <Link className={styles.primaryBtn} to="/docs/getting_started/quickstart">
+              Get Started <span className={styles.btnArrow}>{Icons.arrow}</span>
+            </Link>
             <Link className={styles.secondaryBtn} to="/docs/api-openai">API Reference</Link>
             <a className={styles.ghostBtn} href="https://github.com/llamastack/llama-stack" target="_blank" rel="noopener noreferrer">{Icons.github} GitHub</a>
           </div>
@@ -171,15 +247,44 @@ const PROVIDER_NAMES = [
 ];
 
 function ProviderStrip() {
+  const doubled = [...PROVIDER_NAMES, ...PROVIDER_NAMES];
   return (
     <section className={styles.providerStrip}>
       <div className="container">
         <p className={styles.stripLabel}>Works with</p>
-        <div className={styles.stripLogos}>
-          {PROVIDER_NAMES.map(name => (
-            <span key={name} className={styles.stripItem}>{name}</span>
+      </div>
+      <div className={styles.marqueeWrap}>
+        <div className={styles.marqueeTrack}>
+          {doubled.map((name, i) => (
+            <span key={`${name}-${i}`} className={styles.stripItem}>{name}</span>
+          ))}
+        </div>
+      </div>
+      <div className={styles.stripMoreWrap}>
+        <Link to="/docs/providers" className={styles.stripMore}>See all providers</Link>
+      </div>
+    </section>
+  );
+}
+
+const STATS = [
+  { value: '20+', label: 'Inference Providers' },
+  { value: '11+', label: 'API Endpoints' },
+  { value: '4', label: 'Client Languages' },
+  { value: '100%', label: 'OpenAI Compatible' },
+];
+
+function StatsRibbon() {
+  return (
+    <section className={styles.stats}>
+      <div className="container">
+        <div className={styles.statsGrid}>
+          {STATS.map(s => (
+            <div key={s.label} className={styles.statItem}>
+              <span className={styles.statValue}>{s.value}</span>
+              <span className={styles.statLabel}>{s.label}</span>
+            </div>
           ))}
-          <Link to="/docs/providers" className={styles.stripMore}>and more</Link>
         </div>
       </div>
     </section>
@@ -190,11 +295,18 @@ function Endpoints() {
   return (
     <section className={styles.endpoints}>
       <div className="container">
-        <div className={styles.sectionHead}><h2>OpenAI-compatible endpoints</h2><p>Use any OpenAI client library. Zero code changes.</p></div>
+        <div className={styles.sectionHead}>
+          <span className={styles.sectionTag}>API Surface</span>
+          <h2>OpenAI-compatible endpoints</h2>
+          <p>Use any OpenAI client library. Zero code changes.</p>
+        </div>
         <div className={styles.grid}>
-          {FEATURES.map(f => (
+          {FEATURES.map((f, i) => (
             <div key={f.path} className={styles.card}>
-              <code className={styles.path}>{f.path}</code>
+              <div className={styles.cardTop}>
+                <div className={styles.cardIcon}>{FEATURE_ICONS[i]}</div>
+                <code className={styles.path}>{f.path}</code>
+              </div>
               <h3>{f.label}</h3>
               <p>{f.desc}</p>
             </div>
@@ -209,8 +321,15 @@ function Architecture() {
   return (
     <section className={styles.arch}>
       <div className="container">
-        <div className={styles.sectionHead}><h2>How it works</h2><p>One API surface, pluggable providers, deploy anywhere</p></div>
-        <div className={styles.archImg}><img src="/img/architecture-animated.svg" alt="Llama Stack Architecture" loading="lazy" /></div>
+        <div className={styles.sectionHead}>
+          <span className={styles.sectionTag}>Architecture</span>
+          <h2>How it works</h2>
+          <p>One API surface, pluggable providers, deploy anywhere</p>
+        </div>
+        <div className={styles.archImg}>
+          <div className={styles.archGlow} />
+          <img src="/img/architecture-animated.svg" alt="Llama Stack Architecture" loading="lazy" />
+        </div>
       </div>
     </section>
   );
@@ -220,7 +339,11 @@ function ProviderSection() {
   return (
     <section className={styles.providers}>
       <div className="container">
-        <div className={styles.sectionHead}><h2>Plug in any provider</h2><p>Develop locally with Ollama, deploy to production with vLLM or a managed service</p></div>
+        <div className={styles.sectionHead}>
+          <span className={styles.sectionTag}>Providers</span>
+          <h2>Plug in any provider</h2>
+          <p>Develop locally with Ollama, deploy to production with vLLM or a managed service</p>
+        </div>
         <div className={styles.providerCols}>
           {Object.entries(PROVIDERS).map(([cat, items]) => (
             <div key={cat} className={styles.providerCol}>
@@ -240,6 +363,7 @@ function Community() {
     <section className={styles.community}>
       <div className="container">
         <div className={styles.communityInner}>
+          <span className={styles.sectionTag}>Community</span>
           <h2>Open source. Community driven.</h2>
           <p>Join thousands of developers building with Llama Stack</p>
           <div className={styles.links}>
@@ -260,6 +384,7 @@ export default function Home() {
       <main>
         <Hero />
         <ProviderStrip />
+        <StatsRibbon />
         <Endpoints />
         <Architecture />
         <ProviderSection />
diff --git a/docs/src/pages/index.module.css b/docs/src/pages/index.module.css
index 7bc42f5ad0..45f784d47d 100644
--- a/docs/src/pages/index.module.css
+++ b/docs/src/pages/index.module.css
@@ -12,27 +12,51 @@
   color: #e2e8f0;
 }
 
-.heroGlow {
+/* Animated mesh gradient background */
+.heroMesh {
   position: absolute;
-  inset: 0;
+  inset: -50%;
   background:
-    radial-gradient(ellipse 60% 50% at 20% 40%, rgba(109, 40, 217, 0.1) 0%, transparent 70%),
-    radial-gradient(ellipse 50% 40% at 80% 30%, rgba(14, 165, 233, 0.08) 0%, transparent 70%),
-    radial-gradient(ellipse 40% 50% at 60% 80%, rgba(20, 184, 166, 0.06) 0%, transparent 70%);
+    radial-gradient(ellipse 80% 60% at 20% 40%, rgba(109, 40, 217, 0.12) 0%, transparent 60%),
+    radial-gradient(ellipse 60% 50% at 80% 20%, rgba(14, 165, 233, 0.1) 0%, transparent 60%),
+    radial-gradient(ellipse 70% 60% at 50% 80%, rgba(20, 184, 166, 0.08) 0%, transparent 60%),
+    radial-gradient(ellipse 50% 40% at 70% 60%, rgba(245, 158, 11, 0.05) 0%, transparent 60%);
   pointer-events: none;
-  animation: glowShift 8s ease-in-out infinite alternate;
+  animation: meshRotate 20s ease-in-out infinite;
 }
 
-[data-theme='dark'] .heroGlow {
+[data-theme='dark'] .heroMesh {
   background:
-    radial-gradient(ellipse 60% 50% at 20% 40%, rgba(109, 40, 217, 0.2) 0%, transparent 70%),
-    radial-gradient(ellipse 50% 40% at 80% 30%, rgba(14, 165, 233, 0.15) 0%, transparent 70%),
-    radial-gradient(ellipse 40% 50% at 60% 80%, rgba(20, 184, 166, 0.1) 0%, transparent 70%);
+    radial-gradient(ellipse 80% 60% at 20% 40%, rgba(109, 40, 217, 0.25) 0%, transparent 60%),
+    radial-gradient(ellipse 60% 50% at 80% 20%, rgba(14, 165, 233, 0.2) 0%, transparent 60%),
+    radial-gradient(ellipse 70% 60% at 50% 80%, rgba(20, 184, 166, 0.15) 0%, transparent 60%),
+    radial-gradient(ellipse 50% 40% at 70% 60%, rgba(245, 158, 11, 0.08) 0%, transparent 60%);
 }
 
-@keyframes glowShift {
-  0% { opacity: 0.7; transform: scale(1); }
-  100% { opacity: 1; transform: scale(1.05); }
+@keyframes meshRotate {
+  0% { transform: rotate(0deg) scale(1); }
+  33% { transform: rotate(3deg) scale(1.02); }
+  66% { transform: rotate(-2deg) scale(0.98); }
+  100% { transform: rotate(0deg) scale(1); }
+}
+
+/* Subtle grid overlay */
+.heroGrid {
+  position: absolute;
+  inset: 0;
+  background-image:
+    linear-gradient(rgba(109, 40, 217, 0.03) 1px, transparent 1px),
+    linear-gradient(90deg, rgba(109, 40, 217, 0.03) 1px, transparent 1px);
+  background-size: 60px 60px;
+  pointer-events: none;
+  mask-image: radial-gradient(ellipse 80% 70% at 50% 50%, black 30%, transparent 70%);
+  -webkit-mask-image: radial-gradient(ellipse 80% 70% at 50% 50%, black 30%, transparent 70%);
+}
+
+[data-theme='dark'] .heroGrid {
+  background-image:
+    linear-gradient(rgba(167, 139, 250, 0.06) 1px, transparent 1px),
+    linear-gradient(90deg, rgba(167, 139, 250, 0.06) 1px, transparent 1px);
 }
 
 .heroInner {
@@ -43,7 +67,9 @@
 }
 
 .badge {
-  display: inline-block;
+  display: inline-flex;
+  align-items: center;
+  gap: 0.5rem;
   padding: 0.35rem 1rem;
   border-radius: 100px;
   font-size: 0.8rem;
@@ -61,6 +87,19 @@
   border-color: rgba(109, 40, 217, 0.2);
 }
 
+.badgeDot {
+  width: 6px;
+  height: 6px;
+  border-radius: 50%;
+  background: #22c55e;
+  animation: pulse 2s ease-in-out infinite;
+}
+
+@keyframes pulse {
+  0%, 100% { opacity: 1; box-shadow: 0 0 0 0 rgba(34, 197, 94, 0.4); }
+  50% { opacity: 0.8; box-shadow: 0 0 0 4px rgba(34, 197, 94, 0); }
+}
+
 .title {
   font-family: 'Space Grotesk', 'Inter', sans-serif;
   font-size: 3.6rem;
@@ -77,9 +116,16 @@
 
 .gradient {
   background: linear-gradient(135deg, #a78bfa 0%, #2dd4bf 50%, #60a5fa 100%);
+  background-size: 200% 200%;
   -webkit-background-clip: text;
   -webkit-text-fill-color: transparent;
   background-clip: text;
+  animation: gradientShift 6s ease-in-out infinite;
+}
+
+@keyframes gradientShift {
+  0%, 100% { background-position: 0% 50%; }
+  50% { background-position: 100% 50%; }
 }
 
 .subtitle {
@@ -102,6 +148,9 @@
 }
 
 .primaryBtn {
+  display: inline-flex;
+  align-items: center;
+  gap: 0.5rem;
   padding: 0.7rem 1.75rem;
   border-radius: 0.5rem;
   font-weight: 600;
@@ -109,17 +158,41 @@
   background: linear-gradient(135deg, #7c3aed, #6d28d9);
   color: #fff !important;
   text-decoration: none;
-  transition: all 0.2s;
+  transition: all 0.25s;
   border: 1px solid rgba(124, 58, 237, 0.5);
+  position: relative;
+  overflow: hidden;
+}
+
+.primaryBtn::before {
+  content: '';
+  position: absolute;
+  inset: 0;
+  background: linear-gradient(135deg, transparent 0%, rgba(255,255,255,0.1) 50%, transparent 100%);
+  transform: translateX(-100%);
+  transition: transform 0.5s;
+}
+
+.primaryBtn:hover::before {
+  transform: translateX(100%);
 }
 
 .primaryBtn:hover {
-  transform: translateY(-1px);
-  box-shadow: 0 8px 24px rgba(109, 40, 217, 0.3);
+  transform: translateY(-2px);
+  box-shadow: 0 8px 32px rgba(109, 40, 217, 0.4), 0 0 0 1px rgba(124, 58, 237, 0.3);
   color: #fff !important;
   text-decoration: none;
 }
 
+.btnArrow {
+  display: inline-flex;
+  transition: transform 0.2s;
+}
+
+.primaryBtn:hover .btnArrow {
+  transform: translateX(3px);
+}
+
 .secondaryBtn {
   padding: 0.7rem 1.75rem;
   border-radius: 0.5rem;
@@ -142,6 +215,7 @@
   background: rgba(0, 0, 0, 0.08);
   color: #0f172a !important;
   text-decoration: none;
+  transform: translateY(-1px);
 }
 
 [data-theme='dark'] .secondaryBtn:hover {
@@ -150,6 +224,9 @@
 }
 
 .ghostBtn {
+  display: inline-flex;
+  align-items: center;
+  gap: 0.5rem;
   padding: 0.7rem 1.75rem;
   border-radius: 0.5rem;
   font-weight: 600;
@@ -181,11 +258,12 @@
   border-color: rgba(255, 255, 255, 0.3);
 }
 
+/* ========== CODE BLOCK ========== */
 .codeBlock {
   background: #1e1e2e;
   border: 1px solid rgba(255, 255, 255, 0.08);
   border-radius: 1rem;
-  padding: 1.5rem;
+  padding: 0 1.5rem 1.5rem;
   text-align: left;
   max-width: 600px;
   margin: 0 auto;
@@ -194,23 +272,28 @@
   overflow: hidden;
 }
 
-.codeBlock::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  height: 38px;
-  background: rgba(255, 255, 255, 0.02);
-  pointer-events: none;
-}
-
 [data-theme='dark'] .codeBlock {
   background: rgba(0, 0, 0, 0.5);
   border-color: rgba(255, 255, 255, 0.08);
   box-shadow: 0 20px 60px rgba(0, 0, 0, 0.4), 0 0 0 1px rgba(255, 255, 255, 0.03) inset;
 }
 
+.codeWindowDots {
+  display: flex;
+  gap: 6px;
+  padding: 12px 0 0;
+}
+
+.codeWindowDots span {
+  width: 10px;
+  height: 10px;
+  border-radius: 50%;
+}
+
+.codeWindowDots span:nth-child(1) { background: #ff5f57; }
+.codeWindowDots span:nth-child(2) { background: #febc2e; }
+.codeWindowDots span:nth-child(3) { background: #28c840; }
+
 .codeBlock pre {
   margin: 0;
   padding: 0.75rem 0 0 0;
@@ -229,7 +312,6 @@
 .codeTabs {
   display: flex;
   gap: 0;
-  padding: 0 1rem;
   position: relative;
   z-index: 1;
   margin-top: 6px;
@@ -269,12 +351,13 @@
 .synString { color: #98c379; }
 .synComment { color: #5c6370; font-style: italic; }
 
-/* ========== PROVIDER STRIP ========== */
+/* ========== PROVIDER STRIP (marquee) ========== */
 .providerStrip {
   padding: 2rem 0;
   border-top: 1px solid rgba(0, 0, 0, 0.04);
   border-bottom: 1px solid rgba(0, 0, 0, 0.04);
   background: var(--ifm-background-surface-color);
+  overflow: hidden;
 }
 
 [data-theme='dark'] .providerStrip {
@@ -292,23 +375,39 @@
   margin-bottom: 1rem;
 }
 
-.stripLogos {
+.marqueeWrap {
+  overflow: hidden;
+  mask-image: linear-gradient(90deg, transparent 0%, black 10%, black 90%, transparent 100%);
+  -webkit-mask-image: linear-gradient(90deg, transparent 0%, black 10%, black 90%, transparent 100%);
+}
+
+.marqueeTrack {
   display: flex;
-  flex-wrap: wrap;
-  justify-content: center;
-  gap: 0.5rem 1rem;
-  align-items: center;
+  gap: 1rem;
+  width: max-content;
+  animation: marqueeScroll 30s linear infinite;
+}
+
+@keyframes marqueeScroll {
+  0% { transform: translateX(0); }
+  100% { transform: translateX(-50%); }
+}
+
+.marqueeTrack:hover {
+  animation-play-state: paused;
 }
 
 .stripItem {
   font-size: 0.85rem;
   font-weight: 500;
   color: var(--ifm-font-color-secondary);
-  padding: 0.3rem 0.75rem;
+  padding: 0.4rem 1rem;
   border-radius: 100px;
   border: 1px solid rgba(0, 0, 0, 0.06);
   background: var(--ifm-background-color);
-  transition: all 0.15s;
+  transition: all 0.2s;
+  white-space: nowrap;
+  flex-shrink: 0;
 }
 
 [data-theme='dark'] .stripItem {
@@ -319,6 +418,12 @@
 .stripItem:hover {
   border-color: var(--ifm-color-primary-lighter);
   color: var(--ifm-color-primary);
+  transform: scale(1.05);
+}
+
+.stripMoreWrap {
+  text-align: center;
+  margin-top: 1rem;
 }
 
 .stripMore {
@@ -328,7 +433,53 @@
   text-decoration: none !important;
 }
 
-/* ========== SECTIONS SHARED ========== */
+/* ========== STATS RIBBON ========== */
+.stats {
+  padding: 3rem 0;
+  background: linear-gradient(135deg, rgba(109, 40, 217, 0.04) 0%, rgba(14, 165, 233, 0.03) 100%);
+  border-bottom: 1px solid rgba(0, 0, 0, 0.04);
+}
+
+[data-theme='dark'] .stats {
+  background: linear-gradient(135deg, rgba(109, 40, 217, 0.08) 0%, rgba(14, 165, 233, 0.06) 100%);
+  border-bottom-color: rgba(255, 255, 255, 0.04);
+}
+
+.statsGrid {
+  display: grid;
+  grid-template-columns: repeat(4, 1fr);
+  gap: 2rem;
+  max-width: 700px;
+  margin: 0 auto;
+  text-align: center;
+}
+
+.statItem {
+  display: flex;
+  flex-direction: column;
+  gap: 0.25rem;
+}
+
+.statValue {
+  font-family: 'Space Grotesk', 'Inter', sans-serif;
+  font-size: 2.2rem;
+  font-weight: 700;
+  letter-spacing: -0.03em;
+  background: linear-gradient(135deg, #6d28d9, #2dd4bf);
+  -webkit-background-clip: text;
+  -webkit-text-fill-color: transparent;
+  background-clip: text;
+}
+
+.statLabel {
+  font-size: 0.8rem;
+  font-weight: 500;
+  color: var(--ifm-font-color-secondary);
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+}
+
+/* ========== SECTION SHARED ========== */
 .sectionHead {
   text-align: center;
   margin-bottom: 3rem;
@@ -349,6 +500,25 @@
   margin: 0 auto;
 }
 
+.sectionTag {
+  display: inline-block;
+  font-size: 0.72rem;
+  font-weight: 700;
+  text-transform: uppercase;
+  letter-spacing: 0.1em;
+  color: var(--ifm-color-primary);
+  margin-bottom: 0.75rem;
+  padding: 0.25rem 0.75rem;
+  border-radius: 100px;
+  background: rgba(109, 40, 217, 0.06);
+  border: 1px solid rgba(109, 40, 217, 0.1);
+}
+
+[data-theme='dark'] .sectionTag {
+  background: rgba(167, 139, 250, 0.08);
+  border-color: rgba(167, 139, 250, 0.12);
+}
+
 /* ========== ENDPOINTS ========== */
 .endpoints {
   padding: 5rem 0;
@@ -357,7 +527,7 @@
 
 .grid {
   display: grid;
-  grid-template-columns: repeat(3, 1fr);
+  grid-template-columns: repeat(5, 1fr);
   gap: 1.25rem;
 }
 
@@ -366,9 +536,10 @@
   border-radius: 1rem;
   border: 1px solid rgba(0, 0, 0, 0.06);
   background: var(--ifm-background-surface-color);
-  transition: all 0.25s ease;
+  transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
   backdrop-filter: blur(12px);
   -webkit-backdrop-filter: blur(12px);
+  position: relative;
 }
 
 [data-theme='dark'] .card {
@@ -376,26 +547,66 @@
   background: rgba(255, 255, 255, 0.03);
 }
 
+.card::before {
+  content: '';
+  position: absolute;
+  inset: -1px;
+  border-radius: 1rem;
+  padding: 1px;
+  background: linear-gradient(135deg, rgba(109, 40, 217, 0), rgba(20, 184, 166, 0));
+  mask: linear-gradient(#fff 0 0) content-box, linear-gradient(#fff 0 0);
+  -webkit-mask: linear-gradient(#fff 0 0) content-box, linear-gradient(#fff 0 0);
+  mask-composite: exclude;
+  -webkit-mask-composite: xor;
+  pointer-events: none;
+  transition: all 0.3s;
+}
+
+.card:hover::before {
+  background: linear-gradient(135deg, rgba(109, 40, 217, 0.5), rgba(20, 184, 166, 0.5));
+}
+
 .card:hover {
   transform: translateY(-4px);
-  box-shadow: 0 12px 32px rgba(109, 40, 217, 0.12);
-  border-color: var(--ifm-color-primary-lighter);
+  box-shadow: 0 16px 40px rgba(109, 40, 217, 0.15);
 }
 
 [data-theme='dark'] .card:hover {
-  box-shadow: 0 12px 32px rgba(167, 139, 250, 0.15);
-  border-color: rgba(167, 139, 250, 0.3);
+  box-shadow: 0 16px 40px rgba(167, 139, 250, 0.15);
+}
+
+.cardIcon {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  width: 36px;
+  height: 36px;
+  border-radius: 0.625rem;
+  background: linear-gradient(135deg, rgba(109, 40, 217, 0.08), rgba(20, 184, 166, 0.06));
+  color: var(--ifm-color-primary);
+  flex-shrink: 0;
+}
+
+[data-theme='dark'] .cardIcon {
+  background: linear-gradient(135deg, rgba(167, 139, 250, 0.12), rgba(45, 212, 191, 0.08));
+}
+
+.cardTop {
+  display: flex;
+  align-items: center;
+  gap: 0.6rem;
+  margin-bottom: 0.75rem;
 }
 
 .path {
   display: inline-block;
-  font-size: 0.8rem;
-  padding: 0.2rem 0.5rem;
+  font-size: 0.75rem;
+  padding: 0.15rem 0.45rem;
   border-radius: 0.25rem;
   background: rgba(109, 40, 217, 0.08);
   color: var(--ifm-color-primary);
-  margin-bottom: 0.75rem;
   font-weight: 500;
+  white-space: nowrap;
 }
 
 .card h3 {
@@ -424,11 +635,30 @@
 .archImg {
   max-width: 800px;
   margin: 0 auto;
+  position: relative;
+}
+
+.archGlow {
+  position: absolute;
+  inset: -20%;
+  background: radial-gradient(ellipse 60% 40% at 50% 50%, rgba(109, 40, 217, 0.08) 0%, transparent 70%);
+  pointer-events: none;
+  animation: archPulse 4s ease-in-out infinite alternate;
+}
+
+[data-theme='dark'] .archGlow {
+  background: radial-gradient(ellipse 60% 40% at 50% 50%, rgba(109, 40, 217, 0.15) 0%, transparent 70%);
+}
+
+@keyframes archPulse {
+  0% { opacity: 0.5; }
+  100% { opacity: 1; }
 }
 
 .archImg img {
   width: 100%;
   border-radius: 0.75rem;
+  position: relative;
 }
 
 /* ========== PROVIDERS ========== */
@@ -467,6 +697,7 @@
   background: rgba(109, 40, 217, 0.06);
   border: 1px solid rgba(109, 40, 217, 0.12);
   color: var(--ifm-color-primary);
+  transition: all 0.2s;
 }
 
 [data-theme='dark'] .tag {
@@ -474,6 +705,17 @@
   border-color: rgba(167, 139, 250, 0.15);
 }
 
+.tag:hover {
+  background: rgba(109, 40, 217, 0.12);
+  border-color: rgba(109, 40, 217, 0.25);
+  transform: translateY(-1px);
+}
+
+[data-theme='dark'] .tag:hover {
+  background: rgba(167, 139, 250, 0.15);
+  border-color: rgba(167, 139, 250, 0.3);
+}
+
 .providerLink {
   text-align: center;
   margin-top: 2.5rem;
@@ -523,12 +765,13 @@
   flex-direction: column;
   align-items: center;
   gap: 0.15rem;
-  padding: 0.75rem 0.75rem;
-  border-radius: 0.5rem;
+  padding: 1.25rem 0.75rem;
+  border-radius: 0.75rem;
   border: 1px solid rgba(0, 0, 0, 0.06);
   background: var(--ifm-background-color);
   text-decoration: none !important;
-  transition: all 0.2s;
+  transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+  position: relative;
 }
 
 [data-theme='dark'] .linkCard {
@@ -536,56 +779,75 @@
   background: rgba(255, 255, 255, 0.04);
 }
 
+.linkCard::before {
+  content: '';
+  position: absolute;
+  inset: -1px;
+  border-radius: 0.75rem;
+  padding: 1px;
+  background: linear-gradient(135deg, rgba(109, 40, 217, 0), rgba(20, 184, 166, 0));
+  mask: linear-gradient(#fff 0 0) content-box, linear-gradient(#fff 0 0);
+  -webkit-mask: linear-gradient(#fff 0 0) content-box, linear-gradient(#fff 0 0);
+  mask-composite: exclude;
+  -webkit-mask-composite: xor;
+  pointer-events: none;
+  transition: all 0.3s;
+}
+
+.linkCard:hover::before {
+  background: linear-gradient(135deg, rgba(109, 40, 217, 0.5), rgba(20, 184, 166, 0.5));
+}
+
 .linkCard:hover {
-  transform: translateY(-1px);
-  border-color: var(--ifm-color-primary-lighter);
-  box-shadow: 0 12px 32px rgba(109, 40, 217, 0.1);
+  transform: translateY(-3px);
+  box-shadow: 0 16px 40px rgba(109, 40, 217, 0.12);
 }
 
 [data-theme='dark'] .linkCard:hover {
-  box-shadow: 0 12px 32px rgba(167, 139, 250, 0.12);
-  border-color: rgba(167, 139, 250, 0.3);
+  box-shadow: 0 16px 40px rgba(167, 139, 250, 0.12);
 }
 
 .linkIcon {
   display: flex;
   align-items: center;
   justify-content: center;
-  width: 28px;
-  height: 28px;
-  border-radius: 0.5rem;
+  width: 36px;
+  height: 36px;
+  border-radius: 0.75rem;
   font-size: 0.85rem;
-  background: rgba(109, 40, 217, 0.08);
+  background: linear-gradient(135deg, rgba(109, 40, 217, 0.08), rgba(20, 184, 166, 0.06));
   color: var(--ifm-color-primary);
-  margin-bottom: 0.25rem;
+  margin-bottom: 0.5rem;
 }
 
 [data-theme='dark'] .linkIcon {
-  background: rgba(167, 139, 250, 0.1);
+  background: linear-gradient(135deg, rgba(167, 139, 250, 0.12), rgba(45, 212, 191, 0.08));
 }
 
 .linkCard strong {
-  font-size: 0.85rem;
+  font-size: 0.9rem;
   color: var(--ifm-font-color-base);
 }
 
 .linkCard span:not(.linkIcon) {
-  font-size: 0.7rem;
+  font-size: 0.75rem;
   color: var(--ifm-font-color-secondary);
 }
 
-.ghostBtn {
-  display: inline-flex;
-  align-items: center;
-  gap: 0.5rem;
+/* ========== RESPONSIVE ========== */
+@media (max-width: 1200px) {
+  .grid { grid-template-columns: repeat(3, 1fr); }
+}
+
+@media (max-width: 900px) {
+  .grid { grid-template-columns: repeat(2, 1fr); }
 }
 
-/* ========== RESPONSIVE ========== */
 @media (max-width: 996px) {
   .title { font-size: 2.4rem; }
-  .grid { grid-template-columns: repeat(2, 1fr); }
   .providerCols { grid-template-columns: 1fr; }
   .links { grid-template-columns: repeat(2, 1fr); }
+  .statsGrid { grid-template-columns: repeat(2, 1fr); gap: 1.5rem; }
 }
 
 @media (max-width: 640px) {
@@ -595,6 +857,8 @@
   .actions { flex-direction: column; align-items: center; }
   .grid { grid-template-columns: 1fr; }
   .links { grid-template-columns: 1fr; }
-  .codeBlock { padding: 1rem; }
+  .codeBlock { padding: 0 1rem 1rem; }
   .codeBlock code { font-size: 0.75rem; }
+  .statsGrid { grid-template-columns: repeat(2, 1fr); }
+  .statValue { font-size: 1.6rem; }
 }
diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml
index 3e1d774f82..3a97f89f9f 100644
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@@ -13,67 +13,6 @@ info:
 servers:
 - url: http://any-hosted-llama-stack.com
 paths:
-  /v1/scoring-functions:
-    post:
-      responses:
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-        '204':
-          description: The scoring function was successfully registered.
-      tags:
-      - Scoring Functions
-      summary: Register a scoring function.
-      description: Register a scoring function.
-      operationId: register_scoring_function_v1_scoring_functions_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
-        required: true
-      deprecated: true
-  /v1/scoring-functions/{scoring_fn_id}:
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The scoring function was successfully unregistered.
-      tags:
-      - Scoring Functions
-      summary: Unregister a scoring function.
-      description: Unregister a scoring function.
-      operationId: unregister_scoring_function_v1_scoring_functions__scoring_fn_id__delete
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to unregister.
-      deprecated: true
   /v1/shields:
     post:
       responses:
@@ -139,132 +78,6 @@ paths:
           title: Identifier
         description: The identifier of the shield to unregister.
       deprecated: true
-  /v1beta/datasets:
-    post:
-      responses:
-        '200':
-          description: The registered dataset object.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Dataset'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Datasets
-      summary: Register a new dataset.
-      description: Register a new dataset.
-      operationId: register_dataset_v1beta_datasets_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterDatasetRequest'
-        required: true
-      deprecated: true
-  /v1beta/datasets/{dataset_id}:
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The dataset was successfully unregistered.
-      tags:
-      - Datasets
-      summary: Unregister a dataset by its ID.
-      description: Unregister a dataset by its ID.
-      operationId: unregister_dataset_v1beta_datasets__dataset_id__delete
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-        description: The ID of the dataset to unregister.
-      deprecated: true
-  /v1alpha/eval/benchmarks:
-    post:
-      responses:
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-        '204':
-          description: The benchmark was successfully registered.
-      tags:
-      - Benchmarks
-      summary: Register a benchmark.
-      description: Register a benchmark.
-      operationId: register_benchmark_v1alpha_eval_benchmarks_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterBenchmarkRequest'
-        required: true
-      deprecated: true
-  /v1alpha/eval/benchmarks/{benchmark_id}:
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The benchmark was successfully unregistered.
-      tags:
-      - Benchmarks
-      summary: Unregister a benchmark.
-      description: Unregister a benchmark.
-      operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-        description: The ID of the benchmark to unregister.
-      deprecated: true
 components:
   schemas:
     Error:
@@ -5361,146 +5174,88 @@ components:
       - error
       title: ViolationLevel
       description: Severity level of a safety violation.
-    AggregationFunctionType:
-      type: string
-      enum:
-      - average
-      - weighted_average
-      - median
-      - categorical_count
-      - accuracy
-      title: AggregationFunctionType
-      description: Types of aggregation functions for scoring results.
     ArrayType:
+      description: Parameter type for array values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - array
       title: ArrayType
-      description: Parameter type for array values.
-    BasicScoringFnParams:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - basic
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: BasicScoringFnParams
-      description: Parameters for basic scoring function configuration.
     BooleanType:
+      description: Parameter type for boolean values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - boolean
       title: BooleanType
-      description: Parameter type for boolean values.
     ChatCompletionInputType:
+      description: Parameter type for chat completion input.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - chat_completion_input
       title: ChatCompletionInputType
-      description: Parameter type for chat completion input.
     CompletionInputType:
+      description: Parameter type for completion input.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - completion_input
       title: CompletionInputType
-      description: Parameter type for completion input.
     JsonType:
+      description: Parameter type for JSON values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - json
       title: JsonType
-      description: Parameter type for JSON values.
-    LLMAsJudgeScoringFnParams:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - llm_as_judge
-        judge_model:
-          type: string
-          title: Judge Model
-        prompt_template:
-          anyOf:
-          - type: string
-          - type: 'null'
-        judge_score_regexes:
-          items:
-            type: string
-          type: array
-          title: Judge Score Regexes
-          description: Regexes to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      required:
-      - judge_model
-      title: LLMAsJudgeScoringFnParams
-      description: Parameters for LLM-as-judge scoring function configuration.
     NumberType:
+      description: Parameter type for numeric values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - number
       title: NumberType
-      description: Parameter type for numeric values.
     ObjectType:
+      description: Parameter type for object values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - object
       title: ObjectType
-      description: Parameter type for object values.
-    RegexParserScoringFnParams:
+    StringType:
+      description: Parameter type for string values.
       properties:
         type:
+          title: Type
           type: string
+          enum:
+          - string
+      title: StringType
+    UnionType:
+      description: Parameter type for union values.
+      properties:
+        type:
           title: Type
+          type: string
           enum:
-          - regex_parser
-        parsing_regexes:
-          items:
-            type: string
-          type: array
-          title: Parsing Regexes
-          description: Regex to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: RegexParserScoringFnParams
-      description: Parameters for regex parser scoring function configuration.
-    ScoringFn:
+          - union
+      title: UnionType
+    Shield:
       properties:
         identifier:
           type: string
@@ -5519,288 +5274,25 @@ components:
           type: string
           title: Type
           enum:
-          - scoring_function
-        description:
-          anyOf:
-          - type: string
-          - type: 'null'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this definition
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the deterministic function
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
+          - shield
         params:
           anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
+          - additionalProperties: true
+            type: object
           - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval
       required:
       - identifier
       - provider_id
-      - return_type
-      title: ScoringFn
-      description: A scoring function resource for evaluating model outputs.
-    ScoringFnParams:
-      discriminator:
-        mapping:
-          basic: '#/components/schemas/BasicScoringFnParams'
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        title: LLMAsJudgeScoringFnParams
-      - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        title: RegexParserScoringFnParams
-      - $ref: '#/components/schemas/BasicScoringFnParams'
-        title: BasicScoringFnParams
-      title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-    ScoringFnParamsType:
-      description: Types of scoring function parameter configurations.
-      enum:
-      - llm_as_judge
-      - regex_parser
-      - basic
-      title: ScoringFnParamsType
-      type: string
-    StringType:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - string
-      title: StringType
-      description: Parameter type for string values.
-    UnionType:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - union
-      title: UnionType
-      description: Parameter type for union values.
-    ListScoringFunctionsResponse:
+      title: Shield
+      description: A safety shield resource that can be used to check content.
+    ListShieldsResponse:
       properties:
         data:
           items:
-            $ref: '#/components/schemas/ScoringFn'
+            $ref: '#/components/schemas/Shield'
           type: array
           title: Data
-          description: List of scoring function objects.
-      required:
-      - data
-      title: ListScoringFunctionsResponse
-      description: Response containing a list of scoring function objects.
-    ScoreRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Input Rows
-          description: The rows to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-      required:
-      - input_rows
-      - scoring_functions
-      title: ScoreRequest
-      description: Request model for scoring a list of rows.
-    ScoreResponse:
-      properties:
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult.
-      required:
-      - results
-      title: ScoreResponse
-      description: The response from scoring.
-    ScoringResult:
-      properties:
-        score_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Score Rows
-          description: The scoring result for each row. Each row is a map of column name to value.
-        aggregated_results:
-          additionalProperties: true
-          type: object
-          title: Aggregated Results
-          description: Map of metric name to aggregated value
-      required:
-      - score_rows
-      - aggregated_results
-      title: ScoringResult
-      description: A scoring result for a single row.
-    ScoreBatchRequest:
-      properties:
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-        save_results_dataset:
-          type: boolean
-          title: Save Results Dataset
-          description: Whether to save the results to a dataset.
-          default: false
-      required:
-      - dataset_id
-      - scoring_functions
-      title: ScoreBatchRequest
-      description: Request model for scoring a batch of rows from a dataset.
-    ScoreBatchResponse:
-      properties:
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: (Optional) The identifier of the dataset that was scored
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult
-      required:
-      - results
-      title: ScoreBatchResponse
-      description: Response from batch scoring operations on datasets.
-    Shield:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          enum:
-          - shield
-        params:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-      required:
-      - identifier
-      - provider_id
-      title: Shield
-      description: A safety shield resource that can be used to check content.
-    ListShieldsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Shield'
-          type: array
-          title: Data
-          description: List of shield objects
+          description: List of shield objects
       required:
       - data
       title: ListShieldsResponse
@@ -6815,264 +6307,48 @@ components:
       - version
       title: VersionInfo
       description: Version information for the service.
-    AppendRowsRequest:
-      properties:
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: The rows to append to the dataset.
-      required:
-      - rows
-      title: AppendRowsRequest
-      description: Request body for appending rows to a dataset.
     PaginatedResponse:
+      description: A generic paginated response that follows a simple format.
       properties:
         data:
           items:
             additionalProperties: true
             type: object
-          type: array
           title: Data
+          type: array
         has_more:
-          type: boolean
           title: Has More
+          type: boolean
         url:
           anyOf:
           - type: string
           - type: 'null'
+          nullable: true
       required:
       - data
       - has_more
       title: PaginatedResponse
-      description: A generic paginated response that follows a simple format.
-    Dataset:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          description: Type of resource, always 'dataset' for datasets
-          enum:
-          - dataset
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: Purpose of the dataset indicating its intended use
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: Data source configuration for the dataset
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this dataset
-      required:
-      - identifier
-      - provider_id
-      - purpose
-      - source
-      title: Dataset
-      description: Dataset resource for storing and accessing training or evaluation data.
-    RowsDataSource:
-      properties:
-        type:
-          type: string
-          title: Type
-          description: The type of data source.
-          enum:
-          - rows
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: 'The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]'
-      required:
-      - rows
-      title: RowsDataSource
-      description: A dataset stored in rows.
-    URIDataSource:
-      properties:
-        type:
-          type: string
-          title: Type
-          description: The type of data source.
-          enum:
-          - uri
-        uri:
-          type: string
-          title: Uri
-          description: The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}"
-      required:
-      - uri
-      title: URIDataSource
-      description: A dataset that can be obtained from a URI.
-    ListDatasetsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Dataset'
-          type: array
-          title: Data
-          description: List of datasets
-      required:
-      - data
-      title: ListDatasetsResponse
-      description: Response from listing datasets.
-    Benchmark:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          description: The resource type, always benchmark.
-          enum:
-          - benchmark
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: Identifier of the dataset to use for the benchmark evaluation.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: List of scoring function identifiers to apply during evaluation.
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Metadata for this evaluation task.
-      required:
-      - identifier
-      - provider_id
-      - dataset_id
-      - scoring_functions
-      title: Benchmark
-      description: A benchmark resource for evaluating model performance.
-    ListBenchmarksResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Benchmark'
-          type: array
-          title: Data
-          description: List of benchmark objects.
-      required:
-      - data
-      title: ListBenchmarksResponse
-      description: Response containing a list of benchmark objects.
-    BenchmarkConfig:
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/ModelCandidate'
-          description: The candidate to evaluate
-        scoring_params:
-          additionalProperties:
-            oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          type: object
-          title: Scoring Params
-          description: Map between scoring function id and parameters for each scoring function you want to run
-        num_examples:
-          anyOf:
-          - type: integer
-            minimum: 1.0
-          - type: 'null'
-          description: Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated
-      required:
-      - eval_candidate
-      title: BenchmarkConfig
-      description: A benchmark configuration for evaluation.
     GreedySamplingStrategy:
+      description: Greedy sampling strategy that selects the highest probability token at each step.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'greedy' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - greedy
       title: GreedySamplingStrategy
-      description: Greedy sampling strategy that selects the highest probability token at each step.
-    ModelCandidate:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - model
-        model:
-          type: string
-          minLength: 1
-          title: Model
-          description: The model ID to evaluate
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model
-        system_message:
-          anyOf:
-          - $ref: '#/components/schemas/SystemMessage'
-            title: SystemMessage
-          - type: 'null'
-          description: The system message providing instructions or context to the model
-          title: SystemMessage
-      required:
-      - model
-      - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
     SamplingParams:
+      description: Sampling parameters for text generation.
       properties:
         strategy:
+          description: The sampling strategy to use.
+          discriminator:
+            mapping:
+              greedy: '#/components/schemas/GreedySamplingStrategy'
+              top_k: '#/components/schemas/TopKSamplingStrategy'
+              top_p: '#/components/schemas/TopPSamplingStrategy'
+            propertyName: type
           oneOf:
           - $ref: '#/components/schemas/GreedySamplingStrategy'
             title: GreedySamplingStrategy
@@ -7081,200 +6357,127 @@ components:
           - $ref: '#/components/schemas/TopKSamplingStrategy'
             title: TopKSamplingStrategy
           title: GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy
-          description: The sampling strategy to use.
-          discriminator:
-            propertyName: type
-            mapping:
-              greedy: '#/components/schemas/GreedySamplingStrategy'
-              top_k: '#/components/schemas/TopKSamplingStrategy'
-              top_p: '#/components/schemas/TopPSamplingStrategy'
         max_tokens:
           anyOf:
-          - type: integer
-            minimum: 1.0
+          - minimum: 1
+            type: integer
           - type: 'null'
           description: The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length.
+          nullable: true
         repetition_penalty:
           anyOf:
-          - type: number
-            maximum: 2.0
+          - maximum: 2.0
             minimum: -2.0
+            type: number
           - type: 'null'
-          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
           default: 1.0
+          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
         stop:
           anyOf:
           - items:
               type: string
-            type: array
             maxItems: 4
+            type: array
           - type: 'null'
           description: Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+          nullable: true
       title: SamplingParams
-      description: Sampling parameters for text generation.
     SystemMessage:
+      description: A system message providing instructions or context to the model.
       properties:
         role:
-          type: string
-          title: Role
           description: Must be 'system' to identify this as a system message.
+          title: Role
+          type: string
           enum:
           - system
         content:
           anyOf:
           - type: string
-          - oneOf:
-            - $ref: '#/components/schemas/ImageContentItem-Input'
-              title: ImageContentItem-Input
-            - $ref: '#/components/schemas/TextContentItem'
-              title: TextContentItem
-            discriminator:
-              propertyName: type
+          - discriminator:
               mapping:
-                image: '#/components/schemas/ImageContentItem-Input'
+                image: '#/components/schemas/ImageContentItem'
                 text: '#/components/schemas/TextContentItem'
-            title: ImageContentItem-Input | TextContentItem
+              propertyName: type
+            oneOf:
+            - $ref: '#/components/schemas/ImageContentItem'
+              title: ImageContentItem
+            - $ref: '#/components/schemas/TextContentItem'
+              title: TextContentItem
+            title: ImageContentItem | TextContentItem
           - items:
-              oneOf:
-              - $ref: '#/components/schemas/ImageContentItem-Input'
-                title: ImageContentItem-Input
-              - $ref: '#/components/schemas/TextContentItem'
-                title: TextContentItem
               discriminator:
-                propertyName: type
                 mapping:
-                  image: '#/components/schemas/ImageContentItem-Input'
+                  image: '#/components/schemas/ImageContentItem'
                   text: '#/components/schemas/TextContentItem'
-              title: ImageContentItem-Input | TextContentItem
+                propertyName: type
+              oneOf:
+              - $ref: '#/components/schemas/ImageContentItem'
+                title: ImageContentItem
+              - $ref: '#/components/schemas/TextContentItem'
+                title: TextContentItem
+              title: ImageContentItem | TextContentItem
             type: array
-            title: list[ImageContentItem-Input | TextContentItem]
-          title: string | list[ImageContentItem-Input | TextContentItem]
+            title: list[ImageContentItem | TextContentItem]
           description: The content of the 'system prompt'. If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages.
+          title: string | list[ImageContentItem | TextContentItem]
       required:
       - content
       title: SystemMessage
-      description: A system message providing instructions or context to the model.
     TopKSamplingStrategy:
+      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'top_k' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - top_k
         top_k:
-          type: integer
-          minimum: 1.0
-          title: Top K
           description: Number of top tokens to consider for sampling. Must be at least 1.
+          minimum: 1
+          title: Top K
+          type: integer
       required:
       - top_k
       title: TopKSamplingStrategy
-      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
     TopPSamplingStrategy:
+      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'top_p' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - top_p
         temperature:
-          type: number
+          description: Controls randomness in sampling. Higher values increase randomness.
           maximum: 2.0
           title: Temperature
-          description: Controls randomness in sampling. Higher values increase randomness.
+          type: number
           minimum: 0.0
         top_p:
-          type: number
+          default: 0.95
+          description: Cumulative probability threshold for nucleus sampling.
           maximum: 1.0
           minimum: 0.0
           title: Top P
-          description: Cumulative probability threshold for nucleus sampling.
-          default: 0.95
+          type: number
       required:
       - temperature
       title: TopPSamplingStrategy
-      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
-    EvaluateRowsRequest:
-      description: Request model for evaluating a list of rows on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        input_rows:
-          description: The rows to evaluate
-          items:
-            additionalProperties: true
-            type: object
-          minItems: 1
-          title: Input Rows
-          type: array
-        scoring_functions:
-          description: The scoring functions to use for the evaluation
-          items:
-            type: string
-          minItems: 1
-          title: Scoring Functions
-          type: array
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsRequest
-    EvaluateResponse:
-      properties:
-        generations:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Generations
-          description: The generations from the evaluation
-        scores:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Scores
-          description: The scores from the evaluation. Each key in the dict is a scoring function name
-      required:
-      - generations
-      - scores
-      title: EvaluateResponse
-      description: The response from an evaluation.
-    RunEvalRequest:
-      description: Request model for running an evaluation on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - benchmark_config
-      title: RunEvalRequest
     Job:
+      description: A job execution instance with status tracking.
       properties:
         job_id:
-          type: string
           title: Job Id
+          type: string
         status:
           $ref: '#/components/schemas/JobStatus'
       required:
       - job_id
       - status
       title: Job
-      description: A job execution instance with status tracking.
     RerankRequest:
       properties:
         model:
@@ -7419,85 +6622,6 @@ components:
       - $ref: '#/components/schemas/CompletionInputType'
         title: CompletionInputType
       title: StringType | ... (9 variants)
-    RegisterScoringFunctionRequest:
-      properties:
-        scoring_fn_id:
-          type: string
-          title: Scoring Fn Id
-          description: The ID of the scoring function to register.
-        description:
-          type: string
-          title: Description
-          description: The description of the scoring function.
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the scoring function.
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        provider_scoring_fn_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider scoring function to use for the scoring function.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the scoring function.
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequest
-      description: Request model for registering a scoring function.
     RegisterShieldRequest:
       properties:
         shield_id:
@@ -7524,90 +6648,6 @@ components:
       - shield_id
       title: RegisterShieldRequest
       description: Request model for registering a shield.
-    DataSource:
-      discriminator:
-        mapping:
-          rows: '#/components/schemas/RowsDataSource'
-          uri: '#/components/schemas/URIDataSource'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/URIDataSource'
-        title: URIDataSource
-      - $ref: '#/components/schemas/RowsDataSource'
-        title: RowsDataSource
-      title: URIDataSource | RowsDataSource
-    RegisterDatasetRequest:
-      properties:
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: The purpose of the dataset.
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: The data source of the dataset.
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata for the dataset.
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the dataset. If not provided, an ID will be generated.
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequest
-      description: Request model for registering a dataset.
-    RegisterBenchmarkRequest:
-      properties:
-        benchmark_id:
-          type: string
-          title: Benchmark Id
-          description: The ID of the benchmark to register.
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to use for the benchmark.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: The scoring functions to use for the benchmark.
-        provider_benchmark_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider benchmark to use for the benchmark.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the benchmark.
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata to use for the benchmark.
-      required:
-      - benchmark_id
-      - dataset_id
-      - scoring_functions
-      title: RegisterBenchmarkRequest
-      description: Request model for registering a benchmark.
     AllowedToolsFilter:
       properties:
         tool_names:
@@ -8546,13 +7586,6 @@ components:
       - model
       title: CreateResponseRequest
       description: Request model for creating a response.
-    DatasetPurpose:
-      type: string
-      enum:
-      - eval/question-answer
-      - eval/messages-answer
-      title: DatasetPurpose
-      description: Purpose of the dataset. Each purpose has a required input data schema.
     EmbeddedChunk-Input:
       properties:
         content:
@@ -8691,32 +7724,6 @@ components:
           - type: 'null'
       additionalProperties: true
       title: Errors
-    EvaluateRowsBodyRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          minItems: 1
-          title: Input Rows
-          description: The rows to evaluate
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          minItems: 1
-          title: Scoring Functions
-          description: The scoring functions to use for the evaluation
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsBodyRequest
-      description: Request body model for evaluating rows (without path parameter).
     HealthStatus:
       type: string
       enum:
@@ -8760,16 +7767,6 @@ components:
       required:
       - cached_tokens
       title: InputTokensDetails
-    JobStatus:
-      type: string
-      enum:
-      - completed
-      - in_progress
-      - failed
-      - scheduled
-      - cancelled
-      title: JobStatus
-      description: Status of a job execution.
     ListConnectorsResponse:
       properties:
         data:
@@ -9744,15 +8741,6 @@ components:
       - disabled
       title: ResponseTruncation
       description: Controls how the service truncates input when it exceeds the model context window.
-    RunEvalBodyRequest:
-      properties:
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_config
-      title: RunEvalBodyRequest
-      description: Request body model for running an evaluation (without path parameter).
     SearchRankingOptions:
       properties:
         ranker:
@@ -10140,50 +9128,6 @@ components:
       - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
         title: OpenAIResponseContentPartReasoningText
       title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
-    ListBenchmarksRequest:
-      description: Request model for listing benchmarks.
-      properties: {}
-      title: ListBenchmarksRequest
-    GetBenchmarkRequest:
-      description: Request model for getting a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: GetBenchmarkRequest
-    UnregisterBenchmarkRequest:
-      description: Request model for unregistering a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: UnregisterBenchmarkRequest
-    GetDatasetRequest:
-      description: Request model for getting a dataset by ID.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: GetDatasetRequest
-    UnregisterDatasetRequest:
-      description: Request model for unregistering a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: UnregisterDatasetRequest
     ListModelsResponse:
       description: Response containing a list of model objects.
       properties:
@@ -10216,39 +9160,6 @@ components:
       required:
       - model_id
       title: UnregisterModelRequest
-    DialogType:
-      description: Parameter type for dialog data with semantic output labels.
-      properties:
-        type:
-          title: Type
-          type: string
-          enum:
-          - dialog
-      title: DialogType
-    ListScoringFunctionsRequest:
-      description: Request model for listing scoring functions.
-      properties: {}
-      title: ListScoringFunctionsRequest
-    GetScoringFunctionRequest:
-      description: Request model for getting a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: GetScoringFunctionRequest
-    UnregisterScoringFunctionRequest:
-      description: Request model for unregistering a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: UnregisterScoringFunctionRequest
     GetShieldRequest:
       description: Request model for getting a shield by identifier.
       properties:
@@ -10347,16 +9258,10 @@ components:
       - responses
       - batches
       - vector_io
-      - datasetio
-      - scoring
-      - eval
       - tool_runtime
       - models
       - shields
       - vector_stores
-      - datasets
-      - scoring_functions
-      - benchmarks
       - tool_groups
       - files
       - file_processors
@@ -11157,6 +10062,25 @@ components:
       required:
       - batch_id
       title: CancelBatchRequest
+    JobStatus:
+      description: Status of a job execution.
+      enum:
+      - completed
+      - in_progress
+      - failed
+      - scheduled
+      - cancelled
+      title: JobStatus
+      type: string
+    DialogType:
+      description: Parameter type for dialog data with semantic output labels.
+      properties:
+        type:
+          title: Type
+          type: string
+          enum:
+          - dialog
+      title: DialogType
     ConnectorInput:
       description: Input for creating a connector
       properties:
@@ -11390,90 +10314,6 @@ components:
       - conversation_id
       - item_id
       title: DeleteItemRequest
-    IterRowsRequest:
-      description: Request model for iterating over rows in a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-          type: string
-        start_index:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          nullable: true
-        limit:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          nullable: true
-      required:
-      - dataset_id
-      title: IterRowsRequest
-    BenchmarkIdRequest:
-      description: Request model containing benchmark_id path parameter.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark
-          minLength: 1
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: BenchmarkIdRequest
-    JobStatusRequest:
-      description: Request model for getting the status of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the status of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobStatusRequest
-    JobCancelRequest:
-      description: Request model for canceling a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to cancel
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobCancelRequest
-    JobResultRequest:
-      description: Request model for getting the result of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the result of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobResultRequest
     ProcessFileRequest:
       description: |-
         Request model for file processing operation.
diff --git a/docs/static/experimental-llama-stack-spec.yaml b/docs/static/experimental-llama-stack-spec.yaml
index 0d3b517e83..eb56f2fc1c 100644
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@@ -13,423 +13,6 @@ info:
 servers:
 - url: http://any-hosted-llama-stack.com
 paths:
-  /v1beta/datasetio/append-rows/{dataset_id}:
-    post:
-      responses:
-        '204':
-          description: Rows were successfully appended.
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - DatasetIO
-      summary: Append rows to a dataset.
-      description: Append rows to a dataset.
-      operationId: append_rows_v1beta_datasetio_append_rows__dataset_id__post
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to append the rows to.
-          title: Dataset Id
-        description: The ID of the dataset to append the rows to.
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/AppendRowsRequest'
-  /v1beta/datasetio/iterrows/{dataset_id}:
-    get:
-      responses:
-        '200':
-          description: A PaginatedResponse containing the rows.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/PaginatedResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - DatasetIO
-      summary: Get a paginated list of rows from a dataset.
-      description: |-
-        Get a paginated list of rows from a dataset.
-
-        Uses offset-based pagination where:
-        - start_index: The starting index (0-based). If None, starts from beginning.
-        - limit: Number of items to return. If None or -1, returns all items.
-
-        The response includes:
-        - data: List of items for the current page.
-        - has_more: Whether there are more items available after this set.
-      operationId: iterrows_v1beta_datasetio_iterrows__dataset_id__get
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-        description: The ID of the dataset to get the rows from.
-      - name: start_index
-        in: query
-        required: false
-        schema:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          title: Start Index
-        description: Index into dataset for the first row to get. Get all rows if None.
-      - name: limit
-        in: query
-        required: false
-        schema:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          title: Limit
-        description: The number of rows to get.
-  /v1beta/datasets:
-    get:
-      responses:
-        '200':
-          description: A list of dataset objects.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListDatasetsResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Datasets
-      summary: List all datasets.
-      description: List all datasets.
-      operationId: list_datasets_v1beta_datasets_get
-  /v1beta/datasets/{dataset_id}:
-    get:
-      responses:
-        '200':
-          description: The dataset object.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Dataset'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Datasets
-      summary: Get a dataset by its ID.
-      description: Get a dataset by its ID.
-      operationId: get_dataset_v1beta_datasets__dataset_id__get
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to get.
-          title: Dataset Id
-        description: The ID of the dataset to get.
-  /v1alpha/eval/benchmarks:
-    get:
-      responses:
-        '200':
-          description: A ListBenchmarksResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListBenchmarksResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Benchmarks
-      summary: List all benchmarks.
-      description: List all benchmarks.
-      operationId: list_benchmarks_v1alpha_eval_benchmarks_get
-  /v1alpha/eval/benchmarks/{benchmark_id}:
-    get:
-      responses:
-        '200':
-          description: A Benchmark.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Benchmark'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Benchmarks
-      summary: Get a benchmark by its ID.
-      description: Get a benchmark by its ID.
-      operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-        description: The ID of the benchmark to get.
-  /v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
-    post:
-      responses:
-        '200':
-          description: EvaluateResponse object containing generations and scores.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Evaluate Rows
-      description: Evaluate a list of rows on a benchmark.
-      operationId: evaluate_rows_v1alpha_eval_benchmarks__benchmark_id__evaluations_post
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark
-          title: Benchmark Id
-        description: The ID of the benchmark
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EvaluateRowsBodyRequest'
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs:
-    post:
-      responses:
-        '200':
-          description: The job that was created to run the evaluation.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Run Eval
-      description: Run an evaluation on a benchmark.
-      operationId: run_eval_v1alpha_eval_benchmarks__benchmark_id__jobs_post
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark
-          title: Benchmark Id
-        description: The ID of the benchmark
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RunEvalBodyRequest'
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
-    get:
-      responses:
-        '200':
-          description: The status of the evaluation job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Job Status
-      description: Get the status of a job.
-      operationId: job_status_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: Successful Response
-      tags:
-      - Eval
-      summary: Job Cancel
-      description: Cancel a job.
-      operationId: job_cancel_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__delete
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
-    get:
-      responses:
-        '200':
-          description: The result of the job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Job Result
-      description: Get the result of a job.
-      operationId: job_result_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__result_get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
   /v1alpha/inference/rerank:
     post:
       responses:
@@ -5902,408 +5485,87 @@ components:
       - error
       title: ViolationLevel
       description: Severity level of a safety violation.
-    AggregationFunctionType:
-      type: string
-      enum:
-      - average
-      - weighted_average
-      - median
-      - categorical_count
-      - accuracy
-      title: AggregationFunctionType
-      description: Types of aggregation functions for scoring results.
     ArrayType:
+      description: Parameter type for array values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - array
       title: ArrayType
-      description: Parameter type for array values.
-    BasicScoringFnParams:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - basic
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: BasicScoringFnParams
-      description: Parameters for basic scoring function configuration.
     BooleanType:
+      description: Parameter type for boolean values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - boolean
       title: BooleanType
-      description: Parameter type for boolean values.
     ChatCompletionInputType:
+      description: Parameter type for chat completion input.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - chat_completion_input
       title: ChatCompletionInputType
-      description: Parameter type for chat completion input.
     CompletionInputType:
+      description: Parameter type for completion input.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - completion_input
       title: CompletionInputType
-      description: Parameter type for completion input.
     JsonType:
+      description: Parameter type for JSON values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - json
       title: JsonType
-      description: Parameter type for JSON values.
-    LLMAsJudgeScoringFnParams:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - llm_as_judge
-        judge_model:
-          type: string
-          title: Judge Model
-        prompt_template:
-          anyOf:
-          - type: string
-          - type: 'null'
-        judge_score_regexes:
-          items:
-            type: string
-          type: array
-          title: Judge Score Regexes
-          description: Regexes to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      required:
-      - judge_model
-      title: LLMAsJudgeScoringFnParams
-      description: Parameters for LLM-as-judge scoring function configuration.
     NumberType:
+      description: Parameter type for numeric values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - number
       title: NumberType
-      description: Parameter type for numeric values.
     ObjectType:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - object
-      title: ObjectType
       description: Parameter type for object values.
-    RegexParserScoringFnParams:
       properties:
         type:
-          type: string
           title: Type
-          enum:
-          - regex_parser
-        parsing_regexes:
-          items:
-            type: string
-          type: array
-          title: Parsing Regexes
-          description: Regex to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: RegexParserScoringFnParams
-      description: Parameters for regex parser scoring function configuration.
-    ScoringFn:
-      properties:
-        identifier:
           type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          enum:
-          - scoring_function
-        description:
-          anyOf:
-          - type: string
-          - type: 'null'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this definition
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the deterministic function
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval
-      required:
-      - identifier
-      - provider_id
-      - return_type
-      title: ScoringFn
-      description: A scoring function resource for evaluating model outputs.
-    ScoringFnParams:
-      discriminator:
-        mapping:
-          basic: '#/components/schemas/BasicScoringFnParams'
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        title: LLMAsJudgeScoringFnParams
-      - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        title: RegexParserScoringFnParams
-      - $ref: '#/components/schemas/BasicScoringFnParams'
-        title: BasicScoringFnParams
-      title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-    ScoringFnParamsType:
-      description: Types of scoring function parameter configurations.
-      enum:
-      - llm_as_judge
-      - regex_parser
-      - basic
-      title: ScoringFnParamsType
-      type: string
+          enum:
+          - object
+      title: ObjectType
     StringType:
+      description: Parameter type for string values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - string
       title: StringType
-      description: Parameter type for string values.
     UnionType:
+      description: Parameter type for union values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - union
       title: UnionType
-      description: Parameter type for union values.
-    ListScoringFunctionsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/ScoringFn'
-          type: array
-          title: Data
-          description: List of scoring function objects.
-      required:
-      - data
-      title: ListScoringFunctionsResponse
-      description: Response containing a list of scoring function objects.
-    ScoreRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Input Rows
-          description: The rows to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-      required:
-      - input_rows
-      - scoring_functions
-      title: ScoreRequest
-      description: Request model for scoring a list of rows.
-    ScoreResponse:
-      properties:
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult.
-      required:
-      - results
-      title: ScoreResponse
-      description: The response from scoring.
-    ScoringResult:
-      properties:
-        score_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Score Rows
-          description: The scoring result for each row. Each row is a map of column name to value.
-        aggregated_results:
-          additionalProperties: true
-          type: object
-          title: Aggregated Results
-          description: Map of metric name to aggregated value
-      required:
-      - score_rows
-      - aggregated_results
-      title: ScoringResult
-      description: A scoring result for a single row.
-    ScoreBatchRequest:
-      properties:
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-        save_results_dataset:
-          type: boolean
-          title: Save Results Dataset
-          description: Whether to save the results to a dataset.
-          default: false
-      required:
-      - dataset_id
-      - scoring_functions
-      title: ScoreBatchRequest
-      description: Request model for scoring a batch of rows from a dataset.
-    ScoreBatchResponse:
-      properties:
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: (Optional) The identifier of the dataset that was scored
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult
-      required:
-      - results
-      title: ScoreBatchResponse
-      description: Response from batch scoring operations on datasets.
     Shield:
       properties:
         identifier:
@@ -7356,264 +6618,48 @@ components:
       - version
       title: VersionInfo
       description: Version information for the service.
-    AppendRowsRequest:
-      properties:
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: The rows to append to the dataset.
-      required:
-      - rows
-      title: AppendRowsRequest
-      description: Request body for appending rows to a dataset.
     PaginatedResponse:
+      description: A generic paginated response that follows a simple format.
       properties:
         data:
           items:
             additionalProperties: true
             type: object
-          type: array
           title: Data
+          type: array
         has_more:
-          type: boolean
           title: Has More
+          type: boolean
         url:
           anyOf:
           - type: string
           - type: 'null'
+          nullable: true
       required:
       - data
       - has_more
       title: PaginatedResponse
-      description: A generic paginated response that follows a simple format.
-    Dataset:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          description: Type of resource, always 'dataset' for datasets
-          enum:
-          - dataset
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: Purpose of the dataset indicating its intended use
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: Data source configuration for the dataset
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this dataset
-      required:
-      - identifier
-      - provider_id
-      - purpose
-      - source
-      title: Dataset
-      description: Dataset resource for storing and accessing training or evaluation data.
-    RowsDataSource:
-      properties:
-        type:
-          type: string
-          title: Type
-          description: The type of data source.
-          enum:
-          - rows
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: 'The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]'
-      required:
-      - rows
-      title: RowsDataSource
-      description: A dataset stored in rows.
-    URIDataSource:
-      properties:
-        type:
-          type: string
-          title: Type
-          description: The type of data source.
-          enum:
-          - uri
-        uri:
-          type: string
-          title: Uri
-          description: The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}"
-      required:
-      - uri
-      title: URIDataSource
-      description: A dataset that can be obtained from a URI.
-    ListDatasetsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Dataset'
-          type: array
-          title: Data
-          description: List of datasets
-      required:
-      - data
-      title: ListDatasetsResponse
-      description: Response from listing datasets.
-    Benchmark:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          description: The resource type, always benchmark.
-          enum:
-          - benchmark
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: Identifier of the dataset to use for the benchmark evaluation.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: List of scoring function identifiers to apply during evaluation.
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Metadata for this evaluation task.
-      required:
-      - identifier
-      - provider_id
-      - dataset_id
-      - scoring_functions
-      title: Benchmark
-      description: A benchmark resource for evaluating model performance.
-    ListBenchmarksResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Benchmark'
-          type: array
-          title: Data
-          description: List of benchmark objects.
-      required:
-      - data
-      title: ListBenchmarksResponse
-      description: Response containing a list of benchmark objects.
-    BenchmarkConfig:
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/ModelCandidate'
-          description: The candidate to evaluate
-        scoring_params:
-          additionalProperties:
-            oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          type: object
-          title: Scoring Params
-          description: Map between scoring function id and parameters for each scoring function you want to run
-        num_examples:
-          anyOf:
-          - type: integer
-            minimum: 1.0
-          - type: 'null'
-          description: Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated
-      required:
-      - eval_candidate
-      title: BenchmarkConfig
-      description: A benchmark configuration for evaluation.
     GreedySamplingStrategy:
+      description: Greedy sampling strategy that selects the highest probability token at each step.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'greedy' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - greedy
       title: GreedySamplingStrategy
-      description: Greedy sampling strategy that selects the highest probability token at each step.
-    ModelCandidate:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - model
-        model:
-          type: string
-          minLength: 1
-          title: Model
-          description: The model ID to evaluate
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model
-        system_message:
-          anyOf:
-          - $ref: '#/components/schemas/SystemMessage'
-            title: SystemMessage
-          - type: 'null'
-          description: The system message providing instructions or context to the model
-          title: SystemMessage
-      required:
-      - model
-      - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
     SamplingParams:
+      description: Sampling parameters for text generation.
       properties:
         strategy:
+          description: The sampling strategy to use.
+          discriminator:
+            mapping:
+              greedy: '#/components/schemas/GreedySamplingStrategy'
+              top_k: '#/components/schemas/TopKSamplingStrategy'
+              top_p: '#/components/schemas/TopPSamplingStrategy'
+            propertyName: type
           oneOf:
           - $ref: '#/components/schemas/GreedySamplingStrategy'
             title: GreedySamplingStrategy
@@ -7622,200 +6668,127 @@ components:
           - $ref: '#/components/schemas/TopKSamplingStrategy'
             title: TopKSamplingStrategy
           title: GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy
-          description: The sampling strategy to use.
-          discriminator:
-            propertyName: type
-            mapping:
-              greedy: '#/components/schemas/GreedySamplingStrategy'
-              top_k: '#/components/schemas/TopKSamplingStrategy'
-              top_p: '#/components/schemas/TopPSamplingStrategy'
         max_tokens:
           anyOf:
-          - type: integer
-            minimum: 1.0
+          - minimum: 1
+            type: integer
           - type: 'null'
           description: The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length.
+          nullable: true
         repetition_penalty:
           anyOf:
-          - type: number
-            maximum: 2.0
+          - maximum: 2.0
             minimum: -2.0
+            type: number
           - type: 'null'
-          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
           default: 1.0
+          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
         stop:
           anyOf:
           - items:
               type: string
-            type: array
             maxItems: 4
+            type: array
           - type: 'null'
           description: Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+          nullable: true
       title: SamplingParams
-      description: Sampling parameters for text generation.
     SystemMessage:
+      description: A system message providing instructions or context to the model.
       properties:
         role:
-          type: string
-          title: Role
           description: Must be 'system' to identify this as a system message.
+          title: Role
+          type: string
           enum:
           - system
         content:
           anyOf:
           - type: string
-          - oneOf:
-            - $ref: '#/components/schemas/ImageContentItem-Input'
-              title: ImageContentItem-Input
-            - $ref: '#/components/schemas/TextContentItem'
-              title: TextContentItem
-            discriminator:
-              propertyName: type
+          - discriminator:
               mapping:
-                image: '#/components/schemas/ImageContentItem-Input'
+                image: '#/components/schemas/ImageContentItem'
                 text: '#/components/schemas/TextContentItem'
-            title: ImageContentItem-Input | TextContentItem
+              propertyName: type
+            oneOf:
+            - $ref: '#/components/schemas/ImageContentItem'
+              title: ImageContentItem
+            - $ref: '#/components/schemas/TextContentItem'
+              title: TextContentItem
+            title: ImageContentItem | TextContentItem
           - items:
-              oneOf:
-              - $ref: '#/components/schemas/ImageContentItem-Input'
-                title: ImageContentItem-Input
-              - $ref: '#/components/schemas/TextContentItem'
-                title: TextContentItem
               discriminator:
-                propertyName: type
                 mapping:
-                  image: '#/components/schemas/ImageContentItem-Input'
+                  image: '#/components/schemas/ImageContentItem'
                   text: '#/components/schemas/TextContentItem'
-              title: ImageContentItem-Input | TextContentItem
+                propertyName: type
+              oneOf:
+              - $ref: '#/components/schemas/ImageContentItem'
+                title: ImageContentItem
+              - $ref: '#/components/schemas/TextContentItem'
+                title: TextContentItem
+              title: ImageContentItem | TextContentItem
             type: array
-            title: list[ImageContentItem-Input | TextContentItem]
-          title: string | list[ImageContentItem-Input | TextContentItem]
+            title: list[ImageContentItem | TextContentItem]
           description: The content of the 'system prompt'. If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages.
+          title: string | list[ImageContentItem | TextContentItem]
       required:
       - content
       title: SystemMessage
-      description: A system message providing instructions or context to the model.
     TopKSamplingStrategy:
+      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'top_k' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - top_k
         top_k:
-          type: integer
-          minimum: 1.0
-          title: Top K
           description: Number of top tokens to consider for sampling. Must be at least 1.
+          minimum: 1
+          title: Top K
+          type: integer
       required:
       - top_k
       title: TopKSamplingStrategy
-      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
     TopPSamplingStrategy:
+      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'top_p' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - top_p
         temperature:
-          type: number
+          description: Controls randomness in sampling. Higher values increase randomness.
           maximum: 2.0
           title: Temperature
-          description: Controls randomness in sampling. Higher values increase randomness.
+          type: number
           minimum: 0.0
         top_p:
-          type: number
+          default: 0.95
+          description: Cumulative probability threshold for nucleus sampling.
           maximum: 1.0
           minimum: 0.0
           title: Top P
-          description: Cumulative probability threshold for nucleus sampling.
-          default: 0.95
+          type: number
       required:
       - temperature
       title: TopPSamplingStrategy
-      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
-    EvaluateRowsRequest:
-      description: Request model for evaluating a list of rows on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        input_rows:
-          description: The rows to evaluate
-          items:
-            additionalProperties: true
-            type: object
-          minItems: 1
-          title: Input Rows
-          type: array
-        scoring_functions:
-          description: The scoring functions to use for the evaluation
-          items:
-            type: string
-          minItems: 1
-          title: Scoring Functions
-          type: array
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsRequest
-    EvaluateResponse:
-      properties:
-        generations:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Generations
-          description: The generations from the evaluation
-        scores:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Scores
-          description: The scores from the evaluation. Each key in the dict is a scoring function name
-      required:
-      - generations
-      - scores
-      title: EvaluateResponse
-      description: The response from an evaluation.
-    RunEvalRequest:
-      description: Request model for running an evaluation on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - benchmark_config
-      title: RunEvalRequest
     Job:
+      description: A job execution instance with status tracking.
       properties:
         job_id:
-          type: string
           title: Job Id
+          type: string
         status:
           $ref: '#/components/schemas/JobStatus'
       required:
       - job_id
       - status
       title: Job
-      description: A job execution instance with status tracking.
     RerankRequest:
       properties:
         model:
@@ -7960,85 +6933,6 @@ components:
       - $ref: '#/components/schemas/CompletionInputType'
         title: CompletionInputType
       title: StringType | ... (9 variants)
-    RegisterScoringFunctionRequest:
-      properties:
-        scoring_fn_id:
-          type: string
-          title: Scoring Fn Id
-          description: The ID of the scoring function to register.
-        description:
-          type: string
-          title: Description
-          description: The description of the scoring function.
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the scoring function.
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        provider_scoring_fn_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider scoring function to use for the scoring function.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the scoring function.
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequest
-      description: Request model for registering a scoring function.
     RegisterShieldRequest:
       properties:
         shield_id:
@@ -8065,90 +6959,6 @@ components:
       - shield_id
       title: RegisterShieldRequest
       description: Request model for registering a shield.
-    DataSource:
-      discriminator:
-        mapping:
-          rows: '#/components/schemas/RowsDataSource'
-          uri: '#/components/schemas/URIDataSource'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/URIDataSource'
-        title: URIDataSource
-      - $ref: '#/components/schemas/RowsDataSource'
-        title: RowsDataSource
-      title: URIDataSource | RowsDataSource
-    RegisterDatasetRequest:
-      properties:
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: The purpose of the dataset.
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: The data source of the dataset.
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata for the dataset.
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the dataset. If not provided, an ID will be generated.
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequest
-      description: Request model for registering a dataset.
-    RegisterBenchmarkRequest:
-      properties:
-        benchmark_id:
-          type: string
-          title: Benchmark Id
-          description: The ID of the benchmark to register.
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to use for the benchmark.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: The scoring functions to use for the benchmark.
-        provider_benchmark_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider benchmark to use for the benchmark.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the benchmark.
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata to use for the benchmark.
-      required:
-      - benchmark_id
-      - dataset_id
-      - scoring_functions
-      title: RegisterBenchmarkRequest
-      description: Request model for registering a benchmark.
     AllowedToolsFilter:
       properties:
         tool_names:
@@ -8316,13 +7126,6 @@ components:
       - reasoning.encrypted_content
       title: ConversationItemInclude
       description: Specify additional output data to include in the model response.
-    DatasetPurpose:
-      type: string
-      enum:
-      - eval/question-answer
-      - eval/messages-answer
-      title: DatasetPurpose
-      description: Purpose of the dataset. Each purpose has a required input data schema.
     EmbeddedChunk-Input:
       properties:
         content:
@@ -8461,32 +7264,6 @@ components:
           - type: 'null'
       additionalProperties: true
       title: Errors
-    EvaluateRowsBodyRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          minItems: 1
-          title: Input Rows
-          description: The rows to evaluate
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          minItems: 1
-          title: Scoring Functions
-          description: The scoring functions to use for the evaluation
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsBodyRequest
-      description: Request body model for evaluating rows (without path parameter).
     HealthStatus:
       type: string
       enum:
@@ -8530,16 +7307,6 @@ components:
       required:
       - cached_tokens
       title: InputTokensDetails
-    JobStatus:
-      type: string
-      enum:
-      - completed
-      - in_progress
-      - failed
-      - scheduled
-      - cancelled
-      title: JobStatus
-      description: Status of a job execution.
     ListConnectorsResponse:
       properties:
         data:
@@ -9485,15 +8252,6 @@ components:
 
         Returns a list of chunks ready for storage in vector databases.
         Each chunk contains the content and metadata.
-    RunEvalBodyRequest:
-      properties:
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_config
-      title: RunEvalBodyRequest
-      description: Request body model for running an evaluation (without path parameter).
     SearchRankingOptions:
       properties:
         ranker:
@@ -9881,50 +8639,6 @@ components:
       - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
         title: OpenAIResponseContentPartReasoningText
       title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
-    ListBenchmarksRequest:
-      description: Request model for listing benchmarks.
-      properties: {}
-      title: ListBenchmarksRequest
-    GetBenchmarkRequest:
-      description: Request model for getting a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: GetBenchmarkRequest
-    UnregisterBenchmarkRequest:
-      description: Request model for unregistering a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: UnregisterBenchmarkRequest
-    GetDatasetRequest:
-      description: Request model for getting a dataset by ID.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: GetDatasetRequest
-    UnregisterDatasetRequest:
-      description: Request model for unregistering a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: UnregisterDatasetRequest
     ListModelsResponse:
       description: Response containing a list of model objects.
       properties:
@@ -9957,39 +8671,6 @@ components:
       required:
       - model_id
       title: UnregisterModelRequest
-    DialogType:
-      description: Parameter type for dialog data with semantic output labels.
-      properties:
-        type:
-          title: Type
-          type: string
-          enum:
-          - dialog
-      title: DialogType
-    ListScoringFunctionsRequest:
-      description: Request model for listing scoring functions.
-      properties: {}
-      title: ListScoringFunctionsRequest
-    GetScoringFunctionRequest:
-      description: Request model for getting a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: GetScoringFunctionRequest
-    UnregisterScoringFunctionRequest:
-      description: Request model for unregistering a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: UnregisterScoringFunctionRequest
     GetShieldRequest:
       description: Request model for getting a shield by identifier.
       properties:
@@ -10088,16 +8769,10 @@ components:
       - responses
       - batches
       - vector_io
-      - datasetio
-      - scoring
-      - eval
       - tool_runtime
       - models
       - shields
       - vector_stores
-      - datasets
-      - scoring_functions
-      - benchmarks
       - tool_groups
       - files
       - file_processors
@@ -10898,6 +9573,25 @@ components:
       required:
       - batch_id
       title: CancelBatchRequest
+    JobStatus:
+      description: Status of a job execution.
+      enum:
+      - completed
+      - in_progress
+      - failed
+      - scheduled
+      - cancelled
+      title: JobStatus
+      type: string
+    DialogType:
+      description: Parameter type for dialog data with semantic output labels.
+      properties:
+        type:
+          title: Type
+          type: string
+          enum:
+          - dialog
+      title: DialogType
     ConnectorInput:
       description: Input for creating a connector
       properties:
@@ -11131,90 +9825,6 @@ components:
       - conversation_id
       - item_id
       title: DeleteItemRequest
-    IterRowsRequest:
-      description: Request model for iterating over rows in a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-          type: string
-        start_index:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          nullable: true
-        limit:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          nullable: true
-      required:
-      - dataset_id
-      title: IterRowsRequest
-    BenchmarkIdRequest:
-      description: Request model containing benchmark_id path parameter.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark
-          minLength: 1
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: BenchmarkIdRequest
-    JobStatusRequest:
-      description: Request model for getting the status of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the status of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobStatusRequest
-    JobCancelRequest:
-      description: Request model for canceling a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to cancel
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobCancelRequest
-    JobResultRequest:
-      description: Request model for getting the result of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the result of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobResultRequest
     ProcessFileRequest:
       description: |-
         Request model for file processing operation.
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index 249488661f..e8dd3fa96f 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -1946,131 +1946,6 @@ paths:
             schema:
               $ref: '#/components/schemas/RunShieldRequest'
         required: true
-  /v1/scoring-functions:
-    get:
-      responses:
-        '200':
-          description: A ListScoringFunctionsResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListScoringFunctionsResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring Functions
-      summary: List all scoring functions.
-      description: List all scoring functions.
-      operationId: list_scoring_functions_v1_scoring_functions_get
-  /v1/scoring-functions/{scoring_fn_id}:
-    get:
-      responses:
-        '200':
-          description: A ScoringFn.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoringFn'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Scoring Functions
-      summary: Get a scoring function by its ID.
-      description: Get a scoring function by its ID.
-      operationId: get_scoring_function_v1_scoring_functions__scoring_fn_id__get
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to get.
-  /v1/scoring/score:
-    post:
-      responses:
-        '200':
-          description: A ScoreResponse object containing rows and aggregated results.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring
-      summary: Score a list of rows.
-      description: Score a list of rows.
-      operationId: score_v1_scoring_score_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreRequest'
-        required: true
-  /v1/scoring/score-batch:
-    post:
-      responses:
-        '200':
-          description: A ScoreBatchResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreBatchResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring
-      summary: Score a batch of rows.
-      description: Score a batch of rows.
-      operationId: score_batch_v1_scoring_score_batch_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreBatchRequest'
-        required: true
   /v1/shields:
     get:
       responses:
@@ -8463,146 +8338,88 @@ components:
       - error
       title: ViolationLevel
       description: Severity level of a safety violation.
-    AggregationFunctionType:
-      type: string
-      enum:
-      - average
-      - weighted_average
-      - median
-      - categorical_count
-      - accuracy
-      title: AggregationFunctionType
-      description: Types of aggregation functions for scoring results.
     ArrayType:
+      description: Parameter type for array values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - array
       title: ArrayType
-      description: Parameter type for array values.
-    BasicScoringFnParams:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - basic
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: BasicScoringFnParams
-      description: Parameters for basic scoring function configuration.
     BooleanType:
+      description: Parameter type for boolean values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - boolean
       title: BooleanType
-      description: Parameter type for boolean values.
     ChatCompletionInputType:
+      description: Parameter type for chat completion input.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - chat_completion_input
       title: ChatCompletionInputType
-      description: Parameter type for chat completion input.
     CompletionInputType:
+      description: Parameter type for completion input.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - completion_input
       title: CompletionInputType
-      description: Parameter type for completion input.
     JsonType:
+      description: Parameter type for JSON values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - json
       title: JsonType
-      description: Parameter type for JSON values.
-    LLMAsJudgeScoringFnParams:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - llm_as_judge
-        judge_model:
-          type: string
-          title: Judge Model
-        prompt_template:
-          anyOf:
-          - type: string
-          - type: 'null'
-        judge_score_regexes:
-          items:
-            type: string
-          type: array
-          title: Judge Score Regexes
-          description: Regexes to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      required:
-      - judge_model
-      title: LLMAsJudgeScoringFnParams
-      description: Parameters for LLM-as-judge scoring function configuration.
     NumberType:
+      description: Parameter type for numeric values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - number
       title: NumberType
-      description: Parameter type for numeric values.
     ObjectType:
+      description: Parameter type for object values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - object
       title: ObjectType
-      description: Parameter type for object values.
-    RegexParserScoringFnParams:
+    StringType:
+      description: Parameter type for string values.
       properties:
         type:
+          title: Type
           type: string
+          enum:
+          - string
+      title: StringType
+    UnionType:
+      description: Parameter type for union values.
+      properties:
+        type:
           title: Type
+          type: string
           enum:
-          - regex_parser
-        parsing_regexes:
-          items:
-            type: string
-          type: array
-          title: Parsing Regexes
-          description: Regex to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: RegexParserScoringFnParams
-      description: Parameters for regex parser scoring function configuration.
-    ScoringFn:
+          - union
+      title: UnionType
+    Shield:
       properties:
         identifier:
           type: string
@@ -8621,297 +8438,34 @@ components:
           type: string
           title: Type
           enum:
-          - scoring_function
-        description:
-          anyOf:
-          - type: string
-          - type: 'null'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this definition
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the deterministic function
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
+          - shield
         params:
           anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
+          - additionalProperties: true
+            type: object
           - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval
       required:
       - identifier
       - provider_id
-      - return_type
-      title: ScoringFn
-      description: A scoring function resource for evaluating model outputs.
-    ScoringFnParams:
-      discriminator:
-        mapping:
-          basic: '#/components/schemas/BasicScoringFnParams'
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        title: LLMAsJudgeScoringFnParams
-      - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        title: RegexParserScoringFnParams
-      - $ref: '#/components/schemas/BasicScoringFnParams'
-        title: BasicScoringFnParams
-      title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-    ScoringFnParamsType:
-      description: Types of scoring function parameter configurations.
-      enum:
-      - llm_as_judge
-      - regex_parser
-      - basic
-      title: ScoringFnParamsType
-      type: string
-    StringType:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - string
-      title: StringType
-      description: Parameter type for string values.
-    UnionType:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - union
-      title: UnionType
-      description: Parameter type for union values.
-    ListScoringFunctionsResponse:
+      title: Shield
+      description: A safety shield resource that can be used to check content.
+    ListShieldsResponse:
       properties:
         data:
           items:
-            $ref: '#/components/schemas/ScoringFn'
+            $ref: '#/components/schemas/Shield'
           type: array
           title: Data
-          description: List of scoring function objects.
+          description: List of shield objects
       required:
       - data
-      title: ListScoringFunctionsResponse
-      description: Response containing a list of scoring function objects.
-    ScoreRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Input Rows
-          description: The rows to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-      required:
-      - input_rows
-      - scoring_functions
-      title: ScoreRequest
-      description: Request model for scoring a list of rows.
-    ScoreResponse:
-      properties:
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult.
-      required:
-      - results
-      title: ScoreResponse
-      description: The response from scoring.
-    ScoringResult:
-      properties:
-        score_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Score Rows
-          description: The scoring result for each row. Each row is a map of column name to value.
-        aggregated_results:
-          additionalProperties: true
-          type: object
-          title: Aggregated Results
-          description: Map of metric name to aggregated value
-      required:
-      - score_rows
-      - aggregated_results
-      title: ScoringResult
-      description: A scoring result for a single row.
-    ScoreBatchRequest:
+      title: ListShieldsResponse
+      description: Response containing a list of all shields.
+    ImageContentItem:
+      description: A image content item
       properties:
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-        save_results_dataset:
-          type: boolean
-          title: Save Results Dataset
-          description: Whether to save the results to a dataset.
-          default: false
-      required:
-      - dataset_id
-      - scoring_functions
-      title: ScoreBatchRequest
-      description: Request model for scoring a batch of rows from a dataset.
-    ScoreBatchResponse:
-      properties:
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: (Optional) The identifier of the dataset that was scored
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult
-      required:
-      - results
-      title: ScoreBatchResponse
-      description: Response from batch scoring operations on datasets.
-    Shield:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          enum:
-          - shield
-        params:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-      required:
-      - identifier
-      - provider_id
-      title: Shield
-      description: A safety shield resource that can be used to check content.
-    ListShieldsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Shield'
-          type: array
-          title: Data
-          description: List of shield objects
-      required:
-      - data
-      title: ListShieldsResponse
-      description: Response containing a list of all shields.
-    ImageContentItem:
-      description: A image content item
-      properties:
-        type:
-          title: Type
+        type:
+          title: Type
           type: string
           enum:
           - image
@@ -9917,264 +9471,48 @@ components:
       - version
       title: VersionInfo
       description: Version information for the service.
-    AppendRowsRequest:
-      properties:
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: The rows to append to the dataset.
-      required:
-      - rows
-      title: AppendRowsRequest
-      description: Request body for appending rows to a dataset.
     PaginatedResponse:
+      description: A generic paginated response that follows a simple format.
       properties:
         data:
           items:
             additionalProperties: true
             type: object
-          type: array
           title: Data
+          type: array
         has_more:
-          type: boolean
           title: Has More
+          type: boolean
         url:
           anyOf:
           - type: string
           - type: 'null'
+          nullable: true
       required:
       - data
       - has_more
       title: PaginatedResponse
-      description: A generic paginated response that follows a simple format.
-    Dataset:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          description: Type of resource, always 'dataset' for datasets
-          enum:
-          - dataset
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: Purpose of the dataset indicating its intended use
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: Data source configuration for the dataset
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this dataset
-      required:
-      - identifier
-      - provider_id
-      - purpose
-      - source
-      title: Dataset
-      description: Dataset resource for storing and accessing training or evaluation data.
-    RowsDataSource:
-      properties:
-        type:
-          type: string
-          title: Type
-          description: The type of data source.
-          enum:
-          - rows
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: 'The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]'
-      required:
-      - rows
-      title: RowsDataSource
-      description: A dataset stored in rows.
-    URIDataSource:
-      properties:
-        type:
-          type: string
-          title: Type
-          description: The type of data source.
-          enum:
-          - uri
-        uri:
-          type: string
-          title: Uri
-          description: The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}"
-      required:
-      - uri
-      title: URIDataSource
-      description: A dataset that can be obtained from a URI.
-    ListDatasetsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Dataset'
-          type: array
-          title: Data
-          description: List of datasets
-      required:
-      - data
-      title: ListDatasetsResponse
-      description: Response from listing datasets.
-    Benchmark:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          description: The resource type, always benchmark.
-          enum:
-          - benchmark
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: Identifier of the dataset to use for the benchmark evaluation.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: List of scoring function identifiers to apply during evaluation.
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Metadata for this evaluation task.
-      required:
-      - identifier
-      - provider_id
-      - dataset_id
-      - scoring_functions
-      title: Benchmark
-      description: A benchmark resource for evaluating model performance.
-    ListBenchmarksResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Benchmark'
-          type: array
-          title: Data
-          description: List of benchmark objects.
-      required:
-      - data
-      title: ListBenchmarksResponse
-      description: Response containing a list of benchmark objects.
-    BenchmarkConfig:
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/ModelCandidate'
-          description: The candidate to evaluate
-        scoring_params:
-          additionalProperties:
-            oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          type: object
-          title: Scoring Params
-          description: Map between scoring function id and parameters for each scoring function you want to run
-        num_examples:
-          anyOf:
-          - type: integer
-            minimum: 1.0
-          - type: 'null'
-          description: Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated
-      required:
-      - eval_candidate
-      title: BenchmarkConfig
-      description: A benchmark configuration for evaluation.
     GreedySamplingStrategy:
+      description: Greedy sampling strategy that selects the highest probability token at each step.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'greedy' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - greedy
       title: GreedySamplingStrategy
-      description: Greedy sampling strategy that selects the highest probability token at each step.
-    ModelCandidate:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - model
-        model:
-          type: string
-          minLength: 1
-          title: Model
-          description: The model ID to evaluate
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model
-        system_message:
-          anyOf:
-          - $ref: '#/components/schemas/SystemMessage'
-            title: SystemMessage
-          - type: 'null'
-          description: The system message providing instructions or context to the model
-          title: SystemMessage
-      required:
-      - model
-      - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
     SamplingParams:
+      description: Sampling parameters for text generation.
       properties:
         strategy:
+          description: The sampling strategy to use.
+          discriminator:
+            mapping:
+              greedy: '#/components/schemas/GreedySamplingStrategy'
+              top_k: '#/components/schemas/TopKSamplingStrategy'
+              top_p: '#/components/schemas/TopPSamplingStrategy'
+            propertyName: type
           oneOf:
           - $ref: '#/components/schemas/GreedySamplingStrategy'
             title: GreedySamplingStrategy
@@ -10183,200 +9521,127 @@ components:
           - $ref: '#/components/schemas/TopKSamplingStrategy'
             title: TopKSamplingStrategy
           title: GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy
-          description: The sampling strategy to use.
-          discriminator:
-            propertyName: type
-            mapping:
-              greedy: '#/components/schemas/GreedySamplingStrategy'
-              top_k: '#/components/schemas/TopKSamplingStrategy'
-              top_p: '#/components/schemas/TopPSamplingStrategy'
         max_tokens:
           anyOf:
-          - type: integer
-            minimum: 1.0
+          - minimum: 1
+            type: integer
           - type: 'null'
           description: The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length.
+          nullable: true
         repetition_penalty:
           anyOf:
-          - type: number
-            maximum: 2.0
+          - maximum: 2.0
             minimum: -2.0
+            type: number
           - type: 'null'
-          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
           default: 1.0
+          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
         stop:
           anyOf:
           - items:
               type: string
-            type: array
             maxItems: 4
+            type: array
           - type: 'null'
           description: Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+          nullable: true
       title: SamplingParams
-      description: Sampling parameters for text generation.
     SystemMessage:
+      description: A system message providing instructions or context to the model.
       properties:
         role:
-          type: string
-          title: Role
           description: Must be 'system' to identify this as a system message.
+          title: Role
+          type: string
           enum:
           - system
         content:
           anyOf:
           - type: string
-          - oneOf:
-            - $ref: '#/components/schemas/ImageContentItem-Input'
-              title: ImageContentItem-Input
-            - $ref: '#/components/schemas/TextContentItem'
-              title: TextContentItem
-            discriminator:
-              propertyName: type
+          - discriminator:
               mapping:
-                image: '#/components/schemas/ImageContentItem-Input'
+                image: '#/components/schemas/ImageContentItem'
                 text: '#/components/schemas/TextContentItem'
-            title: ImageContentItem-Input | TextContentItem
+              propertyName: type
+            oneOf:
+            - $ref: '#/components/schemas/ImageContentItem'
+              title: ImageContentItem
+            - $ref: '#/components/schemas/TextContentItem'
+              title: TextContentItem
+            title: ImageContentItem | TextContentItem
           - items:
-              oneOf:
-              - $ref: '#/components/schemas/ImageContentItem-Input'
-                title: ImageContentItem-Input
-              - $ref: '#/components/schemas/TextContentItem'
-                title: TextContentItem
               discriminator:
-                propertyName: type
                 mapping:
-                  image: '#/components/schemas/ImageContentItem-Input'
+                  image: '#/components/schemas/ImageContentItem'
                   text: '#/components/schemas/TextContentItem'
-              title: ImageContentItem-Input | TextContentItem
+                propertyName: type
+              oneOf:
+              - $ref: '#/components/schemas/ImageContentItem'
+                title: ImageContentItem
+              - $ref: '#/components/schemas/TextContentItem'
+                title: TextContentItem
+              title: ImageContentItem | TextContentItem
             type: array
-            title: list[ImageContentItem-Input | TextContentItem]
-          title: string | list[ImageContentItem-Input | TextContentItem]
+            title: list[ImageContentItem | TextContentItem]
           description: The content of the 'system prompt'. If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages.
+          title: string | list[ImageContentItem | TextContentItem]
       required:
       - content
       title: SystemMessage
-      description: A system message providing instructions or context to the model.
     TopKSamplingStrategy:
+      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'top_k' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - top_k
         top_k:
-          type: integer
-          minimum: 1.0
-          title: Top K
           description: Number of top tokens to consider for sampling. Must be at least 1.
+          minimum: 1
+          title: Top K
+          type: integer
       required:
       - top_k
       title: TopKSamplingStrategy
-      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
     TopPSamplingStrategy:
+      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'top_p' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - top_p
         temperature:
-          type: number
+          description: Controls randomness in sampling. Higher values increase randomness.
           maximum: 2.0
           title: Temperature
-          description: Controls randomness in sampling. Higher values increase randomness.
+          type: number
           minimum: 0.0
         top_p:
-          type: number
+          default: 0.95
+          description: Cumulative probability threshold for nucleus sampling.
           maximum: 1.0
           minimum: 0.0
           title: Top P
-          description: Cumulative probability threshold for nucleus sampling.
-          default: 0.95
+          type: number
       required:
       - temperature
       title: TopPSamplingStrategy
-      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
-    EvaluateRowsRequest:
-      description: Request model for evaluating a list of rows on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        input_rows:
-          description: The rows to evaluate
-          items:
-            additionalProperties: true
-            type: object
-          minItems: 1
-          title: Input Rows
-          type: array
-        scoring_functions:
-          description: The scoring functions to use for the evaluation
-          items:
-            type: string
-          minItems: 1
-          title: Scoring Functions
-          type: array
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsRequest
-    EvaluateResponse:
-      properties:
-        generations:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Generations
-          description: The generations from the evaluation
-        scores:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Scores
-          description: The scores from the evaluation. Each key in the dict is a scoring function name
-      required:
-      - generations
-      - scores
-      title: EvaluateResponse
-      description: The response from an evaluation.
-    RunEvalRequest:
-      description: Request model for running an evaluation on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - benchmark_config
-      title: RunEvalRequest
     Job:
+      description: A job execution instance with status tracking.
       properties:
         job_id:
-          type: string
           title: Job Id
+          type: string
         status:
           $ref: '#/components/schemas/JobStatus'
       required:
       - job_id
       - status
       title: Job
-      description: A job execution instance with status tracking.
     RerankRequest:
       properties:
         model:
@@ -10521,85 +9786,6 @@ components:
       - $ref: '#/components/schemas/CompletionInputType'
         title: CompletionInputType
       title: StringType | ... (9 variants)
-    RegisterScoringFunctionRequest:
-      properties:
-        scoring_fn_id:
-          type: string
-          title: Scoring Fn Id
-          description: The ID of the scoring function to register.
-        description:
-          type: string
-          title: Description
-          description: The description of the scoring function.
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the scoring function.
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        provider_scoring_fn_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider scoring function to use for the scoring function.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the scoring function.
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequest
-      description: Request model for registering a scoring function.
     RegisterShieldRequest:
       properties:
         shield_id:
@@ -10626,90 +9812,6 @@ components:
       - shield_id
       title: RegisterShieldRequest
       description: Request model for registering a shield.
-    DataSource:
-      discriminator:
-        mapping:
-          rows: '#/components/schemas/RowsDataSource'
-          uri: '#/components/schemas/URIDataSource'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/URIDataSource'
-        title: URIDataSource
-      - $ref: '#/components/schemas/RowsDataSource'
-        title: RowsDataSource
-      title: URIDataSource | RowsDataSource
-    RegisterDatasetRequest:
-      properties:
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: The purpose of the dataset.
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: The data source of the dataset.
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata for the dataset.
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the dataset. If not provided, an ID will be generated.
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequest
-      description: Request model for registering a dataset.
-    RegisterBenchmarkRequest:
-      properties:
-        benchmark_id:
-          type: string
-          title: Benchmark Id
-          description: The ID of the benchmark to register.
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to use for the benchmark.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: The scoring functions to use for the benchmark.
-        provider_benchmark_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider benchmark to use for the benchmark.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the benchmark.
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata to use for the benchmark.
-      required:
-      - benchmark_id
-      - dataset_id
-      - scoring_functions
-      title: RegisterBenchmarkRequest
-      description: Request model for registering a benchmark.
     AllowedToolsFilter:
       properties:
         tool_names:
@@ -11623,13 +10725,6 @@ components:
       - model
       title: CreateResponseRequest
       description: Request model for creating a response.
-    DatasetPurpose:
-      type: string
-      enum:
-      - eval/question-answer
-      - eval/messages-answer
-      title: DatasetPurpose
-      description: Purpose of the dataset. Each purpose has a required input data schema.
     EmbeddedChunk-Input:
       properties:
         content:
@@ -11768,32 +10863,6 @@ components:
           - type: 'null'
       additionalProperties: true
       title: Errors
-    EvaluateRowsBodyRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          minItems: 1
-          title: Input Rows
-          description: The rows to evaluate
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          minItems: 1
-          title: Scoring Functions
-          description: The scoring functions to use for the evaluation
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsBodyRequest
-      description: Request body model for evaluating rows (without path parameter).
     HealthStatus:
       type: string
       enum:
@@ -11837,16 +10906,6 @@ components:
       required:
       - cached_tokens
       title: InputTokensDetails
-    JobStatus:
-      type: string
-      enum:
-      - completed
-      - in_progress
-      - failed
-      - scheduled
-      - cancelled
-      title: JobStatus
-      description: Status of a job execution.
     ListConnectorsResponse:
       properties:
         data:
@@ -12821,15 +11880,6 @@ components:
       - disabled
       title: ResponseTruncation
       description: Controls how the service truncates input when it exceeds the model context window.
-    RunEvalBodyRequest:
-      properties:
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_config
-      title: RunEvalBodyRequest
-      description: Request body model for running an evaluation (without path parameter).
     SearchRankingOptions:
       properties:
         ranker:
@@ -13217,50 +12267,6 @@ components:
       - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
         title: OpenAIResponseContentPartReasoningText
       title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
-    ListBenchmarksRequest:
-      description: Request model for listing benchmarks.
-      properties: {}
-      title: ListBenchmarksRequest
-    GetBenchmarkRequest:
-      description: Request model for getting a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: GetBenchmarkRequest
-    UnregisterBenchmarkRequest:
-      description: Request model for unregistering a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: UnregisterBenchmarkRequest
-    GetDatasetRequest:
-      description: Request model for getting a dataset by ID.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: GetDatasetRequest
-    UnregisterDatasetRequest:
-      description: Request model for unregistering a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: UnregisterDatasetRequest
     ListModelsResponse:
       description: Response containing a list of model objects.
       properties:
@@ -13293,39 +12299,6 @@ components:
       required:
       - model_id
       title: UnregisterModelRequest
-    DialogType:
-      description: Parameter type for dialog data with semantic output labels.
-      properties:
-        type:
-          title: Type
-          type: string
-          enum:
-          - dialog
-      title: DialogType
-    ListScoringFunctionsRequest:
-      description: Request model for listing scoring functions.
-      properties: {}
-      title: ListScoringFunctionsRequest
-    GetScoringFunctionRequest:
-      description: Request model for getting a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: GetScoringFunctionRequest
-    UnregisterScoringFunctionRequest:
-      description: Request model for unregistering a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: UnregisterScoringFunctionRequest
     GetShieldRequest:
       description: Request model for getting a shield by identifier.
       properties:
@@ -13424,16 +12397,10 @@ components:
       - responses
       - batches
       - vector_io
-      - datasetio
-      - scoring
-      - eval
       - tool_runtime
       - models
       - shields
       - vector_stores
-      - datasets
-      - scoring_functions
-      - benchmarks
       - tool_groups
       - files
       - file_processors
@@ -14234,6 +13201,25 @@ components:
       required:
       - batch_id
       title: CancelBatchRequest
+    JobStatus:
+      description: Status of a job execution.
+      enum:
+      - completed
+      - in_progress
+      - failed
+      - scheduled
+      - cancelled
+      title: JobStatus
+      type: string
+    DialogType:
+      description: Parameter type for dialog data with semantic output labels.
+      properties:
+        type:
+          title: Type
+          type: string
+          enum:
+          - dialog
+      title: DialogType
     ConnectorInput:
       description: Input for creating a connector
       properties:
@@ -14467,90 +13453,6 @@ components:
       - conversation_id
       - item_id
       title: DeleteItemRequest
-    IterRowsRequest:
-      description: Request model for iterating over rows in a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-          type: string
-        start_index:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          nullable: true
-        limit:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          nullable: true
-      required:
-      - dataset_id
-      title: IterRowsRequest
-    BenchmarkIdRequest:
-      description: Request model containing benchmark_id path parameter.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark
-          minLength: 1
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: BenchmarkIdRequest
-    JobStatusRequest:
-      description: Request model for getting the status of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the status of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobStatusRequest
-    JobCancelRequest:
-      description: Request model for canceling a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to cancel
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobCancelRequest
-    JobResultRequest:
-      description: Request model for getting the result of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the result of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobResultRequest
     ProcessFileRequest:
       description: |-
         Request model for file processing operation.
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
index d903796d75..57c8f3d955 100644
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@@ -1948,190 +1948,6 @@ paths:
             schema:
               $ref: '#/components/schemas/RunShieldRequest'
         required: true
-  /v1/scoring-functions:
-    get:
-      responses:
-        '200':
-          description: A ListScoringFunctionsResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListScoringFunctionsResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring Functions
-      summary: List all scoring functions.
-      description: List all scoring functions.
-      operationId: list_scoring_functions_v1_scoring_functions_get
-    post:
-      responses:
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-        '204':
-          description: The scoring function was successfully registered.
-      tags:
-      - Scoring Functions
-      summary: Register a scoring function.
-      description: Register a scoring function.
-      operationId: register_scoring_function_v1_scoring_functions_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
-        required: true
-      deprecated: true
-  /v1/scoring-functions/{scoring_fn_id}:
-    get:
-      responses:
-        '200':
-          description: A ScoringFn.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoringFn'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Scoring Functions
-      summary: Get a scoring function by its ID.
-      description: Get a scoring function by its ID.
-      operationId: get_scoring_function_v1_scoring_functions__scoring_fn_id__get
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to get.
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The scoring function was successfully unregistered.
-      tags:
-      - Scoring Functions
-      summary: Unregister a scoring function.
-      description: Unregister a scoring function.
-      operationId: unregister_scoring_function_v1_scoring_functions__scoring_fn_id__delete
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to unregister.
-      deprecated: true
-  /v1/scoring/score:
-    post:
-      responses:
-        '200':
-          description: A ScoreResponse object containing rows and aggregated results.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring
-      summary: Score a list of rows.
-      description: Score a list of rows.
-      operationId: score_v1_scoring_score_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreRequest'
-        required: true
-  /v1/scoring/score-batch:
-    post:
-      responses:
-        '200':
-          description: A ScoreBatchResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreBatchResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring
-      summary: Score a batch of rows.
-      description: Score a batch of rows.
-      operationId: score_batch_v1_scoring_score_batch_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreBatchRequest'
-        required: true
   /v1/shields:
     get:
       responses:
@@ -3380,116 +3196,15 @@ paths:
       description: Get the version of the service.
       operationId: version_v1_version_get
       x-public: true
-  /v1beta/datasetio/append-rows/{dataset_id}:
+  /v1alpha/inference/rerank:
     post:
-      responses:
-        '204':
-          description: Rows were successfully appended.
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - DatasetIO
-      summary: Append rows to a dataset.
-      description: Append rows to a dataset.
-      operationId: append_rows_v1beta_datasetio_append_rows__dataset_id__post
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to append the rows to.
-          title: Dataset Id
-        description: The ID of the dataset to append the rows to.
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/AppendRowsRequest'
-  /v1beta/datasetio/iterrows/{dataset_id}:
-    get:
-      responses:
-        '200':
-          description: A PaginatedResponse containing the rows.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/PaginatedResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - DatasetIO
-      summary: Get a paginated list of rows from a dataset.
-      description: |-
-        Get a paginated list of rows from a dataset.
-
-        Uses offset-based pagination where:
-        - start_index: The starting index (0-based). If None, starts from beginning.
-        - limit: Number of items to return. If None or -1, returns all items.
-
-        The response includes:
-        - data: List of items for the current page.
-        - has_more: Whether there are more items available after this set.
-      operationId: iterrows_v1beta_datasetio_iterrows__dataset_id__get
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-        description: The ID of the dataset to get the rows from.
-      - name: start_index
-        in: query
-        required: false
-        schema:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          title: Start Index
-        description: Index into dataset for the first row to get. Get all rows if None.
-      - name: limit
-        in: query
-        required: false
-        schema:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          title: Limit
-        description: The number of rows to get.
-  /v1beta/datasets:
-    get:
       responses:
         '200':
-          description: A list of dataset objects.
+          description: RerankResponse with indices sorted by relevance score (descending).
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/ListDatasetsResponse'
+                $ref: '#/components/schemas/RerankResponse'
         '400':
           description: Bad Request
           $ref: '#/components/responses/BadRequest400'
@@ -3503,18 +3218,25 @@ paths:
           description: Default Response
           $ref: '#/components/responses/DefaultError'
       tags:
-      - Datasets
-      summary: List all datasets.
-      description: List all datasets.
-      operationId: list_datasets_v1beta_datasets_get
-    post:
+      - Inference
+      summary: Rerank documents based on relevance to a query.
+      description: Rerank a list of documents based on their relevance to a query.
+      operationId: rerank_v1alpha_inference_rerank_post
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RerankRequest'
+        required: true
+  /v1alpha/admin/providers:
+    get:
       responses:
         '200':
-          description: The registered dataset object.
+          description: A list of provider information objects.
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/Dataset'
+                $ref: '#/components/schemas/ListProvidersResponse'
         '400':
           description: Bad Request
           $ref: '#/components/responses/BadRequest400'
@@ -3528,26 +3250,19 @@ paths:
           description: Default Response
           $ref: '#/components/responses/DefaultError'
       tags:
-      - Datasets
-      summary: Register a new dataset.
-      description: Register a new dataset.
-      operationId: register_dataset_v1beta_datasets_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterDatasetRequest'
-        required: true
-      deprecated: true
-  /v1beta/datasets/{dataset_id}:
+      - Admin
+      summary: List all available providers
+      description: List all available providers with their configuration and health status.
+      operationId: list_providers_v1alpha_admin_providers_get
+  /v1alpha/admin/providers/{provider_id}:
     get:
       responses:
         '200':
-          description: The dataset object.
+          description: The provider information object.
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/Dataset'
+                $ref: '#/components/schemas/ProviderInfo'
         '400':
           $ref: '#/components/responses/BadRequest400'
           description: Bad Request
@@ -3560,469 +3275,31 @@ paths:
         default:
           $ref: '#/components/responses/DefaultError'
           description: Default Response
+        '404':
+          description: Provider not found.
       tags:
-      - Datasets
-      summary: Get a dataset by its ID.
-      description: Get a dataset by its ID.
-      operationId: get_dataset_v1beta_datasets__dataset_id__get
+      - Admin
+      summary: Get provider details
+      description: Get detailed information about a specific provider.
+      operationId: inspect_provider_v1alpha_admin_providers__provider_id__get
       parameters:
-      - name: dataset_id
+      - name: provider_id
         in: path
         required: true
         schema:
           type: string
-          description: The ID of the dataset to get.
-          title: Dataset Id
-        description: The ID of the dataset to get.
-    delete:
+          description: The ID of the provider to inspect.
+          title: Provider Id
+        description: The ID of the provider to inspect.
+  /v1alpha/admin/inspect/routes:
+    get:
       responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The dataset was successfully unregistered.
-      tags:
-      - Datasets
-      summary: Unregister a dataset by its ID.
-      description: Unregister a dataset by its ID.
-      operationId: unregister_dataset_v1beta_datasets__dataset_id__delete
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-        description: The ID of the dataset to unregister.
-      deprecated: true
-  /v1alpha/eval/benchmarks:
-    get:
-      responses:
-        '200':
-          description: A ListBenchmarksResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListBenchmarksResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Benchmarks
-      summary: List all benchmarks.
-      description: List all benchmarks.
-      operationId: list_benchmarks_v1alpha_eval_benchmarks_get
-    post:
-      responses:
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-        '204':
-          description: The benchmark was successfully registered.
-      tags:
-      - Benchmarks
-      summary: Register a benchmark.
-      description: Register a benchmark.
-      operationId: register_benchmark_v1alpha_eval_benchmarks_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterBenchmarkRequest'
-        required: true
-      deprecated: true
-  /v1alpha/eval/benchmarks/{benchmark_id}:
-    get:
-      responses:
-        '200':
-          description: A Benchmark.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Benchmark'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Benchmarks
-      summary: Get a benchmark by its ID.
-      description: Get a benchmark by its ID.
-      operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-        description: The ID of the benchmark to get.
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The benchmark was successfully unregistered.
-      tags:
-      - Benchmarks
-      summary: Unregister a benchmark.
-      description: Unregister a benchmark.
-      operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-        description: The ID of the benchmark to unregister.
-      deprecated: true
-  /v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
-    post:
-      responses:
-        '200':
-          description: EvaluateResponse object containing generations and scores.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Evaluate Rows
-      description: Evaluate a list of rows on a benchmark.
-      operationId: evaluate_rows_v1alpha_eval_benchmarks__benchmark_id__evaluations_post
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark
-          title: Benchmark Id
-        description: The ID of the benchmark
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EvaluateRowsBodyRequest'
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs:
-    post:
-      responses:
-        '200':
-          description: The job that was created to run the evaluation.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Run Eval
-      description: Run an evaluation on a benchmark.
-      operationId: run_eval_v1alpha_eval_benchmarks__benchmark_id__jobs_post
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark
-          title: Benchmark Id
-        description: The ID of the benchmark
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RunEvalBodyRequest'
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
-    get:
-      responses:
-        '200':
-          description: The status of the evaluation job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Job Status
-      description: Get the status of a job.
-      operationId: job_status_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: Successful Response
-      tags:
-      - Eval
-      summary: Job Cancel
-      description: Cancel a job.
-      operationId: job_cancel_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__delete
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
-    get:
-      responses:
-        '200':
-          description: The result of the job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Job Result
-      description: Get the result of a job.
-      operationId: job_result_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__result_get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-  /v1alpha/inference/rerank:
-    post:
-      responses:
-        '200':
-          description: RerankResponse with indices sorted by relevance score (descending).
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/RerankResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Inference
-      summary: Rerank documents based on relevance to a query.
-      description: Rerank a list of documents based on their relevance to a query.
-      operationId: rerank_v1alpha_inference_rerank_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RerankRequest'
-        required: true
-  /v1alpha/admin/providers:
-    get:
-      responses:
-        '200':
-          description: A list of provider information objects.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListProvidersResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Admin
-      summary: List all available providers
-      description: List all available providers with their configuration and health status.
-      operationId: list_providers_v1alpha_admin_providers_get
-  /v1alpha/admin/providers/{provider_id}:
-    get:
-      responses:
-        '200':
-          description: The provider information object.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ProviderInfo'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '404':
-          description: Provider not found.
-      tags:
-      - Admin
-      summary: Get provider details
-      description: Get detailed information about a specific provider.
-      operationId: inspect_provider_v1alpha_admin_providers__provider_id__get
-      parameters:
-      - name: provider_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the provider to inspect.
-          title: Provider Id
-        description: The ID of the provider to inspect.
-  /v1alpha/admin/inspect/routes:
-    get:
-      responses:
-        '200':
-          description: A list of route information objects.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListRoutesResponse'
+        '200':
+          description: A list of route information objects.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListRoutesResponse'
         '400':
           $ref: '#/components/responses/BadRequest400'
           description: Bad Request
@@ -9515,408 +8792,87 @@ components:
       - error
       title: ViolationLevel
       description: Severity level of a safety violation.
-    AggregationFunctionType:
-      type: string
-      enum:
-      - average
-      - weighted_average
-      - median
-      - categorical_count
-      - accuracy
-      title: AggregationFunctionType
-      description: Types of aggregation functions for scoring results.
     ArrayType:
+      description: Parameter type for array values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - array
       title: ArrayType
-      description: Parameter type for array values.
-    BasicScoringFnParams:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - basic
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: BasicScoringFnParams
-      description: Parameters for basic scoring function configuration.
     BooleanType:
+      description: Parameter type for boolean values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - boolean
       title: BooleanType
-      description: Parameter type for boolean values.
     ChatCompletionInputType:
+      description: Parameter type for chat completion input.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - chat_completion_input
       title: ChatCompletionInputType
-      description: Parameter type for chat completion input.
     CompletionInputType:
+      description: Parameter type for completion input.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - completion_input
       title: CompletionInputType
-      description: Parameter type for completion input.
     JsonType:
+      description: Parameter type for JSON values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - json
       title: JsonType
-      description: Parameter type for JSON values.
-    LLMAsJudgeScoringFnParams:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - llm_as_judge
-        judge_model:
-          type: string
-          title: Judge Model
-        prompt_template:
-          anyOf:
-          - type: string
-          - type: 'null'
-        judge_score_regexes:
-          items:
-            type: string
-          type: array
-          title: Judge Score Regexes
-          description: Regexes to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      required:
-      - judge_model
-      title: LLMAsJudgeScoringFnParams
-      description: Parameters for LLM-as-judge scoring function configuration.
     NumberType:
+      description: Parameter type for numeric values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - number
       title: NumberType
-      description: Parameter type for numeric values.
     ObjectType:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - object
-      title: ObjectType
       description: Parameter type for object values.
-    RegexParserScoringFnParams:
       properties:
         type:
-          type: string
           title: Type
-          enum:
-          - regex_parser
-        parsing_regexes:
-          items:
-            type: string
-          type: array
-          title: Parsing Regexes
-          description: Regex to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: RegexParserScoringFnParams
-      description: Parameters for regex parser scoring function configuration.
-    ScoringFn:
-      properties:
-        identifier:
           type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
           enum:
-          - scoring_function
-        description:
-          anyOf:
-          - type: string
-          - type: 'null'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this definition
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the deterministic function
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval
-      required:
-      - identifier
-      - provider_id
-      - return_type
-      title: ScoringFn
-      description: A scoring function resource for evaluating model outputs.
-    ScoringFnParams:
-      discriminator:
-        mapping:
-          basic: '#/components/schemas/BasicScoringFnParams'
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        title: LLMAsJudgeScoringFnParams
-      - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        title: RegexParserScoringFnParams
-      - $ref: '#/components/schemas/BasicScoringFnParams'
-        title: BasicScoringFnParams
-      title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-    ScoringFnParamsType:
-      description: Types of scoring function parameter configurations.
-      enum:
-      - llm_as_judge
-      - regex_parser
-      - basic
-      title: ScoringFnParamsType
-      type: string
+          - object
+      title: ObjectType
     StringType:
+      description: Parameter type for string values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - string
       title: StringType
-      description: Parameter type for string values.
     UnionType:
+      description: Parameter type for union values.
       properties:
         type:
-          type: string
           title: Type
+          type: string
           enum:
           - union
       title: UnionType
-      description: Parameter type for union values.
-    ListScoringFunctionsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/ScoringFn'
-          type: array
-          title: Data
-          description: List of scoring function objects.
-      required:
-      - data
-      title: ListScoringFunctionsResponse
-      description: Response containing a list of scoring function objects.
-    ScoreRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Input Rows
-          description: The rows to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-      required:
-      - input_rows
-      - scoring_functions
-      title: ScoreRequest
-      description: Request model for scoring a list of rows.
-    ScoreResponse:
-      properties:
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult.
-      required:
-      - results
-      title: ScoreResponse
-      description: The response from scoring.
-    ScoringResult:
-      properties:
-        score_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Score Rows
-          description: The scoring result for each row. Each row is a map of column name to value.
-        aggregated_results:
-          additionalProperties: true
-          type: object
-          title: Aggregated Results
-          description: Map of metric name to aggregated value
-      required:
-      - score_rows
-      - aggregated_results
-      title: ScoringResult
-      description: A scoring result for a single row.
-    ScoreBatchRequest:
-      properties:
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-        save_results_dataset:
-          type: boolean
-          title: Save Results Dataset
-          description: Whether to save the results to a dataset.
-          default: false
-      required:
-      - dataset_id
-      - scoring_functions
-      title: ScoreBatchRequest
-      description: Request model for scoring a batch of rows from a dataset.
-    ScoreBatchResponse:
-      properties:
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: (Optional) The identifier of the dataset that was scored
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult
-      required:
-      - results
-      title: ScoreBatchResponse
-      description: Response from batch scoring operations on datasets.
     Shield:
       properties:
         identifier:
@@ -10969,264 +9925,48 @@ components:
       - version
       title: VersionInfo
       description: Version information for the service.
-    AppendRowsRequest:
-      properties:
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: The rows to append to the dataset.
-      required:
-      - rows
-      title: AppendRowsRequest
-      description: Request body for appending rows to a dataset.
     PaginatedResponse:
+      description: A generic paginated response that follows a simple format.
       properties:
         data:
           items:
             additionalProperties: true
             type: object
-          type: array
           title: Data
+          type: array
         has_more:
-          type: boolean
           title: Has More
+          type: boolean
         url:
           anyOf:
           - type: string
           - type: 'null'
+          nullable: true
       required:
       - data
       - has_more
       title: PaginatedResponse
-      description: A generic paginated response that follows a simple format.
-    Dataset:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          description: Type of resource, always 'dataset' for datasets
-          enum:
-          - dataset
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: Purpose of the dataset indicating its intended use
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: Data source configuration for the dataset
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this dataset
-      required:
-      - identifier
-      - provider_id
-      - purpose
-      - source
-      title: Dataset
-      description: Dataset resource for storing and accessing training or evaluation data.
-    RowsDataSource:
-      properties:
-        type:
-          type: string
-          title: Type
-          description: The type of data source.
-          enum:
-          - rows
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: 'The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]'
-      required:
-      - rows
-      title: RowsDataSource
-      description: A dataset stored in rows.
-    URIDataSource:
-      properties:
-        type:
-          type: string
-          title: Type
-          description: The type of data source.
-          enum:
-          - uri
-        uri:
-          type: string
-          title: Uri
-          description: The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}"
-      required:
-      - uri
-      title: URIDataSource
-      description: A dataset that can be obtained from a URI.
-    ListDatasetsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Dataset'
-          type: array
-          title: Data
-          description: List of datasets
-      required:
-      - data
-      title: ListDatasetsResponse
-      description: Response from listing datasets.
-    Benchmark:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          title: Type
-          description: The resource type, always benchmark.
-          enum:
-          - benchmark
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: Identifier of the dataset to use for the benchmark evaluation.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: List of scoring function identifiers to apply during evaluation.
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Metadata for this evaluation task.
-      required:
-      - identifier
-      - provider_id
-      - dataset_id
-      - scoring_functions
-      title: Benchmark
-      description: A benchmark resource for evaluating model performance.
-    ListBenchmarksResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Benchmark'
-          type: array
-          title: Data
-          description: List of benchmark objects.
-      required:
-      - data
-      title: ListBenchmarksResponse
-      description: Response containing a list of benchmark objects.
-    BenchmarkConfig:
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/ModelCandidate'
-          description: The candidate to evaluate
-        scoring_params:
-          additionalProperties:
-            oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          type: object
-          title: Scoring Params
-          description: Map between scoring function id and parameters for each scoring function you want to run
-        num_examples:
-          anyOf:
-          - type: integer
-            minimum: 1.0
-          - type: 'null'
-          description: Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated
-      required:
-      - eval_candidate
-      title: BenchmarkConfig
-      description: A benchmark configuration for evaluation.
     GreedySamplingStrategy:
+      description: Greedy sampling strategy that selects the highest probability token at each step.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'greedy' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - greedy
       title: GreedySamplingStrategy
-      description: Greedy sampling strategy that selects the highest probability token at each step.
-    ModelCandidate:
-      properties:
-        type:
-          type: string
-          title: Type
-          enum:
-          - model
-        model:
-          type: string
-          minLength: 1
-          title: Model
-          description: The model ID to evaluate
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model
-        system_message:
-          anyOf:
-          - $ref: '#/components/schemas/SystemMessage'
-            title: SystemMessage
-          - type: 'null'
-          description: The system message providing instructions or context to the model
-          title: SystemMessage
-      required:
-      - model
-      - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
     SamplingParams:
+      description: Sampling parameters for text generation.
       properties:
         strategy:
+          description: The sampling strategy to use.
+          discriminator:
+            mapping:
+              greedy: '#/components/schemas/GreedySamplingStrategy'
+              top_k: '#/components/schemas/TopKSamplingStrategy'
+              top_p: '#/components/schemas/TopPSamplingStrategy'
+            propertyName: type
           oneOf:
           - $ref: '#/components/schemas/GreedySamplingStrategy'
             title: GreedySamplingStrategy
@@ -11235,200 +9975,127 @@ components:
           - $ref: '#/components/schemas/TopKSamplingStrategy'
             title: TopKSamplingStrategy
           title: GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy
-          description: The sampling strategy to use.
-          discriminator:
-            propertyName: type
-            mapping:
-              greedy: '#/components/schemas/GreedySamplingStrategy'
-              top_k: '#/components/schemas/TopKSamplingStrategy'
-              top_p: '#/components/schemas/TopPSamplingStrategy'
         max_tokens:
           anyOf:
-          - type: integer
-            minimum: 1.0
+          - minimum: 1
+            type: integer
           - type: 'null'
           description: The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length.
+          nullable: true
         repetition_penalty:
           anyOf:
-          - type: number
-            maximum: 2.0
+          - maximum: 2.0
             minimum: -2.0
+            type: number
           - type: 'null'
-          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
           default: 1.0
+          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
         stop:
           anyOf:
           - items:
               type: string
-            type: array
             maxItems: 4
+            type: array
           - type: 'null'
           description: Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+          nullable: true
       title: SamplingParams
-      description: Sampling parameters for text generation.
     SystemMessage:
+      description: A system message providing instructions or context to the model.
       properties:
         role:
-          type: string
-          title: Role
           description: Must be 'system' to identify this as a system message.
+          title: Role
+          type: string
           enum:
           - system
         content:
           anyOf:
           - type: string
-          - oneOf:
-            - $ref: '#/components/schemas/ImageContentItem-Input'
-              title: ImageContentItem-Input
-            - $ref: '#/components/schemas/TextContentItem'
-              title: TextContentItem
-            discriminator:
-              propertyName: type
+          - discriminator:
               mapping:
-                image: '#/components/schemas/ImageContentItem-Input'
+                image: '#/components/schemas/ImageContentItem'
                 text: '#/components/schemas/TextContentItem'
-            title: ImageContentItem-Input | TextContentItem
+              propertyName: type
+            oneOf:
+            - $ref: '#/components/schemas/ImageContentItem'
+              title: ImageContentItem
+            - $ref: '#/components/schemas/TextContentItem'
+              title: TextContentItem
+            title: ImageContentItem | TextContentItem
           - items:
-              oneOf:
-              - $ref: '#/components/schemas/ImageContentItem-Input'
-                title: ImageContentItem-Input
-              - $ref: '#/components/schemas/TextContentItem'
-                title: TextContentItem
               discriminator:
-                propertyName: type
                 mapping:
-                  image: '#/components/schemas/ImageContentItem-Input'
+                  image: '#/components/schemas/ImageContentItem'
                   text: '#/components/schemas/TextContentItem'
-              title: ImageContentItem-Input | TextContentItem
+                propertyName: type
+              oneOf:
+              - $ref: '#/components/schemas/ImageContentItem'
+                title: ImageContentItem
+              - $ref: '#/components/schemas/TextContentItem'
+                title: TextContentItem
+              title: ImageContentItem | TextContentItem
             type: array
-            title: list[ImageContentItem-Input | TextContentItem]
-          title: string | list[ImageContentItem-Input | TextContentItem]
+            title: list[ImageContentItem | TextContentItem]
           description: The content of the 'system prompt'. If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages.
+          title: string | list[ImageContentItem | TextContentItem]
       required:
       - content
       title: SystemMessage
-      description: A system message providing instructions or context to the model.
     TopKSamplingStrategy:
+      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'top_k' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - top_k
         top_k:
-          type: integer
-          minimum: 1.0
-          title: Top K
           description: Number of top tokens to consider for sampling. Must be at least 1.
+          minimum: 1
+          title: Top K
+          type: integer
       required:
       - top_k
       title: TopKSamplingStrategy
-      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
     TopPSamplingStrategy:
+      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
       properties:
         type:
-          type: string
-          title: Type
           description: Must be 'top_p' to identify this sampling strategy.
+          title: Type
+          type: string
           enum:
           - top_p
         temperature:
-          type: number
+          description: Controls randomness in sampling. Higher values increase randomness.
           maximum: 2.0
           title: Temperature
-          description: Controls randomness in sampling. Higher values increase randomness.
+          type: number
           minimum: 0.0
         top_p:
-          type: number
+          default: 0.95
+          description: Cumulative probability threshold for nucleus sampling.
           maximum: 1.0
           minimum: 0.0
           title: Top P
-          description: Cumulative probability threshold for nucleus sampling.
-          default: 0.95
+          type: number
       required:
       - temperature
       title: TopPSamplingStrategy
-      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
-    EvaluateRowsRequest:
-      description: Request model for evaluating a list of rows on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        input_rows:
-          description: The rows to evaluate
-          items:
-            additionalProperties: true
-            type: object
-          minItems: 1
-          title: Input Rows
-          type: array
-        scoring_functions:
-          description: The scoring functions to use for the evaluation
-          items:
-            type: string
-          minItems: 1
-          title: Scoring Functions
-          type: array
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsRequest
-    EvaluateResponse:
-      properties:
-        generations:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Generations
-          description: The generations from the evaluation
-        scores:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Scores
-          description: The scores from the evaluation. Each key in the dict is a scoring function name
-      required:
-      - generations
-      - scores
-      title: EvaluateResponse
-      description: The response from an evaluation.
-    RunEvalRequest:
-      description: Request model for running an evaluation on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - benchmark_config
-      title: RunEvalRequest
     Job:
+      description: A job execution instance with status tracking.
       properties:
         job_id:
-          type: string
           title: Job Id
+          type: string
         status:
           $ref: '#/components/schemas/JobStatus'
       required:
       - job_id
       - status
       title: Job
-      description: A job execution instance with status tracking.
     RerankRequest:
       properties:
         model:
@@ -11573,85 +10240,6 @@ components:
       - $ref: '#/components/schemas/CompletionInputType'
         title: CompletionInputType
       title: StringType | ... (9 variants)
-    RegisterScoringFunctionRequest:
-      properties:
-        scoring_fn_id:
-          type: string
-          title: Scoring Fn Id
-          description: The ID of the scoring function to register.
-        description:
-          type: string
-          title: Description
-          description: The description of the scoring function.
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the scoring function.
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        provider_scoring_fn_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider scoring function to use for the scoring function.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the scoring function.
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequest
-      description: Request model for registering a scoring function.
     RegisterShieldRequest:
       properties:
         shield_id:
@@ -11678,90 +10266,6 @@ components:
       - shield_id
       title: RegisterShieldRequest
       description: Request model for registering a shield.
-    DataSource:
-      discriminator:
-        mapping:
-          rows: '#/components/schemas/RowsDataSource'
-          uri: '#/components/schemas/URIDataSource'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/URIDataSource'
-        title: URIDataSource
-      - $ref: '#/components/schemas/RowsDataSource'
-        title: RowsDataSource
-      title: URIDataSource | RowsDataSource
-    RegisterDatasetRequest:
-      properties:
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: The purpose of the dataset.
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: The data source of the dataset.
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata for the dataset.
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the dataset. If not provided, an ID will be generated.
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequest
-      description: Request model for registering a dataset.
-    RegisterBenchmarkRequest:
-      properties:
-        benchmark_id:
-          type: string
-          title: Benchmark Id
-          description: The ID of the benchmark to register.
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to use for the benchmark.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: The scoring functions to use for the benchmark.
-        provider_benchmark_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider benchmark to use for the benchmark.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the benchmark.
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata to use for the benchmark.
-      required:
-      - benchmark_id
-      - dataset_id
-      - scoring_functions
-      title: RegisterBenchmarkRequest
-      description: Request model for registering a benchmark.
     AllowedToolsFilter:
       properties:
         tool_names:
@@ -12698,13 +11202,6 @@ components:
       - model
       title: CreateResponseRequest
       description: Request model for creating a response.
-    DatasetPurpose:
-      type: string
-      enum:
-      - eval/question-answer
-      - eval/messages-answer
-      title: DatasetPurpose
-      description: Purpose of the dataset. Each purpose has a required input data schema.
     EmbeddedChunk-Input:
       properties:
         content:
@@ -12843,32 +11340,6 @@ components:
           - type: 'null'
       additionalProperties: true
       title: Errors
-    EvaluateRowsBodyRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          minItems: 1
-          title: Input Rows
-          description: The rows to evaluate
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          minItems: 1
-          title: Scoring Functions
-          description: The scoring functions to use for the evaluation
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsBodyRequest
-      description: Request body model for evaluating rows (without path parameter).
     HealthStatus:
       type: string
       enum:
@@ -12912,16 +11383,6 @@ components:
       required:
       - cached_tokens
       title: InputTokensDetails
-    JobStatus:
-      type: string
-      enum:
-      - completed
-      - in_progress
-      - failed
-      - scheduled
-      - cancelled
-      title: JobStatus
-      description: Status of a job execution.
     ListConnectorsResponse:
       properties:
         data:
@@ -13896,15 +12357,6 @@ components:
       - disabled
       title: ResponseTruncation
       description: Controls how the service truncates input when it exceeds the model context window.
-    RunEvalBodyRequest:
-      properties:
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_config
-      title: RunEvalBodyRequest
-      description: Request body model for running an evaluation (without path parameter).
     SearchRankingOptions:
       properties:
         ranker:
@@ -14292,50 +12744,6 @@ components:
       - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
         title: OpenAIResponseContentPartReasoningText
       title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
-    ListBenchmarksRequest:
-      description: Request model for listing benchmarks.
-      properties: {}
-      title: ListBenchmarksRequest
-    GetBenchmarkRequest:
-      description: Request model for getting a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: GetBenchmarkRequest
-    UnregisterBenchmarkRequest:
-      description: Request model for unregistering a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: UnregisterBenchmarkRequest
-    GetDatasetRequest:
-      description: Request model for getting a dataset by ID.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: GetDatasetRequest
-    UnregisterDatasetRequest:
-      description: Request model for unregistering a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: UnregisterDatasetRequest
     ListModelsResponse:
       description: Response containing a list of model objects.
       properties:
@@ -14368,39 +12776,6 @@ components:
       required:
       - model_id
       title: UnregisterModelRequest
-    DialogType:
-      description: Parameter type for dialog data with semantic output labels.
-      properties:
-        type:
-          title: Type
-          type: string
-          enum:
-          - dialog
-      title: DialogType
-    ListScoringFunctionsRequest:
-      description: Request model for listing scoring functions.
-      properties: {}
-      title: ListScoringFunctionsRequest
-    GetScoringFunctionRequest:
-      description: Request model for getting a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: GetScoringFunctionRequest
-    UnregisterScoringFunctionRequest:
-      description: Request model for unregistering a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: UnregisterScoringFunctionRequest
     GetShieldRequest:
       description: Request model for getting a shield by identifier.
       properties:
@@ -14499,16 +12874,10 @@ components:
       - responses
       - batches
       - vector_io
-      - datasetio
-      - scoring
-      - eval
       - tool_runtime
       - models
       - shields
       - vector_stores
-      - datasets
-      - scoring_functions
-      - benchmarks
       - tool_groups
       - files
       - file_processors
@@ -15309,6 +13678,25 @@ components:
       required:
       - batch_id
       title: CancelBatchRequest
+    JobStatus:
+      description: Status of a job execution.
+      enum:
+      - completed
+      - in_progress
+      - failed
+      - scheduled
+      - cancelled
+      title: JobStatus
+      type: string
+    DialogType:
+      description: Parameter type for dialog data with semantic output labels.
+      properties:
+        type:
+          title: Type
+          type: string
+          enum:
+          - dialog
+      title: DialogType
     ConnectorInput:
       description: Input for creating a connector
       properties:
@@ -15542,90 +13930,6 @@ components:
       - conversation_id
       - item_id
       title: DeleteItemRequest
-    IterRowsRequest:
-      description: Request model for iterating over rows in a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-          type: string
-        start_index:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          nullable: true
-        limit:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          nullable: true
-      required:
-      - dataset_id
-      title: IterRowsRequest
-    BenchmarkIdRequest:
-      description: Request model containing benchmark_id path parameter.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark
-          minLength: 1
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: BenchmarkIdRequest
-    JobStatusRequest:
-      description: Request model for getting the status of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the status of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobStatusRequest
-    JobCancelRequest:
-      description: Request model for canceling a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to cancel
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobCancelRequest
-    JobResultRequest:
-      description: Request model for getting the result of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the result of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobResultRequest
     ProcessFileRequest:
       description: |-
         Request model for file processing operation.
diff --git a/scripts/openapi_generator/stainless_config/generate_config.py b/scripts/openapi_generator/stainless_config/generate_config.py
index 32e774c174..91d2ca0742 100644
--- a/scripts/openapi_generator/stainless_config/generate_config.py
+++ b/scripts/openapi_generator/stainless_config/generate_config.py
@@ -75,25 +75,6 @@
 ENVIRONMENTS = {"production": "http://any-hosted-llama-stack.com"}
 
 PAGINATION = [
-    {
-        "name": "datasets_iterrows",
-        "type": "offset",
-        "request": {
-            "dataset_id": {"type": "string"},
-            "start_index": {
-                "type": "integer",
-                "x-stainless-pagination-property": {"purpose": "offset_count_param"},
-            },
-            "limit": {"type": "integer"},
-        },
-        "response": {
-            "data": {"type": "array", "items": {"type": "object"}},
-            "next_index": {
-                "type": "integer",
-                "x-stainless-pagination-property": {"purpose": "offset_count_start_field"},
-            },
-        },
-    },
     {
         "name": "openai_cursor_page",
         "type": "cursor",
@@ -135,49 +116,6 @@
 
 OPENAPI = {
     "transformations": [
-        {
-            "command": "mergeObject",
-            "reason": "Better return_type using enum",
-            "args": {
-                "target": ["$.components.schemas"],
-                "object": {
-                    "ReturnType": {
-                        "additionalProperties": False,
-                        "properties": {
-                            "type": {
-                                "enum": [
-                                    "string",
-                                    "number",
-                                    "boolean",
-                                    "array",
-                                    "object",
-                                    "json",
-                                    "union",
-                                    "chat_completion_input",
-                                    "completion_input",
-                                    "agent_turn_input",
-                                ]
-                            }
-                        },
-                        "required": ["type"],
-                        "type": "object",
-                    }
-                },
-            },
-        },
-        {
-            "command": "replaceProperties",
-            "reason": "Replace return type properties with better model (see above)",
-            "args": {
-                "filter": {
-                    "only": [
-                        "$.components.schemas.ScoringFn.properties.return_type",
-                        "$.components.schemas.RegisterScoringFunctionRequest.properties.return_type",
-                    ]
-                },
-                "value": {"$ref": "#/components/schemas/ReturnType"},
-            },
-        },
         {
             "command": "oneOfToAnyOf",
             "reason": "Prism (mock server) doesn't like one of our "
@@ -211,7 +149,6 @@
             "param_type": "ParamType",
             "safety_violation": "SafetyViolation",
             "sampling_params": "SamplingParams",
-            "scoring_result": "ScoringResult",
             "system_message": "SystemMessage",
             "health_info": "HealthInfo",
             "provider_info": "ProviderInfo",
@@ -439,25 +376,6 @@
             "delete": "delete /v1/shields/{identifier}",
         },
     },
-    "scoring": {
-        "methods": {
-            "score": "post /v1/scoring/score",
-            "score_batch": "post /v1/scoring/score-batch",
-        }
-    },
-    "scoring_functions": {
-        "models": {
-            "scoring_fn": "ScoringFn",
-            "scoring_fn_params": "ScoringFnParams",
-            "list_scoring_functions_response": "ListScoringFunctionsResponse",
-        },
-        "methods": {
-            "retrieve": "get /v1/scoring-functions/{scoring_fn_id}",
-            "list": {"paginated": False, "endpoint": "get /v1/scoring-functions"},
-            "register": "post /v1/scoring-functions",
-            "unregister": "delete /v1/scoring-functions/{scoring_fn_id}",
-        },
-    },
     "files": {
         "models": {
             "file": "OpenAIFileObject",
@@ -482,43 +400,6 @@
     },
     "alpha": {
         "subresources": {
-            "benchmarks": {
-                "models": {
-                    "benchmark": "Benchmark",
-                    "list_benchmarks_response": "ListBenchmarksResponse",
-                },
-                "methods": {
-                    "retrieve": "get /v1alpha/eval/benchmarks/{benchmark_id}",
-                    "list": {
-                        "paginated": False,
-                        "endpoint": "get /v1alpha/eval/benchmarks",
-                    },
-                    "register": "post /v1alpha/eval/benchmarks",
-                    "unregister": "delete /v1alpha/eval/benchmarks/{benchmark_id}",
-                },
-            },
-            "eval": {
-                "models": {
-                    "evaluate_response": "EvaluateResponse",
-                    "benchmark_config": "BenchmarkConfig",
-                    "job": "Job",
-                },
-                "methods": {
-                    "evaluate_rows": "post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations",
-                    "run_eval": "post /v1alpha/eval/benchmarks/{benchmark_id}/jobs",
-                    "evaluate_rows_alpha": "post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations",
-                    "run_eval_alpha": "post /v1alpha/eval/benchmarks/{benchmark_id}/jobs",
-                },
-                "subresources": {
-                    "jobs": {
-                        "methods": {
-                            "cancel": "delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
-                            "status": "get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
-                            "retrieve": "get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result",
-                        }
-                    }
-                },
-            },
             "admin": {
                 "methods": {
                     "list_providers": "get /v1alpha/admin/providers",
@@ -535,21 +416,6 @@
             },
         }
     },
-    "beta": {
-        "subresources": {
-            "datasets": {
-                "models": {"list_datasets_response": "ListDatasetsResponse"},
-                "methods": {
-                    "register": "post /v1beta/datasets",
-                    "retrieve": "get /v1beta/datasets/{dataset_id}",
-                    "list": {"paginated": False, "endpoint": "get /v1beta/datasets"},
-                    "unregister": "delete /v1beta/datasets/{dataset_id}",
-                    "iterrows": "get /v1beta/datasetio/iterrows/{dataset_id}",
-                    "appendrows": "post /v1beta/datasetio/append-rows/{dataset_id}",
-                },
-            }
-        }
-    },
 }
 
 
diff --git a/src/llama_stack/core/datatypes.py b/src/llama_stack/core/datatypes.py
index 984506bd6d..873140fbfa 100644
--- a/src/llama_stack/core/datatypes.py
+++ b/src/llama_stack/core/datatypes.py
@@ -21,22 +21,13 @@
 from llama_stack.log import LoggingConfig
 from llama_stack_api import (
     Api,
-    Benchmark,
-    BenchmarkInput,
     ConnectorInput,
-    Dataset,
-    DatasetInput,
-    DatasetIO,
-    Eval,
     Inference,
     Model,
     ModelInput,
     ProviderSpec,
     Resource,
     Safety,
-    Scoring,
-    ScoringFn,
-    ScoringFnInput,
     Shield,
     ShieldInput,
     ToolGroup,
@@ -99,44 +90,20 @@ class VectorStoreWithOwner(VectorStore, ResourceWithOwner):
     pass
 
 
-class DatasetWithOwner(Dataset, ResourceWithOwner):
-    """A Dataset resource extended with ownership information for access control."""
-
-    pass
-
-
-class ScoringFnWithOwner(ScoringFn, ResourceWithOwner):
-    """A ScoringFn resource extended with ownership information for access control."""
-
-    pass
-
-
-class BenchmarkWithOwner(Benchmark, ResourceWithOwner):
-    """A Benchmark resource extended with ownership information for access control."""
-
-    pass
-
-
 class ToolGroupWithOwner(ToolGroup, ResourceWithOwner):
     """A ToolGroup resource extended with ownership information for access control."""
 
     pass
 
 
-RoutableObject = Model | Shield | VectorStore | Dataset | ScoringFn | Benchmark | ToolGroup
+RoutableObject = Model | Shield | VectorStore | ToolGroup
 
 RoutableObjectWithProvider = Annotated[
-    ModelWithOwner
-    | ShieldWithOwner
-    | VectorStoreWithOwner
-    | DatasetWithOwner
-    | ScoringFnWithOwner
-    | BenchmarkWithOwner
-    | ToolGroupWithOwner,
+    ModelWithOwner | ShieldWithOwner | VectorStoreWithOwner | ToolGroupWithOwner,
     Field(discriminator="type"),
 ]
 
-RoutedProtocol = Inference | Safety | VectorIO | DatasetIO | Scoring | Eval | ToolRuntime
+RoutedProtocol = Inference | Safety | VectorIO | ToolRuntime
 
 
 # Example: /inference, /safety
@@ -749,9 +716,6 @@ class RegisteredResources(BaseModel):
     models: list[ModelInput] = Field(default_factory=list)
     shields: list[ShieldInput] = Field(default_factory=list)
     vector_stores: list[VectorStoreInput] = Field(default_factory=list)
-    datasets: list[DatasetInput] = Field(default_factory=list)
-    scoring_fns: list[ScoringFnInput] = Field(default_factory=list)
-    benchmarks: list[BenchmarkInput] = Field(default_factory=list)
     tool_groups: list[ToolGroupInput] = Field(default_factory=list, deprecated=True)
 
     @model_validator(mode="after")
diff --git a/src/llama_stack/core/distribution.py b/src/llama_stack/core/distribution.py
index fecfa14447..0bd66f33d5 100644
--- a/src/llama_stack/core/distribution.py
+++ b/src/llama_stack/core/distribution.py
@@ -59,18 +59,6 @@ def builtin_automatically_routed_apis() -> list[AutoRoutedApiInfo]:
             routing_table_api=Api.shields,
             router_api=Api.safety,
         ),
-        AutoRoutedApiInfo(
-            routing_table_api=Api.datasets,
-            router_api=Api.datasetio,
-        ),
-        AutoRoutedApiInfo(
-            routing_table_api=Api.scoring_functions,
-            router_api=Api.scoring,
-        ),
-        AutoRoutedApiInfo(
-            routing_table_api=Api.benchmarks,
-            router_api=Api.eval,
-        ),
         AutoRoutedApiInfo(
             routing_table_api=Api.tool_groups,
             router_api=Api.tool_runtime,
diff --git a/src/llama_stack/core/resolver.py b/src/llama_stack/core/resolver.py
index 5b96b57730..8b1d8af4ad 100644
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@@ -27,14 +27,8 @@
     Admin,
     Api,
     Batches,
-    Benchmarks,
-    BenchmarksProtocolPrivate,
     Connectors,
     Conversations,
-    DatasetIO,
-    Datasets,
-    DatasetsProtocolPrivate,
-    Eval,
     ExternalApiSpec,
     FileProcessors,
     Files,
@@ -50,9 +44,6 @@
     RemoteProviderSpec,
     Responses,
     Safety,
-    Scoring,
-    ScoringFunctions,
-    ScoringFunctionsProtocolPrivate,
     Shields,
     ShieldsProtocolPrivate,
     ToolGroups,
@@ -95,12 +86,6 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
         Api.models: Models,
         Api.safety: Safety,
         Api.shields: Shields,
-        Api.datasetio: DatasetIO,
-        Api.datasets: Datasets,
-        Api.scoring: Scoring,
-        Api.scoring_functions: ScoringFunctions,
-        Api.eval: Eval,
-        Api.benchmarks: Benchmarks,
         Api.tool_groups: ToolGroups,
         Api.tool_runtime: ToolRuntime,
         Api.files: Files,
@@ -150,13 +135,6 @@ def additional_protocols_map() -> dict[Api, Any]:
         Api.inference: (ModelsProtocolPrivate, Models, Api.models),
         Api.tool_groups: (ToolGroupsProtocolPrivate, ToolGroups, Api.tool_groups),
         Api.safety: (ShieldsProtocolPrivate, Shields, Api.shields),
-        Api.datasetio: (DatasetsProtocolPrivate, Datasets, Api.datasets),
-        Api.scoring: (
-            ScoringFunctionsProtocolPrivate,
-            ScoringFunctions,
-            Api.scoring_functions,
-        ),
-        Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks),
     }
 
 
diff --git a/src/llama_stack/core/routers/__init__.py b/src/llama_stack/core/routers/__init__.py
index 8cef9e3514..5fe983603b 100644
--- a/src/llama_stack/core/routers/__init__.py
+++ b/src/llama_stack/core/routers/__init__.py
@@ -23,10 +23,7 @@ async def get_routing_table_impl(
     dist_registry: DistributionRegistry,
     policy: list[AccessRule],
 ) -> Any:
-    from ..routing_tables.benchmarks import BenchmarksRoutingTable
-    from ..routing_tables.datasets import DatasetsRoutingTable
     from ..routing_tables.models import ModelsRoutingTable
-    from ..routing_tables.scoring_functions import ScoringFunctionsRoutingTable
     from ..routing_tables.shields import ShieldsRoutingTable
     from ..routing_tables.toolgroups import ToolGroupsRoutingTable
     from ..routing_tables.vector_stores import VectorStoresRoutingTable
@@ -34,9 +31,6 @@ async def get_routing_table_impl(
     api_to_tables = {
         "models": ModelsRoutingTable,
         "shields": ShieldsRoutingTable,
-        "datasets": DatasetsRoutingTable,
-        "scoring_functions": ScoringFunctionsRoutingTable,
-        "benchmarks": BenchmarksRoutingTable,
         "tool_groups": ToolGroupsRoutingTable,
         "vector_stores": VectorStoresRoutingTable,
     }
@@ -53,8 +47,6 @@ async def get_routing_table_impl(
 async def get_auto_router_impl(
     api: Api, routing_table: RoutingTable, deps: dict[str, Any], run_config: StackConfig, policy: list[AccessRule]
 ) -> Any:
-    from .datasets import DatasetIORouter
-    from .eval_scoring import EvalRouter, ScoringRouter
     from .inference import InferenceRouter
     from .safety import SafetyRouter
     from .tool_runtime import ToolRuntimeRouter
@@ -64,9 +56,6 @@ async def get_auto_router_impl(
         "vector_io": VectorIORouter,
         "inference": InferenceRouter,
         "safety": SafetyRouter,
-        "datasetio": DatasetIORouter,
-        "scoring": ScoringRouter,
-        "eval": EvalRouter,
         "tool_runtime": ToolRuntimeRouter,
     }
     if api.value not in api_to_routers:
diff --git a/src/llama_stack/core/routers/datasets.py b/src/llama_stack/core/routers/datasets.py
deleted file mode 100644
index 6f4a59b757..0000000000
--- a/src/llama_stack/core/routers/datasets.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from llama_stack.log import get_logger
-from llama_stack_api import (
-    AppendRowsParams,
-    DatasetIO,
-    DatasetPurpose,
-    DataSource,
-    IterRowsRequest,
-    PaginatedResponse,
-    RoutingTable,
-)
-
-logger = get_logger(name=__name__, category="core::routers")
-
-
-class DatasetIORouter(DatasetIO):
-    """Router that delegates DatasetIO operations to the appropriate provider via a routing table."""
-
-    def __init__(
-        self,
-        routing_table: RoutingTable,
-    ) -> None:
-        logger.debug("Initializing DatasetIORouter")
-        self.routing_table = routing_table
-
-    async def initialize(self) -> None:
-        logger.debug("DatasetIORouter.initialize")
-        pass
-
-    async def shutdown(self) -> None:
-        logger.debug("DatasetIORouter.shutdown")
-        pass
-
-    async def register_dataset(
-        self,
-        purpose: DatasetPurpose,
-        source: DataSource,
-        metadata: dict[str, Any] | None = None,
-        dataset_id: str | None = None,
-    ) -> None:
-        logger.debug(
-            "DatasetIORouter.register_dataset",
-            purpose=purpose,
-            source=source,
-            metadata=metadata,
-            dataset_id=dataset_id,
-        )
-        await self.routing_table.register_dataset(
-            purpose=purpose,
-            source=source,
-            metadata=metadata,
-            dataset_id=dataset_id,
-        )
-
-    async def iterrows(self, request: IterRowsRequest) -> PaginatedResponse:
-        logger.debug(
-            "DatasetIORouter.iterrows: , start_index= limit",
-            dataset_id=request.dataset_id,
-            start_index=request.start_index,
-            limit=request.limit,
-        )
-        provider = await self.routing_table.get_provider_impl(request.dataset_id)
-        return await provider.iterrows(
-            dataset_id=request.dataset_id,
-            start_index=request.start_index,
-            limit=request.limit,
-        )
-
-    async def append_rows(self, params: AppendRowsParams) -> None:
-        logger.debug("DatasetIORouter.append_rows", dataset_id=params.dataset_id, rows_count=len(params.rows))
-        provider = await self.routing_table.get_provider_impl(params.dataset_id)
-        return await provider.append_rows(
-            dataset_id=params.dataset_id,
-            rows=params.rows,
-        )
diff --git a/src/llama_stack/core/routers/eval_scoring.py b/src/llama_stack/core/routers/eval_scoring.py
deleted file mode 100644
index a3e16b10f8..0000000000
--- a/src/llama_stack/core/routers/eval_scoring.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.log import get_logger
-from llama_stack_api import (
-    BenchmarkConfig,
-    Eval,
-    EvaluateResponse,
-    EvaluateRowsRequest,
-    Job,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    RoutingTable,
-    RunEvalRequest,
-    ScoreBatchRequest,
-    ScoreBatchResponse,
-    ScoreRequest,
-    ScoreResponse,
-    Scoring,
-    resolve_evaluate_rows_request,
-    resolve_job_cancel_request,
-    resolve_job_result_request,
-    resolve_job_status_request,
-    resolve_run_eval_request,
-)
-
-logger = get_logger(name=__name__, category="core::routers")
-
-
-class ScoringRouter(Scoring):
-    """Router that delegates scoring operations to the appropriate provider via a routing table."""
-
-    def __init__(
-        self,
-        routing_table: RoutingTable,
-    ) -> None:
-        logger.debug("Initializing ScoringRouter")
-        self.routing_table = routing_table
-
-    async def initialize(self) -> None:
-        logger.debug("ScoringRouter.initialize")
-        pass
-
-    async def shutdown(self) -> None:
-        logger.debug("ScoringRouter.shutdown")
-        pass
-
-    async def score_batch(
-        self,
-        request: ScoreBatchRequest,
-    ) -> ScoreBatchResponse:
-        logger.debug("ScoringRouter.score_batch", dataset_id=request.dataset_id)
-        res = {}
-        for fn_identifier in request.scoring_functions.keys():
-            provider = await self.routing_table.get_provider_impl(fn_identifier)
-            # Create a request for this specific scoring function
-            single_fn_request = ScoreBatchRequest(
-                dataset_id=request.dataset_id,
-                scoring_functions={fn_identifier: request.scoring_functions[fn_identifier]},
-                save_results_dataset=request.save_results_dataset,
-            )
-            score_response = await provider.score_batch(single_fn_request)
-            res.update(score_response.results)
-
-        if request.save_results_dataset:
-            raise NotImplementedError("Save results dataset not implemented yet")
-
-        return ScoreBatchResponse(
-            results=res,
-        )
-
-    async def score(
-        self,
-        request: ScoreRequest,
-    ) -> ScoreResponse:
-        logger.debug(
-            "ScoringRouter.score: rows, functions",
-            input_rows_count=len(request.input_rows),
-            scoring_functions_count=len(request.scoring_functions),
-        )
-        res = {}
-        # look up and map each scoring function to its provider impl
-        for fn_identifier in request.scoring_functions.keys():
-            provider = await self.routing_table.get_provider_impl(fn_identifier)
-            # Create a request for this specific scoring function
-            single_fn_request = ScoreRequest(
-                input_rows=request.input_rows,
-                scoring_functions={fn_identifier: request.scoring_functions[fn_identifier]},
-            )
-            score_response = await provider.score(single_fn_request)
-            res.update(score_response.results)
-
-        return ScoreResponse(results=res)
-
-
-class EvalRouter(Eval):
-    """Router that delegates evaluation operations to the appropriate provider via a routing table."""
-
-    def __init__(
-        self,
-        routing_table: RoutingTable,
-    ) -> None:
-        logger.debug("Initializing EvalRouter")
-        self.routing_table = routing_table
-
-    async def initialize(self) -> None:
-        logger.debug("EvalRouter.initialize")
-        pass
-
-    async def shutdown(self) -> None:
-        logger.debug("EvalRouter.shutdown")
-        pass
-
-    async def run_eval(
-        self,
-        request: RunEvalRequest | None = None,
-        *,
-        benchmark_id: str | None = None,
-        benchmark_config: BenchmarkConfig | None = None,
-    ) -> Job:
-        """Run an evaluation on a benchmark.
-
-        Supports both new-style (request object) and old-style (individual parameters).
-        Old-style usage is deprecated and will emit a DeprecationWarning.
-
-        Args:
-            request: The new-style request object (preferred)
-            benchmark_id: (Deprecated) The benchmark ID
-            benchmark_config: (Deprecated) The benchmark configuration
-
-        Returns:
-            Job object representing the evaluation job
-        """
-        resolved_request = resolve_run_eval_request(
-            request, benchmark_id=benchmark_id, benchmark_config=benchmark_config
-        )
-        logger.debug("EvalRouter.run_eval", benchmark_id=resolved_request.benchmark_id)
-        provider = await self.routing_table.get_provider_impl(resolved_request.benchmark_id)
-        return await provider.run_eval(resolved_request)
-
-    async def evaluate_rows(
-        self,
-        request: EvaluateRowsRequest | None = None,
-        *,
-        benchmark_id: str | None = None,
-        input_rows: list[dict[str, Any]] | None = None,
-        scoring_functions: list[str] | None = None,
-        benchmark_config: BenchmarkConfig | None = None,
-    ) -> EvaluateResponse:
-        """Evaluate a list of rows on a benchmark.
-
-        Supports both new-style (request object) and old-style (individual parameters).
-        Old-style usage is deprecated and will emit a DeprecationWarning.
-
-        Args:
-            request: The new-style request object (preferred)
-            benchmark_id: (Deprecated) The benchmark ID
-            input_rows: (Deprecated) The rows to evaluate
-            scoring_functions: (Deprecated) The scoring functions to use
-            benchmark_config: (Deprecated) The benchmark configuration
-
-        Returns:
-            EvaluateResponse object containing generations and scores
-        """
-        resolved_request = resolve_evaluate_rows_request(
-            request,
-            benchmark_id=benchmark_id,
-            input_rows=input_rows,
-            scoring_functions=scoring_functions,
-            benchmark_config=benchmark_config,
-        )
-        logger.debug(
-            "EvalRouter.evaluate_rows: , rows",
-            benchmark_id=resolved_request.benchmark_id,
-            input_rows_count=len(resolved_request.input_rows),
-        )
-        provider = await self.routing_table.get_provider_impl(resolved_request.benchmark_id)
-        return await provider.evaluate_rows(resolved_request)
-
-    async def job_status(
-        self,
-        request: JobStatusRequest | None = None,
-        *,
-        benchmark_id: str | None = None,
-        job_id: str | None = None,
-    ) -> Job:
-        """Get the status of a job.
-
-        Supports both new-style (request object) and old-style (individual parameters).
-        Old-style usage is deprecated and will emit a DeprecationWarning.
-
-        Args:
-            request: The new-style request object (preferred)
-            benchmark_id: (Deprecated) The benchmark ID
-            job_id: (Deprecated) The job ID
-
-        Returns:
-            Job object with the current status
-        """
-        resolved_request = resolve_job_status_request(request, benchmark_id=benchmark_id, job_id=job_id)
-        logger.debug(
-            "EvalRouter.job_status", benchmark_id=resolved_request.benchmark_id, job_id=resolved_request.job_id
-        )
-        provider = await self.routing_table.get_provider_impl(resolved_request.benchmark_id)
-        return await provider.job_status(resolved_request)
-
-    async def job_cancel(
-        self,
-        request: JobCancelRequest | None = None,
-        *,
-        benchmark_id: str | None = None,
-        job_id: str | None = None,
-    ) -> None:
-        """Cancel a job.
-
-        Supports both new-style (request object) and old-style (individual parameters).
-        Old-style usage is deprecated and will emit a DeprecationWarning.
-
-        Args:
-            request: The new-style request object (preferred)
-            benchmark_id: (Deprecated) The benchmark ID
-            job_id: (Deprecated) The job ID
-
-        Returns:
-            None
-        """
-        resolved_request = resolve_job_cancel_request(request, benchmark_id=benchmark_id, job_id=job_id)
-        logger.debug(
-            "EvalRouter.job_cancel", benchmark_id=resolved_request.benchmark_id, job_id=resolved_request.job_id
-        )
-        provider = await self.routing_table.get_provider_impl(resolved_request.benchmark_id)
-        await provider.job_cancel(resolved_request)
-
-    async def job_result(
-        self,
-        request: JobResultRequest | None = None,
-        *,
-        benchmark_id: str | None = None,
-        job_id: str | None = None,
-    ) -> EvaluateResponse:
-        """Get the result of a job.
-
-        Supports both new-style (request object) and old-style (individual parameters).
-        Old-style usage is deprecated and will emit a DeprecationWarning.
-
-        Args:
-            request: The new-style request object (preferred)
-            benchmark_id: (Deprecated) The benchmark ID
-            job_id: (Deprecated) The job ID
-
-        Returns:
-            EvaluateResponse object with the job results
-        """
-        resolved_request = resolve_job_result_request(request, benchmark_id=benchmark_id, job_id=job_id)
-        logger.debug(
-            "EvalRouter.job_result", benchmark_id=resolved_request.benchmark_id, job_id=resolved_request.job_id
-        )
-        provider = await self.routing_table.get_provider_impl(resolved_request.benchmark_id)
-        return await provider.job_result(resolved_request)
diff --git a/src/llama_stack/core/routing_tables/README.md b/src/llama_stack/core/routing_tables/README.md
index b0e6d45a32..bad74c95f2 100644
--- a/src/llama_stack/core/routing_tables/README.md
+++ b/src/llama_stack/core/routing_tables/README.md
@@ -10,9 +10,6 @@ routing_tables/
   common.py              # CommonRoutingTableImpl base class
   models.py              # ModelsRoutingTable (models -> inference providers)
   shields.py             # ShieldsRoutingTable (shields -> safety providers)
-  datasets.py            # DatasetsRoutingTable (datasets -> datasetio providers)
-  scoring_functions.py   # ScoringFunctionsRoutingTable
-  benchmarks.py          # BenchmarksRoutingTable
   toolgroups.py          # ToolGroupsRoutingTable
   vector_stores.py       # VectorStoresRoutingTable
 ```
diff --git a/src/llama_stack/core/routing_tables/benchmarks.py b/src/llama_stack/core/routing_tables/benchmarks.py
deleted file mode 100644
index c4e7ad9698..0000000000
--- a/src/llama_stack/core/routing_tables/benchmarks.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack.core.datatypes import (
-    BenchmarkWithOwner,
-)
-from llama_stack.log import get_logger
-from llama_stack_api import (
-    Benchmark,
-    Benchmarks,
-    GetBenchmarkRequest,
-    ListBenchmarksRequest,
-    ListBenchmarksResponse,
-    RegisterBenchmarkRequest,
-    UnregisterBenchmarkRequest,
-)
-
-from .common import CommonRoutingTableImpl
-
-logger = get_logger(name=__name__, category="core::routing_tables")
-
-
-class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
-    """Routing table for managing benchmark registrations and provider lookups."""
-
-    async def list_benchmarks(self, request: ListBenchmarksRequest) -> ListBenchmarksResponse:
-        return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
-
-    async def get_benchmark(self, request: GetBenchmarkRequest) -> Benchmark:
-        benchmark = await self.get_object_by_identifier("benchmark", request.benchmark_id)
-        if benchmark is None:
-            raise ValueError(f"Benchmark '{request.benchmark_id}' not found")
-        return benchmark
-
-    async def register_benchmark(
-        self,
-        request: RegisterBenchmarkRequest,
-    ) -> None:
-        metadata = request.metadata if request.metadata is not None else {}
-        provider_id = request.provider_id
-        if provider_id is None:
-            if len(self.impls_by_provider_id) == 1:
-                provider_id = list(self.impls_by_provider_id.keys())[0]
-            else:
-                raise ValueError(
-                    "No provider specified and multiple providers available. Please specify a provider_id."
-                )
-        provider_benchmark_id = request.provider_benchmark_id
-        if provider_benchmark_id is None:
-            provider_benchmark_id = request.benchmark_id
-        benchmark = BenchmarkWithOwner(
-            identifier=request.benchmark_id,
-            dataset_id=request.dataset_id,
-            scoring_functions=request.scoring_functions,
-            metadata=metadata,
-            provider_id=provider_id,
-            provider_resource_id=provider_benchmark_id,
-        )
-        await self.register_object(benchmark)
-
-    async def unregister_benchmark(self, request: UnregisterBenchmarkRequest) -> None:
-        get_request = GetBenchmarkRequest(benchmark_id=request.benchmark_id)
-        existing_benchmark = await self.get_benchmark(get_request)
-        await self.unregister_object(existing_benchmark)
diff --git a/src/llama_stack/core/routing_tables/common.py b/src/llama_stack/core/routing_tables/common.py
index 828cdf4060..128ac27bef 100644
--- a/src/llama_stack/core/routing_tables/common.py
+++ b/src/llama_stack/core/routing_tables/common.py
@@ -13,7 +13,6 @@
     RoutableObject,
     RoutableObjectWithProvider,
     RoutedProtocol,
-    ScoringFnWithOwner,
 )
 from llama_stack.core.request_headers import get_authenticated_user
 from llama_stack.core.store import DistributionRegistry
@@ -58,12 +57,6 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
         return await p.register_shield(obj)
     elif api == Api.vector_io:
         return await p.register_vector_store(obj)
-    elif api == Api.datasetio:
-        return await p.register_dataset(obj)
-    elif api == Api.scoring:
-        return await p.register_scoring_function(obj)
-    elif api == Api.eval:
-        return await p.register_benchmark(obj)
     elif api == Api.tool_runtime:
         return await p.register_toolgroup(obj)
     else:
@@ -87,12 +80,6 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
         return await p.unregister_model(obj.identifier)
     elif api == Api.safety:
         return await p.unregister_shield(obj.identifier)
-    elif api == Api.datasetio:
-        return await p.unregister_dataset(obj.identifier)
-    elif api == Api.eval:
-        return await p.unregister_benchmark(obj.identifier)
-    elif api == Api.scoring:
-        return await p.unregister_scoring_function(obj.identifier)
     elif api == Api.tool_runtime:
         return await p.unregister_toolgroup(obj.identifier)
     else:
@@ -128,7 +115,7 @@ async def add_objects(objs: list[RoutableObjectWithProvider], provider_id: str,
                 await self.dist_registry.register(obj)
 
         # Register all objects from providers
-        for pid, p in self.impls_by_provider_id.items():
+        for _pid, p in self.impls_by_provider_id.items():
             api = get_impl_api(p)
             if api == Api.inference:
                 p.model_store = self
@@ -136,14 +123,6 @@ async def add_objects(objs: list[RoutableObjectWithProvider], provider_id: str,
                 p.shield_store = self
             elif api == Api.vector_io:
                 p.vector_store_store = self
-            elif api == Api.datasetio:
-                p.dataset_store = self
-            elif api == Api.scoring:
-                p.scoring_function_store = self
-                scoring_functions = await p.list_scoring_functions()
-                await add_objects(scoring_functions, pid, ScoringFnWithOwner)
-            elif api == Api.eval:
-                p.benchmark_store = self
             elif api == Api.tool_runtime:
                 p.tool_store = self
 
@@ -155,10 +134,7 @@ async def refresh(self) -> None:
         pass
 
     async def get_provider_impl(self, routing_key: str, provider_id: str | None = None) -> Any:
-        from .benchmarks import BenchmarksRoutingTable
-        from .datasets import DatasetsRoutingTable
         from .models import ModelsRoutingTable
-        from .scoring_functions import ScoringFunctionsRoutingTable
         from .shields import ShieldsRoutingTable
         from .toolgroups import ToolGroupsRoutingTable
         from .vector_stores import VectorStoresRoutingTable
@@ -170,12 +146,6 @@ def apiname_object():
                 return ("Safety", "shield")
             elif isinstance(self, VectorStoresRoutingTable):
                 return ("VectorIO", "vector_store")
-            elif isinstance(self, DatasetsRoutingTable):
-                return ("DatasetIO", "dataset")
-            elif isinstance(self, ScoringFunctionsRoutingTable):
-                return ("Scoring", "scoring_function")
-            elif isinstance(self, BenchmarksRoutingTable):
-                return ("Eval", "benchmark")
             elif isinstance(self, ToolGroupsRoutingTable):
                 return ("ToolGroups", "tool_group")
             else:
diff --git a/src/llama_stack/core/routing_tables/datasets.py b/src/llama_stack/core/routing_tables/datasets.py
deleted file mode 100644
index 9e3f3eec14..0000000000
--- a/src/llama_stack/core/routing_tables/datasets.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import uuid
-
-from llama_stack.core.datatypes import (
-    DatasetWithOwner,
-)
-from llama_stack.log import get_logger
-from llama_stack_api import (
-    Dataset,
-    DatasetNotFoundError,
-    DatasetType,
-    ListDatasetsResponse,
-    ResourceType,
-    RowsDataSource,
-    URIDataSource,
-)
-from llama_stack_api.datasets.api import (
-    Datasets,
-    GetDatasetRequest,
-    RegisterDatasetRequest,
-    UnregisterDatasetRequest,
-)
-
-from .common import CommonRoutingTableImpl
-
-logger = get_logger(name=__name__, category="core::routing_tables")
-
-
-class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
-    """Routing table for managing dataset registrations and provider lookups."""
-
-    async def list_datasets(self) -> ListDatasetsResponse:
-        return ListDatasetsResponse(data=await self.get_all_with_type(ResourceType.dataset.value))
-
-    async def get_dataset(self, request: GetDatasetRequest) -> Dataset:
-        dataset = await self.get_object_by_identifier("dataset", request.dataset_id)
-        if dataset is None:
-            raise DatasetNotFoundError(request.dataset_id)
-        return dataset
-
-    async def register_dataset(self, request: RegisterDatasetRequest) -> Dataset:
-        purpose = request.purpose
-        source = request.source
-        metadata = request.metadata
-        dataset_id = request.dataset_id
-        if isinstance(source, dict):
-            if source["type"] == "uri":
-                source = URIDataSource.parse_obj(source)
-            elif source["type"] == "rows":
-                source = RowsDataSource.parse_obj(source)
-
-        if not dataset_id:
-            dataset_id = f"dataset-{str(uuid.uuid4())}"
-
-        provider_dataset_id = dataset_id
-
-        # infer provider from source
-        if metadata and metadata.get("provider_id"):
-            provider_id = metadata.get("provider_id")  # pass through from nvidia datasetio
-        elif source.type == DatasetType.rows.value:
-            provider_id = "localfs"
-        elif source.type == DatasetType.uri.value:
-            # infer provider from uri
-            if source.uri.startswith("huggingface"):
-                provider_id = "huggingface"
-            else:
-                provider_id = "localfs"
-        else:
-            raise ValueError(f"Unknown data source type: {source.type}")
-
-        if metadata is None:
-            metadata = {}
-
-        dataset = DatasetWithOwner(
-            identifier=dataset_id,
-            provider_resource_id=provider_dataset_id,
-            provider_id=provider_id,
-            purpose=purpose,
-            source=source,
-            metadata=metadata,
-        )
-
-        await self.register_object(dataset)
-        return dataset
-
-    async def unregister_dataset(self, request: UnregisterDatasetRequest) -> None:
-        dataset = await self.get_dataset(GetDatasetRequest(dataset_id=request.dataset_id))
-        await self.unregister_object(dataset)
diff --git a/src/llama_stack/core/routing_tables/scoring_functions.py b/src/llama_stack/core/routing_tables/scoring_functions.py
deleted file mode 100644
index a9b1bc4e69..0000000000
--- a/src/llama_stack/core/routing_tables/scoring_functions.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.core.datatypes import (
-    ScoringFnWithOwner,
-)
-from llama_stack.log import get_logger
-from llama_stack_api import (
-    GetScoringFunctionRequest,
-    ListScoringFunctionsRequest,
-    ListScoringFunctionsResponse,
-    RegisterScoringFunctionRequest,
-    ResourceType,
-    ScoringFn,
-    ScoringFunctions,
-    UnregisterScoringFunctionRequest,
-)
-
-from .common import CommonRoutingTableImpl
-
-logger = get_logger(name=__name__, category="core::routing_tables")
-
-
-class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
-    """Routing table for managing scoring function registrations and provider lookups."""
-
-    async def list_scoring_functions(self, request: ListScoringFunctionsRequest) -> ListScoringFunctionsResponse:
-        return ListScoringFunctionsResponse(data=await self.get_all_with_type(ResourceType.scoring_function.value))
-
-    async def get_scoring_function(self, request: GetScoringFunctionRequest) -> ScoringFn:
-        scoring_fn = await self.get_object_by_identifier("scoring_function", request.scoring_fn_id)
-        if scoring_fn is None:
-            raise ValueError(f"Scoring function '{request.scoring_fn_id}' not found")
-        return scoring_fn
-
-    async def register_scoring_function(
-        self,
-        request: RegisterScoringFunctionRequest,
-    ) -> None:
-        provider_scoring_fn_id = request.provider_scoring_fn_id
-        if provider_scoring_fn_id is None:
-            provider_scoring_fn_id = request.scoring_fn_id
-        provider_id = request.provider_id
-        if provider_id is None:
-            if len(self.impls_by_provider_id) == 1:
-                provider_id = list(self.impls_by_provider_id.keys())[0]
-            else:
-                raise ValueError(
-                    "No provider specified and multiple providers available. Please specify a provider_id."
-                )
-        scoring_fn = ScoringFnWithOwner(
-            identifier=request.scoring_fn_id,
-            description=request.description,
-            return_type=request.return_type,
-            provider_resource_id=provider_scoring_fn_id,
-            provider_id=provider_id,
-            params=request.params,
-        )
-        scoring_fn.provider_id = provider_id
-        await self.register_object(scoring_fn)
-
-    async def unregister_scoring_function(self, request: UnregisterScoringFunctionRequest) -> None:
-        get_request = GetScoringFunctionRequest(scoring_fn_id=request.scoring_fn_id)
-        existing_scoring_fn = await self.get_scoring_function(get_request)
-        await self.unregister_object(existing_scoring_fn)
diff --git a/src/llama_stack/core/stack.py b/src/llama_stack/core/stack.py
index 6ce27851d5..61d7da32da 100644
--- a/src/llama_stack/core/stack.py
+++ b/src/llama_stack/core/stack.py
@@ -49,12 +49,8 @@
 from llama_stack_api import (
     Api,
     Batches,
-    Benchmarks,
     Connectors,
     Conversations,
-    DatasetIO,
-    Datasets,
-    Eval,
     Files,
     Inference,
     Inspect,
@@ -62,19 +58,14 @@
     ModelType,
     Prompts,
     Providers,
-    RegisterBenchmarkRequest,
     RegisterModelRequest,
-    RegisterScoringFunctionRequest,
     RegisterShieldRequest,
     Responses,
     Safety,
-    Scoring,
-    ScoringFunctions,
     Shields,
     ToolGroupNotFoundError,
     VectorIO,
 )
-from llama_stack_api.datasets import RegisterDatasetRequest
 
 logger = get_logger(name=__name__, category="core")
 
@@ -85,13 +76,7 @@ class LlamaStack(
     Responses,
     Batches,
     Safety,
-    Datasets,
     VectorIO,
-    Eval,
-    Benchmarks,
-    Scoring,
-    ScoringFunctions,
-    DatasetIO,
     Models,
     Shields,
     Inspect,
@@ -110,15 +95,6 @@ class LlamaStack(
 RESOURCES = [
     ("models", Api.models, "register_model", "list_models", RegisterModelRequest),
     ("shields", Api.shields, "register_shield", "list_shields", RegisterShieldRequest),
-    ("datasets", Api.datasets, "register_dataset", "list_datasets", RegisterDatasetRequest),
-    (
-        "scoring_fns",
-        Api.scoring_functions,
-        "register_scoring_function",
-        "list_scoring_functions",
-        RegisterScoringFunctionRequest,
-    ),
-    ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks", RegisterBenchmarkRequest),
     ("vector_stores", Api.vector_stores, "register_vector_store", "list_vector_stores", None),
 ]
 
@@ -133,9 +109,6 @@ class LlamaStack(
     "vector_store_id",
     "model_id",
     "shield_id",
-    "dataset_id",
-    "scoring_fn_id",
-    "benchmark_id",
 ]
 
 
diff --git a/src/llama_stack/distributions/ci-tests/build.yaml b/src/llama_stack/distributions/ci-tests/build.yaml
index 4ac635bae8..7ee40405e5 100644
--- a/src/llama_stack/distributions/ci-tests/build.yaml
+++ b/src/llama_stack/distributions/ci-tests/build.yaml
@@ -37,15 +37,6 @@ distribution_spec:
     - provider_type: inline::code-scanner
     responses:
     - provider_type: inline::builtin
-    eval:
-    - provider_type: inline::builtin
-    datasetio:
-    - provider_type: remote::huggingface
-    - provider_type: inline::localfs
-    scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
     tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
diff --git a/src/llama_stack/distributions/ci-tests/config.yaml b/src/llama_stack/distributions/ci-tests/config.yaml
index 388d787453..934f4d75bb 100644
--- a/src/llama_stack/distributions/ci-tests/config.yaml
+++ b/src/llama_stack/distributions/ci-tests/config.yaml
@@ -2,15 +2,12 @@ version: 2
 distro_name: ci-tests
 apis:
 - batches
-- datasetio
-- eval
 - file_processors
 - files
 - inference
 - messages
 - responses
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -214,35 +211,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -316,9 +284,6 @@ registered_resources:
     provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
     provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
   auth:
diff --git a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
index c96be30295..80211aac54 100644
--- a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
@@ -2,15 +2,12 @@ version: 2
 distro_name: ci-tests
 apis:
 - batches
-- datasetio
-- eval
 - file_processors
 - files
 - inference
 - messages
 - responses
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -214,35 +211,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -329,9 +297,6 @@ registered_resources:
     provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
     provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
   auth:
diff --git a/src/llama_stack/distributions/nvidia/config.yaml b/src/llama_stack/distributions/nvidia/config.yaml
index 80838d2af3..8e694665ce 100644
--- a/src/llama_stack/distributions/nvidia/config.yaml
+++ b/src/llama_stack/distributions/nvidia/config.yaml
@@ -1,13 +1,10 @@
 version: 2
 distro_name: nvidia
 apis:
-- datasetio
-- eval
 - files
 - inference
 - responses
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -43,22 +40,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
-  datasetio:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      api_key: ${env.NVIDIA_API_KEY:=}
-      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-      project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-      datasets_url: ${env.NVIDIA_DATASETS_URL:=http://nemo.test}
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
   tool_runtime:
   - provider_id: file-search
     provider_type: inline::file-search
@@ -100,8 +81,5 @@ registered_resources:
   models: []
   shields: []
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/nvidia/nvidia.py b/src/llama_stack/distributions/nvidia/nvidia.py
index 8baccb8498..ec04aa2d2b 100644
--- a/src/llama_stack/distributions/nvidia/nvidia.py
+++ b/src/llama_stack/distributions/nvidia/nvidia.py
@@ -9,8 +9,6 @@
 from llama_stack.core.datatypes import BuildProvider, ModelInput, Provider, ShieldInput
 from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings
 from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
-from llama_stack.providers.remote.datasetio.nvidia import NvidiaDatasetIOConfig
-from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig
 from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
 from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig
 
@@ -29,12 +27,6 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
         "vector_io": [BuildProvider(provider_type="inline::faiss")],
         "safety": [BuildProvider(provider_type="remote::nvidia")],
         "responses": [BuildProvider(provider_type="inline::builtin")],
-        "eval": [BuildProvider(provider_type="remote::nvidia")],
-        "datasetio": [
-            BuildProvider(provider_type="inline::localfs"),
-            BuildProvider(provider_type="remote::nvidia"),
-        ],
-        "scoring": [BuildProvider(provider_type="inline::basic")],
         "tool_runtime": [BuildProvider(provider_type="inline::file-search")],
         "files": [BuildProvider(provider_type="inline::localfs")],
     }
@@ -49,16 +41,6 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
         provider_type="remote::nvidia",
         config=NVIDIASafetyConfig.sample_run_config(),
     )
-    datasetio_provider = Provider(
-        provider_id="nvidia",
-        provider_type="remote::nvidia",
-        config=NvidiaDatasetIOConfig.sample_run_config(),
-    )
-    eval_provider = Provider(
-        provider_id="nvidia",
-        provider_type="remote::nvidia",
-        config=NVIDIAEvalConfig.sample_run_config(),
-    )
     files_provider = Provider(
         provider_id="builtin-files",
         provider_type="inline::localfs",
@@ -76,7 +58,7 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
     return DistributionTemplate(
         name=name,
         distro_type="self_hosted",
-        description="Use NVIDIA NIM for running LLM inference, evaluation and safety",
+        description="Use NVIDIA NIM for running LLM inference and safety",
         container_image=None,
         template_path=Path(__file__).parent / "doc_template.md",
         providers=providers,
@@ -84,8 +66,6 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
             "config.yaml": RunConfigSettings(
                 provider_overrides={
                     "inference": [inference_provider],
-                    "datasetio": [datasetio_provider],
-                    "eval": [eval_provider],
                     "files": [files_provider],
                 },
             ),
@@ -95,7 +75,6 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
                         inference_provider,
                         safety_provider,
                     ],
-                    "eval": [eval_provider],
                     "files": [files_provider],
                 },
                 default_models=[inference_model, safety_model],
@@ -111,14 +90,6 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
                 "True",
                 "Whether to append the API version to the base_url",
             ),
-            "NVIDIA_DATASET_NAMESPACE": (
-                "default",
-                "NVIDIA Dataset Namespace",
-            ),
-            "NVIDIA_PROJECT_ID": (
-                "test-project",
-                "NVIDIA Project ID",
-            ),
             "GUARDRAILS_SERVICE_URL": (
                 "http://0.0.0.0:7331",
                 "URL for the NeMo Guardrails Service",
@@ -127,10 +98,6 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
                 "self-check",
                 "NVIDIA Guardrail Configuration ID",
             ),
-            "NVIDIA_EVALUATOR_URL": (
-                "http://0.0.0.0:7331",
-                "URL for the NeMo Evaluator Service",
-            ),
             "INFERENCE_MODEL": (
                 "Llama3.1-8B-Instruct",
                 "Inference model",
diff --git a/src/llama_stack/distributions/nvidia/run-with-safety.yaml b/src/llama_stack/distributions/nvidia/run-with-safety.yaml
index 901753eecd..0ea054397a 100644
--- a/src/llama_stack/distributions/nvidia/run-with-safety.yaml
+++ b/src/llama_stack/distributions/nvidia/run-with-safety.yaml
@@ -1,13 +1,10 @@
 version: 2
 distro_name: nvidia
 apis:
-- datasetio
-- eval
 - files
 - inference
 - responses
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -48,28 +45,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
-  datasetio:
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      api_key: ${env.NVIDIA_API_KEY:=}
-      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-      project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-      datasets_url: ${env.NVIDIA_DATASETS_URL:=http://nemo.test}
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
   tool_runtime:
   - provider_id: file-search
     provider_type: inline::file-search
@@ -121,8 +96,5 @@ registered_resources:
   - shield_id: ${env.SAFETY_MODEL}
     provider_id: nvidia
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/oci/config.yaml b/src/llama_stack/distributions/oci/config.yaml
index 370038181e..00de02b760 100644
--- a/src/llama_stack/distributions/oci/config.yaml
+++ b/src/llama_stack/distributions/oci/config.yaml
@@ -1,13 +1,10 @@
 version: 2
 distro_name: oci
 apis:
-- datasetio
-- eval
 - files
 - inference
 - responses
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -45,35 +42,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -127,8 +95,5 @@ registered_resources:
   models: []
   shields: []
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/oci/oci.py b/src/llama_stack/distributions/oci/oci.py
index da0e363efb..30a8f4008a 100644
--- a/src/llama_stack/distributions/oci/oci.py
+++ b/src/llama_stack/distributions/oci/oci.py
@@ -31,16 +31,6 @@ def get_distribution_template(name: str = "oci") -> DistributionTemplate:
         ],
         "safety": [BuildProvider(provider_type="inline::llama-guard")],
         "responses": [BuildProvider(provider_type="inline::builtin")],
-        "eval": [BuildProvider(provider_type="inline::builtin")],
-        "datasetio": [
-            BuildProvider(provider_type="remote::huggingface"),
-            BuildProvider(provider_type="inline::localfs"),
-        ],
-        "scoring": [
-            BuildProvider(provider_type="inline::basic"),
-            BuildProvider(provider_type="inline::llm-as-judge"),
-            BuildProvider(provider_type="inline::braintrust"),
-        ],
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
diff --git a/src/llama_stack/distributions/open-benchmark/config.yaml b/src/llama_stack/distributions/open-benchmark/config.yaml
index 0d225c42bb..bbb0697d15 100644
--- a/src/llama_stack/distributions/open-benchmark/config.yaml
+++ b/src/llama_stack/distributions/open-benchmark/config.yaml
@@ -1,12 +1,9 @@
 version: 2
 distro_name: open-benchmark
 apis:
-- datasetio
-- eval
 - inference
 - responses
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -84,35 +81,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -184,74 +152,5 @@ registered_resources:
   shields:
   - shield_id: meta-llama/Llama-Guard-3-8B
   vector_dbs: []
-  datasets:
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/simpleqa?split=train
-    metadata: {}
-    dataset_id: simpleqa
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/mmlu_cot?split=test&name=all
-    metadata: {}
-    dataset_id: mmlu_cot
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/gpqa_0shot_cot?split=test&name=gpqa_main
-    metadata: {}
-    dataset_id: gpqa_cot
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/math_500?split=test
-    metadata: {}
-    dataset_id: math_500
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/IfEval?split=train
-    metadata: {}
-    dataset_id: ifeval
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/docvqa?split=val
-    metadata: {}
-    dataset_id: docvqa
-  scoring_fns: []
-  benchmarks:
-  - dataset_id: simpleqa
-    scoring_functions:
-    - llm-as-judge::405b-simpleqa
-    metadata: {}
-    benchmark_id: builtin-simpleqa
-  - dataset_id: mmlu_cot
-    scoring_functions:
-    - basic::regex_parser_multiple_choice_answer
-    metadata: {}
-    benchmark_id: builtin-mmlu-cot
-  - dataset_id: gpqa_cot
-    scoring_functions:
-    - basic::regex_parser_multiple_choice_answer
-    metadata: {}
-    benchmark_id: builtin-gpqa-cot
-  - dataset_id: math_500
-    scoring_functions:
-    - basic::regex_parser_math_response
-    metadata: {}
-    benchmark_id: builtin-math-500
-  - dataset_id: ifeval
-    scoring_functions:
-    - basic::ifeval
-    metadata: {}
-    benchmark_id: builtin-ifeval
-  - dataset_id: docvqa
-    scoring_functions:
-    - basic::docvqa
-    metadata: {}
-    benchmark_id: builtin-docvqa
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/open-benchmark/open_benchmark.py b/src/llama_stack/distributions/open-benchmark/open_benchmark.py
index bb1a1773dc..5300823f57 100644
--- a/src/llama_stack/distributions/open-benchmark/open_benchmark.py
+++ b/src/llama_stack/distributions/open-benchmark/open_benchmark.py
@@ -6,9 +6,7 @@
 
 
 from llama_stack.core.datatypes import (
-    BenchmarkInput,
     BuildProvider,
-    DatasetInput,
     ModelInput,
     Provider,
     ShieldInput,
@@ -31,7 +29,7 @@
     PGVectorVectorIOConfig,
 )
 from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
-from llama_stack_api import DatasetPurpose, ModelType, URIDataSource
+from llama_stack_api import ModelType
 
 
 def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderModelEntry]]]:
@@ -101,7 +99,7 @@ def get_distribution_template() -> DistributionTemplate:
     """Build the open-benchmark distribution template for running evaluations.
 
     Returns:
-        A DistributionTemplate configured with benchmark datasets and scoring functions.
+        A DistributionTemplate configured for open benchmarking.
     """
     inference_providers, available_models = get_inference_providers()
     providers = {
@@ -113,16 +111,6 @@ def get_distribution_template() -> DistributionTemplate:
         ],
         "safety": [BuildProvider(provider_type="inline::llama-guard")],
         "responses": [BuildProvider(provider_type="inline::builtin")],
-        "eval": [BuildProvider(provider_type="inline::builtin")],
-        "datasetio": [
-            BuildProvider(provider_type="remote::huggingface"),
-            BuildProvider(provider_type="inline::localfs"),
-        ],
-        "scoring": [
-            BuildProvider(provider_type="inline::basic"),
-            BuildProvider(provider_type="inline::llm-as-judge"),
-            BuildProvider(provider_type="inline::braintrust"),
-        ],
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
@@ -173,83 +161,6 @@ def get_distribution_template() -> DistributionTemplate:
         ),
     ]
 
-    default_datasets = [
-        DatasetInput(
-            dataset_id="simpleqa",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/simpleqa?split=train",
-            ),
-        ),
-        DatasetInput(
-            dataset_id="mmlu_cot",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/mmlu_cot?split=test&name=all",
-            ),
-        ),
-        DatasetInput(
-            dataset_id="gpqa_cot",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/gpqa_0shot_cot?split=test&name=gpqa_main",
-            ),
-        ),
-        DatasetInput(
-            dataset_id="math_500",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/math_500?split=test",
-            ),
-        ),
-        DatasetInput(
-            dataset_id="ifeval",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/IfEval?split=train",
-            ),
-        ),
-        DatasetInput(
-            dataset_id="docvqa",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/docvqa?split=val",
-            ),
-        ),
-    ]
-
-    default_benchmarks = [
-        BenchmarkInput(
-            benchmark_id="builtin-simpleqa",
-            dataset_id="simpleqa",
-            scoring_functions=["llm-as-judge::405b-simpleqa"],
-        ),
-        BenchmarkInput(
-            benchmark_id="builtin-mmlu-cot",
-            dataset_id="mmlu_cot",
-            scoring_functions=["basic::regex_parser_multiple_choice_answer"],
-        ),
-        BenchmarkInput(
-            benchmark_id="builtin-gpqa-cot",
-            dataset_id="gpqa_cot",
-            scoring_functions=["basic::regex_parser_multiple_choice_answer"],
-        ),
-        BenchmarkInput(
-            benchmark_id="builtin-math-500",
-            dataset_id="math_500",
-            scoring_functions=["basic::regex_parser_math_response"],
-        ),
-        BenchmarkInput(
-            benchmark_id="builtin-ifeval",
-            dataset_id="ifeval",
-            scoring_functions=["basic::ifeval"],
-        ),
-        BenchmarkInput(
-            benchmark_id="builtin-docvqa",
-            dataset_id="docvqa",
-            scoring_functions=["basic::docvqa"],
-        ),
-    ]
     return DistributionTemplate(
         name=name,
         distro_type="self_hosted",
@@ -266,8 +177,6 @@ def get_distribution_template() -> DistributionTemplate:
                 },
                 default_models=default_models,
                 default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
-                default_datasets=default_datasets,
-                default_benchmarks=default_benchmarks,
             ),
         },
         run_config_env_vars={
diff --git a/src/llama_stack/distributions/postgres-demo/config.yaml b/src/llama_stack/distributions/postgres-demo/config.yaml
index b6c7f45543..c8c4d3bedb 100644
--- a/src/llama_stack/distributions/postgres-demo/config.yaml
+++ b/src/llama_stack/distributions/postgres-demo/config.yaml
@@ -110,8 +110,5 @@ registered_resources:
   shields:
   - shield_id: meta-llama/Llama-Guard-3-8B
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/starter/build.yaml b/src/llama_stack/distributions/starter/build.yaml
index 301c632482..c08224e9cc 100644
--- a/src/llama_stack/distributions/starter/build.yaml
+++ b/src/llama_stack/distributions/starter/build.yaml
@@ -38,16 +38,9 @@ distribution_spec:
     - provider_type: inline::code-scanner
     responses:
     - provider_type: inline::builtin
-    eval:
     - provider_type: inline::builtin
-    datasetio:
     - provider_type: remote::huggingface
     - provider_type: inline::localfs
-    scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
-    tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
     - provider_type: inline::file-search
diff --git a/src/llama_stack/distributions/starter/config.yaml b/src/llama_stack/distributions/starter/config.yaml
index d8ac645efd..a31fa49fde 100644
--- a/src/llama_stack/distributions/starter/config.yaml
+++ b/src/llama_stack/distributions/starter/config.yaml
@@ -2,15 +2,12 @@ version: 2
 distro_name: starter
 apis:
 - batches
-- datasetio
-- eval
 - file_processors
 - files
 - inference
 - messages
 - responses
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -208,35 +205,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -295,9 +263,6 @@ registered_resources:
     provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
     provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
 vector_stores:
diff --git a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
index 1ccdeb5666..3295781c85 100644
--- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
@@ -2,15 +2,12 @@ version: 2
 distro_name: starter
 apis:
 - batches
-- datasetio
-- eval
 - file_processors
 - files
 - inference
 - messages
 - responses
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -208,35 +205,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -308,9 +276,6 @@ registered_resources:
     provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
     provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
 vector_stores:
diff --git a/src/llama_stack/distributions/starter/starter.py b/src/llama_stack/distributions/starter/starter.py
index 61e969c01b..e001d7fd16 100644
--- a/src/llama_stack/distributions/starter/starter.py
+++ b/src/llama_stack/distributions/starter/starter.py
@@ -154,16 +154,6 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
         ],
         "messages": [BuildProvider(provider_type="inline::builtin")],
         "responses": [BuildProvider(provider_type="inline::builtin")],
-        "eval": [BuildProvider(provider_type="inline::builtin")],
-        "datasetio": [
-            BuildProvider(provider_type="remote::huggingface"),
-            BuildProvider(provider_type="inline::localfs"),
-        ],
-        "scoring": [
-            BuildProvider(provider_type="inline::basic"),
-            BuildProvider(provider_type="inline::llm-as-judge"),
-            BuildProvider(provider_type="inline::braintrust"),
-        ],
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
diff --git a/src/llama_stack/distributions/template.py b/src/llama_stack/distributions/template.py
index dfae2688a7..76880362bf 100644
--- a/src/llama_stack/distributions/template.py
+++ b/src/llama_stack/distributions/template.py
@@ -15,9 +15,7 @@
 from llama_stack.core.datatypes import (
     LLAMA_STACK_RUN_CONFIG_VERSION,
     Api,
-    BenchmarkInput,
     BuildProvider,
-    DatasetInput,
     ModelInput,
     Provider,
     SafetyConfig,
@@ -35,7 +33,7 @@
 from llama_stack.core.storage.sqlstore.sqlstore import SqliteSqlStoreConfig
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
-from llama_stack_api import ConnectorInput, ConnectorType, DatasetPurpose, ModelType
+from llama_stack_api import ConnectorInput, ConnectorType, ModelType
 
 
 def filter_empty_values(obj: Any) -> Any:
@@ -198,8 +196,6 @@ class RunConfigSettings(BaseModel):
     provider_overrides: dict[str, list[Provider]] = Field(default_factory=dict)
     default_models: list[ModelInput] | None = None
     default_shields: list[ShieldInput] | None = None
-    default_datasets: list[DatasetInput] | None = None
-    default_benchmarks: list[BenchmarkInput] | None = None
     default_connectors: list[ConnectorInput] | None = None
     vector_stores_config: VectorStoresConfig | None = None
     safety_config: SafetyConfig | None = None
@@ -299,9 +295,6 @@ def run_config(
                 "models": [m.model_dump(exclude_none=True) for m in (self.default_models or [])],
                 "shields": [s.model_dump(exclude_none=True) for s in (self.default_shields or [])],
                 "vector_dbs": [],
-                "datasets": [d.model_dump(exclude_none=True) for d in (self.default_datasets or [])],
-                "scoring_fns": [],
-                "benchmarks": [b.model_dump(exclude_none=True) for b in (self.default_benchmarks or [])],
             },
             "server": {
                 "port": 8321,
@@ -406,11 +399,9 @@ def enum_representer(dumper, data):
 
         # Register YAML representer for enums
         yaml.add_representer(ModelType, enum_representer)
-        yaml.add_representer(DatasetPurpose, enum_representer)
         yaml.add_representer(StorageBackendType, enum_representer)
         yaml.add_representer(ConnectorType, enum_representer)
         yaml.SafeDumper.add_representer(ModelType, enum_representer)
-        yaml.SafeDumper.add_representer(DatasetPurpose, enum_representer)
         yaml.SafeDumper.add_representer(StorageBackendType, enum_representer)
         yaml.SafeDumper.add_representer(ConnectorType, enum_representer)
 
diff --git a/src/llama_stack/distributions/watsonx/config.yaml b/src/llama_stack/distributions/watsonx/config.yaml
index b91ff10513..f6d3733af3 100644
--- a/src/llama_stack/distributions/watsonx/config.yaml
+++ b/src/llama_stack/distributions/watsonx/config.yaml
@@ -1,13 +1,10 @@
 version: 2
 distro_name: watsonx
 apis:
-- datasetio
-- eval
 - files
 - inference
 - responses
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -43,35 +40,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -125,8 +93,5 @@ registered_resources:
   models: []
   shields: []
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/watsonx/watsonx.py b/src/llama_stack/distributions/watsonx/watsonx.py
index a45fc91641..1ab5831bba 100644
--- a/src/llama_stack/distributions/watsonx/watsonx.py
+++ b/src/llama_stack/distributions/watsonx/watsonx.py
@@ -28,16 +28,6 @@ def get_distribution_template(name: str = "watsonx") -> DistributionTemplate:
         "vector_io": [BuildProvider(provider_type="inline::faiss")],
         "safety": [BuildProvider(provider_type="inline::llama-guard")],
         "responses": [BuildProvider(provider_type="inline::builtin")],
-        "eval": [BuildProvider(provider_type="inline::builtin")],
-        "datasetio": [
-            BuildProvider(provider_type="remote::huggingface"),
-            BuildProvider(provider_type="inline::localfs"),
-        ],
-        "scoring": [
-            BuildProvider(provider_type="inline::basic"),
-            BuildProvider(provider_type="inline::llm-as-judge"),
-            BuildProvider(provider_type="inline::braintrust"),
-        ],
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
diff --git a/src/llama_stack/log.py b/src/llama_stack/log.py
index 7c37bd1b82..2e896531ee 100644
--- a/src/llama_stack/log.py
+++ b/src/llama_stack/log.py
@@ -38,7 +38,6 @@ class LoggingConfig(BaseModel):
     "inference",
     "agents",
     "safety",
-    "eval",
     "tools",
     "client",
     "openai",
@@ -52,7 +51,6 @@ class LoggingConfig(BaseModel):
     "vector_io",
     "tool_runtime",
     "cli",
-    "scoring",
     "tests",
     "telemetry",
     "connectors",
diff --git a/src/llama_stack/providers/inline/README.md b/src/llama_stack/providers/inline/README.md
index b7b074d6e1..7b506a5a3c 100644
--- a/src/llama_stack/providers/inline/README.md
+++ b/src/llama_stack/providers/inline/README.md
@@ -12,9 +12,6 @@ inline/
   ios/                 # iOS on-device inference
   safety/              # Safety checks (llama-guard, code-scanner)
   vector_io/           # Vector storage (sqlite-vec, faiss, chroma, milvus, qdrant)
-  datasetio/           # Dataset I/O (local file handling)
-  eval/                # Evaluation orchestration
-  scoring/             # Scoring function implementations
   tool_runtime/        # Tool runtime (RAG context retrieval)
   files/               # File storage and management
   file_processor/      # File processing (text extraction, etc.)
diff --git a/src/llama_stack/providers/inline/datasetio/__init__.py b/src/llama_stack/providers/inline/datasetio/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/datasetio/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/datasetio/localfs/__init__.py b/src/llama_stack/providers/inline/datasetio/localfs/__init__.py
deleted file mode 100644
index 58aa6ffaf3..0000000000
--- a/src/llama_stack/providers/inline/datasetio/localfs/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from .config import LocalFSDatasetIOConfig
-
-
-async def get_provider_impl(
-    config: LocalFSDatasetIOConfig,
-    _deps: dict[str, Any],
-):
-    from .datasetio import LocalFSDatasetIOImpl
-
-    impl = LocalFSDatasetIOImpl(config)
-    await impl.initialize()
-    return impl
diff --git a/src/llama_stack/providers/inline/datasetio/localfs/config.py b/src/llama_stack/providers/inline/datasetio/localfs/config.py
deleted file mode 100644
index 400d2b737f..0000000000
--- a/src/llama_stack/providers/inline/datasetio/localfs/config.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel
-
-from llama_stack.core.storage.datatypes import KVStoreReference
-
-
-class LocalFSDatasetIOConfig(BaseModel):
-    """Configuration for the local filesystem dataset I/O provider."""
-
-    kvstore: KVStoreReference
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {
-            "kvstore": KVStoreReference(
-                backend="kv_default",
-                namespace="datasetio::localfs",
-            ).model_dump(exclude_none=True)
-        }
diff --git a/src/llama_stack/providers/inline/datasetio/localfs/datasetio.py b/src/llama_stack/providers/inline/datasetio/localfs/datasetio.py
deleted file mode 100644
index 45d173890e..0000000000
--- a/src/llama_stack/providers/inline/datasetio/localfs/datasetio.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.core.storage.kvstore import kvstore_impl
-from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
-from llama_stack.providers.utils.pagination import paginate_records
-from llama_stack_api import Dataset, DatasetIO, DatasetsProtocolPrivate, PaginatedResponse
-
-from .config import LocalFSDatasetIOConfig
-
-DATASETS_PREFIX = "localfs_datasets:"
-
-
-class PandasDataframeDataset:
-    """Wraps a dataset definition with lazy pandas DataFrame loading."""
-
-    def __init__(self, dataset_def: Dataset, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.dataset_def = dataset_def
-        self.df = None
-
-    def __len__(self) -> int:
-        assert self.df is not None, "Dataset not loaded. Please call .load() first"
-        return len(self.df)
-
-    def __getitem__(self, idx):
-        assert self.df is not None, "Dataset not loaded. Please call .load() first"
-        if isinstance(idx, slice):
-            return self.df.iloc[idx].to_dict(orient="records")
-        else:
-            return self.df.iloc[idx].to_dict()
-
-    async def load(self) -> None:
-        if self.df is not None:
-            return
-
-        if self.dataset_def.source.type == "uri":
-            self.df = await get_dataframe_from_uri(self.dataset_def.source.uri)
-        elif self.dataset_def.source.type == "rows":
-            import pandas
-
-            self.df = pandas.DataFrame(self.dataset_def.source.rows)
-        else:
-            raise ValueError(f"Unsupported dataset source type: {self.dataset_def.source.type}")
-
-        if self.df is None:
-            raise ValueError(f"Failed to load dataset from {self.dataset_def.url}")
-
-
-class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
-    """Dataset I/O provider for reading and writing datasets using the local filesystem."""
-
-    def __init__(self, config: LocalFSDatasetIOConfig) -> None:
-        self.config = config
-        # local registry for keeping track of datasets within the provider
-        self.dataset_infos = {}
-        self.kvstore = None
-
-    async def initialize(self) -> None:
-        self.kvstore = await kvstore_impl(self.config.kvstore)
-        # Load existing datasets from kvstore
-        start_key = DATASETS_PREFIX
-        end_key = f"{DATASETS_PREFIX}\xff"
-        stored_datasets = await self.kvstore.values_in_range(start_key, end_key)
-
-        for dataset in stored_datasets:
-            dataset = Dataset.model_validate_json(dataset)
-            self.dataset_infos[dataset.identifier] = dataset
-
-    async def shutdown(self) -> None: ...
-
-    async def register_dataset(
-        self,
-        dataset_def: Dataset,
-    ) -> None:
-        # Store in kvstore
-        key = f"{DATASETS_PREFIX}{dataset_def.identifier}"
-        await self.kvstore.set(
-            key=key,
-            value=dataset_def.model_dump_json(),
-        )
-        self.dataset_infos[dataset_def.identifier] = dataset_def
-
-    async def unregister_dataset(self, dataset_id: str) -> None:
-        key = f"{DATASETS_PREFIX}{dataset_id}"
-        await self.kvstore.delete(key=key)
-        del self.dataset_infos[dataset_id]
-
-    async def iterrows(
-        self,
-        dataset_id: str,
-        start_index: int | None = None,
-        limit: int | None = None,
-    ) -> PaginatedResponse:
-        dataset_def = self.dataset_infos[dataset_id]
-        dataset_impl = PandasDataframeDataset(dataset_def)
-        await dataset_impl.load()
-
-        records = dataset_impl.df.to_dict("records")
-        return paginate_records(records, start_index, limit)
-
-    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
-        import pandas
-
-        dataset_def = self.dataset_infos[dataset_id]
-        dataset_impl = PandasDataframeDataset(dataset_def)
-        await dataset_impl.load()
-
-        new_rows_df = pandas.DataFrame(rows)
-        dataset_impl.df = pandas.concat([dataset_impl.df, new_rows_df], ignore_index=True)
diff --git a/src/llama_stack/providers/inline/eval/__init__.py b/src/llama_stack/providers/inline/eval/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/eval/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/eval/builtin/__init__.py b/src/llama_stack/providers/inline/eval/builtin/__init__.py
deleted file mode 100644
index a77330b5e2..0000000000
--- a/src/llama_stack/providers/inline/eval/builtin/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.core.datatypes import Api
-
-from .config import BuiltinEvalConfig
-
-
-async def get_provider_impl(
-    config: BuiltinEvalConfig,
-    deps: dict[Api, Any],
-):
-    from .eval import BuiltinEvalImpl
-
-    impl = BuiltinEvalImpl(
-        config,
-        deps[Api.datasetio],
-        deps[Api.datasets],
-        deps[Api.scoring],
-        deps[Api.inference],
-        deps[Api.responses],
-    )
-    await impl.initialize()
-    return impl
diff --git a/src/llama_stack/providers/inline/eval/builtin/config.py b/src/llama_stack/providers/inline/eval/builtin/config.py
deleted file mode 100644
index 7af66e31a8..0000000000
--- a/src/llama_stack/providers/inline/eval/builtin/config.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel
-
-from llama_stack.core.storage.datatypes import KVStoreReference
-
-
-class BuiltinEvalConfig(BaseModel):
-    """Configuration for the built-in evaluation provider."""
-
-    kvstore: KVStoreReference
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {
-            "kvstore": KVStoreReference(
-                backend="kv_default",
-                namespace="eval",
-            ).model_dump(exclude_none=True)
-        }
diff --git a/src/llama_stack/providers/inline/eval/builtin/eval.py b/src/llama_stack/providers/inline/eval/builtin/eval.py
deleted file mode 100644
index 5d3f6db53e..0000000000
--- a/src/llama_stack/providers/inline/eval/builtin/eval.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import json
-from typing import Any
-
-from tqdm import tqdm
-
-from llama_stack.core.storage.kvstore import kvstore_impl
-from llama_stack.providers.utils.common.data_schema_validator import ColumnName
-from llama_stack_api import (
-    Benchmark,
-    BenchmarksProtocolPrivate,
-    DatasetIO,
-    Datasets,
-    Eval,
-    EvaluateResponse,
-    EvaluateRowsRequest,
-    Inference,
-    IterRowsRequest,
-    Job,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatus,
-    JobStatusRequest,
-    OpenAIChatCompletionRequestWithExtraBody,
-    OpenAICompletionRequestWithExtraBody,
-    OpenAISystemMessageParam,
-    OpenAIUserMessageParam,
-    Responses,
-    RunEvalRequest,
-    ScoreRequest,
-    Scoring,
-)
-
-from .config import BuiltinEvalConfig
-
-EVAL_TASKS_PREFIX = "benchmarks:"
-
-
-class BuiltinEvalImpl(
-    Eval,
-    BenchmarksProtocolPrivate,
-):
-    """Built-in evaluation provider that runs benchmarks using inference, scoring, and datasets."""
-
-    def __init__(
-        self,
-        config: BuiltinEvalConfig,
-        datasetio_api: DatasetIO,
-        datasets_api: Datasets,
-        scoring_api: Scoring,
-        inference_api: Inference,
-        responses_api: Responses,
-    ) -> None:
-        self.config = config
-        self.datasetio_api = datasetio_api
-        self.datasets_api = datasets_api
-        self.scoring_api = scoring_api
-        self.inference_api = inference_api
-        self.responses_api = responses_api
-
-        # TODO: assume sync job, will need jobs API for async scheduling
-        self.jobs = {}
-
-        self.benchmarks = {}
-
-    async def initialize(self) -> None:
-        self.kvstore = await kvstore_impl(self.config.kvstore)
-        # Load existing benchmarks from kvstore
-        start_key = EVAL_TASKS_PREFIX
-        end_key = f"{EVAL_TASKS_PREFIX}\xff"
-        stored_benchmarks = await self.kvstore.values_in_range(start_key, end_key)
-
-        for benchmark in stored_benchmarks:
-            benchmark = Benchmark.model_validate_json(benchmark)
-            self.benchmarks[benchmark.identifier] = benchmark
-
-    async def shutdown(self) -> None: ...
-
-    async def register_benchmark(self, task_def: Benchmark) -> None:
-        # Store in kvstore
-        key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}"
-        await self.kvstore.set(
-            key=key,
-            value=task_def.model_dump_json(),
-        )
-        self.benchmarks[task_def.identifier] = task_def
-
-    async def unregister_benchmark(self, benchmark_id: str) -> None:
-        if benchmark_id in self.benchmarks:
-            del self.benchmarks[benchmark_id]
-
-        key = f"{EVAL_TASKS_PREFIX}{benchmark_id}"
-        await self.kvstore.delete(key)
-
-    async def run_eval(
-        self,
-        request: RunEvalRequest,
-    ) -> Job:
-        task_def = self.benchmarks[request.benchmark_id]
-        dataset_id = task_def.dataset_id
-        scoring_functions = task_def.scoring_functions
-
-        # TODO (xiyan): validate dataset schema
-        # dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
-
-        all_rows = await self.datasetio_api.iterrows(
-            IterRowsRequest(
-                dataset_id=dataset_id,
-                limit=(-1 if request.benchmark_config.num_examples is None else request.benchmark_config.num_examples),
-            )
-        )
-        eval_rows_request = EvaluateRowsRequest(
-            benchmark_id=request.benchmark_id,
-            input_rows=all_rows.data,
-            scoring_functions=scoring_functions,
-            benchmark_config=request.benchmark_config,
-        )
-        res = await self.evaluate_rows(eval_rows_request)
-
-        # TODO: currently needs to wait for generation before returning
-        # need job scheduler queue (ray/celery) w/ jobs api
-        job_id = str(len(self.jobs))
-        self.jobs[job_id] = res
-        return Job(job_id=job_id, status=JobStatus.completed)
-
-    async def _run_model_generation(
-        self, input_rows: list[dict[str, Any]], request: EvaluateRowsRequest
-    ) -> list[dict[str, Any]]:
-        candidate = request.benchmark_config.eval_candidate
-        assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
-        sampling_params = {"max_tokens": candidate.sampling_params.max_tokens}
-
-        generations = []
-        for x in tqdm(input_rows):
-            if ColumnName.completion_input.value in x:
-                if candidate.sampling_params.stop:
-                    sampling_params["stop"] = candidate.sampling_params.stop
-
-                input_content = json.loads(x[ColumnName.completion_input.value])
-                params = OpenAICompletionRequestWithExtraBody(
-                    model=candidate.model,
-                    prompt=input_content,
-                    **sampling_params,
-                )
-                response = await self.inference_api.openai_completion(params)
-                generations.append({ColumnName.generated_answer.value: response.choices[0].text})
-            elif ColumnName.chat_completion_input.value in x:
-                chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value])
-                input_messages = [
-                    OpenAIUserMessageParam(**x) for x in chat_completion_input_json if x["role"] == "user"
-                ]
-
-                messages = []
-                if candidate.system_message:
-                    messages.append(candidate.system_message)
-
-                messages += [OpenAISystemMessageParam(**x) for x in chat_completion_input_json if x["role"] == "system"]
-
-                messages += input_messages
-                params = OpenAIChatCompletionRequestWithExtraBody(
-                    model=candidate.model,
-                    messages=messages,
-                    **sampling_params,
-                )
-                response = await self.inference_api.openai_chat_completion(params)
-                generations.append({ColumnName.generated_answer.value: response.choices[0].message.content})
-            else:
-                raise ValueError("Invalid input row")
-
-        return generations
-
-    async def evaluate_rows(
-        self,
-        request: EvaluateRowsRequest,
-    ) -> EvaluateResponse:
-        candidate = request.benchmark_config.eval_candidate
-        # Agent evaluation removed
-        if candidate.type == "model":
-            generations = await self._run_model_generation(request.input_rows, request)
-        else:
-            raise ValueError(f"Invalid candidate type: {candidate.type}")
-
-        # scoring with generated_answer
-        score_input_rows = [
-            input_r | generated_r for input_r, generated_r in zip(request.input_rows, generations, strict=False)
-        ]
-
-        if request.benchmark_config.scoring_params is not None:
-            scoring_functions_dict = {
-                scoring_fn_id: request.benchmark_config.scoring_params.get(scoring_fn_id, None)
-                for scoring_fn_id in request.scoring_functions
-            }
-        else:
-            scoring_functions_dict = dict.fromkeys(request.scoring_functions)
-
-        score_request = ScoreRequest(
-            input_rows=score_input_rows,
-            scoring_functions=scoring_functions_dict,
-        )
-        score_response = await self.scoring_api.score(score_request)
-
-        return EvaluateResponse(generations=generations, scores=score_response.results)
-
-    async def job_status(self, request: JobStatusRequest) -> Job:
-        if request.job_id in self.jobs:
-            return Job(job_id=request.job_id, status=JobStatus.completed)
-
-        raise ValueError(f"Job {request.job_id} not found")
-
-    async def job_cancel(self, request: JobCancelRequest) -> None:
-        raise NotImplementedError("Job cancel is not implemented yet")
-
-    async def job_result(self, request: JobResultRequest) -> EvaluateResponse:
-        job_status_request = JobStatusRequest(benchmark_id=request.benchmark_id, job_id=request.job_id)
-        job = await self.job_status(job_status_request)
-        status = job.status
-        if not status or status != JobStatus.completed:
-            raise ValueError(f"Job is not completed, Status: {status.value}")
-
-        return self.jobs[request.job_id]
diff --git a/src/llama_stack/providers/inline/scoring/__init__.py b/src/llama_stack/providers/inline/scoring/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/basic/__init__.py b/src/llama_stack/providers/inline/scoring/basic/__init__.py
deleted file mode 100644
index c996b9c2db..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.core.datatypes import Api
-
-from .config import BasicScoringConfig
-
-
-async def get_provider_impl(
-    config: BasicScoringConfig,
-    deps: dict[Api, Any],
-):
-    from .scoring import BasicScoringImpl
-
-    impl = BasicScoringImpl(
-        config,
-        deps[Api.datasetio],
-        deps[Api.datasets],
-    )
-    await impl.initialize()
-    return impl
diff --git a/src/llama_stack/providers/inline/scoring/basic/config.py b/src/llama_stack/providers/inline/scoring/basic/config.py
deleted file mode 100644
index 2a33222250..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/config.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel
-
-
-class BasicScoringConfig(BaseModel):
-    """Configuration for the basic scoring provider."""
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {}
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring.py b/src/llama_stack/providers/inline/scoring/basic/scoring.py
deleted file mode 100644
index 2a630f1159..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    DatasetIO,
-    Datasets,
-    IterRowsRequest,
-    ScoreBatchRequest,
-    ScoreBatchResponse,
-    ScoreRequest,
-    ScoreResponse,
-    Scoring,
-    ScoringFn,
-    ScoringFunctionsProtocolPrivate,
-    ScoringResult,
-)
-
-from .config import BasicScoringConfig
-from .scoring_fn.docvqa_scoring_fn import DocVQAScoringFn
-from .scoring_fn.equality_scoring_fn import EqualityScoringFn
-from .scoring_fn.ifeval_scoring_fn import IfEvalScoringFn
-from .scoring_fn.regex_parser_math_response_scoring_fn import (
-    RegexParserMathResponseScoringFn,
-)
-from .scoring_fn.regex_parser_scoring_fn import RegexParserScoringFn
-from .scoring_fn.subset_of_scoring_fn import SubsetOfScoringFn
-
-FIXED_FNS = [
-    EqualityScoringFn,
-    SubsetOfScoringFn,
-    RegexParserScoringFn,
-    RegexParserMathResponseScoringFn,
-    IfEvalScoringFn,
-    DocVQAScoringFn,
-]
-
-
-class BasicScoringImpl(
-    Scoring,
-    ScoringFunctionsProtocolPrivate,
-):
-    """Scoring provider with built-in functions for equality, regex, DocVQA, and IFEval metrics."""
-
-    def __init__(
-        self,
-        config: BasicScoringConfig,
-        datasetio_api: DatasetIO,
-        datasets_api: Datasets,
-    ) -> None:
-        self.config = config
-        self.datasetio_api = datasetio_api
-        self.datasets_api = datasets_api
-        self.scoring_fn_id_impls = {}
-
-    async def initialize(self) -> None:
-        for fn in FIXED_FNS:
-            impl = fn()
-            for fn_defs in impl.get_supported_scoring_fn_defs():
-                self.scoring_fn_id_impls[fn_defs.identifier] = impl
-
-    async def shutdown(self) -> None: ...
-
-    async def list_scoring_functions(self) -> list[ScoringFn]:
-        scoring_fn_defs_list = [
-            fn_def for impl in self.scoring_fn_id_impls.values() for fn_def in impl.get_supported_scoring_fn_defs()
-        ]
-
-        for f in scoring_fn_defs_list:
-            assert f.identifier.startswith("basic"), "All basic scoring fn must have identifier prefixed with 'basic'! "
-
-        return scoring_fn_defs_list
-
-    async def register_scoring_function(self, function_def: ScoringFn) -> None:
-        raise NotImplementedError("Register scoring function not implemented yet")
-
-    async def score_batch(
-        self,
-        request: ScoreBatchRequest,
-    ) -> ScoreBatchResponse:
-        all_rows = await self.datasetio_api.iterrows(IterRowsRequest(dataset_id=request.dataset_id, limit=-1))
-        score_request = ScoreRequest(
-            input_rows=all_rows.data,
-            scoring_functions=request.scoring_functions,
-        )
-        res = await self.score(score_request)
-        if request.save_results_dataset:
-            # TODO: persist and register dataset on to server for reading
-            # self.datasets_api.register_dataset()
-            raise NotImplementedError("Save results dataset not implemented yet")
-
-        return ScoreBatchResponse(
-            results=res.results,
-        )
-
-    async def score(
-        self,
-        request: ScoreRequest,
-    ) -> ScoreResponse:
-        res = {}
-        for scoring_fn_id in request.scoring_functions.keys():
-            if scoring_fn_id not in self.scoring_fn_id_impls:
-                raise ValueError(f"Scoring function {scoring_fn_id} is not supported.")
-            scoring_fn = self.scoring_fn_id_impls[scoring_fn_id]
-            scoring_fn_params = request.scoring_functions.get(scoring_fn_id, None)
-            score_results = await scoring_fn.score(request.input_rows, scoring_fn_id, scoring_fn_params)
-            agg_results = await scoring_fn.aggregate(score_results, scoring_fn_id, scoring_fn_params)
-            res[scoring_fn_id] = ScoringResult(
-                score_rows=score_results,
-                aggregated_results=agg_results,
-            )
-
-        return ScoreResponse(
-            results=res,
-        )
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
deleted file mode 100644
index 3d74406158..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import json
-import re
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import ScoringFnParams, ScoringResultRow
-
-from .fn_defs.docvqa import docvqa
-
-CONTRACTIONS = {
-    "aint": "ain't",
-    "arent": "aren't",
-    "cant": "can't",
-    "couldve": "could've",
-    "couldnt": "couldn't",
-    "couldn'tve": "couldn't've",
-    "couldnt've": "couldn't've",
-    "didnt": "didn't",
-    "doesnt": "doesn't",
-    "dont": "don't",
-    "hadnt": "hadn't",
-    "hadnt've": "hadn't've",
-    "hadn'tve": "hadn't've",
-    "hasnt": "hasn't",
-    "havent": "haven't",
-    "hed": "he'd",
-    "hed've": "he'd've",
-    "he'dve": "he'd've",
-    "hes": "he's",
-    "howd": "how'd",
-    "howll": "how'll",
-    "hows": "how's",
-    "Id've": "I'd've",
-    "I'dve": "I'd've",
-    "Im": "I'm",
-    "Ive": "I've",
-    "isnt": "isn't",
-    "itd": "it'd",
-    "itd've": "it'd've",
-    "it'dve": "it'd've",
-    "itll": "it'll",
-    "let's": "let's",
-    "maam": "ma'am",
-    "mightnt": "mightn't",
-    "mightnt've": "mightn't've",
-    "mightn'tve": "mightn't've",
-    "mightve": "might've",
-    "mustnt": "mustn't",
-    "mustve": "must've",
-    "neednt": "needn't",
-    "notve": "not've",
-    "oclock": "o'clock",
-    "oughtnt": "oughtn't",
-    "ow's'at": "'ow's'at",
-    "'ows'at": "'ow's'at",
-    "'ow'sat": "'ow's'at",
-    "shant": "shan't",
-    "shed've": "she'd've",
-    "she'dve": "she'd've",
-    "she's": "she's",
-    "shouldve": "should've",
-    "shouldnt": "shouldn't",
-    "shouldnt've": "shouldn't've",
-    "shouldn'tve": "shouldn't've",
-    "somebody'd": "somebodyd",
-    "somebodyd've": "somebody'd've",
-    "somebody'dve": "somebody'd've",
-    "somebodyll": "somebody'll",
-    "somebodys": "somebody's",
-    "someoned": "someone'd",
-    "someoned've": "someone'd've",
-    "someone'dve": "someone'd've",
-    "someonell": "someone'll",
-    "someones": "someone's",
-    "somethingd": "something'd",
-    "somethingd've": "something'd've",
-    "something'dve": "something'd've",
-    "somethingll": "something'll",
-    "thats": "that's",
-    "thered": "there'd",
-    "thered've": "there'd've",
-    "there'dve": "there'd've",
-    "therere": "there're",
-    "theres": "there's",
-    "theyd": "they'd",
-    "theyd've": "they'd've",
-    "they'dve": "they'd've",
-    "theyll": "they'll",
-    "theyre": "they're",
-    "theyve": "they've",
-    "twas": "'twas",
-    "wasnt": "wasn't",
-    "wed've": "we'd've",
-    "we'dve": "we'd've",
-    "weve": "we've",
-    "werent": "weren't",
-    "whatll": "what'll",
-    "whatre": "what're",
-    "whats": "what's",
-    "whatve": "what've",
-    "whens": "when's",
-    "whered": "where'd",
-    "wheres": "where's",
-    "whereve": "where've",
-    "whod": "who'd",
-    "whod've": "who'd've",
-    "who'dve": "who'd've",
-    "wholl": "who'll",
-    "whos": "who's",
-    "whove": "who've",
-    "whyll": "why'll",
-    "whyre": "why're",
-    "whys": "why's",
-    "wont": "won't",
-    "wouldve": "would've",
-    "wouldnt": "wouldn't",
-    "wouldnt've": "wouldn't've",
-    "wouldn'tve": "wouldn't've",
-    "yall": "y'all",
-    "yall'll": "y'all'll",
-    "y'allll": "y'all'll",
-    "yall'd've": "y'all'd've",
-    "y'alld've": "y'all'd've",
-    "y'all'dve": "y'all'd've",
-    "youd": "you'd",
-    "youd've": "you'd've",
-    "you'dve": "you'd've",
-    "youll": "you'll",
-    "youre": "you're",
-    "youve": "you've",
-    "1st": "first",
-    "2nd": "second",
-    "3rd": "third",
-}
-NUMBERS = {
-    "none": "0",
-    "zero": "0",
-    "one": "1",
-    "two": "2",
-    "three": "3",
-    "four": "4",
-    "five": "5",
-    "six": "6",
-    "seven": "7",
-    "eight": "8",
-    "nine": "9",
-    "ten": "10",
-}
-ARTICLES = [
-    "a",
-    "an",
-    "the",
-    "to",
-    "in",
-    "from",
-    "by",
-]  # Contains a bit more than just articles, but we want to get rid of these elements influencing the accuracy
-PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
-COMMA_STRIP = re.compile(r"(\d)(\,)(\d)")
-PUNCTUATION = [
-    ";",
-    r"/",
-    "[",
-    "]",
-    '"',
-    "{",
-    "}",
-    "(",
-    ")",
-    "=",
-    "+",
-    "\\",
-    "_",
-    "-",
-    ">",
-    "<",
-    "@",
-    "`",
-    ",",
-    "?",
-    "!",
-]
-
-
-def normalize_answer(s: str) -> str:
-    """Normalize a DocVQA answer by removing punctuation, articles, and extra whitespace.
-
-    Args:
-        s: raw answer string
-
-    Returns:
-        Normalized answer string
-    """
-    # process punctuation
-    for p in PUNCTUATION:
-        if (p + " " in s or " " + p in s) or (re.search(COMMA_STRIP, s) is not None):
-            s = s.replace(p, "")
-        else:
-            s = s.replace(p, " ")
-        s = PERIOD_STRIP.sub("", s, re.UNICODE)
-
-    # process digits and articles
-    temp_text = s.lower().split()
-    out_text = []
-    for word in temp_text:
-        word = NUMBERS.setdefault(word, word)
-        if word not in ARTICLES:
-            out_text.append(word)
-
-    # standardize contractions
-    for word_id, word in enumerate(out_text):
-        if word in CONTRACTIONS:
-            out_text[word_id] = CONTRACTIONS[word]
-    return " ".join(out_text)
-
-
-class DocVQAScoringFn(RegisteredBaseScoringFn):
-    """
-    docvqa basically matches the generated answer against several allowed
-    choices, but we need to normalize the answer to avoid penalizing
-    trivial differences
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            docvqa.identifier: docvqa,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = "docvqa",
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        expected_answers = json.loads(input_row["expected_answer"])
-        generated_answer = input_row["generated_answer"]
-        score = 1.0 if normalize_answer(generated_answer) in [normalize_answer(s) for s in expected_answers] else 0.0
-        return {
-            "score": score,
-        }
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py
deleted file mode 100644
index 2e79240bec..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import ScoringFnParams, ScoringResultRow
-
-from .fn_defs.equality import equality
-
-
-class EqualityScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn that assigns a score of 1.0 if the input string matches the target string, and 0.0 otherwise.
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            equality.identifier: equality,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = "equality",
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        assert "expected_answer" in input_row, "Expected answer not found in input row."
-        assert "generated_answer" in input_row, "Generated answer not found in input row."
-
-        expected_answer = input_row["expected_answer"]
-        generated_answer = input_row["generated_answer"]
-        score = 1.0 if expected_answer == generated_answer else 0.0
-        return {
-            "score": score,
-        }
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py
deleted file mode 100644
index a7305d13aa..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-docvqa = ScoringFn(
-    identifier="basic::docvqa",
-    description="DocVQA Visual Question & Answer scoring function",
-    return_type=NumberType(),
-    provider_id="basic",
-    provider_resource_id="docvqa",
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py
deleted file mode 100644
index f7d2f32ae3..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-equality = ScoringFn(
-    identifier="basic::equality",
-    description="Returns 1.0 if the input is equal to the target, 0.0 otherwise.",
-    provider_id="basic",
-    provider_resource_id="equality",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
deleted file mode 100644
index a2ed1d695d..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-ifeval = ScoringFn(
-    identifier="basic::ifeval",
-    description="Eval intruction follow capacity by checkping how many instructions can be followed in each example",
-    return_type=NumberType(),
-    provider_id="basic",
-    provider_resource_id="ifeval",
-    params=BasicScoringFnParams(
-        aggregation_functions=[AggregationFunctionType.weighted_average],
-    ),
-)
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py
deleted file mode 100644
index 4e2b49a1fd..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    NumberType,
-    RegexParserScoringFnParams,
-    ScoringFn,
-)
-
-MATH_ANSWER_REGEXES = [r".*final answer is:?\s*\$\\boxed{(?P<X>.*)}\$"]
-
-
-regex_parser_math_response = ScoringFn(
-    identifier="basic::regex_parser_math_response",
-    description="For math related benchmarks, extract answer from the generated response and expected_answer and see if they match",
-    return_type=NumberType(),
-    provider_id="basic",
-    provider_resource_id="regex-parser-math-response",
-    params=RegexParserScoringFnParams(
-        parsing_regexes=MATH_ANSWER_REGEXES,
-        aggregation_functions=[AggregationFunctionType.accuracy],
-    ),
-)
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
deleted file mode 100644
index df0cf52d9d..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    NumberType,
-    RegexParserScoringFnParams,
-    ScoringFn,
-)
-
-MULTILINGUAL_ANSWER_REGEXES = [
-    r"The best answer is ",
-    r"Answer\s*:",
-    r"Answer\s*:​​​​​​",  # Korean invisible character
-    r"উত্তর\s*:",
-    r"उत्तर\s*:",
-    r"উত্তরঃ",
-    r"উত্তর\s*:",
-    r"Antwort\s*:",
-    r"답변\s*:",
-    r"정답\s*:",
-    r"답\s*:",
-    r"答案\s*：",
-    r"答案\s*:",
-    r"答\s*：",
-    r"答\s*:",
-    r"答复\s*：",
-    r"答曰\s*：",
-    r"الإجابة:",
-    r"الجواب:",
-    r"إجابة:",
-    r"الإجابة النهائية:",
-    r"الإجابة الصحيحة:",
-    r"الإجابة الصحيحة هي:",
-    r"الإجابة هي:",
-    r"Respuesta\s*:",
-    r"Risposta\s*:",
-    r"答え\s*:",
-    r"答え\s*：",
-    r"回答\s*:",
-    r"回答\s*：",
-    r"解答\s*:",
-    r"Jawaban\s*:",
-    r"Réponse\s*:",
-    r"Resposta\s*:",
-    r"Jibu\s*:",
-    r"Idahun\s*:",
-    r"Ìdáhùn\s*:",
-    r"Idáhùn\s*:",
-    r"Àmọ̀nà\s*:",
-    r"Àdáhùn\s*:",
-    r"Ànúgọ\s*:",
-    r"Àṣàyàn\s*:",
-]
-
-MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = r"(?i){}\s*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[Ａ]|[Ｂ]|[Ｃ]|[Ｄ])"
-
-regex_parser_multiple_choice_answer = ScoringFn(
-    identifier="basic::regex_parser_multiple_choice_answer",
-    description="Extract answer from response matching Answer: [the_answer_letter], and compare with expected result",
-    return_type=NumberType(),
-    provider_id="basic",
-    provider_resource_id="regex-parser-multiple-choice-answer",
-    params=RegexParserScoringFnParams(
-        parsing_regexes=[MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(x) for x in MULTILINGUAL_ANSWER_REGEXES],
-        aggregation_functions=[AggregationFunctionType.accuracy],
-    ),
-)
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py
deleted file mode 100644
index 1f143c4a62..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-subset_of = ScoringFn(
-    identifier="basic::subset_of",
-    description="Returns 1.0 if the expected is included in generated, 0.0 otherwise.",
-    return_type=NumberType(),
-    provider_id="basic",
-    provider_resource_id="subset-of",
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
deleted file mode 100644
index 33b1c5a312..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import ScoringFnParams, ScoringResultRow
-
-from .fn_defs.ifeval import (
-    ifeval,
-)
-
-
-class IfEvalScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn Instruction-Following Eval (IFEval) benchmark
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            ifeval.identifier: ifeval,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        from ..utils.ifeval_utils import INSTRUCTION_DICT, INSTRUCTION_LIST
-
-        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
-        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
-        if scoring_params is not None:
-            fn_def.params = scoring_params
-
-        instruction_list = input_row["instruction_id_list"]
-        generated_answer = input_row["generated_answer"].strip()
-
-        is_following_list = []
-        results = dict(
-            {k + "_correct": 0.0 for k in INSTRUCTION_LIST},
-            **{k + "_total": 0.0 for k in INSTRUCTION_LIST},
-        )
-
-        for index, instruction_id in enumerate(instruction_list):
-            instruction_cls = INSTRUCTION_DICT[instruction_id]
-            instruction = instruction_cls(instruction_id)
-            results[instruction_id + "_total"] += 1.0
-            results[instruction_id.split(":")[0] + "_total"] += 1.0
-
-            clean_input_row = {k: v for k, v in input_row["kwargs"][index].items() if v is not None}
-            print(clean_input_row)
-            instruction.build_description(**clean_input_row)
-            args = instruction.get_instruction_args()
-            if args and "prompt" in args:
-                instruction.build_description(prompt=input_row["prompt"])
-
-            if generated_answer and instruction.check_following(generated_answer):
-                is_following_list.append(True)
-                results[instruction_id + "_correct"] += 1.0
-                results[instruction_id.split(":")[0] + "_correct"] += 1.0
-            else:
-                is_following_list.append(False)
-
-        if len(is_following_list) == 0:
-            return {
-                "score": 0.0,
-                "weight": 0.0,
-            }
-
-        return {
-            "score": float(sum(is_following_list)) / float(len(is_following_list)),
-            "weight": float(len(is_following_list)),
-        }
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
deleted file mode 100644
index 1f4f2f9794..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import ScoringFnParams, ScoringFnParamsType, ScoringResultRow
-
-from ..utils.math_utils import first_answer, normalize_final_answer, try_evaluate_frac, try_evaluate_latex
-from .fn_defs.regex_parser_math_response import (
-    regex_parser_math_response,
-)
-
-
-class RegexParserMathResponseScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn for math benchamrks that parses answer from generated response according to context and check match with expected_answer.
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            regex_parser_math_response.identifier: regex_parser_math_response,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
-        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
-        if scoring_params is not None:
-            fn_def.params = scoring_params
-
-        assert fn_def.params is not None and fn_def.params.type == ScoringFnParamsType.regex_parser.value, (
-            f"RegexParserScoringFnParams not found for {fn_def}."
-        )
-
-        expected_answer = input_row["expected_answer"]
-        generated_answer = input_row["generated_answer"]
-
-        parsing_regexes = fn_def.params.parsing_regexes
-        assert len(parsing_regexes) == 1, (
-            "Only one parsing regex is supported for regex_parser_math_response scoring function."
-        )
-        parsing_regexes = fn_def.params.parsing_regexes[0]
-
-        normalized_generated_answer = normalize_final_answer(
-            first_answer(generated_answer),
-            parsing_regexes,
-            match_first=True,
-        )
-        normalized_generated_answer = try_evaluate_frac(try_evaluate_latex(normalized_generated_answer))
-
-        normalized_expected_answer = normalize_final_answer(expected_answer, r".*")
-        normalized_expected_answer = try_evaluate_frac(try_evaluate_latex(normalized_expected_answer))
-
-        score = 1.0 if normalized_generated_answer == normalized_expected_answer else 0.0
-        return {
-            "score": score,
-        }
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py
deleted file mode 100644
index 1cc74f8746..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import re
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import ScoringFnParams, ScoringFnParamsType, ScoringResultRow
-
-from .fn_defs.regex_parser_multiple_choice_answer import (
-    regex_parser_multiple_choice_answer,
-)
-
-
-class RegexParserScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn that parses answer from generated response according to context and check match with expected_answer.
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            regex_parser_multiple_choice_answer.identifier: regex_parser_multiple_choice_answer,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
-        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
-        if scoring_params is not None:
-            fn_def.params = scoring_params
-
-        assert fn_def.params is not None and fn_def.params.type == ScoringFnParamsType.regex_parser.value, (
-            f"RegexParserScoringFnParams not found for {fn_def}."
-        )
-
-        expected_answer = input_row["expected_answer"]
-        generated_answer = input_row["generated_answer"]
-
-        # parse answer according to regex
-        parsed_answer = None
-        for regex in fn_def.params.parsing_regexes:
-            match = re.search(regex, generated_answer)
-            if match:
-                parsed_answer = match.group(1)
-                break
-
-        score = 1.0 if parsed_answer and parsed_answer == expected_answer else 0.0
-        return {
-            "score": score,
-        }
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py
deleted file mode 100644
index fe15a4972d..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import ScoringFnParams, ScoringResultRow
-
-from .fn_defs.subset_of import subset_of
-
-
-class SubsetOfScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn that assigns a score of 1.0 if the expected string is included in the generated string, and 0.0 otherwise.
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            subset_of.identifier: subset_of,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = "subset_of",
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        expected_answer = input_row["expected_answer"]
-        generated_answer = input_row["generated_answer"]
-        score = 1.0 if expected_answer in generated_answer else 0.0
-        return {
-            "score": score,
-        }
diff --git a/src/llama_stack/providers/inline/scoring/basic/utils/__init__.py b/src/llama_stack/providers/inline/scoring/basic/utils/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/utils/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_checkers_core.py b/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_checkers_core.py
deleted file mode 100644
index 4ab79170f1..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_checkers_core.py
+++ /dev/null
@@ -1,819 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import random
-import re
-
-import langdetect
-from pythainlp.tokenize import sent_tokenize as sent_tokenize_thai
-from pythainlp.tokenize import word_tokenize as word_tokenize_thai
-
-from llama_stack.log import get_logger
-
-logger = get_logger(name=__name__, category="scoring")
-
-from llama_stack.providers.inline.scoring.basic.utils.ifeval_support import (
-    _COMPARISON_RELATION,
-    _CONSTRAINED_RESPONSE_OPTIONS,
-    _KEYWORD_FREQUENCY,
-    _LANGUAGES,
-    _MAX_NUM_SENTENCES,
-    _NUM_BULLETS,
-    _NUM_HIGHLIGHTED_SECTIONS,
-    _NUM_KEYWORDS,
-    _NUM_PARAGRAPHS,
-    _NUM_PLACEHOLDERS,
-    _NUM_SECTIONS,
-    _NUM_WORDS_LOWER_LIMIT,
-    _NUM_WORDS_UPPER_LIMIT,
-    _POSTSCRIPT_MARKER,
-    _SECTION_SPLITER,
-    _STARTER_OPTIONS,
-    count_sentences,
-    count_words,
-    count_words_cjk,
-    generate_keywords,
-    get_langid,
-    split_chinese_japanese_hindi,
-)
-
-
-class Instruction:
-    """An instruction template."""
-
-    def __init__(self, instruction_id):
-        self.id = instruction_id
-
-    def build_description(self, **kwargs):
-        raise NotImplementedError("`build_description` not implemented.")
-
-    def get_instruction_args(self):
-        raise NotImplementedError("`get_instruction_args` not implemented.")
-
-    def get_instruction_args_keys(self):
-        raise NotImplementedError("`get_instruction_args_keys` not implemented.")
-
-    def check_following(self, value):
-        raise NotImplementedError("`check_following` not implemented.")
-
-
-class ResponseLanguageChecker(Instruction):
-    """Check the language of the entire response."""
-
-    def build_description(self, *, language=None):
-        """Build the instruction description.
-
-        Args:
-          language: A string representing the expected language of the response. The
-            language has to comply to the 97 types defined in
-            `langid.py` (https://pypi.org/project/langid/1.1.5/), which follows
-            ISO 639-1 codes (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes);
-            for example, `en` for English, `zh` for Chinese, `fr` for French.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._language = language
-        if self._language is None:
-            self._language = random.choice(list(_LANGUAGES.keys()))
-
-        self._description_pattern = (
-            "Your ENTIRE response should be in {language} language, no other " + "language is allowed."
-        )
-        return self._description_pattern.format(language=_LANGUAGES[self._language])
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"language": self._language}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["language"]
-
-    def check_following(self, value):
-        """Check if the language of the entire response follows the instruction.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the language of `value` follows instruction; otherwise False.
-        """
-        assert isinstance(value, str)
-
-        try:
-            return langdetect.detect(value) == self._language
-        except langdetect.LangDetectException as e:
-            # Count as instruction is followed.
-            logger.info("Unable to detect language", text=value, error=str(e))
-            return True
-
-
-class NumberOfSentences(Instruction):
-    """Check the number of sentences."""
-
-    def build_description(self, *, num_sentences=None, relation=None):
-        """Build the instruction description.
-
-        Args:
-          num_sentences: An integer specifying the number of sentences as a
-            threshold.
-          relation: A string in (`less than`, `at least`), defining the relational
-            operator for comparison.
-            Two relational comparisons are supported for now:
-            if 'less than', the actual number of sentences < the threshold;
-            if 'at least', the actual number of sentences >= the threshold.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        # The number of sentences as a threshold for comparison.
-        self._num_sentences_threshold = num_sentences
-        if self._num_sentences_threshold is None or self._num_sentences_threshold < 0:
-            self._num_sentences_threshold = random.randint(1, _MAX_NUM_SENTENCES)
-
-        if relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given."
-            )
-        else:
-            self._comparison_relation = relation
-
-        self._description_pattern = "Your response should contain {relation} {num_sentences} sentences."
-        return self._description_pattern.format(
-            relation=self._comparison_relation,
-            num_sentences=self._num_sentences_threshold,
-        )
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "num_sentences": self._num_sentences_threshold,
-            "relation": self._comparison_relation,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_sentences", "relation"]
-
-    def check_following(self, value):
-        """Check if the number of sentences follows the instruction.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the response follows the instruction.
-
-        Raise:
-            ValueError if the string in `instruction_args` is not in
-            [`less_than`, `at_least`].
-        """
-        lang = get_langid(value)
-        if lang == "th":
-            # Counting Newline also as a new sentence:
-            num_sentences = sum([len(sent_tokenize_thai(line)) for line in value.splitlines()])
-        elif lang in ["zh", "zh-cn", "zh-tw", "ja", "hi"]:
-            num_sentences = len(list(split_chinese_japanese_hindi(value)))
-        else:
-            num_sentences = count_sentences(value)
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return num_sentences < self._num_sentences_threshold
-        elif self._comparison_relation == _COMPARISON_RELATION[1]:
-            return num_sentences >= self._num_sentences_threshold
-
-
-class PlaceholderChecker(Instruction):
-    """Check the placeholders in template writing."""
-
-    def build_description(self, *, num_placeholders=None):
-        """Build the instruction description.
-
-        Args:
-          num_placeholders: An integer denoting the minimum number of
-            placeholders required in the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_placeholders = num_placeholders
-        if self._num_placeholders is None or self._num_placeholders < 0:
-            self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
-        self._description_pattern = (
-            "The response must contain at least {num_placeholders} placeholders "
-            + "represented by square brackets, such as [address]."
-        )
-        return self._description_pattern.format(num_placeholders=self._num_placeholders)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_placeholders": self._num_placeholders}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_placeholders"]
-
-    def check_following(self, value):
-        """Check if the number of placeholders follows the instruction.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the actual number of placeholders in the response is greater than
-          or equal to `num_placeholders`; otherwise, False.
-        """
-        placeholders = re.findall(r"\[.*?\]", value)
-        num_placeholders = len(placeholders)
-        return num_placeholders >= self._num_placeholders
-
-
-class BulletListChecker(Instruction):
-    """Checks the bullet list in the prompt."""
-
-    def build_description(self, *, num_bullets=None):
-        """Build the instruction description.
-
-        Args:
-          num_bullets: An integer specifying the exact number of bullet lists
-            that is required to appear in the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_bullets = num_bullets
-        if self._num_bullets is None or self._num_bullets < 0:
-            self._num_bullets = random.randint(1, _NUM_BULLETS)
-        self._description_pattern = (
-            "Your answer must contain exactly {num_bullets} bullet points. "
-            + "Use the markdown bullet points such as:\n"
-            + "* This is point 1. \n"
-            + "* This is point 2"
-        )
-        return self._description_pattern.format(num_bullets=self._num_bullets)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_bullets": self._num_bullets}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_bullets"]
-
-    def check_following(self, value):
-        r"""Check if the number of bullet lists meets the requirement.
-
-        Args:
-          value: A string representing the response. The response is expected to
-            contain some bullet lists that start with `\*`.
-
-        Returns:
-          True if the actual number of bullet lists in the response meets the
-          requirement.
-        """
-        bullet_lists = re.findall(r"^\s*\*[^\*].*$", value, flags=re.MULTILINE)
-        bullet_lists_2 = re.findall(r"^\s*-.*$", value, flags=re.MULTILINE)
-        num_bullet_lists = len(bullet_lists) + len(bullet_lists_2)
-        return num_bullet_lists == self._num_bullets
-
-
-class ConstrainedResponseChecker(Instruction):
-    """Checks the constrained response."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        # A sequence of string(s) representing the options of the expected response.
-        self._constrained_responses = _CONSTRAINED_RESPONSE_OPTIONS
-        self._description_pattern = "Answer with one of the following options: {response_options}"
-        return self._description_pattern.format(response_options=self._constrained_responses)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks if the response matches the constrained options.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the actual response contains one of the options in the constrained
-          responses; otherwise False.
-        """
-        value = value.strip()
-        for constrained_response in self._constrained_responses:
-            if constrained_response in value:
-                return True
-        return False
-
-
-class ConstrainedStartChecker(Instruction):
-    """Checks the response start."""
-
-    def build_description(self, *, starter=None):
-        """Build the instruction description.
-
-        Args:
-          starter: A string representing the keyward that the response should start
-            with.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._starter = starter.strip() if isinstance(starter, str) else starter
-        if self._starter is None:
-            self._starter = random.choice(_STARTER_OPTIONS)
-        self._description_pattern = (
-            "During the conversation, when it is your turn, " + "please always start with {starter}"
-        )
-        return self._description_pattern.format(starter=self._starter)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"starter": self._starter}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["starter"]
-
-    def check_following(self, value):
-        """Checks if the response starts with the constrained keyword or phrase.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the response starts with the given phrase or keyword that is
-          contained in `instruction_args`; otherwise, False.
-        """
-        response_pattern = r"^\s*" + self._starter + r".*$"
-        response_with_constrained_start = re.search(response_pattern, value, flags=re.MULTILINE)
-        return True if response_with_constrained_start else False
-
-
-class HighlightSectionChecker(Instruction):
-    """Checks the highlighted section."""
-
-    def build_description(self, *, num_highlights=None):
-        """Build the instruction description.
-
-        Args:
-          num_highlights: An integer specifying the minimum number of highlighted
-            sections.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_highlights = num_highlights
-        if self._num_highlights is None or self._num_highlights < 0:
-            self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
-
-        self._description_pattern = (
-            "Highlight at least {num_highlights} sections in your answer with "
-            + "markdown, i.e. *highlighted section*."
-        )
-
-        return self._description_pattern.format(num_highlights=self._num_highlights)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_highlights": self._num_highlights}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_highlights"]
-
-    def check_following(self, value):
-        """Checks if the number of highlighted sections meets the requirement.
-
-        Args:
-          value: a string repesenting the response. The response is expected to
-            contain highlighted sections in the format of *highlighted*.
-
-        Returns:
-          True if the actual number of highlighted sections in the format of
-          *highlighed sections* meets the minimum requirement; otherwise False.
-        """
-        num_highlights = 0
-        highlights = re.findall(r"\*[^\n\*]*\*", value)
-        double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", value)
-        for highlight in highlights:
-            if highlight.strip("*").strip():
-                num_highlights += 1
-        for highlight in double_highlights:
-            if highlight.removeprefix("**").removesuffix("**").strip():
-                num_highlights += 1
-
-        return num_highlights >= self._num_highlights
-
-
-class SectionChecker(Instruction):
-    """Checks the sections."""
-
-    def build_description(self, *, section_spliter=None, num_sections=None):
-        """Build the instruction description.
-
-        Args:
-          section_spliter: A string represents the section spliter keyword that
-            marks a new section, i.e., `Section` or `SECTION`.
-          num_sections: An integer specifying the number of sections.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._section_spliter = section_spliter.strip() if isinstance(section_spliter, str) else section_spliter
-        if self._section_spliter is None:
-            self._section_spliter = random.choice(_SECTION_SPLITER)
-
-        self._num_sections = num_sections
-        if self._num_sections is None or self._num_sections < 0:
-            self._num_sections = random.randint(1, _NUM_SECTIONS)
-
-        self._description_pattern = (
-            "Your response must have {num_sections} sections. Mark the beginning "
-            + "of each section with {section_spliter} X, such as:\n"
-            + "{section_spliter} 1\n"
-            + "[content of section 1]\n"
-            + "{section_spliter} 2\n"
-            + "[content of section 2]"
-        )
-
-        return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "section_spliter": self._section_spliter,
-            "num_sections": self._num_sections,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["section_spliter", "num_sections"]
-
-    def check_following(self, value):
-        """Checks the response contains multiple sections.
-
-        Args:
-          value: A string representing the response. The response is expected
-            to contain multiple sections (number of sections is greater than 1).
-            A new section starts with `Section 1`, where the number denotes the
-            section index.
-
-        Returns:
-          True if the number of sections in the response is greater than or equal to
-          the minimum number of sections; otherwise, False.
-        """
-        section_splitter_patten = r"\s?" + self._section_spliter + r"\s?\d+\s?"
-        sections = re.split(section_splitter_patten, value)
-        num_sections = len(sections) - 1
-        return num_sections >= self._num_sections
-
-
-class ParagraphChecker(Instruction):
-    """Checks the paragraphs."""
-
-    def build_description(self, *, num_paragraphs=None):
-        """Build the instruction description.
-
-        Args:
-          num_paragraphs: An integer specifying the number of paragraphs.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_paragraphs = num_paragraphs
-        if self._num_paragraphs is None or self._num_paragraphs < 0:
-            self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
-
-        self._description_pattern = (
-            "There should be {num_paragraphs} paragraphs. " + "Paragraphs are separated with the markdown divider: ***"
-        )
-
-        return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_paragraphs": self._num_paragraphs}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_paragraphs"]
-
-    def check_following(self, value):
-        """Checks the response contains required number of paragraphs.
-
-        Args:
-          value: A string representing the response. The response may contain
-            paragraphs that are separated by the markdown divider: `***`.
-
-        Returns:
-          True if the actual number of paragraphs is the same as required;
-          otherwise, False.
-        """
-        paragraphs = re.split(r"\s?\*\*\*\s?", value)
-        num_paragraphs = len(paragraphs)
-
-        for index, paragraph in enumerate(paragraphs):
-            if not paragraph.strip():
-                if index == 0 or index == len(paragraphs) - 1:
-                    num_paragraphs -= 1
-                else:
-                    return False
-
-        return num_paragraphs == self._num_paragraphs
-
-
-class PostscriptChecker(Instruction):
-    """Checks the postscript."""
-
-    def build_description(self, *, postscript_marker=None):
-        """Build the instruction description.
-
-        Args:
-          postscript_marker: A string containing the keyword that marks the start
-            of the postscript section.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._postscript_marker = postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
-        if self._postscript_marker is None:
-            self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
-
-        self._description_pattern = (
-            "At the end of your response, please explicitly add a postscript " + "starting with {postscript}"
-        )
-
-        return self._description_pattern.format(postscript=self._postscript_marker)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"postscript_marker": self._postscript_marker}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["postscript_marker"]
-
-    def check_following(self, value):
-        """Checks if the response follows the postscript format.
-
-        Args:
-          value: a string representing the response. The response is expected to
-            contain a postscript section.
-
-        Returns:
-          True if the response contains a postscript section starting with
-          the keyword containing in the `instruction_args`; otherwise False.
-        """
-        value = value.lower()
-        if self._postscript_marker == "P.P.S":
-            postscript_pattern = r"\s*p\.\s?p\.\s?s.*$"
-        elif self._postscript_marker == "P.S.":
-            postscript_pattern = r"\s*p\.\s?s\..*$"
-        else:
-            postscript_pattern = r"\s*" + self._postscript_marker.lower() + r".*$"
-        postscript = re.findall(postscript_pattern, value, flags=re.MULTILINE)
-        return True if postscript else False
-
-
-class RephraseChecker(Instruction):
-    """Checks the repharse."""
-
-    def build_description(self, *, original_message):
-        """Build the instruction description.
-
-        Args:
-          original_message: A string representing the original message. The
-            rephrased response should only change its words/sentences in between
-            its two asterisks, for example, *change me*. Both original and rephrased
-            messages should contain the changes in the form of *change me*.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        if not self.is_change(original_message):
-            raise ValueError(f"Message {original_message} does not contain changes in the form of *change me*.")
-
-        self._reference_without_change = original_message
-        self._description = (
-            "Rephrasing: Your rephrased response should only"
-            + "change the words/sentences in between two asterisks"
-            + "such as *change me*."
-        )
-        return self._description
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"original_message": self._reference_without_change}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["original_message"]
-
-    def check_following(self, value):
-        r"""Checks if the rephrasing follows the instruction.
-
-        Args:
-          value: A string representing the response, which is expected to rephras
-            the string of `instruction_args`.
-
-        Returns:
-          True if `value` and `instruction_args` only differ by the words/sentences
-          in between two asterisks such as *change me*; otherwise, False.
-        """
-
-        if not self.is_change(value):
-            raise ValueError(f"value {value} does not contain changes in the form of *change me*.")
-
-        response_without_changes = self.strip_changes(value)
-        reference_without_changes = self.strip_changes(self._reference_without_change)
-
-        return response_without_changes == reference_without_changes
-
-    def is_change(self, response):
-        """Check if there is change in the response in the form of *change me*."""
-        return re.search(r"\*.*\*", response)
-
-    def strip_changes(self, response):
-        """Strips off the changes."""
-        return re.sub(r"\*.*\*", "", response)
-
-
-class KeywordChecker(Instruction):
-    """Check the exisitence of certain keywords."""
-
-    def build_description(self, *, keywords=None):
-        """Build the instruction description.
-
-        Args:
-          keywords: A sequence of strings representing the keywords that are
-            expected in the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-
-        if not keywords:
-            self._keywords = generate_keywords(num_keywords=_NUM_KEYWORDS)
-        else:
-            self._keywords = keywords
-        self._keywords = sorted(self._keywords)
-
-        self._description_pattern = "Include keywords {keywords} in the response."
-
-        return self._description_pattern.format(keywords=self._keywords)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"keywords": self._keywords}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["keywords"]
-
-    def check_following(self, value):
-        """Check if the response contain the expected keywords."""
-        for keyword in self._keywords:
-            if not re.search(keyword, value, flags=re.IGNORECASE):
-                return False
-        return True
-
-
-class KeywordFrequencyChecker(Instruction):
-    """Check the keyword frequency."""
-
-    def build_description(self, *, keyword=None, frequency=None, relation=None):
-        """Build the instruction description.
-
-        Args:
-          keyword: A string representing a keyword that is expected in the response.
-          frequency: An integer specifying the number of times `keyword` is expected
-            to appear in the response.
-          relation: A string in (`less than`, `at least`), defining the relational
-            operator for comparison.
-            Two relational comparisons are supported for now:
-            if 'less than', the actual number of occurrences < frequency;
-            if 'at least', the actual number of occurrences >= frequency.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        if not keyword:
-            self._keyword = generate_keywords(num_keywords=1)[0]
-        else:
-            self._keyword = keyword.strip()
-
-        self._frequency = frequency
-        if self._frequency is None or self._frequency < 0:
-            self._frequency = random.randint(1, _KEYWORD_FREQUENCY)
-
-        if relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given."
-            )
-        else:
-            self._comparison_relation = relation
-
-        self._description_pattern = (
-            "In your response, the word {keyword} should appear {relation} " + "{frequency} times."
-        )
-
-        return self._description_pattern.format(
-            keyword=self._keyword,
-            relation=self._comparison_relation,
-            frequency=self._frequency,
-        )
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "keyword": self._keyword,
-            "frequency": self._frequency,
-            "relation": self._comparison_relation,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["keyword", "frequency", "relation"]
-
-    def check_following(self, value):
-        """Checks if the response contain the keyword with required frequency."""
-        actual_occurrences = len(re.findall(self._keyword, value, flags=re.IGNORECASE))
-
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return actual_occurrences < self._frequency
-        elif self._comparison_relation == _COMPARISON_RELATION[1]:
-            return actual_occurrences >= self._frequency
-
-
-class NumberOfWords(Instruction):
-    """Checks the number of words."""
-
-    def build_description(self, *, num_words=None, relation=None):
-        """Build the instruction description.
-
-        Args:
-          num_words: An integer specifying the number of words contained in the
-            response.
-          relation: A string in (`less than`, `at least`), defining the relational
-            operator for comparison.
-            Two relational comparisons are supported for now:
-            if 'less than', the actual number of words < num_words;
-            if 'at least', the actual number of words >= num_words.
-
-        Returns:
-          A string representing the instruction description.
-        """
-
-        self._num_words = num_words
-        if self._num_words is None or self._num_words < 0:
-            self._num_words = random.randint(_NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT)
-
-        if relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given."
-            )
-        else:
-            self._comparison_relation = relation
-
-        self._description_pattern = "Answer with {relation} {num_words} words."
-
-        return self._description_pattern.format(relation=self._comparison_relation, num_words=self._num_words)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_words": self._num_words, "relation": self._comparison_relation}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_words", "relation"]
-
-    def check_following(self, value):
-        """Checks if the response contains the expected number of words."""
-        lang = get_langid(value)
-        if lang == "th":
-            num_words = len(word_tokenize_thai(value))
-        elif lang in ["zh", "zh-cn", "zh-tw", "ja", "ko"]:
-            num_words = count_words_cjk(value)
-        else:
-            num_words = count_words(value)
-
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return num_words < self._num_words
-        elif self._comparison_relation == _COMPARISON_RELATION[1]:
-            return num_words >= self._num_words
diff --git a/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_checkers_format.py b/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_checkers_format.py
deleted file mode 100644
index 142b909811..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_checkers_format.py
+++ /dev/null
@@ -1,698 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import collections
-import json
-import random
-import re
-import string
-
-import langdetect
-import nltk
-
-from llama_stack.log import get_logger
-
-logger = get_logger(name=__name__, category="scoring")
-
-from llama_stack.providers.inline.scoring.basic.utils.ifeval_checkers_core import Instruction
-from llama_stack.providers.inline.scoring.basic.utils.ifeval_support import (
-    _ALL_CAPITAL_WORD_FREQUENCY,
-    _COMPARISON_RELATION,
-    _ENDING_OPTIONS,
-    _LETTER_FREQUENCY,
-    _NUM_KEYWORDS,
-    _NUM_PARAGRAPHS,
-    generate_keywords,
-    get_langid,
-    split_into_sentences,
-)
-
-
-class JsonFormat(Instruction):
-    """Check the Json format."""
-
-    def build_description(self):
-        self._description_pattern = (
-            "Entire output should be wrapped in JSON format. You can use markdown ticks such as ```."
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        value = (
-            value.strip()
-            .removeprefix("```json")
-            .removeprefix("```Json")
-            .removeprefix("```JSON")
-            .removeprefix("```")
-            .removesuffix("```")
-            .strip()
-        )
-        try:
-            json.loads(value)
-        except ValueError as _:
-            return False
-        return True
-
-
-class ParagraphFirstWordCheck(Instruction):
-    """Check the paragraph and the first word of the nth paragraph."""
-
-    def build_description(self, num_paragraphs=None, nth_paragraph=None, first_word=None):
-        r"""Build the instruction description.
-
-        Args:
-          num_paragraphs: An integer indicating the number of paragraphs expected
-            in the response. A paragraph is a subset of the string that is
-            expected to be separated by '\n\n'.
-          nth_paragraph: An integer indicating the paragraph number that we look at.
-            Note that n starts from 1.
-          first_word: A string that represent the first word of the bth paragraph.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_paragraphs = num_paragraphs
-        if self._num_paragraphs is None or self._num_paragraphs < 0:
-            self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
-
-        self._nth_paragraph = nth_paragraph
-        if self._nth_paragraph is None or self._nth_paragraph <= 0 or self._nth_paragraph > self._num_paragraphs:
-            self._nth_paragraph = random.randint(1, self._num_paragraphs + 1)
-
-        self._first_word = first_word
-        if self._first_word is None:
-            self._first_word = generate_keywords(num_keywords=1)[0]
-        self._first_word = self._first_word.lower()
-
-        self._description_pattern = (
-            "There should be {num_paragraphs} paragraphs. "
-            + "Paragraphs and only paragraphs are separated with each other by two "
-            + "new lines as if it was '\\n\\n' in python. "
-            + "Paragraph {nth_paragraph} must start with word {first_word}."
-        )
-
-        return self._description_pattern.format(
-            num_paragraphs=self._num_paragraphs,
-            nth_paragraph=self._nth_paragraph,
-            first_word=self._first_word,
-        )
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "num_paragraphs": self._num_paragraphs,
-            "nth_paragraph": self._nth_paragraph,
-            "first_word": self._first_word,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_paragraphs", "nth_paragraph", "first_word"]
-
-    def check_following(self, value):
-        """Checks for required number of paragraphs and correct first word.
-
-        Args:
-          value: a string representing the response. The response may contain
-            paragraphs that are separated by two new lines and the first word of
-            the nth paragraph will have to match a specified word.
-
-        Returns:
-          True if the number of paragraphs is the same as required and the first
-          word of the specified paragraph is the same as required. Otherwise, false.
-        """
-
-        paragraphs = re.split(r"\n\n", value)
-        num_paragraphs = len(paragraphs)
-
-        for paragraph in paragraphs:
-            if not paragraph.strip():
-                num_paragraphs -= 1
-
-        # check that index doesn't go out of bounds
-        if self._nth_paragraph <= num_paragraphs:
-            paragraph = paragraphs[self._nth_paragraph - 1].strip()
-            if not paragraph:
-                return False
-        else:
-            return False
-
-        first_word = ""
-        punctuation = {".", ",", "?", "!", "'", '"'}
-
-        # get first word and remove punctuation
-        word = paragraph.split()[0].strip()
-        word = word.lstrip("'")
-        word = word.lstrip('"')
-
-        for letter in word:
-            if letter in punctuation:
-                break
-            first_word += letter.lower()
-
-        return num_paragraphs == self._num_paragraphs and first_word == self._first_word
-
-
-class KeySentenceChecker(Instruction):
-    """Check the existence of certain key sentences."""
-
-    def build_description(self, key_sentences=None, num_sentences=None):
-        """Build the instruction description.
-
-        Args:
-          key_sentences: A sequences of strings representing the key sentences that
-            are expected in the response.
-          num_sentences: The number of key sentences that are expected to be seen in
-            the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-
-        if not key_sentences:
-            self._key_sentences = {["For now, this is fine."]}
-        else:
-            self._key_sentences = key_sentences
-
-        if not num_sentences:
-            self._num_sentences = random.randint(1, len(self._key_sentences))
-        else:
-            self._num_sentences = num_sentences
-
-        self._description_pattern = "Include {num_sentences} of the following sentences {key_sentences}"
-
-        return self._description_pattern.format(num_sentences=self._num_sentences, key_sentences=self._key_sentences)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "num_sentences": self._num_sentences,
-            "key_sentences": list(self._key_sentences),
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_sentences", "key_sentences"]
-
-    def check_following(self, value):
-        """Checks if the response contains the expected key sentences."""
-        count = 0
-        sentences = split_into_sentences(value)
-        for sentence in self._key_sentences:
-            if sentence in sentences:
-                count += 1
-
-        return count == self._num_sentences
-
-
-class ForbiddenWords(Instruction):
-    """Checks that specified words are not used in response."""
-
-    def build_description(self, forbidden_words=None):
-        """Build the instruction description.
-
-        Args:
-          forbidden_words: A sequences of strings respresenting words that are not
-            allowed in the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-
-        if not forbidden_words:
-            self._forbidden_words = generate_keywords(num_keywords=_NUM_KEYWORDS)
-        else:
-            self._forbidden_words = list(set(forbidden_words))
-        self._forbidden_words = sorted(self._forbidden_words)
-        self._description_pattern = "Do not include keywords {forbidden_words} in the response."
-
-        return self._description_pattern.format(forbidden_words=self._forbidden_words)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"forbidden_words": self._forbidden_words}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["forbidden_words"]
-
-    def check_following(self, value):
-        """Check if the response does not contain the expected keywords."""
-        for word in self._forbidden_words:
-            if re.search(r"\b" + word + r"\b", value, flags=re.IGNORECASE):
-                return False
-        return True
-
-
-class RephraseParagraph(Instruction):
-    """Checks that the paragraph is rephrased."""
-
-    def build_description(self, *, original_paragraph, low, high):
-        """Builds the instruction description.
-
-        Args:
-          original_paragraph: A string presenting the original paragraph. The
-            rephrases response should have betweeb low-high words in common.
-          low: An integer presenting the lower bound of similar words.
-          high: An integer representing the upper bound of similar words.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._original_paragraph = original_paragraph
-        self._low = low
-        self._high = high
-
-        self._description = (
-            "Rephrase the following paragraph: "
-            + "{original_paragraph}\nYour response should have "
-            + "between {low} and {high} of the same words. "
-            + "Words are the same if and only if all of the "
-            + "letters, ignoring cases, are the same. For "
-            + "example, 'run' is the same as 'Run' but different "
-            + "to 'ran'."
-        )
-
-        return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "original_paragraph": self._original_paragraph,
-            "low": self._low,
-            "high": self._high,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["original_paragraph", "low", "high"]
-
-    def check_following(self, value):
-        val_words = re.findall(r"\w+", value.lower())
-        original_words = re.findall(r"\w+", self._original_paragraph.lower())
-        similar_words = 0
-
-        dict_val = collections.Counter(val_words)
-        dict_original = collections.Counter(original_words)
-
-        for word in dict_original:
-            similar_words += min(dict_original[word], dict_val[word])
-
-        return similar_words >= self._low and similar_words <= self._high
-
-
-class TwoResponsesChecker(Instruction):
-    """Check that two responses were given."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = (
-            "Give two different responses. Responses and only responses should"
-            " be separated by 6 asterisk symbols: ******."
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks if the response has two different answers.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if two responses are detected and false otherwise.
-        """
-        valid_responses = list()
-        responses = value.split("******")
-        for index, response in enumerate(responses):
-            if not response.strip():
-                if index != 0 and index != len(responses) - 1:
-                    return False
-            else:
-                valid_responses.append(response)
-        return len(valid_responses) == 2 and valid_responses[0].strip() != valid_responses[1].strip()
-
-
-class RepeatPromptThenAnswer(Instruction):
-    """Checks that Prompt is first repeated then answered."""
-
-    def build_description(self, *, prompt_to_repeat=None):
-        """Build the instruction description.
-
-        Args:
-          prompt_to_repeat: The prompt that is meant to be repeated.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        if not prompt_to_repeat:
-            raise ValueError("prompt_to_repeat must be set.")
-        else:
-            self._prompt_to_repeat = prompt_to_repeat
-        self._description_pattern = (
-            "First repeat the request word for word without change,"
-            " then give your answer (1. do not say any words or characters"
-            " before repeating the request; 2. the request you need to repeat"
-            " does not include this sentence)"
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return {"prompt_to_repeat": self._prompt_to_repeat}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["prompt_to_repeat"]
-
-    def check_following(self, value):
-        if value.strip().lower().startswith(self._prompt_to_repeat.strip().lower()):
-            return True
-        return False
-
-
-class EndChecker(Instruction):
-    """Checks that the prompt ends with a given phrase."""
-
-    def build_description(self, *, end_phrase=None):
-        """Build the instruction description.
-
-        Args:
-          end_phrase: A string representing the phrase the response should end with.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._end_phrase = end_phrase.strip() if isinstance(end_phrase, str) else end_phrase
-        if self._end_phrase is None:
-            self._end_phrase = random.choice(_ENDING_OPTIONS)
-        self._description_pattern = (
-            "Finish your response with this exact phrase {ender}. No other words should follow this phrase."
-        )
-        return self._description_pattern.format(ender=self._end_phrase)
-
-    def get_instruction_args(self):
-        return {"end_phrase": self._end_phrase}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["end_phrase"]
-
-    def check_following(self, value):
-        """Checks if the response ends with the expected phrase."""
-        value = value.strip().strip('"').lower()
-        self._end_phrase = self._end_phrase.strip().lower()
-        return value.endswith(self._end_phrase)
-
-
-class TitleChecker(Instruction):
-    """Checks the response for a title."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = (
-            "Your answer must contain a title, wrapped in double angular brackets, such as <<poem of joy>>."
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks if the response contains a title."""
-        pattern = r"<<[^\n]+>>"
-        re_pattern = re.compile(pattern)
-        titles = re.findall(re_pattern, value)
-
-        for title in titles:
-            if title.lstrip("<").rstrip(">").strip():
-                return True
-        return False
-
-
-class LetterFrequencyChecker(Instruction):
-    """Checks letter frequency."""
-
-    def build_description(self, *, letter=None, let_frequency=None, let_relation=None):
-        """Build the instruction description.
-
-        Args:
-          letter: A string representing a letter that is expected in the response.
-          let_frequency: An integer specifying the number of times `keyword` is
-            expected to appear in the response.
-          let_relation: A string in (`less than`, `at least`), defining the
-            relational operator for comparison. Two relational comparisons are
-            supported for now; if 'less than', the actual number of
-            occurrences < frequency; if 'at least', the actual number of
-            occurrences >= frequency.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        if not letter or len(letter) > 1 or ord(letter.lower()) < 97 or ord(letter.lower()) > 122:
-            self._letter = random.choice(list(string.ascii_letters))
-        else:
-            self._letter = letter.strip()
-        self._letter = self._letter.lower()
-
-        self._frequency = let_frequency
-        if self._frequency is None or self._frequency < 0:
-            self._frequency = random.randint(1, _LETTER_FREQUENCY)
-
-        if let_relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif let_relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {let_relation} is given."
-            )
-        else:
-            self._comparison_relation = let_relation
-
-        self._description_pattern = (
-            "In your response, the letter {letter} should appear {let_relation} {let_frequency} times."
-        )
-
-        return self._description_pattern.format(
-            letter=self._letter,
-            let_frequency=self._frequency,
-            let_relation=self._comparison_relation,
-        )
-
-    def get_instruction_args(self):
-        """Returns the keyword args of build description."""
-        return {
-            "letter": self._letter,
-            "let_frequency": self._frequency,
-            "let_relation": self._comparison_relation,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["letter", "let_frequency", "let_relation"]
-
-    def check_following(self, value):
-        """Checks that the response contains the letter at the right frequency."""
-        value = value.lower()
-        letters = collections.Counter(value)
-
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return letters[self._letter] < self._frequency
-        else:
-            return letters[self._letter] >= self._frequency
-
-
-class CapitalLettersEnglishChecker(Instruction):
-    """Checks that the response is in english and is in all capital letters."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = "Your entire response should be in English, and in all capital letters."
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks that the response is in English and in all capital letters."""
-        assert isinstance(value, str)
-
-        try:
-            return value.isupper() and langdetect.detect(value) == "en"
-        except langdetect.LangDetectException as e:
-            # Count as instruction is followed.
-            logger.info("Unable to detect language", text=value, error=str(e))
-            return True
-
-
-class LowercaseLettersEnglishChecker(Instruction):
-    """Checks that the response is in english and is in all lowercase letters."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = (
-            "Your entire response should be in English, and in all lowercase letters. No capital letters are allowed."
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks that the response is in English and in all lowercase letters."""
-        assert isinstance(value, str)
-
-        try:
-            return value.islower() and langdetect.detect(value) == "en"
-        except langdetect.LangDetectException as e:
-            # Count as instruction is followed.
-            logger.info("Unable to detect language", text=value, error=str(e))
-            return True
-
-
-class CommaChecker(Instruction):
-    """Checks the response for no commas."""
-
-    def build_description(self, **kwargs):
-        """Build the instruction description."""
-        self._description_pattern = "In your entire response, refrain from the use of any commas."
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks that the response does not contain commas."""
-        return not re.search(r"\,", value)
-
-
-class CapitalWordFrequencyChecker(Instruction):
-    """Checks frequency of words with all capital letters."""
-
-    def build_description(
-        self,
-        capital_frequency=None,
-        capital_relation=None,
-    ):
-        """Build the instruction description.
-
-        Args:
-          capital_frequency: An integer that represents the number of words that
-            should be in all capital letters.
-          capital_relation: A string that is 'at least' or 'at most' that refers to
-            the frequency.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._frequency = capital_frequency
-        if self._frequency is None:
-            self._frequency = random.randint(1, _ALL_CAPITAL_WORD_FREQUENCY)
-
-        self._comparison_relation = capital_relation
-        if capital_relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif capital_relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                "The supported relation for comparison must be in "
-                f"{_COMPARISON_RELATION}, but {capital_relation} is given."
-            )
-
-        self._description_pattern = (
-            "In your response, words with all capital letters should appear {relation} {frequency} times."
-        )
-
-        return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)
-
-    def get_instruction_args(self):
-        """Returns the keyword args of build description."""
-        return {
-            "capital_frequency": self._frequency,
-            "capital_relation": self._comparison_relation,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["capital_frequency", "capital_relation"]
-
-    def check_following(self, value):
-        """Checks the frequency of words with all capital letters."""
-        # Hyphenated words will count as one word
-        nltk.download("punkt_tab")
-        words = nltk.word_tokenize(value)
-        capital_words = [word for word in words if word.isupper()]
-
-        capital_words = len(capital_words)
-
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return capital_words < self._frequency
-        else:
-            return capital_words >= self._frequency
-
-
-class QuotationChecker(Instruction):
-    """Checks response is wrapped with double quotation marks."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = "Wrap your entire response with double quotation marks."
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        """Returns the keyword args of build description."""
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks if the response is wrapped with double quotation marks."""
-        quotations_map = {
-            "ja": "「」",
-            "ru": "«»",
-            "th": "“”",
-            "zh": "“”",
-            "zh-cn": "“”",
-            "zh-tw": "“”",
-        }
-        value = value.strip()
-        lang = get_langid(value)
-        quotes = quotations_map.get(lang, '""')
-        # TODO: We may wanna revisit this logic in new generations to only check of the response language's quotes.
-        return len(value) > 1 and value[0] in [quotes[0], '"'] and value[-1] in [quotes[1], '"']
-
-
-# Define instruction dicts
diff --git a/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_support.py b/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_support.py
deleted file mode 100644
index 0ebfc5f763..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_support.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import collections
-import functools
-import random
-import re
-from collections.abc import Iterable, Sequence
-from types import MappingProxyType
-
-import emoji
-import langdetect
-import nltk
-
-from llama_stack.log import get_logger
-
-logger = get_logger(name=__name__, category="scoring")
-
-from llama_stack.providers.inline.scoring.basic.utils.ifeval_word_list import WORD_LIST
-
-# ISO 639-1 codes to language names.
-LANGUAGE_CODES = MappingProxyType(
-    {
-        "en": "English",
-        "es": "Spanish",
-        "pt": "Portuguese",
-        "ar": "Arabic",
-        "hi": "Hindi",
-        "fr": "French",
-        "ru": "Russian",
-        "de": "German",
-        "ja": "Japanese",
-        "it": "Italian",
-        "bn": "Bengali",
-        "uk": "Ukrainian",
-        "th": "Thai",
-        "ur": "Urdu",
-        "ta": "Tamil",
-        "te": "Telugu",
-        "bg": "Bulgarian",
-        "ko": "Korean",
-        "pl": "Polish",
-        "he": "Hebrew",
-        "fa": "Persian",
-        "vi": "Vietnamese",
-        "ne": "Nepali",
-        "sw": "Swahili",
-        "kn": "Kannada",
-        "mr": "Marathi",
-        "gu": "Gujarati",
-        "pa": "Punjabi",
-        "ml": "Malayalam",
-        "fi": "Finnish",
-    }
-)
-
-# Chinese characters
-_CHINESE_CHARS_PATTERN = r"[\u4E00-\u9FFF\u3400-\u4DBF]"
-# Japanese Hiragana & Katakana
-_JAPANESE_CHARS_PATTERN = r"[\u3040-\u309f\u30a0-\u30ff]"
-# Korean (Hangul Syllables)
-_KOREAN_CHARS_PATTERN = r"[\uAC00-\uD7AF]"
-_ALPHABETS = "([A-Za-z])"
-_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
-_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
-_STARTERS = (
-    r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
-)
-_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
-_WEBSITES = "[.](com|net|org|io|gov|edu|me)"
-_DIGITS = "([0-9])"
-_MULTIPLE_DOTS = r"\.{2,}"
-
-
-# Util functions
-def split_into_sentences(text):
-    """Split the text into sentences.
-
-    Args:
-      text: A string that consists of more than or equal to one sentences.
-
-    Returns:
-      A list of strings where each string is a sentence.
-    """
-    text = " " + text + "  "
-    text = text.replace("\n", " ")
-    text = re.sub(_PREFIXES, "\\1<prd>", text)
-    text = re.sub(_WEBSITES, "<prd>\\1", text)
-    text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
-    text = re.sub(
-        _MULTIPLE_DOTS,
-        lambda match: "<prd>" * len(match.group(0)) + "<stop>",
-        text,
-    )
-    if "Ph.D" in text:
-        text = text.replace("Ph.D.", "Ph<prd>D<prd>")
-    text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
-    text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
-    text = re.sub(
-        _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
-        "\\1<prd>\\2<prd>\\3<prd>",
-        text,
-    )
-    text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text)
-    text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
-    text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
-    text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
-    if "”" in text:
-        text = text.replace(".”", "”.")
-    if '"' in text:
-        text = text.replace('."', '".')
-    if "!" in text:
-        text = text.replace('!"', '"!')
-    if "?" in text:
-        text = text.replace('?"', '"?')
-    text = text.replace(".", ".<stop>")
-    text = text.replace("?", "?<stop>")
-    text = text.replace("!", "!<stop>")
-    text = text.replace("<prd>", ".")
-    sentences = text.split("<stop>")
-    sentences = [s.strip() for s in sentences]
-    if sentences and not sentences[-1]:
-        sentences = sentences[:-1]
-    return sentences
-
-
-def count_words(text):
-    """Counts the number of words."""
-    tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
-    tokens = tokenizer.tokenize(text)
-    num_words = len(tokens)
-    return num_words
-
-
-def split_chinese_japanese_hindi(lines: str) -> Iterable[str]:
-    """
-    Split Chinese and Japanese text into sentences.
-    From https://stackoverflow.com/questions/27441191/splitting-chinese-document-into-sentences
-    Special question/exclamation marks were added upon inspection of our raw data,
-    Also supports multiple lines.
-    The separator for hindi is '।'
-    """
-    for line in lines.splitlines():
-        yield from re.findall(
-            r"[^!?。\.\!\?\！\？\．\n।]+[!?。\.\!\?\！\？\．\n।]?",
-            line.strip(),
-            flags=re.U,
-        )
-
-
-def count_words_cjk(text: str) -> int:
-    """Counts the number of words for Chinese and Japanese and Korean.
-    Can be extended to additional languages.
-    Source: https://stackoverflow.com/questions/49164507/how-to-count-the-number-of-chinese-korean-and-english-words withadditional modifications
-    Example:
-        >In: count_words_cjk('こんにちは、ジェイソンさん、Jason? Nice to meet you☺ ❤')
-        >Out: 19
-    """
-    # Non alpha numeric patterns in latin and asian languages.
-    non_alphanumeric_patterns = (
-        r"[\\.\!\?\．\/_,\{\}<>:;$%^&*(+\"\'+——！，。？、`~@#￥……（）：；《）《》“”()\[\]«»〔〕\-「」]+"
-    )
-    text = re.sub(non_alphanumeric_patterns, "", text)
-
-    emoji_cnt = emoji.emoji_count(text)  # count emojis
-    text = emoji.replace_emoji(text, "")  # remove emojis
-
-    foreign_chars_patterns = "|".join([_CHINESE_CHARS_PATTERN, _JAPANESE_CHARS_PATTERN, _KOREAN_CHARS_PATTERN])
-    asian_chars = re.findall(foreign_chars_patterns, text)
-    asian_chars_cnt = len(asian_chars)
-    non_asian_chars = re.sub(foreign_chars_patterns, " ", text)
-    non_asian_words_cnt = len(non_asian_chars.split())
-
-    return non_asian_words_cnt + asian_chars_cnt + emoji_cnt
-
-
-@functools.cache
-def _get_sentence_tokenizer():
-    return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
-
-
-def count_sentences(text):
-    """Count the number of sentences."""
-    tokenizer = _get_sentence_tokenizer()
-    tokenized_sentences = tokenizer.tokenize(text)
-    return len(tokenized_sentences)
-
-
-def get_langid(text: str, lid_path: str | None = None) -> str:
-    """Detect the primary language of a text using per-line language detection.
-
-    Args:
-        text: input text to analyze
-        lid_path: unused, kept for interface compatibility
-
-    Returns:
-        ISO 639-1 language code, defaulting to "en" if detection fails
-    """
-    line_langs: list[str] = []
-    lines = [line.strip() for line in text.split("\n") if len(line.strip()) >= 4]
-
-    for line in lines:
-        try:
-            line_langs.append(langdetect.detect(line))
-        except langdetect.LangDetectException as e:
-            logger.info("Unable to detect language", text=line, error=str(e))
-
-    if len(line_langs) == 0:
-        return "en"
-    # select the text language to be the most commonly predicted language of the lines.
-    return collections.Counter(line_langs).most_common(1)[0][0]
-
-
-def generate_keywords(num_keywords):
-    """Randomly generates a few keywords."""
-    return random.sample(WORD_LIST, k=num_keywords)
-
-
-"""Library of instructions"""
-_InstructionArgsDtype = dict[str, int | str | Sequence[str]] | None
-
-_LANGUAGES = LANGUAGE_CODES
-
-# The relational operation for comparison.
-_COMPARISON_RELATION = ("less than", "at least")
-
-# The maximum number of sentences.
-_MAX_NUM_SENTENCES = 20
-
-# The number of placeholders.
-_NUM_PLACEHOLDERS = 4
-
-# The number of bullet lists.
-_NUM_BULLETS = 5
-
-# The options of constrained response.
-_CONSTRAINED_RESPONSE_OPTIONS = (
-    "My answer is yes.",
-    "My answer is no.",
-    "My answer is maybe.",
-)
-
-# The options of starter keywords.
-_STARTER_OPTIONS = (
-    "I would say",
-    "My answer is",
-    "I believe",
-    "In my opinion",
-    "I think",
-    "I reckon",
-    "I feel",
-    "From my perspective",
-    "As I see it",
-    "According to me",
-    "As far as I'm concerned",
-    "To my understanding",
-    "In my view",
-    "My take on it is",
-    "As per my perception",
-)
-
-# The options of ending keywords.
-# TODO(jeffreyzhou) add more ending options
-_ENDING_OPTIONS = ("Any other questions?", "Is there anything else I can help with?")
-
-# The number of highlighted sections.
-_NUM_HIGHLIGHTED_SECTIONS = 4
-
-# The section spliter.
-_SECTION_SPLITER = ("Section", "SECTION")
-
-# The number of sections.
-_NUM_SECTIONS = 5
-
-# The number of paragraphs.
-_NUM_PARAGRAPHS = 5
-
-# The postscript marker.
-_POSTSCRIPT_MARKER = ("P.S.", "P.P.S")
-
-# The number of keywords.
-_NUM_KEYWORDS = 2
-
-# The occurrences of a single keyword.
-_KEYWORD_FREQUENCY = 3
-
-# The occurrences of a single letter.
-_LETTER_FREQUENCY = 10
-
-# The occurrences of words with all capital letters.
-_ALL_CAPITAL_WORD_FREQUENCY = 20
-
-# The number of words in the response.
-_NUM_WORDS_LOWER_LIMIT = 100
-_NUM_WORDS_UPPER_LIMIT = 500
diff --git a/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py b/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
deleted file mode 100644
index acce13ea22..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.providers.inline.scoring.basic.utils.ifeval_checkers_core import (
-    BulletListChecker,
-    ConstrainedResponseChecker,
-    HighlightSectionChecker,
-    KeywordChecker,
-    KeywordFrequencyChecker,
-    NumberOfSentences,
-    NumberOfWords,
-    ParagraphChecker,
-    PlaceholderChecker,
-    PostscriptChecker,
-    ResponseLanguageChecker,
-    SectionChecker,
-)
-from llama_stack.providers.inline.scoring.basic.utils.ifeval_checkers_format import (
-    CapitalLettersEnglishChecker,
-    CapitalWordFrequencyChecker,
-    CommaChecker,
-    EndChecker,
-    ForbiddenWords,
-    JsonFormat,
-    LetterFrequencyChecker,
-    LowercaseLettersEnglishChecker,
-    ParagraphFirstWordCheck,
-    QuotationChecker,
-    RepeatPromptThenAnswer,
-    TitleChecker,
-    TwoResponsesChecker,
-)
-
-_KEYWORD = "keywords:"
-_LANGUAGE = "language:"
-_LENGTH = "length_constraints:"
-_CONTENT = "detectable_content:"
-_FORMAT = "detectable_format:"
-_MULTITURN = "multi-turn:"
-_COMBINATION = "combination:"
-_STARTEND = "startend:"
-_CHANGE_CASES = "change_case:"
-_PUNCTUATION = "punctuation:"
-
-INSTRUCTION_DICT = {
-    _KEYWORD + "existence": KeywordChecker,
-    _KEYWORD + "frequency": KeywordFrequencyChecker,
-    # _KEYWORD + "key_sentences": KeySentenceChecker,
-    _KEYWORD + "forbidden_words": ForbiddenWords,
-    _KEYWORD + "letter_frequency": LetterFrequencyChecker,
-    _LANGUAGE + "response_language": ResponseLanguageChecker,
-    _LENGTH + "number_sentences": NumberOfSentences,
-    _LENGTH + "number_paragraphs": ParagraphChecker,
-    _LENGTH + "number_words": NumberOfWords,
-    _LENGTH + "nth_paragraph_first_word": ParagraphFirstWordCheck,
-    _CONTENT + "number_placeholders": PlaceholderChecker,
-    _CONTENT + "postscript": PostscriptChecker,
-    _FORMAT + "number_bullet_lists": BulletListChecker,
-    # _CONTENT + "rephrase_paragraph": RephraseParagraph,
-    _FORMAT + "constrained_response": ConstrainedResponseChecker,
-    _FORMAT + "number_highlighted_sections": (HighlightSectionChecker),
-    _FORMAT + "multiple_sections": SectionChecker,
-    # _FORMAT + "rephrase": RephraseChecker,
-    _FORMAT + "json_format": JsonFormat,
-    _FORMAT + "title": TitleChecker,
-    # _MULTITURN + "constrained_start": ConstrainedStartChecker,
-    _COMBINATION + "two_responses": TwoResponsesChecker,
-    _COMBINATION + "repeat_prompt": RepeatPromptThenAnswer,
-    _STARTEND + "end_checker": EndChecker,
-    _CHANGE_CASES + "capital_word_frequency": CapitalWordFrequencyChecker,
-    _CHANGE_CASES + "english_capital": CapitalLettersEnglishChecker,
-    _CHANGE_CASES + "english_lowercase": LowercaseLettersEnglishChecker,
-    _PUNCTUATION + "no_comma": CommaChecker,
-    _STARTEND + "quotation": QuotationChecker,
-}
-
-INSTRUCTION_LIST = list(INSTRUCTION_DICT.keys()) + [
-    _KEYWORD[:-1],
-    _LANGUAGE[:-1],
-    _LENGTH[:-1],
-    _CONTENT[:-1],
-    _FORMAT[:-1],
-    _MULTITURN[:-1],
-    _COMBINATION[:-1],
-    _STARTEND[:-1],
-    _CHANGE_CASES[:-1],
-    _PUNCTUATION[:-1],
-]
diff --git a/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_word_list.py b/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_word_list.py
deleted file mode 100644
index 679f483326..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_word_list.py
+++ /dev/null
@@ -1,1538 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack.log import get_logger
-
-logger = get_logger(name=__name__, category="scoring")
-
-WORD_LIST = [
-    "western",
-    "sentence",
-    "signal",
-    "dump",
-    "spot",
-    "opposite",
-    "bottom",
-    "potato",
-    "administration",
-    "working",
-    "welcome",
-    "morning",
-    "good",
-    "agency",
-    "primary",
-    "wish",
-    "responsibility",
-    "press",
-    "problem",
-    "president",
-    "steal",
-    "brush",
-    "read",
-    "type",
-    "beat",
-    "trainer",
-    "growth",
-    "lock",
-    "bone",
-    "case",
-    "equal",
-    "comfortable",
-    "region",
-    "replacement",
-    "performance",
-    "mate",
-    "walk",
-    "medicine",
-    "film",
-    "thing",
-    "rock",
-    "tap",
-    "total",
-    "competition",
-    "ease",
-    "south",
-    "establishment",
-    "gather",
-    "parking",
-    "world",
-    "plenty",
-    "breath",
-    "claim",
-    "alcohol",
-    "trade",
-    "dear",
-    "highlight",
-    "street",
-    "matter",
-    "decision",
-    "mess",
-    "agreement",
-    "studio",
-    "coach",
-    "assist",
-    "brain",
-    "wing",
-    "style",
-    "private",
-    "top",
-    "brown",
-    "leg",
-    "buy",
-    "procedure",
-    "method",
-    "speed",
-    "high",
-    "company",
-    "valuable",
-    "pie",
-    "analyst",
-    "session",
-    "pattern",
-    "district",
-    "pleasure",
-    "dinner",
-    "swimming",
-    "joke",
-    "order",
-    "plate",
-    "department",
-    "motor",
-    "cell",
-    "spend",
-    "cabinet",
-    "difference",
-    "power",
-    "examination",
-    "engine",
-    "horse",
-    "dimension",
-    "pay",
-    "toe",
-    "curve",
-    "literature",
-    "bother",
-    "fire",
-    "possibility",
-    "debate",
-    "activity",
-    "passage",
-    "hello",
-    "cycle",
-    "background",
-    "quiet",
-    "author",
-    "effect",
-    "actor",
-    "page",
-    "bicycle",
-    "error",
-    "throat",
-    "attack",
-    "character",
-    "phone",
-    "tea",
-    "increase",
-    "outcome",
-    "file",
-    "specific",
-    "inspector",
-    "internal",
-    "potential",
-    "staff",
-    "building",
-    "employer",
-    "shoe",
-    "hand",
-    "direction",
-    "garden",
-    "purchase",
-    "interview",
-    "study",
-    "recognition",
-    "member",
-    "spiritual",
-    "oven",
-    "sandwich",
-    "weird",
-    "passenger",
-    "particular",
-    "response",
-    "reaction",
-    "size",
-    "variation",
-    "a",
-    "cancel",
-    "candy",
-    "exit",
-    "guest",
-    "condition",
-    "fly",
-    "price",
-    "weakness",
-    "convert",
-    "hotel",
-    "great",
-    "mouth",
-    "mind",
-    "song",
-    "sugar",
-    "suspect",
-    "telephone",
-    "ear",
-    "roof",
-    "paint",
-    "refrigerator",
-    "organization",
-    "jury",
-    "reward",
-    "engineering",
-    "day",
-    "possession",
-    "crew",
-    "bar",
-    "road",
-    "description",
-    "celebration",
-    "score",
-    "mark",
-    "letter",
-    "shower",
-    "suggestion",
-    "sir",
-    "luck",
-    "national",
-    "progress",
-    "hall",
-    "stroke",
-    "theory",
-    "offer",
-    "story",
-    "tax",
-    "definition",
-    "history",
-    "ride",
-    "medium",
-    "opening",
-    "glass",
-    "elevator",
-    "stomach",
-    "question",
-    "ability",
-    "leading",
-    "village",
-    "computer",
-    "city",
-    "grand",
-    "confidence",
-    "candle",
-    "priest",
-    "recommendation",
-    "point",
-    "necessary",
-    "body",
-    "desk",
-    "secret",
-    "horror",
-    "noise",
-    "culture",
-    "warning",
-    "water",
-    "round",
-    "diet",
-    "flower",
-    "bus",
-    "tough",
-    "permission",
-    "week",
-    "prompt",
-    "connection",
-    "abuse",
-    "height",
-    "save",
-    "corner",
-    "border",
-    "stress",
-    "drive",
-    "stop",
-    "rip",
-    "meal",
-    "listen",
-    "confusion",
-    "girlfriend",
-    "living",
-    "relation",
-    "significance",
-    "plan",
-    "creative",
-    "atmosphere",
-    "blame",
-    "invite",
-    "housing",
-    "paper",
-    "drink",
-    "roll",
-    "silver",
-    "drunk",
-    "age",
-    "damage",
-    "smoke",
-    "environment",
-    "pack",
-    "savings",
-    "influence",
-    "tourist",
-    "rain",
-    "post",
-    "sign",
-    "grandmother",
-    "run",
-    "profit",
-    "push",
-    "clerk",
-    "final",
-    "wine",
-    "swim",
-    "pause",
-    "stuff",
-    "singer",
-    "funeral",
-    "average",
-    "source",
-    "scene",
-    "tradition",
-    "personal",
-    "snow",
-    "nobody",
-    "distance",
-    "sort",
-    "sensitive",
-    "animal",
-    "major",
-    "negotiation",
-    "click",
-    "mood",
-    "period",
-    "arrival",
-    "expression",
-    "holiday",
-    "repeat",
-    "dust",
-    "closet",
-    "gold",
-    "bad",
-    "sail",
-    "combination",
-    "clothes",
-    "emphasis",
-    "duty",
-    "black",
-    "step",
-    "school",
-    "jump",
-    "document",
-    "professional",
-    "lip",
-    "chemical",
-    "front",
-    "wake",
-    "while",
-    "inside",
-    "watch",
-    "row",
-    "subject",
-    "penalty",
-    "balance",
-    "possible",
-    "adult",
-    "aside",
-    "sample",
-    "appeal",
-    "wedding",
-    "depth",
-    "king",
-    "award",
-    "wife",
-    "blow",
-    "site",
-    "camp",
-    "music",
-    "safe",
-    "gift",
-    "fault",
-    "guess",
-    "act",
-    "shame",
-    "drama",
-    "capital",
-    "exam",
-    "stupid",
-    "record",
-    "sound",
-    "swing",
-    "novel",
-    "minimum",
-    "ratio",
-    "machine",
-    "shape",
-    "lead",
-    "operation",
-    "salary",
-    "cloud",
-    "affair",
-    "hit",
-    "chapter",
-    "stage",
-    "quantity",
-    "access",
-    "army",
-    "chain",
-    "traffic",
-    "kick",
-    "analysis",
-    "airport",
-    "time",
-    "vacation",
-    "philosophy",
-    "ball",
-    "chest",
-    "thanks",
-    "place",
-    "mountain",
-    "advertising",
-    "red",
-    "past",
-    "rent",
-    "return",
-    "tour",
-    "house",
-    "construction",
-    "net",
-    "native",
-    "war",
-    "figure",
-    "fee",
-    "spray",
-    "user",
-    "dirt",
-    "shot",
-    "task",
-    "stick",
-    "friend",
-    "software",
-    "promotion",
-    "interaction",
-    "surround",
-    "block",
-    "purpose",
-    "practice",
-    "conflict",
-    "routine",
-    "requirement",
-    "bonus",
-    "hole",
-    "state",
-    "junior",
-    "sweet",
-    "catch",
-    "tear",
-    "fold",
-    "wall",
-    "editor",
-    "life",
-    "position",
-    "pound",
-    "respect",
-    "bathroom",
-    "coat",
-    "script",
-    "job",
-    "teach",
-    "birth",
-    "view",
-    "resolve",
-    "theme",
-    "employee",
-    "doubt",
-    "market",
-    "education",
-    "serve",
-    "recover",
-    "tone",
-    "harm",
-    "miss",
-    "union",
-    "understanding",
-    "cow",
-    "river",
-    "association",
-    "concept",
-    "training",
-    "recipe",
-    "relationship",
-    "reserve",
-    "depression",
-    "proof",
-    "hair",
-    "revenue",
-    "independent",
-    "lift",
-    "assignment",
-    "temporary",
-    "amount",
-    "loss",
-    "edge",
-    "track",
-    "check",
-    "rope",
-    "estimate",
-    "pollution",
-    "stable",
-    "message",
-    "delivery",
-    "perspective",
-    "mirror",
-    "assistant",
-    "representative",
-    "witness",
-    "nature",
-    "judge",
-    "fruit",
-    "tip",
-    "devil",
-    "town",
-    "emergency",
-    "upper",
-    "drop",
-    "stay",
-    "human",
-    "neck",
-    "speaker",
-    "network",
-    "sing",
-    "resist",
-    "league",
-    "trip",
-    "signature",
-    "lawyer",
-    "importance",
-    "gas",
-    "choice",
-    "engineer",
-    "success",
-    "part",
-    "external",
-    "worker",
-    "simple",
-    "quarter",
-    "student",
-    "heart",
-    "pass",
-    "spite",
-    "shift",
-    "rough",
-    "lady",
-    "grass",
-    "community",
-    "garage",
-    "youth",
-    "standard",
-    "skirt",
-    "promise",
-    "blind",
-    "television",
-    "disease",
-    "commission",
-    "positive",
-    "energy",
-    "calm",
-    "presence",
-    "tune",
-    "basis",
-    "preference",
-    "head",
-    "common",
-    "cut",
-    "somewhere",
-    "presentation",
-    "current",
-    "thought",
-    "revolution",
-    "effort",
-    "master",
-    "implement",
-    "republic",
-    "floor",
-    "principle",
-    "stranger",
-    "shoulder",
-    "grade",
-    "button",
-    "tennis",
-    "police",
-    "collection",
-    "account",
-    "register",
-    "glove",
-    "divide",
-    "professor",
-    "chair",
-    "priority",
-    "combine",
-    "peace",
-    "extension",
-    "maybe",
-    "evening",
-    "frame",
-    "sister",
-    "wave",
-    "code",
-    "application",
-    "mouse",
-    "match",
-    "counter",
-    "bottle",
-    "half",
-    "cheek",
-    "resolution",
-    "back",
-    "knowledge",
-    "make",
-    "discussion",
-    "screw",
-    "length",
-    "accident",
-    "battle",
-    "dress",
-    "knee",
-    "log",
-    "package",
-    "it",
-    "turn",
-    "hearing",
-    "newspaper",
-    "layer",
-    "wealth",
-    "profile",
-    "imagination",
-    "answer",
-    "weekend",
-    "teacher",
-    "appearance",
-    "meet",
-    "bike",
-    "rise",
-    "belt",
-    "crash",
-    "bowl",
-    "equivalent",
-    "support",
-    "image",
-    "poem",
-    "risk",
-    "excitement",
-    "remote",
-    "secretary",
-    "public",
-    "produce",
-    "plane",
-    "display",
-    "money",
-    "sand",
-    "situation",
-    "punch",
-    "customer",
-    "title",
-    "shake",
-    "mortgage",
-    "option",
-    "number",
-    "pop",
-    "window",
-    "extent",
-    "nothing",
-    "experience",
-    "opinion",
-    "departure",
-    "dance",
-    "indication",
-    "boy",
-    "material",
-    "band",
-    "leader",
-    "sun",
-    "beautiful",
-    "muscle",
-    "farmer",
-    "variety",
-    "fat",
-    "handle",
-    "director",
-    "opportunity",
-    "calendar",
-    "outside",
-    "pace",
-    "bath",
-    "fish",
-    "consequence",
-    "put",
-    "owner",
-    "go",
-    "doctor",
-    "information",
-    "share",
-    "hurt",
-    "protection",
-    "career",
-    "finance",
-    "force",
-    "golf",
-    "garbage",
-    "aspect",
-    "kid",
-    "food",
-    "boot",
-    "milk",
-    "respond",
-    "objective",
-    "reality",
-    "raw",
-    "ring",
-    "mall",
-    "one",
-    "impact",
-    "area",
-    "news",
-    "international",
-    "series",
-    "impress",
-    "mother",
-    "shelter",
-    "strike",
-    "loan",
-    "month",
-    "seat",
-    "anything",
-    "entertainment",
-    "familiar",
-    "clue",
-    "year",
-    "glad",
-    "supermarket",
-    "natural",
-    "god",
-    "cost",
-    "conversation",
-    "tie",
-    "ruin",
-    "comfort",
-    "earth",
-    "storm",
-    "percentage",
-    "assistance",
-    "budget",
-    "strength",
-    "beginning",
-    "sleep",
-    "other",
-    "young",
-    "unit",
-    "fill",
-    "store",
-    "desire",
-    "hide",
-    "value",
-    "cup",
-    "maintenance",
-    "nurse",
-    "function",
-    "tower",
-    "role",
-    "class",
-    "camera",
-    "database",
-    "panic",
-    "nation",
-    "basket",
-    "ice",
-    "art",
-    "spirit",
-    "chart",
-    "exchange",
-    "feedback",
-    "statement",
-    "reputation",
-    "search",
-    "hunt",
-    "exercise",
-    "nasty",
-    "notice",
-    "male",
-    "yard",
-    "annual",
-    "collar",
-    "date",
-    "platform",
-    "plant",
-    "fortune",
-    "passion",
-    "friendship",
-    "spread",
-    "cancer",
-    "ticket",
-    "attitude",
-    "island",
-    "active",
-    "object",
-    "service",
-    "buyer",
-    "bite",
-    "card",
-    "face",
-    "steak",
-    "proposal",
-    "patient",
-    "heat",
-    "rule",
-    "resident",
-    "broad",
-    "politics",
-    "west",
-    "knife",
-    "expert",
-    "girl",
-    "design",
-    "salt",
-    "baseball",
-    "grab",
-    "inspection",
-    "cousin",
-    "couple",
-    "magazine",
-    "cook",
-    "dependent",
-    "security",
-    "chicken",
-    "version",
-    "currency",
-    "ladder",
-    "scheme",
-    "kitchen",
-    "employment",
-    "local",
-    "attention",
-    "manager",
-    "fact",
-    "cover",
-    "sad",
-    "guard",
-    "relative",
-    "county",
-    "rate",
-    "lunch",
-    "program",
-    "initiative",
-    "gear",
-    "bridge",
-    "breast",
-    "talk",
-    "dish",
-    "guarantee",
-    "beer",
-    "vehicle",
-    "reception",
-    "woman",
-    "substance",
-    "copy",
-    "lecture",
-    "advantage",
-    "park",
-    "cold",
-    "death",
-    "mix",
-    "hold",
-    "scale",
-    "tomorrow",
-    "blood",
-    "request",
-    "green",
-    "cookie",
-    "church",
-    "strip",
-    "forever",
-    "beyond",
-    "debt",
-    "tackle",
-    "wash",
-    "following",
-    "feel",
-    "maximum",
-    "sector",
-    "sea",
-    "property",
-    "economics",
-    "menu",
-    "bench",
-    "try",
-    "language",
-    "start",
-    "call",
-    "solid",
-    "address",
-    "income",
-    "foot",
-    "senior",
-    "honey",
-    "few",
-    "mixture",
-    "cash",
-    "grocery",
-    "link",
-    "map",
-    "form",
-    "factor",
-    "pot",
-    "model",
-    "writer",
-    "farm",
-    "winter",
-    "skill",
-    "anywhere",
-    "birthday",
-    "policy",
-    "release",
-    "husband",
-    "lab",
-    "hurry",
-    "mail",
-    "equipment",
-    "sink",
-    "pair",
-    "driver",
-    "consideration",
-    "leather",
-    "skin",
-    "blue",
-    "boat",
-    "sale",
-    "brick",
-    "two",
-    "feed",
-    "square",
-    "dot",
-    "rush",
-    "dream",
-    "location",
-    "afternoon",
-    "manufacturer",
-    "control",
-    "occasion",
-    "trouble",
-    "introduction",
-    "advice",
-    "bet",
-    "eat",
-    "kill",
-    "category",
-    "manner",
-    "office",
-    "estate",
-    "pride",
-    "awareness",
-    "slip",
-    "crack",
-    "client",
-    "nail",
-    "shoot",
-    "membership",
-    "soft",
-    "anybody",
-    "web",
-    "official",
-    "individual",
-    "pizza",
-    "interest",
-    "bag",
-    "spell",
-    "profession",
-    "queen",
-    "deal",
-    "resource",
-    "ship",
-    "guy",
-    "chocolate",
-    "joint",
-    "formal",
-    "upstairs",
-    "car",
-    "resort",
-    "abroad",
-    "dealer",
-    "associate",
-    "finger",
-    "surgery",
-    "comment",
-    "team",
-    "detail",
-    "crazy",
-    "path",
-    "tale",
-    "initial",
-    "arm",
-    "radio",
-    "demand",
-    "single",
-    "draw",
-    "yellow",
-    "contest",
-    "piece",
-    "quote",
-    "pull",
-    "commercial",
-    "shirt",
-    "contribution",
-    "cream",
-    "channel",
-    "suit",
-    "discipline",
-    "instruction",
-    "concert",
-    "speech",
-    "low",
-    "effective",
-    "hang",
-    "scratch",
-    "industry",
-    "breakfast",
-    "lay",
-    "join",
-    "metal",
-    "bedroom",
-    "minute",
-    "product",
-    "rest",
-    "temperature",
-    "many",
-    "give",
-    "argument",
-    "print",
-    "purple",
-    "laugh",
-    "health",
-    "credit",
-    "investment",
-    "sell",
-    "setting",
-    "lesson",
-    "egg",
-    "middle",
-    "marriage",
-    "level",
-    "evidence",
-    "phrase",
-    "love",
-    "self",
-    "benefit",
-    "guidance",
-    "affect",
-    "you",
-    "dad",
-    "anxiety",
-    "special",
-    "boyfriend",
-    "test",
-    "blank",
-    "payment",
-    "soup",
-    "obligation",
-    "reply",
-    "smile",
-    "deep",
-    "complaint",
-    "addition",
-    "review",
-    "box",
-    "towel",
-    "minor",
-    "fun",
-    "soil",
-    "issue",
-    "cigarette",
-    "internet",
-    "gain",
-    "tell",
-    "entry",
-    "spare",
-    "incident",
-    "family",
-    "refuse",
-    "branch",
-    "can",
-    "pen",
-    "grandfather",
-    "constant",
-    "tank",
-    "uncle",
-    "climate",
-    "ground",
-    "volume",
-    "communication",
-    "kind",
-    "poet",
-    "child",
-    "screen",
-    "mine",
-    "quit",
-    "gene",
-    "lack",
-    "charity",
-    "memory",
-    "tooth",
-    "fear",
-    "mention",
-    "marketing",
-    "reveal",
-    "reason",
-    "court",
-    "season",
-    "freedom",
-    "land",
-    "sport",
-    "audience",
-    "classroom",
-    "law",
-    "hook",
-    "win",
-    "carry",
-    "eye",
-    "smell",
-    "distribution",
-    "research",
-    "country",
-    "dare",
-    "hope",
-    "whereas",
-    "stretch",
-    "library",
-    "if",
-    "delay",
-    "college",
-    "plastic",
-    "book",
-    "present",
-    "use",
-    "worry",
-    "champion",
-    "goal",
-    "economy",
-    "march",
-    "election",
-    "reflection",
-    "midnight",
-    "slide",
-    "inflation",
-    "action",
-    "challenge",
-    "guitar",
-    "coast",
-    "apple",
-    "campaign",
-    "field",
-    "jacket",
-    "sense",
-    "way",
-    "visual",
-    "remove",
-    "weather",
-    "trash",
-    "cable",
-    "regret",
-    "buddy",
-    "beach",
-    "historian",
-    "courage",
-    "sympathy",
-    "truck",
-    "tension",
-    "permit",
-    "nose",
-    "bed",
-    "son",
-    "person",
-    "base",
-    "meat",
-    "usual",
-    "air",
-    "meeting",
-    "worth",
-    "game",
-    "independence",
-    "physical",
-    "brief",
-    "play",
-    "raise",
-    "board",
-    "she",
-    "key",
-    "writing",
-    "pick",
-    "command",
-    "party",
-    "yesterday",
-    "spring",
-    "candidate",
-    "physics",
-    "university",
-    "concern",
-    "development",
-    "change",
-    "string",
-    "target",
-    "instance",
-    "room",
-    "bitter",
-    "bird",
-    "football",
-    "normal",
-    "split",
-    "impression",
-    "wood",
-    "long",
-    "meaning",
-    "stock",
-    "cap",
-    "leadership",
-    "media",
-    "ambition",
-    "fishing",
-    "essay",
-    "salad",
-    "repair",
-    "today",
-    "designer",
-    "night",
-    "bank",
-    "drawing",
-    "inevitable",
-    "phase",
-    "vast",
-    "chip",
-    "anger",
-    "switch",
-    "cry",
-    "twist",
-    "personality",
-    "attempt",
-    "storage",
-    "being",
-    "preparation",
-    "bat",
-    "selection",
-    "white",
-    "technology",
-    "contract",
-    "side",
-    "section",
-    "station",
-    "till",
-    "structure",
-    "tongue",
-    "taste",
-    "truth",
-    "difficulty",
-    "group",
-    "limit",
-    "main",
-    "move",
-    "feeling",
-    "light",
-    "example",
-    "mission",
-    "might",
-    "wait",
-    "wheel",
-    "shop",
-    "host",
-    "classic",
-    "alternative",
-    "cause",
-    "agent",
-    "consist",
-    "table",
-    "airline",
-    "text",
-    "pool",
-    "craft",
-    "range",
-    "fuel",
-    "tool",
-    "partner",
-    "load",
-    "entrance",
-    "deposit",
-    "hate",
-    "article",
-    "video",
-    "summer",
-    "feature",
-    "extreme",
-    "mobile",
-    "hospital",
-    "flight",
-    "fall",
-    "pension",
-    "piano",
-    "fail",
-    "result",
-    "rub",
-    "gap",
-    "system",
-    "report",
-    "suck",
-    "ordinary",
-    "wind",
-    "nerve",
-    "ask",
-    "shine",
-    "note",
-    "line",
-    "mom",
-    "perception",
-    "brother",
-    "reference",
-    "bend",
-    "charge",
-    "treat",
-    "trick",
-    "term",
-    "homework",
-    "bake",
-    "bid",
-    "status",
-    "project",
-    "strategy",
-    "orange",
-    "let",
-    "enthusiasm",
-    "parent",
-    "concentrate",
-    "device",
-    "travel",
-    "poetry",
-    "business",
-    "society",
-    "kiss",
-    "end",
-    "vegetable",
-    "employ",
-    "schedule",
-    "hour",
-    "brave",
-    "focus",
-    "process",
-    "movie",
-    "illegal",
-    "general",
-    "coffee",
-    "ad",
-    "highway",
-    "chemistry",
-    "psychology",
-    "hire",
-    "bell",
-    "conference",
-    "relief",
-    "show",
-    "neat",
-    "funny",
-    "weight",
-    "quality",
-    "club",
-    "daughter",
-    "zone",
-    "touch",
-    "tonight",
-    "shock",
-    "burn",
-    "excuse",
-    "name",
-    "survey",
-    "landscape",
-    "advance",
-    "satisfaction",
-    "bread",
-    "disaster",
-    "item",
-    "hat",
-    "prior",
-    "shopping",
-    "visit",
-    "east",
-    "photo",
-    "home",
-    "idea",
-    "father",
-    "comparison",
-    "cat",
-    "pipe",
-    "winner",
-    "count",
-    "lake",
-    "fight",
-    "prize",
-    "foundation",
-    "dog",
-    "keep",
-    "ideal",
-    "fan",
-    "struggle",
-    "peak",
-    "safety",
-    "solution",
-    "hell",
-    "conclusion",
-    "population",
-    "strain",
-    "alarm",
-    "measurement",
-    "second",
-    "train",
-    "race",
-    "due",
-    "insurance",
-    "boss",
-    "tree",
-    "monitor",
-    "sick",
-    "course",
-    "drag",
-    "appointment",
-    "slice",
-    "still",
-    "care",
-    "patience",
-    "rich",
-    "escape",
-    "emotion",
-    "royal",
-    "female",
-    "childhood",
-    "government",
-    "picture",
-    "will",
-    "sock",
-    "big",
-    "gate",
-    "oil",
-    "cross",
-    "pin",
-    "improvement",
-    "championship",
-    "silly",
-    "help",
-    "sky",
-    "pitch",
-    "man",
-    "diamond",
-    "most",
-    "transition",
-    "work",
-    "science",
-    "committee",
-    "moment",
-    "fix",
-    "teaching",
-    "dig",
-    "specialist",
-    "complex",
-    "guide",
-    "people",
-    "dead",
-    "voice",
-    "original",
-    "break",
-    "topic",
-    "data",
-    "degree",
-    "reading",
-    "recording",
-    "bunch",
-    "reach",
-    "judgment",
-    "lie",
-    "regular",
-    "set",
-    "painting",
-    "mode",
-    "list",
-    "player",
-    "bear",
-    "north",
-    "wonder",
-    "carpet",
-    "heavy",
-    "officer",
-    "negative",
-    "clock",
-    "unique",
-    "baby",
-    "pain",
-    "assumption",
-    "disk",
-    "iron",
-    "bill",
-    "drawer",
-    "look",
-    "double",
-    "mistake",
-    "finish",
-    "future",
-    "brilliant",
-    "contact",
-    "math",
-    "rice",
-    "leave",
-    "restaurant",
-    "discount",
-    "sex",
-    "virus",
-    "bit",
-    "trust",
-    "event",
-    "wear",
-    "juice",
-    "failure",
-    "bug",
-    "context",
-    "mud",
-    "whole",
-    "wrap",
-    "intention",
-    "draft",
-    "pressure",
-    "cake",
-    "dark",
-    "explanation",
-    "space",
-    "angle",
-    "word",
-    "efficiency",
-    "management",
-    "habit",
-    "star",
-    "chance",
-    "finding",
-    "transportation",
-    "stand",
-    "criticism",
-    "flow",
-    "door",
-    "injury",
-    "insect",
-    "surprise",
-    "apartment",
-]  # pylint: disable=line-too-long
diff --git a/src/llama_stack/providers/inline/scoring/basic/utils/math_utils.py b/src/llama_stack/providers/inline/scoring/basic/utils/math_utils.py
deleted file mode 100644
index 618657980d..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/utils/math_utils.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import re
-from collections.abc import Sequence
-
-from llama_stack.providers.utils.scoring.basic_scoring_utils import time_limit
-
-# from minerva
-SUBSTITUTIONS = [
-    ("an ", ""),
-    ("a ", ""),
-    (".$", "$"),
-    ("\\$", ""),
-    (r"\ ", ""),
-    (" ", ""),
-    ("mbox", "text"),
-    (",\\text{and}", ","),
-    ("\\text{and}", ","),
-    ("\\text{m}", "\\text{}"),
-]
-
-REMOVED_EXPRESSIONS = [
-    "square",
-    "ways",
-    "integers",
-    "dollars",
-    "mph",
-    "inches",
-    "ft",
-    "hours",
-    "km",
-    "units",
-    "\\ldots",
-    "sue",
-    "points",
-    "feet",
-    "minutes",
-    "digits",
-    "cents",
-    "degrees",
-    "cm",
-    "gm",
-    "pounds",
-    "meters",
-    "meals",
-    "edges",
-    "students",
-    "childrentickets",
-    "multiples",
-    "\\text{s}",
-    "\\text{.}",
-    "\\text{\ns}",
-    "\\text{}^2",
-    "\\text{}^3",
-    "\\text{\n}",
-    "\\text{}",
-    r"\mathrm{th}",
-    r"^\circ",
-    r"^{\circ}",
-    r"\;",
-    r",\!",
-    "{,}",
-    '"',
-    "\\dots",
-]
-
-
-def try_evaluate_frac(expression: str, fmt: str = "0.2e") -> str:
-    """Attempt to evaluate LaTeX frac expressions in a string.
-
-    Args:
-        expression: string potentially containing frac patterns
-        fmt: format specifier for evaluated fractions
-
-    Returns:
-        String with evaluated fractions replaced by their numeric values
-    """
-    if isinstance(expression, float):
-        return expression
-    new_expression = f"{expression}"
-    regex = re.compile(r"\\frac{([^}]+)}{([^}]+)}")
-    for match in re.finditer(regex, expression):
-        try:
-            value = float(match.group(1)) / float(match.group(2))
-            new_expression = new_expression.replace(
-                match.group(),
-                f"{{value:{fmt}}}".format(value=value),
-                1,
-            )
-        except Exception:
-            continue
-    return new_expression
-
-
-def try_evaluate_latex(expression: str, fmt: str = ".2e") -> str:
-    """Attempt to evaluate a LaTeX math expression using sympy.
-
-    Args:
-        expression: LaTeX math expression string
-        fmt: format specifier for the result
-
-    Returns:
-        Formatted numeric string or the original expression if evaluation fails
-    """
-    try:
-        with time_limit(seconds=5):
-            from sympy.parsing.latex import parse_latex
-
-            value = parse_latex(expression).evalf()  # type: ignore
-            return f"{{value:{fmt}}}".format(value=value)
-    except Exception:
-        return expression
-
-
-def first_answer(text: str, markers: Sequence[str] = ("Q:", "A:")) -> str:
-    """Extract the first answer segment before any marker strings.
-
-    Args:
-        text: input text potentially containing markers
-        markers: sequence of marker strings to split on
-
-    Returns:
-        Text before the first occurrence of any marker
-    """
-    for marker in markers:
-        text = text.split(marker)[0]
-    return text
-
-
-def extract_result_from_boxed(answer: str) -> str:
-    """Extract the value from a LaTeX boxed expression.
-
-    Args:
-        answer: string containing a boxed expression
-
-    Returns:
-        The content inside the boxed command, or empty string if not found
-    """
-    box_start = "\\boxed"
-    # format is `\\boxed <value>$` or `\\boxed{<value>}`, with potential white spaces framing `<value>`
-    start = answer.rfind(box_start)
-    if start < 0:
-        return ""
-    answer = answer[start + len(box_start) :].strip()
-    ends_with_curly = answer.startswith("{")
-    i = 0
-    open_braces = 0
-    while i < len(answer):
-        if answer[i] == "{":
-            open_braces += 1
-        elif answer[i] == "}":
-            open_braces -= 1
-        if open_braces == 0:
-            if ends_with_curly:
-                answer = answer[: i + 1].strip()
-                break
-            elif answer[i] == "$":
-                answer = answer[:i].strip()
-                break
-        i += 1
-    else:
-        return ""
-    # remove extra curly braces
-    while True:
-        if answer.startswith("{") and answer.endswith("}"):
-            answer = answer[1:-1].strip()
-        else:
-            break
-    return answer
-
-
-# from minerva paper + _normalise_result from xavierm
-def normalize_final_answer(final_answer: str, regex_pattern: str, match_first: bool = True) -> str:
-    """Extract and normalize a final answer to a quantitative reasoning question."""
-    match = re.findall(regex_pattern, final_answer)
-    extraction: str
-    if len(match) > 0:
-        if match_first:
-            extraction = match[0]
-        else:
-            extraction = match[-1]
-    else:
-        extraction = extract_result_from_boxed(final_answer)
-
-    if len(extraction) == 0:
-        return final_answer
-    else:
-        final_answer = extraction
-    final_answer = final_answer.split("=")[-1]
-    for before, after in SUBSTITUTIONS:
-        final_answer = final_answer.replace(before, after)
-    for expr in REMOVED_EXPRESSIONS:
-        final_answer = final_answer.replace(expr, "")
-    # Extract answer that is in LaTeX math, is bold,
-    # is surrounded by a box, etc.
-    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
-    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
-    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
-    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
-    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
-    # Normalize shorthand TeX:
-    # \fracab -> \frac{a}{b}
-    # \frac{abc}{bef} -> \frac{abc}{bef}
-    # \fracabc -> \frac{a}{b}c
-    # \sqrta -> \sqrt{a}
-    # \sqrtab -> sqrt{a}b
-    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
-    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
-    final_answer = final_answer.replace("$", "")
-    # Normalize 100,000 -> 100000
-    if final_answer.replace(",", "").isdigit():
-        final_answer = final_answer.replace(",", "")
-    # If the final answer is a single letter in parentheses, remove the parentheses
-    # Example: (a) -> a (but not (ab) -> ab)
-    if re.match(r"\([a-zA-Z]\)", final_answer):
-        final_answer = final_answer[1]
-    return _normalise_result(final_answer)
-
-
-def _normalise_result(string: str) -> str:
-    # linebreaks
-    string = string.replace("\n", "")
-
-    # remove inverse spaces
-    string = string.replace("\\!", "")
-
-    # replace \\ with \
-    string = string.replace("\\\\", "\\")
-
-    # replace tfrac and dfrac with frac
-    string = string.replace("cfrac", "frac")
-    string = string.replace("tfrac", "frac")
-    string = string.replace("dfrac", "frac")
-
-    # remove \left and \right
-    string = string.replace("\\left", "")
-    string = string.replace("\\le", "")
-    string = string.replace("\\right", "")
-
-    # Remove circ (degrees)
-    string = string.replace("^{\\circ}", "")
-    string = string.replace("^\\circ", "")
-
-    # remove dollar signs
-    string = string.replace("\\$", "")
-
-    # remove units (on the right)
-    string = _remove_right_units(string)
-
-    # remove percentage
-    string = string.replace("\\%", "")
-    string = string.replace(r"\%", "")
-
-    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
-    string = string.replace(" .", " 0.")
-    string = string.replace("{.", "{0.")
-    # if empty, return empty string
-    if len(string) == 0:
-        return string
-    if string[0] == ".":
-        string = "0" + string
-
-    # to consider: get rid of e.g. "k = " or "q = " at beginning
-    string = string.split("=")[-1]
-
-    # fix sqrt3 --> sqrt{3}
-    string = _fix_sqrt(string)
-
-    # remove spaces
-    string = string.replace(" ", "")
-
-    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
-    string = _fix_fracs(string)
-
-    # manually change 0.5 --> \frac{1}{2}
-    if string == "0.5":
-        string = "\\frac{1}{2}"
-
-    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
-    string = _fix_a_slash_b(string)
-
-    return string
-
-
-def _remove_right_units(string: str) -> str:
-    # "\\text{ " only ever occurs (at least in the val set) when describing units
-    try:
-        if "\\text{ " in string:
-            splits = string.split("\\text{ ")
-            assert len(splits) == 2
-            return splits[0]
-        else:
-            return string
-    except AssertionError:
-        return string
-
-
-def _fix_sqrt(string: str) -> str:
-    if "\\sqrt" not in string:
-        return string
-    splits = string.split("\\sqrt")
-    new_string = splits[0]
-    for split in splits[1:]:
-        if len(split) == 0:
-            return string
-        if split[0] != "{":
-            a = split[0]
-            new_substr = "\\sqrt{" + a + "}" + split[1:]
-        else:
-            new_substr = "\\sqrt" + split
-        new_string += new_substr
-    return new_string
-
-
-def _fix_fracs(string: str) -> str:
-    substrs = string.split("\\frac")
-    new_str = substrs[0]
-    if len(substrs) > 1:
-        substrs = substrs[1:]
-        for substr in substrs:
-            new_str += "\\frac"
-            if len(substr) == 0:
-                return string
-            if substr[0] == "{":
-                new_str += substr
-            else:
-                try:
-                    assert len(substr) >= 2
-                except AssertionError:
-                    return string
-                a = substr[0]
-                b = substr[1]
-                if b != "{":
-                    if len(substr) > 2:
-                        post_substr = substr[2:]
-                        new_str += "{" + a + "}{" + b + "}" + post_substr
-                    else:
-                        new_str += "{" + a + "}{" + b + "}"
-                else:
-                    if len(substr) > 2:
-                        post_substr = substr[2:]
-                        new_str += "{" + a + "}" + b + post_substr
-                    else:
-                        new_str += "{" + a + "}" + b
-    string = new_str
-    return string
-
-
-def _fix_a_slash_b(string: str) -> str:
-    if len(string.split("/")) != 2:
-        return string
-    a = string.split("/")[0]
-    b = string.split("/")[1]
-    try:
-        ia = int(a)
-        ib = int(b)
-        assert string == f"{ia}/{ib}"
-        new_string = "\\frac{" + str(ia) + "}{" + str(ib) + "}"
-        return new_string
-    except (ValueError, AssertionError):
-        return string
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/__init__.py b/src/llama_stack/providers/inline/scoring/braintrust/__init__.py
deleted file mode 100644
index aa526b1714..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel, SecretStr
-
-from llama_stack.core.datatypes import Api
-
-from .config import BraintrustScoringConfig
-
-
-class BraintrustProviderDataValidator(BaseModel):
-    """Validator for Braintrust provider data requiring an OpenAI API key."""
-
-    openai_api_key: SecretStr
-
-
-async def get_provider_impl(
-    config: BraintrustScoringConfig,
-    deps: dict[Api, Any],
-):
-    from .braintrust import BraintrustScoringImpl
-
-    impl = BraintrustScoringImpl(config, deps[Api.datasetio], deps[Api.datasets])
-    await impl.initialize()
-    return impl
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/src/llama_stack/providers/inline/scoring/braintrust/braintrust.py
deleted file mode 100644
index 9b860670b1..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/braintrust.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import os
-import threading
-from typing import Any
-
-from llama_stack.core.datatypes import Api
-from llama_stack.core.request_headers import NeedsRequestProviderData
-from llama_stack.providers.utils.common.data_schema_validator import (
-    get_valid_schemas,
-    validate_row_schema,
-)
-from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_metrics
-from llama_stack_api import (
-    DatasetIO,
-    Datasets,
-    IterRowsRequest,
-    ScoreBatchRequest,
-    ScoreBatchResponse,
-    ScoreRequest,
-    ScoreResponse,
-    Scoring,
-    ScoringFn,
-    ScoringFunctionsProtocolPrivate,
-    ScoringResult,
-    ScoringResultRow,
-)
-
-from .config import BraintrustScoringConfig
-from .scoring_fn.fn_defs.answer_correctness import answer_correctness_fn_def
-from .scoring_fn.fn_defs.answer_relevancy import answer_relevancy_fn_def
-from .scoring_fn.fn_defs.answer_similarity import answer_similarity_fn_def
-from .scoring_fn.fn_defs.context_entity_recall import context_entity_recall_fn_def
-from .scoring_fn.fn_defs.context_precision import context_precision_fn_def
-from .scoring_fn.fn_defs.context_recall import context_recall_fn_def
-from .scoring_fn.fn_defs.context_relevancy import context_relevancy_fn_def
-from .scoring_fn.fn_defs.factuality import factuality_fn_def
-from .scoring_fn.fn_defs.faithfulness import faithfulness_fn_def
-
-# Mapping of scoring function identifiers to their definitions (lightweight, no heavy imports)
-SUPPORTED_BRAINTRUST_SCORING_FN_DEFS: dict[str, ScoringFn] = {
-    "braintrust::factuality": factuality_fn_def,
-    "braintrust::answer-correctness": answer_correctness_fn_def,
-    "braintrust::answer-relevancy": answer_relevancy_fn_def,
-    "braintrust::answer-similarity": answer_similarity_fn_def,
-    "braintrust::faithfulness": faithfulness_fn_def,
-    "braintrust::context-entity-recall": context_entity_recall_fn_def,
-    "braintrust::context-precision": context_precision_fn_def,
-    "braintrust::context-recall": context_recall_fn_def,
-    "braintrust::context-relevancy": context_relevancy_fn_def,
-}
-
-# Lazy-loaded evaluators (defers loading autoevals and its pyarrow dependency)
-_braintrust_evaluators: dict[str, Any] | None = None
-_braintrust_evaluators_lock = threading.Lock()
-
-
-def _get_braintrust_evaluators() -> dict[str, Any]:
-    """Lazily load autoevals evaluators on first use.
-
-    This defers importing autoevals (and its pyarrow dependency) until
-    braintrust scoring is actually needed, saving ~63MB of memory at startup.
-    """
-    global _braintrust_evaluators
-    if _braintrust_evaluators is not None:
-        return _braintrust_evaluators
-
-    with _braintrust_evaluators_lock:
-        if _braintrust_evaluators is not None:
-            return _braintrust_evaluators
-
-        from autoevals.llm import Factuality
-        from autoevals.ragas import (
-            AnswerCorrectness,
-            AnswerRelevancy,
-            AnswerSimilarity,
-            ContextEntityRecall,
-            ContextPrecision,
-            ContextRecall,
-            ContextRelevancy,
-            Faithfulness,
-        )
-
-        _braintrust_evaluators = {
-            "braintrust::factuality": Factuality(),
-            "braintrust::answer-correctness": AnswerCorrectness(),
-            "braintrust::answer-relevancy": AnswerRelevancy(),
-            "braintrust::answer-similarity": AnswerSimilarity(),
-            "braintrust::faithfulness": Faithfulness(),
-            "braintrust::context-entity-recall": ContextEntityRecall(),
-            "braintrust::context-precision": ContextPrecision(),
-            "braintrust::context-recall": ContextRecall(),
-            "braintrust::context-relevancy": ContextRelevancy(),
-        }
-        return _braintrust_evaluators
-
-
-class BraintrustScoringImpl(
-    Scoring,
-    ScoringFunctionsProtocolPrivate,
-    NeedsRequestProviderData,
-):
-    """Scoring provider using Braintrust evaluators for LLM output assessment."""
-
-    def __init__(
-        self,
-        config: BraintrustScoringConfig,
-        datasetio_api: DatasetIO,
-        datasets_api: Datasets,
-    ) -> None:
-        self.config = config
-        self.datasetio_api = datasetio_api
-        self.datasets_api = datasets_api
-        self.supported_fn_defs_registry = SUPPORTED_BRAINTRUST_SCORING_FN_DEFS
-
-    async def initialize(self) -> None: ...
-
-    async def shutdown(self) -> None: ...
-
-    async def list_scoring_functions(self) -> list[ScoringFn]:
-        scoring_fn_defs_list = list(self.supported_fn_defs_registry.values())
-        for f in scoring_fn_defs_list:
-            assert f.identifier.startswith("braintrust"), (
-                "All braintrust scoring fn must have identifier prefixed with 'braintrust'! "
-            )
-
-        return scoring_fn_defs_list
-
-    async def register_scoring_function(self, scoring_fn: ScoringFn) -> None:
-        raise NotImplementedError("Registering scoring function not allowed for braintrust provider")
-
-    async def set_api_key(self) -> None:
-        # api key is in the request headers
-        if not self.config.openai_api_key:
-            provider_data = self.get_request_provider_data()
-            if provider_data is None or not provider_data.openai_api_key:
-                raise ValueError(
-                    'Pass OpenAI API Key in the header X-LlamaStack-Provider-Data as { "openai_api_key": <your api key>}'
-                )
-            self.config.openai_api_key = provider_data.openai_api_key
-
-        os.environ["OPENAI_API_KEY"] = self.config.openai_api_key
-
-    async def score_batch(
-        self,
-        request: ScoreBatchRequest,
-    ) -> ScoreBatchResponse:
-        await self.set_api_key()
-
-        all_rows = await self.datasetio_api.iterrows(IterRowsRequest(dataset_id=request.dataset_id, limit=-1))
-        score_request = ScoreRequest(
-            input_rows=all_rows.data,
-            scoring_functions=request.scoring_functions,
-        )
-        res = await self.score(score_request)
-        if request.save_results_dataset:
-            # TODO: persist and register dataset on to server for reading
-            # self.datasets_api.register_dataset()
-            raise NotImplementedError("Save results dataset not implemented yet")
-
-        return ScoreBatchResponse(
-            results=res.results,
-        )
-
-    async def score_row(self, input_row: dict[str, Any], scoring_fn_identifier: str | None = None) -> ScoringResultRow:
-        validate_row_schema(input_row, get_valid_schemas(Api.scoring.value))
-        await self.set_api_key()
-        assert scoring_fn_identifier is not None, "scoring_fn_identifier cannot be None"
-        expected_answer = input_row["expected_answer"]
-        generated_answer = input_row["generated_answer"]
-        input_query = input_row["input_query"]
-        evaluators = _get_braintrust_evaluators()
-        evaluator = evaluators[scoring_fn_identifier]
-
-        result = evaluator(
-            generated_answer,
-            expected_answer,
-            input=input_query,
-            context=input_row["context"] if "context" in input_row else None,
-        )
-        score = result.score
-        return {"score": score, "metadata": result.metadata}
-
-    async def score(
-        self,
-        request: ScoreRequest,
-    ) -> ScoreResponse:
-        await self.set_api_key()
-        res = {}
-        for scoring_fn_id in request.scoring_functions:
-            if scoring_fn_id not in self.supported_fn_defs_registry:
-                raise ValueError(f"Scoring function {scoring_fn_id} is not supported.")
-
-            score_results = [await self.score_row(input_row, scoring_fn_id) for input_row in request.input_rows]
-            aggregation_functions = self.supported_fn_defs_registry[scoring_fn_id].params.aggregation_functions
-
-            # override scoring_fn params if provided
-            if request.scoring_functions[scoring_fn_id] is not None:
-                override_params = request.scoring_functions[scoring_fn_id]
-                if override_params.aggregation_functions:
-                    aggregation_functions = override_params.aggregation_functions
-
-            agg_results = aggregate_metrics(score_results, aggregation_functions)
-            res[scoring_fn_id] = ScoringResult(
-                score_rows=score_results,
-                aggregated_results=agg_results,
-            )
-
-        return ScoreResponse(
-            results=res,
-        )
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/config.py b/src/llama_stack/providers/inline/scoring/braintrust/config.py
deleted file mode 100644
index 1ef1915516..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/config.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-
-class BraintrustScoringConfig(BaseModel):
-    """Configuration for the Braintrust scoring provider."""
-
-    openai_api_key: str | None = Field(
-        default=None,
-        description="The OpenAI API Key",
-    )
-
-    @classmethod
-    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
-        return {
-            "openai_api_key": "${env.OPENAI_API_KEY:=}",
-        }
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
deleted file mode 100644
index b058305b45..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-answer_correctness_fn_def = ScoringFn(
-    identifier="braintrust::answer-correctness",
-    description=(
-        "Scores the correctness of the answer based on the ground truth. "
-        "Uses Braintrust LLM-based scorer from autoevals library."
-    ),
-    provider_id="braintrust",
-    provider_resource_id="answer-correctness",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py
deleted file mode 100644
index d619d38a80..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-answer_relevancy_fn_def = ScoringFn(
-    identifier="braintrust::answer-relevancy",
-    description=(
-        "Test output relevancy against the input query using Braintrust LLM scorer. "
-        "See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="answer-relevancy",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py
deleted file mode 100644
index 34354a1fc2..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-answer_similarity_fn_def = ScoringFn(
-    identifier="braintrust::answer-similarity",
-    description=(
-        "Test output similarity against expected value using Braintrust LLM scorer. "
-        "See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="answer-similarity",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py
deleted file mode 100644
index 4092ccc4ad..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-context_entity_recall_fn_def = ScoringFn(
-    identifier="braintrust::context-entity-recall",
-    description=(
-        "Evaluates how well the context captures the named entities present in the "
-        "reference answer. See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="context-entity-recall",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py
deleted file mode 100644
index 2b32b9eec8..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-context_precision_fn_def = ScoringFn(
-    identifier="braintrust::context-precision",
-    description=(
-        "Measures how much of the provided context is actually relevant to answering the "
-        "question. See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="context-precision",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py
deleted file mode 100644
index 4d6547002d..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-context_recall_fn_def = ScoringFn(
-    identifier="braintrust::context-recall",
-    description=(
-        "Evaluates how well the context covers the information needed to answer the "
-        "question. See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="context-recall",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py
deleted file mode 100644
index 739dfd7bdb..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-context_relevancy_fn_def = ScoringFn(
-    identifier="braintrust::context-relevancy",
-    description=(
-        "Assesses how relevant the provided context is to the given question. See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="context-relevancy",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py
deleted file mode 100644
index 59ed5949bc..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-factuality_fn_def = ScoringFn(
-    identifier="braintrust::factuality",
-    description=(
-        "Test output factuality against expected value using Braintrust LLM scorer. "
-        "See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="factuality",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py
deleted file mode 100644
index 96c36d226a..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-faithfulness_fn_def = ScoringFn(
-    identifier="braintrust::faithfulness",
-    description=(
-        "Test output faithfulness to the input query using Braintrust LLM scorer. "
-        "See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="faithfulness",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py
deleted file mode 100644
index 76735fcb34..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.core.datatypes import Api
-
-from .config import LlmAsJudgeScoringConfig
-
-
-async def get_provider_impl(
-    config: LlmAsJudgeScoringConfig,
-    deps: dict[Api, Any],
-):
-    from .scoring import LlmAsJudgeScoringImpl
-
-    impl = LlmAsJudgeScoringImpl(config, deps[Api.datasetio], deps[Api.datasets], deps[Api.inference])
-    await impl.initialize()
-    return impl
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/config.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/config.py
deleted file mode 100644
index c5333afb98..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/config.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel
-
-
-class LlmAsJudgeScoringConfig(BaseModel):
-    """Configuration for the LLM-as-judge scoring provider."""
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {}
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
deleted file mode 100644
index fb96df3627..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    DatasetIO,
-    Datasets,
-    Inference,
-    IterRowsRequest,
-    ScoreBatchRequest,
-    ScoreBatchResponse,
-    ScoreRequest,
-    ScoreResponse,
-    Scoring,
-    ScoringFn,
-    ScoringFunctionsProtocolPrivate,
-    ScoringResult,
-)
-
-from .config import LlmAsJudgeScoringConfig
-from .scoring_fn.llm_as_judge_scoring_fn import LlmAsJudgeScoringFn
-
-LLM_JUDGE_FN = LlmAsJudgeScoringFn
-
-
-class LlmAsJudgeScoringImpl(
-    Scoring,
-    ScoringFunctionsProtocolPrivate,
-):
-    """Scoring provider that uses an LLM to evaluate and judge response quality."""
-
-    def __init__(
-        self,
-        config: LlmAsJudgeScoringConfig,
-        datasetio_api: DatasetIO,
-        datasets_api: Datasets,
-        inference_api: Inference,
-    ) -> None:
-        self.config = config
-        self.datasetio_api = datasetio_api
-        self.datasets_api = datasets_api
-        self.inference_api = inference_api
-
-    async def initialize(self) -> None:
-        impl = LLM_JUDGE_FN(inference_api=self.inference_api)
-        self.llm_as_judge_fn = impl
-
-    async def shutdown(self) -> None: ...
-
-    async def list_scoring_functions(self) -> list[ScoringFn]:
-        scoring_fn_defs_list = self.llm_as_judge_fn.get_supported_scoring_fn_defs()
-
-        for f in self.llm_as_judge_fn.get_supported_scoring_fn_defs():
-            assert f.identifier.startswith("llm-as-judge"), (
-                "All llm-as-judge scoring fn must have identifier prefixed with 'llm-as-judge'! "
-            )
-
-        return scoring_fn_defs_list
-
-    async def register_scoring_function(self, function_def: ScoringFn) -> None:
-        self.llm_as_judge_fn.register_scoring_fn_def(function_def)
-
-    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
-        self.llm_as_judge_fn.unregister_scoring_fn_def(scoring_fn_id)
-
-    async def score_batch(
-        self,
-        request: ScoreBatchRequest,
-    ) -> ScoreBatchResponse:
-        all_rows = await self.datasetio_api.iterrows(IterRowsRequest(dataset_id=request.dataset_id, limit=-1))
-        score_request = ScoreRequest(
-            input_rows=all_rows.data,
-            scoring_functions=request.scoring_functions,
-        )
-        res = await self.score(score_request)
-        if request.save_results_dataset:
-            # TODO: persist and register dataset on to server for reading
-            # self.datasets_api.register_dataset()
-            raise NotImplementedError("Save results dataset not implemented yet")
-
-        return ScoreBatchResponse(
-            results=res.results,
-        )
-
-    async def score(
-        self,
-        request: ScoreRequest,
-    ) -> ScoreResponse:
-        res = {}
-        for scoring_fn_id in request.scoring_functions.keys():
-            scoring_fn = self.llm_as_judge_fn
-            scoring_fn_params = request.scoring_functions.get(scoring_fn_id, None)
-            score_results = await scoring_fn.score(request.input_rows, scoring_fn_id, scoring_fn_params)
-            agg_results = await scoring_fn.aggregate(score_results, scoring_fn_id, scoring_fn_params)
-            res[scoring_fn_id] = ScoringResult(
-                score_rows=score_results,
-                aggregated_results=agg_results,
-            )
-
-        return ScoreResponse(
-            results=res,
-        )
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py
deleted file mode 100644
index ed26169a5a..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    LLMAsJudgeScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-GRADER_TEMPLATE = """
-Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
-First, I will give examples of each grade, and then you will grade a new example.
-The following are examples of CORRECT predicted answers.
-```
-Question: What are the names of Barack Obama's children?
-Gold target: Malia Obama and Sasha Obama
-Predicted answer 1: sasha and malia obama
-Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
-Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
-```
-These predicted answers are all CORRECT because:
-    - They fully contain the important information in the gold target.
-    - They do not contain any information that contradicts the gold target.
-    - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
-    - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
-The following are examples of INCORRECT predicted answers.
-```
-Question: What are the names of Barack Obama's children?
-Gold target: Malia and Sasha
-Predicted answer 1: Malia.
-Predicted answer 2: Malia, Sasha, and Susan.
-Predicted answer 3: Barack Obama does not have any children.
-Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
-Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
-Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
-Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
-```
-These predicted answers are all INCORRECT because:
-    - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
-The following are examples of NOT_ATTEMPTED predicted answers.
-```
-Question: What are the names of Barack Obama's children?
-Gold target: Malia and Sasha
-Predicted answer 1: I don't know.
-Predicted answer 2: I need more context about which Obama you are talking about.
-Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
-Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
-```
-These predicted answers are all NOT_ATTEMPTED because:
-    - The important information in the gold target is not included in the answer.
-    - No statements in the answer contradict the gold target.
-Also note the following things:
-- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
-    - Predicted answers "120k", "124k", and 115k" are all CORRECT.
-    - Predicted answers "100k" and "113k" are INCORRECT.
-    - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
-- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
-    - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
-- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
-    - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
-    - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
-    - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
-    - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
-- Do not punish for typos in people's name if it's clearly the same name.
-    - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
-Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
-```
-Question: {input_query}
-Gold target: {expected_answer}
-Predicted answer: {generated_answer}
-```
-Grade the predicted answer of this new question as one of:
-A: CORRECT
-B: INCORRECT
-C: NOT_ATTEMPTED
-Just return the letters "A", "B", or "C", with no text around it.
-""".strip()
-
-
-llm_as_judge_405b_simpleqa = ScoringFn(
-    identifier="llm-as-judge::405b-simpleqa",
-    description="Llm As Judge Scoring Function for SimpleQA Benchmark (https://github.com/openai/simple-evals/blob/main/simpleqa_eval.py)",
-    return_type=NumberType(),
-    provider_id="llm-as-judge",
-    provider_resource_id="llm-as-judge-405b-simpleqa",
-    params=LLMAsJudgeScoringFnParams(
-        judge_model="meta-llama/Llama-3.1-405B-Instruct",
-        prompt_template=GRADER_TEMPLATE,
-        judge_score_regexes=[r"(A|B|C)"],
-        aggregation_functions=[AggregationFunctionType.categorical_count.value],
-    ),
-)
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py
deleted file mode 100644
index bffffd878c..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import LLMAsJudgeScoringFnParams, NumberType, ScoringFn
-
-llm_as_judge_base = ScoringFn(
-    identifier="llm-as-judge::base",
-    description="Llm As Judge Scoring Function",
-    return_type=NumberType(),
-    provider_id="llm-as-judge",
-    provider_resource_id="llm-as-judge-base",
-    params=LLMAsJudgeScoringFnParams(
-        judge_model="meta-llama/Llama-3.1-405B-Instruct",
-        prompt_template="Enter custom LLM as Judge Prompt Template",
-    ),
-)
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
deleted file mode 100644
index 73ce82cda2..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import re
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import Inference, OpenAIChatCompletionRequestWithExtraBody, ScoringFnParams, ScoringResultRow
-
-from .fn_defs.llm_as_judge_405b_simpleqa import llm_as_judge_405b_simpleqa
-from .fn_defs.llm_as_judge_base import llm_as_judge_base
-
-
-class LlmAsJudgeScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn that assigns
-    """
-
-    def __init__(self, inference_api: Inference, *arg, **kwargs) -> None:
-        super().__init__(*arg, **kwargs)
-        self.inference_api = inference_api
-        self.supported_fn_defs_registry = {
-            llm_as_judge_base.identifier: llm_as_judge_base,
-            llm_as_judge_405b_simpleqa.identifier: llm_as_judge_405b_simpleqa,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
-        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
-
-        # override params if scoring_params is provided
-        if scoring_params is not None:
-            fn_def.params = scoring_params
-
-        assert fn_def.params is not None, f"LLMAsJudgeparams not found for {fn_def}."
-        assert fn_def.params.prompt_template is not None, "LLM Judge prompt_template not found."
-        assert fn_def.params.judge_score_regexes is not None, "LLM Judge judge_score_regexes not found."
-
-        input_query = input_row["input_query"]
-        expected_answer = input_row["expected_answer"]
-        generated_answer = input_row["generated_answer"]
-
-        judge_input_msg = fn_def.params.prompt_template.format(
-            input_query=input_query,
-            expected_answer=expected_answer,
-            generated_answer=generated_answer,
-        )
-
-        params = OpenAIChatCompletionRequestWithExtraBody(
-            model=fn_def.params.judge_model,
-            messages=[
-                {
-                    "role": "user",
-                    "content": judge_input_msg,
-                }
-            ],
-        )
-        judge_response = await self.inference_api.openai_chat_completion(params)
-        content = judge_response.choices[0].message.content
-        rating_regexes = fn_def.params.judge_score_regexes
-
-        judge_rating = None
-        for regex in rating_regexes:
-            match = re.search(regex, content)
-            if match:
-                judge_rating = match.group(1)
-                break
-
-        return {
-            "score": judge_rating,
-            "judge_feedback": content,
-        }
diff --git a/src/llama_stack/providers/registry/README.md b/src/llama_stack/providers/registry/README.md
index 4be26b52e3..36147e9d73 100644
--- a/src/llama_stack/providers/registry/README.md
+++ b/src/llama_stack/providers/registry/README.md
@@ -8,14 +8,11 @@ Provider spec declarations. Each file defines which providers are available for
 registry/
   __init__.py
   batches.py           # Batch processing providers
-  datasetio.py         # Dataset I/O providers
-  eval.py              # Evaluation providers
   file_processors.py   # File processor providers
   files.py             # File storage providers
   inference.py         # Inference providers (20+ remote + 2 inline)
   responses.py         # Responses API providers (inline::builtin)
   safety.py            # Safety providers (llama-guard, bedrock, etc.)
-  scoring.py           # Scoring function providers
   tool_runtime.py      # Tool runtime providers
   vector_io.py         # Vector I/O providers
 ```
diff --git a/src/llama_stack/providers/registry/datasetio.py b/src/llama_stack/providers/registry/datasetio.py
deleted file mode 100644
index d115af694a..0000000000
--- a/src/llama_stack/providers/registry/datasetio.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack_api import (
-    Api,
-    InlineProviderSpec,
-    ProviderSpec,
-    RemoteProviderSpec,
-)
-
-
-def available_providers() -> list[ProviderSpec]:
-    """Return the list of available dataset I/O provider specifications.
-
-    Returns:
-        List of ProviderSpec objects describing available providers
-    """
-    return [
-        InlineProviderSpec(
-            api=Api.datasetio,
-            provider_type="inline::localfs",
-            pip_packages=["pandas"],
-            module="llama_stack.providers.inline.datasetio.localfs",
-            config_class="llama_stack.providers.inline.datasetio.localfs.LocalFSDatasetIOConfig",
-            api_dependencies=[],
-            description="Local filesystem-based dataset I/O provider for reading and writing datasets to local storage.",
-        ),
-        RemoteProviderSpec(
-            api=Api.datasetio,
-            adapter_type="huggingface",
-            provider_type="remote::huggingface",
-            pip_packages=[
-                "datasets>=4.0.0",
-            ],
-            module="llama_stack.providers.remote.datasetio.huggingface",
-            config_class="llama_stack.providers.remote.datasetio.huggingface.HuggingfaceDatasetIOConfig",
-            description="HuggingFace datasets provider for accessing and managing datasets from the HuggingFace Hub.",
-        ),
-        RemoteProviderSpec(
-            api=Api.datasetio,
-            adapter_type="nvidia",
-            provider_type="remote::nvidia",
-            module="llama_stack.providers.remote.datasetio.nvidia",
-            config_class="llama_stack.providers.remote.datasetio.nvidia.NvidiaDatasetIOConfig",
-            pip_packages=[
-                "datasets>=4.0.0",
-            ],
-            description="NVIDIA's dataset I/O provider for accessing datasets from NVIDIA's data platform.",
-        ),
-    ]
diff --git a/src/llama_stack/providers/registry/eval.py b/src/llama_stack/providers/registry/eval.py
deleted file mode 100644
index 5917b00f80..0000000000
--- a/src/llama_stack/providers/registry/eval.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack_api import Api, InlineProviderSpec, ProviderSpec, RemoteProviderSpec
-
-
-def available_providers() -> list[ProviderSpec]:
-    """Return the list of available evaluation provider specifications.
-
-    Returns:
-        List of ProviderSpec objects describing available providers
-    """
-    return [
-        InlineProviderSpec(
-            api=Api.eval,
-            provider_type="inline::builtin",
-            pip_packages=["tree_sitter", "pythainlp", "langdetect", "emoji", "nltk>=3.9.4"],
-            module="llama_stack.providers.inline.eval.builtin",
-            config_class="llama_stack.providers.inline.eval.builtin.BuiltinEvalConfig",
-            api_dependencies=[
-                Api.datasetio,
-                Api.datasets,
-                Api.scoring,
-                Api.inference,
-                Api.responses,
-            ],
-            description="Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.",
-        ),
-        RemoteProviderSpec(
-            api=Api.eval,
-            adapter_type="nvidia",
-            pip_packages=[
-                "requests",
-            ],
-            provider_type="remote::nvidia",
-            module="llama_stack.providers.remote.eval.nvidia",
-            config_class="llama_stack.providers.remote.eval.nvidia.NVIDIAEvalConfig",
-            description="NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.",
-            api_dependencies=[
-                Api.datasetio,
-                Api.datasets,
-                Api.scoring,
-                Api.inference,
-                Api.responses,
-            ],
-        ),
-    ]
diff --git a/src/llama_stack/providers/registry/scoring.py b/src/llama_stack/providers/registry/scoring.py
deleted file mode 100644
index 37a2534ceb..0000000000
--- a/src/llama_stack/providers/registry/scoring.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack_api import Api, InlineProviderSpec, ProviderSpec
-
-
-def available_providers() -> list[ProviderSpec]:
-    """Return the list of available scoring provider specifications.
-
-    Returns:
-        List of ProviderSpec objects describing available providers
-    """
-    return [
-        InlineProviderSpec(
-            api=Api.scoring,
-            provider_type="inline::basic",
-            pip_packages=["requests"],
-            module="llama_stack.providers.inline.scoring.basic",
-            config_class="llama_stack.providers.inline.scoring.basic.BasicScoringConfig",
-            api_dependencies=[
-                Api.datasetio,
-                Api.datasets,
-            ],
-            description="Basic scoring provider for simple evaluation metrics and scoring functions.",
-        ),
-        InlineProviderSpec(
-            api=Api.scoring,
-            provider_type="inline::llm-as-judge",
-            pip_packages=[],
-            module="llama_stack.providers.inline.scoring.llm_as_judge",
-            config_class="llama_stack.providers.inline.scoring.llm_as_judge.LlmAsJudgeScoringConfig",
-            api_dependencies=[
-                Api.datasetio,
-                Api.datasets,
-                Api.inference,
-            ],
-            description="LLM-as-judge scoring provider that uses language models to evaluate and score responses.",
-        ),
-        InlineProviderSpec(
-            api=Api.scoring,
-            provider_type="inline::braintrust",
-            pip_packages=["autoevals"],
-            module="llama_stack.providers.inline.scoring.braintrust",
-            config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig",
-            api_dependencies=[
-                Api.datasetio,
-                Api.datasets,
-            ],
-            provider_data_validator="llama_stack.providers.inline.scoring.braintrust.BraintrustProviderDataValidator",
-            description="Braintrust scoring provider for evaluation and scoring using the Braintrust platform.",
-        ),
-    ]
diff --git a/src/llama_stack/providers/remote/datasetio/__init__.py b/src/llama_stack/providers/remote/datasetio/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/remote/datasetio/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/remote/datasetio/huggingface/__init__.py b/src/llama_stack/providers/remote/datasetio/huggingface/__init__.py
deleted file mode 100644
index db803d1838..0000000000
--- a/src/llama_stack/providers/remote/datasetio/huggingface/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .config import HuggingfaceDatasetIOConfig
-
-
-async def get_adapter_impl(
-    config: HuggingfaceDatasetIOConfig,
-    _deps,
-):
-    from .huggingface import HuggingfaceDatasetIOImpl
-
-    impl = HuggingfaceDatasetIOImpl(config)
-    await impl.initialize()
-    return impl
diff --git a/src/llama_stack/providers/remote/datasetio/huggingface/config.py b/src/llama_stack/providers/remote/datasetio/huggingface/config.py
deleted file mode 100644
index 76f5aeca58..0000000000
--- a/src/llama_stack/providers/remote/datasetio/huggingface/config.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel
-
-from llama_stack.core.storage.datatypes import KVStoreReference
-
-
-class HuggingfaceDatasetIOConfig(BaseModel):
-    """Configuration for the HuggingFace dataset I/O provider."""
-
-    kvstore: KVStoreReference
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {
-            "kvstore": KVStoreReference(
-                backend="kv_default",
-                namespace="datasetio::huggingface",
-            ).model_dump(exclude_none=True)
-        }
diff --git a/src/llama_stack/providers/remote/datasetio/huggingface/huggingface.py b/src/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
deleted file mode 100644
index 6ec5b4446b..0000000000
--- a/src/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-from urllib.parse import parse_qs, urlparse
-
-from llama_stack.core.storage.kvstore import kvstore_impl
-from llama_stack.providers.utils.pagination import paginate_records
-from llama_stack_api import Dataset, DatasetIO, DatasetsProtocolPrivate, PaginatedResponse
-
-from .config import HuggingfaceDatasetIOConfig
-
-DATASETS_PREFIX = "datasets:"
-
-
-def parse_hf_params(dataset_def: Dataset):
-    """Parse HuggingFace dataset URI into path and query parameters.
-
-    Args:
-        dataset_def: dataset definition containing the source URI
-
-    Returns:
-        Tuple of (dataset_path, params_dict)
-    """
-    uri = dataset_def.source.uri
-    parsed_uri = urlparse(uri)
-    params = parse_qs(parsed_uri.query)
-    params = {k: v[0] for k, v in params.items()}
-    path = parsed_uri.path.lstrip("/")
-
-    return path, params
-
-
-class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
-    """Dataset I/O provider for loading datasets from HuggingFace Hub."""
-
-    def __init__(self, config: HuggingfaceDatasetIOConfig) -> None:
-        self.config = config
-        # local registry for keeping track of datasets within the provider
-        self.dataset_infos = {}
-        self.kvstore = None
-
-    async def initialize(self) -> None:
-        self.kvstore = await kvstore_impl(self.config.kvstore)
-        # Load existing datasets from kvstore
-        start_key = DATASETS_PREFIX
-        end_key = f"{DATASETS_PREFIX}\xff"
-        stored_datasets = await self.kvstore.values_in_range(start_key, end_key)
-
-        for dataset in stored_datasets:
-            dataset = Dataset.model_validate_json(dataset)
-            self.dataset_infos[dataset.identifier] = dataset
-
-    async def shutdown(self) -> None: ...
-
-    async def register_dataset(
-        self,
-        dataset_def: Dataset,
-    ) -> None:
-        # Store in kvstore
-        key = f"{DATASETS_PREFIX}{dataset_def.identifier}"
-        await self.kvstore.set(
-            key=key,
-            value=dataset_def.model_dump_json(),
-        )
-        self.dataset_infos[dataset_def.identifier] = dataset_def
-
-    async def unregister_dataset(self, dataset_id: str) -> None:
-        key = f"{DATASETS_PREFIX}{dataset_id}"
-        await self.kvstore.delete(key=key)
-        del self.dataset_infos[dataset_id]
-
-    async def iterrows(
-        self,
-        dataset_id: str,
-        start_index: int | None = None,
-        limit: int | None = None,
-    ) -> PaginatedResponse:
-        import datasets as hf_datasets
-
-        dataset_def = self.dataset_infos[dataset_id]
-        path, params = parse_hf_params(dataset_def)
-        loaded_dataset = hf_datasets.load_dataset(path, **params)
-
-        records = [loaded_dataset[i] for i in range(len(loaded_dataset))]
-        return paginate_records(records, start_index, limit)
-
-    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
-        import datasets as hf_datasets
-
-        dataset_def = self.dataset_infos[dataset_id]
-        path, params = parse_hf_params(dataset_def)
-        loaded_dataset = hf_datasets.load_dataset(path, **params)
-
-        # Convert rows to HF Dataset format
-        new_dataset = hf_datasets.Dataset.from_list(rows)
-
-        # Concatenate the new rows with existing dataset
-        updated_dataset = hf_datasets.concatenate_datasets([loaded_dataset, new_dataset])
-
-        if dataset_def.metadata.get("path", None):
-            updated_dataset.push_to_hub(dataset_def.metadata["path"])
-        else:
-            raise NotImplementedError("Uploading to URL-based datasets is not supported yet")
diff --git a/src/llama_stack/providers/remote/datasetio/nvidia/README.md b/src/llama_stack/providers/remote/datasetio/nvidia/README.md
deleted file mode 100644
index a872c61303..0000000000
--- a/src/llama_stack/providers/remote/datasetio/nvidia/README.md
+++ /dev/null
@@ -1,74 +0,0 @@
-# NVIDIA DatasetIO Provider for LlamaStack
-
-This provider enables dataset management using NVIDIA's NeMo Customizer service.
-
-## Features
-
-- Register datasets for fine-tuning LLMs
-- Unregister datasets
-
-## Getting Started
-
-### Prerequisites
-
-- LlamaStack with NVIDIA configuration
-- Access to Hosted NVIDIA NeMo Microservice
-- API key for authentication with the NVIDIA service
-
-### Setup
-
-Build the NVIDIA environment:
-
-```bash
-uv pip install llama-stack-client
-uv run llama stack list-deps nvidia | xargs -L1 uv pip install
-```
-
-### Basic Usage using the LlamaStack Python Client
-
-#### Initialize the client
-
-```python
-import os
-
-os.environ["NVIDIA_API_KEY"] = "your-api-key"
-os.environ["NVIDIA_CUSTOMIZER_URL"] = "http://nemo.test"
-os.environ["NVIDIA_DATASET_NAMESPACE"] = "default"
-os.environ["NVIDIA_PROJECT_ID"] = "test-project"
-from llama_stack.core.library_client import LlamaStackAsLibraryClient
-
-client = LlamaStackAsLibraryClient("nvidia")
-client.initialize()
-```
-
-#### Register a dataset
-
-```python
-client.datasets.register(
-    purpose="eval/question-answer",
-    dataset_id="my-eval-dataset",
-    source={"type": "uri", "uri": "hf://datasets/default/sample-dataset"},
-    metadata={
-        "format": "json",
-        "description": "Dataset for evaluation",
-        "provider": "nvidia",
-    },
-)
-```
-
-#### Get a list of all registered datasets
-
-```python
-datasets = client.datasets.list()
-for dataset in datasets:
-    print(f"Dataset ID: {dataset.identifier}")
-    print(f"Description: {dataset.metadata.get('description', '')}")
-    print(f"Source: {dataset.source.uri}")
-    print("---")
-```
-
-#### Unregister a dataset
-
-```python
-client.datasets.unregister(dataset_id="my-training-dataset")
-```
diff --git a/src/llama_stack/providers/remote/datasetio/nvidia/__init__.py b/src/llama_stack/providers/remote/datasetio/nvidia/__init__.py
deleted file mode 100644
index 418daec8d8..0000000000
--- a/src/llama_stack/providers/remote/datasetio/nvidia/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .config import NvidiaDatasetIOConfig
-
-
-async def get_adapter_impl(
-    config: NvidiaDatasetIOConfig,
-    _deps,
-):
-    from .datasetio import NvidiaDatasetIOAdapter
-
-    if not isinstance(config, NvidiaDatasetIOConfig):
-        raise RuntimeError(f"Unexpected config type: {type(config)}")
-
-    impl = NvidiaDatasetIOAdapter(config)
-    return impl
-
-
-__all__ = ["get_adapter_impl", "NvidiaDatasetIOAdapter"]
diff --git a/src/llama_stack/providers/remote/datasetio/nvidia/config.py b/src/llama_stack/providers/remote/datasetio/nvidia/config.py
deleted file mode 100644
index addce6c1f0..0000000000
--- a/src/llama_stack/providers/remote/datasetio/nvidia/config.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-import warnings
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-
-class NvidiaDatasetIOConfig(BaseModel):
-    """Configuration for NVIDIA DatasetIO implementation."""
-
-    api_key: str | None = Field(
-        default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
-        description="The NVIDIA API key.",
-    )
-
-    dataset_namespace: str | None = Field(
-        default_factory=lambda: os.getenv("NVIDIA_DATASET_NAMESPACE", "default"),
-        description="The NVIDIA dataset namespace.",
-    )
-
-    project_id: str | None = Field(
-        default_factory=lambda: os.getenv("NVIDIA_PROJECT_ID", "test-project"),
-        description="The NVIDIA project ID.",
-    )
-
-    datasets_url: str = Field(
-        default_factory=lambda: os.getenv("NVIDIA_DATASETS_URL", "http://nemo.test"),
-        description="Base URL for the NeMo Dataset API",
-    )
-
-    # warning for default values
-    def __post_init__(self):
-        default_values = []
-        if os.getenv("NVIDIA_PROJECT_ID") is None:
-            default_values.append("project_id='test-project'")
-        if os.getenv("NVIDIA_DATASET_NAMESPACE") is None:
-            default_values.append("dataset_namespace='default'")
-        if os.getenv("NVIDIA_DATASETS_URL") is None:
-            default_values.append("datasets_url='http://nemo.test'")
-
-        if default_values:
-            warnings.warn(
-                f"Using default values: {', '.join(default_values)}. \
-                          Please set the environment variables to avoid this default behavior.",
-                stacklevel=2,
-            )
-
-    @classmethod
-    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
-        return {
-            "api_key": "${env.NVIDIA_API_KEY:=}",
-            "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:=default}",
-            "project_id": "${env.NVIDIA_PROJECT_ID:=test-project}",
-            "datasets_url": "${env.NVIDIA_DATASETS_URL:=http://nemo.test}",
-        }
diff --git a/src/llama_stack/providers/remote/datasetio/nvidia/datasetio.py b/src/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
deleted file mode 100644
index 2f5548fa96..0000000000
--- a/src/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-import aiohttp
-
-from llama_stack_api import URL, Dataset, PaginatedResponse, ParamType
-
-from .config import NvidiaDatasetIOConfig
-
-
-class NvidiaDatasetIOAdapter:
-    """Nvidia NeMo DatasetIO API."""
-
-    def __init__(self, config: NvidiaDatasetIOConfig):
-        self.config = config
-        self.headers = {}
-
-    async def _make_request(
-        self,
-        method: str,
-        path: str,
-        headers: dict[str, Any] | None = None,
-        params: dict[str, Any] | None = None,
-        json: dict[str, Any] | None = None,
-        **kwargs,
-    ) -> dict[str, Any]:
-        """Helper method to make HTTP requests to the Customizer API."""
-        url = f"{self.config.datasets_url}{path}"
-        request_headers = self.headers.copy()
-
-        # Set default Content-Type for JSON requests
-        if json is not None:
-            request_headers["Content-Type"] = "application/json"
-
-        if headers:
-            request_headers.update(headers)
-
-        async with aiohttp.ClientSession(headers=request_headers) as session:
-            async with session.request(method, url, params=params, json=json, **kwargs) as response:
-                if response.status != 200:
-                    error_data = await response.json()
-                    raise Exception(f"API request failed: {error_data}")
-                return await response.json()
-
-    async def register_dataset(
-        self,
-        dataset_def: Dataset,
-    ) -> Dataset:
-        """Register a new dataset.
-
-        Args:
-            dataset_def [Dataset]: The dataset definition.
-                dataset_id [str]: The ID of the dataset.
-                source [DataSource]: The source of the dataset.
-                metadata [Dict[str, Any]]: The metadata of the dataset.
-                    format [str]: The format of the dataset.
-                    description [str]: The description of the dataset.
-        Returns:
-            Dataset
-        """
-        # add warnings for unsupported params
-        request_body = {
-            "name": dataset_def.identifier,
-            "namespace": self.config.dataset_namespace,
-            "files_url": dataset_def.source.uri,
-            "project": self.config.project_id,
-        }
-        if dataset_def.metadata:
-            request_body["format"] = dataset_def.metadata.get("format")
-            request_body["description"] = dataset_def.metadata.get("description")
-        await self._make_request(
-            "POST",
-            "/v1/datasets",
-            json=request_body,
-        )
-        return dataset_def
-
-    async def update_dataset(
-        self,
-        dataset_id: str,
-        dataset_schema: dict[str, ParamType],
-        url: URL,
-        provider_dataset_id: str | None = None,
-        provider_id: str | None = None,
-        metadata: dict[str, Any] | None = None,
-    ) -> None:
-        raise NotImplementedError("Not implemented")
-
-    async def unregister_dataset(
-        self,
-        dataset_id: str,
-    ) -> None:
-        await self._make_request(
-            "DELETE",
-            f"/v1/datasets/{self.config.dataset_namespace}/{dataset_id}",
-            headers={"Accept": "application/json", "Content-Type": "application/json"},
-        )
-
-    async def iterrows(
-        self,
-        dataset_id: str,
-        start_index: int | None = None,
-        limit: int | None = None,
-    ) -> PaginatedResponse:
-        raise NotImplementedError("Not implemented")
-
-    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
-        raise NotImplementedError("Not implemented")
diff --git a/src/llama_stack/providers/remote/eval/__init__.py b/src/llama_stack/providers/remote/eval/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/remote/eval/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/remote/eval/nvidia/README.md b/src/llama_stack/providers/remote/eval/nvidia/README.md
deleted file mode 100644
index 4443d484be..0000000000
--- a/src/llama_stack/providers/remote/eval/nvidia/README.md
+++ /dev/null
@@ -1,142 +0,0 @@
-# NVIDIA NeMo Evaluator Eval Provider
-
-## Overview
-
-For the first integration, Benchmarks are mapped to Evaluation Configs on in the NeMo Evaluator. The full evaluation config object is provided as part of the meta-data. The `dataset_id` and `scoring_functions` are not used.
-
-Below are a few examples of how to register a benchmark, which in turn will create an evaluation config in NeMo Evaluator and how to trigger an evaluation.
-
-### Example for register an academic benchmark
-
-```text
-POST /eval/benchmarks
-```
-
-```json
-{
-  "benchmark_id": "mmlu",
-  "dataset_id": "",
-  "scoring_functions": [],
-  "metadata": {
-    "type": "mmlu"
-  }
-}
-```
-
-### Example for register a custom evaluation
-
-```text
-POST /eval/benchmarks
-```
-
-```json
-{
-  "benchmark_id": "my-custom-benchmark",
-  "dataset_id": "",
-  "scoring_functions": [],
-  "metadata": {
-    "type": "custom",
-    "params": {
-      "parallelism": 8
-    },
-    "tasks": {
-      "qa": {
-        "type": "completion",
-        "params": {
-          "template": {
-            "prompt": "{{prompt}}",
-            "max_tokens": 200
-          }
-        },
-        "dataset": {
-          "files_url": "hf://datasets/default/sample-basic-test/testing/testing.jsonl"
-        },
-        "metrics": {
-          "bleu": {
-            "type": "bleu",
-            "params": {
-              "references": [
-                "{{ideal_response}}"
-              ]
-            }
-          }
-        }
-      }
-    }
-  }
-}
-```
-
-### Example for triggering a benchmark/custom evaluation
-
-```text
-POST /eval/benchmarks/{benchmark_id}/jobs
-```
-
-```json
-{
-  "benchmark_id": "my-custom-benchmark",
-  "benchmark_config": {
-    "eval_candidate": {
-      "type": "model",
-      "model": "meta-llama/Llama3.1-8B-Instruct",
-      "sampling_params": {
-        "max_tokens": 100,
-        "temperature": 0.7
-      }
-    },
-    "scoring_params": {}
-  }
-}
-```
-
-Response example:
-
-```json
-{
-    "job_id": "eval-1234",
-    "status": "in_progress"
-}
-```
-
-### Example for getting the status of a job
-
-```text
-GET /eval/benchmarks/{benchmark_id}/jobs/{job_id}
-```
-
-Response example:
-
-```json
-{
-  "job_id": "eval-1234",
-  "status": "in_progress"
-}
-```
-
-### Example for cancelling a job
-
-```text
-POST /eval/benchmarks/{benchmark_id}/jobs/{job_id}/cancel
-```
-
-### Example for getting the results
-
-```text
-GET /eval/benchmarks/{benchmark_id}/results
-```
-
-```json
-{
-  "generations": [],
-  "scores": {
-    "{benchmark_id}": {
-      "score_rows": [],
-      "aggregated_results": {
-        "tasks": {},
-        "groups": {}
-      }
-    }
-  }
-}
-```
diff --git a/src/llama_stack/providers/remote/eval/nvidia/__init__.py b/src/llama_stack/providers/remote/eval/nvidia/__init__.py
deleted file mode 100644
index 7d7da74c46..0000000000
--- a/src/llama_stack/providers/remote/eval/nvidia/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.core.datatypes import Api
-
-from .config import NVIDIAEvalConfig
-
-
-async def get_adapter_impl(
-    config: NVIDIAEvalConfig,
-    deps: dict[Api, Any],
-):
-    from .eval import NVIDIAEvalImpl
-
-    impl = NVIDIAEvalImpl(
-        config,
-        deps[Api.datasetio],
-        deps[Api.datasets],
-        deps[Api.scoring],
-        deps[Api.inference],
-        deps[Api.responses],
-    )
-    await impl.initialize()
-    return impl
-
-
-__all__ = ["get_adapter_impl", "NVIDIAEvalImpl"]
diff --git a/src/llama_stack/providers/remote/eval/nvidia/config.py b/src/llama_stack/providers/remote/eval/nvidia/config.py
deleted file mode 100644
index 7a1c04304f..0000000000
--- a/src/llama_stack/providers/remote/eval/nvidia/config.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import os
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-
-class NVIDIAEvalConfig(BaseModel):
-    """
-     Configuration for the NVIDIA NeMo Evaluator microservice endpoint.
-
-    Attributes:
-        evaluator_url (str): A base url for accessing the NVIDIA evaluation endpoint, e.g. http://localhost:8000.
-    """
-
-    evaluator_url: str = Field(
-        default_factory=lambda: os.getenv("NVIDIA_EVALUATOR_URL", "http://0.0.0.0:7331"),
-        description="The url for accessing the evaluator service",
-    )
-
-    @classmethod
-    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
-        return {
-            "evaluator_url": "${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}",
-        }
diff --git a/src/llama_stack/providers/remote/eval/nvidia/eval.py b/src/llama_stack/providers/remote/eval/nvidia/eval.py
deleted file mode 100644
index 1b223c0350..0000000000
--- a/src/llama_stack/providers/remote/eval/nvidia/eval.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-import httpx
-
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
-from llama_stack_api import (
-    Benchmark,
-    BenchmarksProtocolPrivate,
-    DatasetIO,
-    Datasets,
-    Eval,
-    EvaluateResponse,
-    EvaluateRowsRequest,
-    Inference,
-    Job,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatus,
-    JobStatusRequest,
-    Responses,
-    RunEvalRequest,
-    Scoring,
-    ScoringResult,
-)
-
-from .config import NVIDIAEvalConfig
-
-DEFAULT_NAMESPACE = "nvidia"
-
-
-class NVIDIAEvalImpl(
-    Eval,
-    BenchmarksProtocolPrivate,
-    ModelRegistryHelper,
-):
-    """Evaluation provider implementation using NVIDIA evaluation services."""
-
-    def __init__(
-        self,
-        config: NVIDIAEvalConfig,
-        datasetio_api: DatasetIO,
-        datasets_api: Datasets,
-        scoring_api: Scoring,
-        inference_api: Inference,
-        responses_api: Responses,
-    ) -> None:
-        self.config = config
-        self.datasetio_api = datasetio_api
-        self.datasets_api = datasets_api
-        self.scoring_api = scoring_api
-        self.inference_api = inference_api
-        self.responses_api = responses_api
-
-        ModelRegistryHelper.__init__(self)
-        self._client: httpx.AsyncClient | None = None
-
-    @property
-    def client(self) -> httpx.AsyncClient:
-        if self._client is None:
-            raise RuntimeError("Client not initialized. Call initialize() first.")
-        return self._client
-
-    async def initialize(self) -> None:
-        self._client = httpx.AsyncClient(timeout=httpx.Timeout(30.0))
-
-    async def shutdown(self) -> None:
-        if self._client:
-            await self._client.aclose()
-
-    async def _evaluator_get(self, path: str):
-        """Helper for making GET requests to the evaluator service."""
-        response = await self.client.get(url=f"{self.config.evaluator_url}{path}")
-        response.raise_for_status()
-        return response.json()
-
-    async def _evaluator_post(self, path: str, data: dict[str, Any]):
-        """Helper for making POST requests to the evaluator service."""
-        response = await self.client.post(url=f"{self.config.evaluator_url}{path}", json=data)
-        response.raise_for_status()
-        return response.json()
-
-    async def _evaluator_delete(self, path: str) -> None:
-        """Helper for making DELETE requests to the evaluator service."""
-        response = await self.client.delete(url=f"{self.config.evaluator_url}{path}")
-        response.raise_for_status()
-
-    async def register_benchmark(self, task_def: Benchmark) -> None:
-        """Register a benchmark as an evaluation configuration."""
-        await self._evaluator_post(
-            "/v1/evaluation/configs",
-            {
-                "namespace": DEFAULT_NAMESPACE,
-                "name": task_def.benchmark_id,
-                # metadata is copied to request body as-is
-                **task_def.metadata,
-            },
-        )
-
-    async def unregister_benchmark(self, benchmark_id: str) -> None:
-        """Unregister a benchmark evaluation configuration from NeMo Evaluator."""
-        await self._evaluator_delete(f"/v1/evaluation/configs/{DEFAULT_NAMESPACE}/{benchmark_id}")
-
-    async def run_eval(
-        self,
-        request: RunEvalRequest,
-    ) -> Job:
-        """Run an evaluation job for a benchmark."""
-        model = (
-            request.benchmark_config.eval_candidate.model
-            if request.benchmark_config.eval_candidate.type == "model"
-            else request.benchmark_config.eval_candidate.config.model
-        )
-        nvidia_model = self.get_provider_model_id(model) or model
-
-        result = await self._evaluator_post(
-            "/v1/evaluation/jobs",
-            {
-                "config": f"{DEFAULT_NAMESPACE}/{request.benchmark_id}",
-                "target": {"type": "model", "model": nvidia_model},
-            },
-        )
-
-        return Job(job_id=result["id"], status=JobStatus.in_progress)
-
-    async def evaluate_rows(
-        self,
-        request: EvaluateRowsRequest,
-    ) -> EvaluateResponse:
-        raise NotImplementedError()
-
-    async def job_status(self, request: JobStatusRequest) -> Job:
-        """Get the status of an evaluation job.
-
-        EvaluatorStatus: "created", "pending", "running", "cancelled", "cancelling", "failed", "completed".
-        JobStatus: "scheduled", "in_progress", "completed", "cancelled", "failed"
-        """
-        result = await self._evaluator_get(f"/v1/evaluation/jobs/{request.job_id}")
-        result_status = result["status"]
-
-        job_status = JobStatus.failed
-        if result_status in ["created", "pending"]:
-            job_status = JobStatus.scheduled
-        elif result_status in ["running"]:
-            job_status = JobStatus.in_progress
-        elif result_status in ["completed"]:
-            job_status = JobStatus.completed
-        elif result_status in ["cancelled"]:
-            job_status = JobStatus.cancelled
-
-        return Job(job_id=request.job_id, status=job_status)
-
-    async def job_cancel(self, request: JobCancelRequest) -> None:
-        """Cancel the evaluation job."""
-        await self._evaluator_post(f"/v1/evaluation/jobs/{request.job_id}/cancel", {})
-
-    async def job_result(self, request: JobResultRequest) -> EvaluateResponse:
-        """Returns the results of the evaluation job."""
-
-        job_status_request = JobStatusRequest(benchmark_id=request.benchmark_id, job_id=request.job_id)
-        job = await self.job_status(job_status_request)
-        status = job.status
-        if not status or status != JobStatus.completed:
-            raise ValueError(f"Job {request.job_id} not completed. Status: {status.value}")
-
-        result = await self._evaluator_get(f"/v1/evaluation/jobs/{request.job_id}/results")
-
-        return EvaluateResponse(
-            # TODO: these are stored in detailed results on NeMo Evaluator side; can be added
-            generations=[],
-            scores={
-                request.benchmark_id: ScoringResult(
-                    score_rows=[],
-                    aggregated_results=result,
-                )
-            },
-        )
diff --git a/src/llama_stack/providers/utils/common/data_schema_validator.py b/src/llama_stack/providers/utils/common/data_schema_validator.py
deleted file mode 100644
index b466ec7412..0000000000
--- a/src/llama_stack/providers/utils/common/data_schema_validator.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Any
-
-from llama_stack.core.datatypes import Api
-from llama_stack_api import ChatCompletionInputType, CompletionInputType, StringType
-
-
-class ColumnName(Enum):
-    """Enumeration of recognized column names for dataset schemas."""
-
-    input_query = "input_query"
-    expected_answer = "expected_answer"
-    chat_completion_input = "chat_completion_input"
-    completion_input = "completion_input"
-    generated_answer = "generated_answer"
-    context = "context"
-    dialog = "dialog"
-    function = "function"
-    language = "language"
-    id = "id"
-    ground_truth = "ground_truth"
-
-
-VALID_SCHEMAS_FOR_SCORING = [
-    {
-        ColumnName.input_query.value: StringType(),
-        ColumnName.expected_answer.value: StringType(),
-        ColumnName.generated_answer.value: StringType(),
-    },
-    {
-        ColumnName.input_query.value: StringType(),
-        ColumnName.expected_answer.value: StringType(),
-        ColumnName.generated_answer.value: StringType(),
-        ColumnName.context.value: StringType(),
-    },
-    {
-        ColumnName.input_query.value: StringType(),
-        ColumnName.expected_answer.value: StringType(),
-        ColumnName.generated_answer.value: StringType(),
-        ColumnName.function.value: StringType(),
-        ColumnName.language.value: StringType(),
-        ColumnName.id.value: StringType(),
-        ColumnName.ground_truth.value: StringType(),
-    },
-]
-
-VALID_SCHEMAS_FOR_EVAL = [
-    {
-        ColumnName.input_query.value: StringType(),
-        ColumnName.expected_answer.value: StringType(),
-        ColumnName.chat_completion_input.value: ChatCompletionInputType(),
-    },
-    {
-        ColumnName.input_query.value: StringType(),
-        ColumnName.expected_answer.value: StringType(),
-        ColumnName.completion_input.value: CompletionInputType(),
-    },
-    {
-        ColumnName.input_query.value: StringType(),
-        ColumnName.expected_answer.value: StringType(),
-        ColumnName.generated_answer.value: StringType(),
-        ColumnName.function.value: StringType(),
-        ColumnName.language.value: StringType(),
-        ColumnName.id.value: StringType(),
-        ColumnName.ground_truth.value: StringType(),
-    },
-]
-
-
-def get_valid_schemas(api_str: str):
-    """Return the valid dataset schemas for the given API.
-
-    Args:
-        api_str: API identifier string (e.g. "scoring" or "eval")
-
-    Returns:
-        List of valid schema dictionaries for the specified API
-    """
-    if api_str == Api.scoring.value:
-        return VALID_SCHEMAS_FOR_SCORING
-    elif api_str == Api.eval.value:
-        return VALID_SCHEMAS_FOR_EVAL
-    else:
-        raise ValueError(f"Invalid API string: {api_str}")
-
-
-def validate_dataset_schema(
-    dataset_schema: dict[str, Any],
-    expected_schemas: list[dict[str, Any]],
-):
-    """Validate that a dataset schema matches one of the expected schemas.
-
-    Args:
-        dataset_schema: the schema to validate
-        expected_schemas: list of acceptable schema definitions
-    """
-    if dataset_schema not in expected_schemas:
-        raise ValueError(f"Dataset {dataset_schema} does not have a correct input schema in {expected_schemas}")
-
-
-def validate_row_schema(
-    input_row: dict[str, Any],
-    expected_schemas: list[dict[str, Any]],
-):
-    """Validate that an input row contains keys from at least one expected schema.
-
-    Args:
-        input_row: dictionary representing a data row
-        expected_schemas: list of acceptable schema definitions
-    """
-    for schema in expected_schemas:
-        if all(key in input_row for key in schema):
-            return
-
-    raise ValueError(f"Input row {input_row} does not match any of the expected schemas in {expected_schemas}")
diff --git a/src/llama_stack/providers/utils/datasetio/__init__.py b/src/llama_stack/providers/utils/datasetio/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/utils/datasetio/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/utils/datasetio/url_utils.py b/src/llama_stack/providers/utils/datasetio/url_utils.py
deleted file mode 100644
index df88ee82ad..0000000000
--- a/src/llama_stack/providers/utils/datasetio/url_utils.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import asyncio
-import base64
-import io
-from urllib.parse import unquote
-
-from llama_stack.providers.utils.common.data_url import parse_data_url
-
-
-async def get_dataframe_from_uri(uri: str):
-    """Load a pandas DataFrame from a URI pointing to CSV, Excel, or data URL.
-
-    Args:
-        uri: file path, URL, or data URL to load
-
-    Returns:
-        A pandas DataFrame with the loaded data
-    """
-    import pandas
-
-    df = None
-    if uri.endswith(".csv"):
-        # Moving to its own thread to avoid io from blocking the eventloop
-        # This isn't ideal as it moves more then just the IO to a new thread
-        # but it is as close as we can easly get
-        df = await asyncio.to_thread(pandas.read_csv, uri)
-    elif uri.endswith(".xlsx"):
-        df = await asyncio.to_thread(pandas.read_excel, uri)
-    elif uri.startswith("data:"):
-        parts = parse_data_url(uri)
-        data = parts["data"]
-        if parts["is_base64"]:
-            data = base64.b64decode(data)
-        else:
-            data = unquote(data)
-            encoding = parts["encoding"] or "utf-8"
-            data = data.encode(encoding)
-
-        mime_type = parts["mimetype"]
-        mime_category = mime_type.split("/")[0]
-        data_bytes = io.BytesIO(data)
-
-        if mime_category == "text":
-            df = pandas.read_csv(data_bytes)
-        else:
-            df = pandas.read_excel(data_bytes)
-    else:
-        raise ValueError(f"Unsupported file type: {uri}")
-
-    return df
diff --git a/src/llama_stack/providers/utils/scoring/__init__.py b/src/llama_stack/providers/utils/scoring/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/utils/scoring/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/utils/scoring/aggregation_utils.py b/src/llama_stack/providers/utils/scoring/aggregation_utils.py
deleted file mode 100644
index b730db0cc8..0000000000
--- a/src/llama_stack/providers/utils/scoring/aggregation_utils.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import statistics
-from typing import Any
-
-from llama_stack_api import AggregationFunctionType, ScoringResultRow
-
-
-def aggregate_accuracy(scoring_results: list[ScoringResultRow]) -> dict[str, Any]:
-    """Compute accuracy metrics from scoring results.
-
-    Args:
-        scoring_results: list of scoring result rows with score values
-
-    Returns:
-        Dictionary with accuracy, num_correct, and num_total
-    """
-    num_correct = sum(result["score"] for result in scoring_results)
-    avg_score = num_correct / len(scoring_results)
-
-    return {
-        "accuracy": avg_score,
-        "num_correct": num_correct,
-        "num_total": len(scoring_results),
-    }
-
-
-def aggregate_average(scoring_results: list[ScoringResultRow]) -> dict[str, Any]:
-    """Compute the arithmetic average of non-null scores.
-
-    Args:
-        scoring_results: list of scoring result rows with score values
-
-    Returns:
-        Dictionary with average score
-    """
-    return {
-        "average": sum(result["score"] for result in scoring_results if result["score"] is not None)
-        / len([_ for _ in scoring_results if _["score"] is not None]),
-    }
-
-
-def aggregate_weighted_average(scoring_results: list[ScoringResultRow]) -> dict[str, Any]:
-    """Compute the weighted average of non-null scores.
-
-    Args:
-        scoring_results: list of scoring result rows with score and weight values
-
-    Returns:
-        Dictionary with weighted_average score
-    """
-    return {
-        "weighted_average": sum(
-            result["score"] * result["weight"]
-            for result in scoring_results
-            if result["score"] is not None and result["weight"] is not None
-        )
-        / sum(result["weight"] for result in scoring_results if result["weight"] is not None),
-    }
-
-
-def aggregate_categorical_count(
-    scoring_results: list[ScoringResultRow],
-) -> dict[str, Any]:
-    """Count occurrences of each unique score category.
-
-    Args:
-        scoring_results: list of scoring result rows with score values
-
-    Returns:
-        Dictionary with categorical_count mapping each category to its count
-    """
-    scores = [str(r["score"]) for r in scoring_results]
-    unique_scores = sorted(set(scores))
-    return {"categorical_count": {s: scores.count(s) for s in unique_scores}}
-
-
-def aggregate_median(scoring_results: list[ScoringResultRow]) -> dict[str, Any]:
-    """Compute the median of non-null scores.
-
-    Args:
-        scoring_results: list of scoring result rows with score values
-
-    Returns:
-        Dictionary with median score value
-    """
-    scores = [r["score"] for r in scoring_results if r["score"] is not None]
-    median = statistics.median(scores) if scores else None
-    return {"median": median}
-
-
-# TODO: decide whether we want to make aggregation functions as a registerable resource
-AGGREGATION_FUNCTIONS = {
-    AggregationFunctionType.accuracy: aggregate_accuracy,
-    AggregationFunctionType.average: aggregate_average,
-    AggregationFunctionType.weighted_average: aggregate_weighted_average,
-    AggregationFunctionType.categorical_count: aggregate_categorical_count,
-    AggregationFunctionType.median: aggregate_median,
-}
-
-
-def aggregate_metrics(
-    scoring_results: list[ScoringResultRow], metrics: list[AggregationFunctionType]
-) -> dict[str, Any]:
-    """Aggregate scoring results using the specified metric functions.
-
-    Args:
-        scoring_results: list of scoring result rows
-        metrics: list of aggregation function types to apply
-
-    Returns:
-        Dictionary mapping each metric to its aggregated result
-    """
-    agg_results = {}
-    for metric in metrics:
-        if metric not in AGGREGATION_FUNCTIONS:
-            raise ValueError(f"Aggregation function {metric} not found")
-        agg_fn = AGGREGATION_FUNCTIONS[metric]
-        agg_results[metric] = agg_fn(scoring_results)
-    return agg_results
diff --git a/src/llama_stack/providers/utils/scoring/base_scoring_fn.py b/src/llama_stack/providers/utils/scoring/base_scoring_fn.py
deleted file mode 100644
index f372db8b52..0000000000
--- a/src/llama_stack/providers/utils/scoring/base_scoring_fn.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from abc import ABC, abstractmethod
-from typing import Any
-
-from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_metrics
-from llama_stack_api import ScoringFn, ScoringFnParams, ScoringResultRow
-
-
-class BaseScoringFn(ABC):
-    """
-    Base interface class for Scoring Functions.
-    Each scoring function needs to implement the following methods:
-    - score_row(self, row)
-    - aggregate(self, scoring_fn_results)
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-
-    def __str__(self) -> str:
-        return self.__class__.__name__
-
-    @abstractmethod
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        raise NotImplementedError()
-
-    @abstractmethod
-    async def aggregate(
-        self,
-        scoring_results: list[ScoringResultRow],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> dict[str, Any]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    async def score(
-        self,
-        input_rows: list[dict[str, Any]],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> list[ScoringResultRow]:
-        raise NotImplementedError()
-
-
-class RegisteredBaseScoringFn(BaseScoringFn):
-    """
-    Interface for native scoring functions that are registered in LlamaStack.
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {}
-
-    def __str__(self) -> str:
-        return self.__class__.__name__
-
-    def get_supported_scoring_fn_defs(self) -> list[ScoringFn]:
-        return list(self.supported_fn_defs_registry.values())
-
-    def register_scoring_fn_def(self, scoring_fn: ScoringFn) -> None:
-        if scoring_fn.identifier in self.supported_fn_defs_registry:
-            raise ValueError(f"Scoring function def with identifier {scoring_fn.identifier} already exists.")
-        self.supported_fn_defs_registry[scoring_fn.identifier] = scoring_fn
-
-    def unregister_scoring_fn_def(self, scoring_fn_id: str) -> None:
-        if scoring_fn_id not in self.supported_fn_defs_registry:
-            raise ValueError(f"Scoring function def with identifier {scoring_fn_id} does not exist.")
-        del self.supported_fn_defs_registry[scoring_fn_id]
-
-    @abstractmethod
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        raise NotImplementedError()
-
-    async def aggregate(
-        self,
-        scoring_results: list[ScoringResultRow],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> dict[str, Any]:
-        params = self.supported_fn_defs_registry[scoring_fn_identifier].params
-        if scoring_params is not None:
-            if params is None:
-                params = scoring_params
-            else:
-                params.aggregation_functions = scoring_params.aggregation_functions
-
-        aggregation_functions = []
-        if params and hasattr(params, "aggregation_functions") and params.aggregation_functions:
-            aggregation_functions.extend(params.aggregation_functions)
-        return aggregate_metrics(scoring_results, aggregation_functions)
-
-    async def score(
-        self,
-        input_rows: list[dict[str, Any]],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> list[ScoringResultRow]:
-        return [await self.score_row(input_row, scoring_fn_identifier, scoring_params) for input_row in input_rows]
diff --git a/src/llama_stack/providers/utils/scoring/basic_scoring_utils.py b/src/llama_stack/providers/utils/scoring/basic_scoring_utils.py
deleted file mode 100644
index e1b3b81083..0000000000
--- a/src/llama_stack/providers/utils/scoring/basic_scoring_utils.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import contextlib
-import signal
-from collections.abc import Iterator
-from types import FrameType
-
-
-class TimeoutError(Exception):
-    """Raised when a timed operation exceeds its allowed duration."""
-
-    pass
-
-
-@contextlib.contextmanager
-def time_limit(seconds: float) -> Iterator[None]:
-    """Context manager that raises TimeoutError after the specified number of seconds.
-
-    Args:
-        seconds: maximum allowed execution time
-    """
-
-    def signal_handler(signum: int, frame: FrameType | None) -> None:
-        raise TimeoutError("Timed out!")
-
-    signal.setitimer(signal.ITIMER_REAL, seconds)
-    signal.signal(signal.SIGALRM, signal_handler)
-    try:
-        yield
-    finally:
-        signal.setitimer(signal.ITIMER_REAL, 0)
diff --git a/src/llama_stack_api/__init__.py b/src/llama_stack_api/__init__.py
index 04d814dd9b..b5318f92ca 100644
--- a/src/llama_stack_api/__init__.py
+++ b/src/llama_stack_api/__init__.py
@@ -78,17 +78,6 @@
     ListBatchesResponse,
     RetrieveBatchRequest,
 )
-from .benchmarks import (
-    Benchmark,
-    BenchmarkInput,
-    Benchmarks,
-    CommonBenchmarkFields,
-    GetBenchmarkRequest,
-    ListBenchmarksRequest,
-    ListBenchmarksResponse,
-    RegisterBenchmarkRequest,
-    UnregisterBenchmarkRequest,
-)
 
 # Import commonly used types from common submodule
 from .common.content_types import (
@@ -106,7 +95,6 @@
     ConnectorToolNotFoundError,
     ConversationItemNotFoundError,
     ConversationNotFoundError,
-    DatasetNotFoundError,
     InternalServerError,
     InvalidParameterError,
     ModelNotFoundError,
@@ -161,29 +149,8 @@
     RetrieveItemRequest,
     UpdateConversationRequest,
 )
-from .datasetio import (
-    AppendRowsParams,
-    AppendRowsRequest,
-    DatasetIO,
-    DatasetStore,
-    IterRowsRequest,
-)
-from .datasets import (
-    CommonDatasetFields,
-    Dataset,
-    DatasetInput,
-    DatasetPurpose,
-    Datasets,
-    DatasetType,
-    DataSource,
-    ListDatasetsResponse,
-    RowsDataSource,
-    URIDataSource,
-)
 from .datatypes import (
     Api,
-    BenchmarksProtocolPrivate,
-    DatasetsProtocolPrivate,
     DynamicApiMeta,
     Error,
     ExternalApiSpec,
@@ -195,32 +162,10 @@
     RemoteProviderConfig,
     RemoteProviderSpec,
     RoutingTable,
-    ScoringFunctionsProtocolPrivate,
     ShieldsProtocolPrivate,
     ToolGroupsProtocolPrivate,
     VectorStoresProtocolPrivate,
 )
-from .eval import (
-    BenchmarkConfig,
-    BenchmarkIdRequest,
-    Eval,
-    EvalCandidate,
-    EvaluateResponse,
-    EvaluateRowsBodyRequest,
-    EvaluateRowsRequest,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    ModelCandidate,
-    RunEvalBodyRequest,
-    RunEvalRequest,
-    # Backward compatibility helpers
-    resolve_evaluate_rows_request,
-    resolve_job_cancel_request,
-    resolve_job_result_request,
-    resolve_job_status_request,
-    resolve_run_eval_request,
-)
 from .file_processors import FileProcessors, ProcessFileRequest, ProcessFileResponse
 from .filters import COMPARISON_FILTER_TYPES, COMPOUND_FILTER_TYPES, ComparisonFilter, CompoundFilter, Filter
 from .files import (
@@ -501,33 +446,6 @@
     ViolationLevel,
 )
 
-from .scoring import (
-    ScoreBatchRequest,
-    ScoreBatchResponse,
-    ScoreRequest,
-    ScoreResponse,
-    Scoring,
-    ScoringFunctionStore,
-    ScoringResult,
-    ScoringResultRow,
-)
-from .scoring_functions import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    CommonScoringFnFields,
-    GetScoringFunctionRequest,
-    ListScoringFunctionsRequest,
-    ListScoringFunctionsResponse,
-    LLMAsJudgeScoringFnParams,
-    RegexParserScoringFnParams,
-    RegisterScoringFunctionRequest,
-    ScoringFn,
-    ScoringFnInput,
-    ScoringFnParams,
-    ScoringFnParamsType,
-    ScoringFunctions,
-    UnregisterScoringFunctionRequest,
-)
 from .shields import (
     CommonShieldFields,
     GetShieldRequest,
@@ -617,7 +535,6 @@
     "LLAMA_STACK_API_V1BETA",
     # API Symbols
     "Responses",
-    "AggregationFunctionType",
     # Responses Request Models
     "CancelResponseRequest",
     "CreateResponseRequest",
@@ -629,18 +546,12 @@
     "Api",
     "ApiFilter",
     "ApprovalFilter",
-    "BasicScoringFnParams",
     "Batches",
     "BatchNotFoundError",
     "BatchObject",
     "CancelBatchRequest",
     "CreateBatchRequest",
     "ListBatchesRequest",
-    "Benchmark",
-    "BenchmarkConfig",
-    "BenchmarkInput",
-    "Benchmarks",
-    "BenchmarksProtocolPrivate",
     "Bf16QuantizationConfig",
     "CallableT",
     "ChatCompletionInputType",
@@ -652,11 +563,8 @@
     "DEFAULT_CHUNK_SIZE_TOKENS",
     "DeleteChunksRequest",
     "EmbeddedChunk",
-    "CommonBenchmarkFields",
     "ConflictError",
-    "CommonDatasetFields",
     "CommonModelFields",
-    "CommonScoringFnFields",
     "CommonShieldFields",
     "CompletionInputType",
     "CompletionRequest",
@@ -685,19 +593,6 @@
     "ListItemsRequest",
     "RetrieveItemRequest",
     "UpdateConversationRequest",
-    "DataSource",
-    "Dataset",
-    "DatasetIO",
-    "DatasetInput",
-    "DatasetPurpose",
-    "DatasetNotFoundError",
-    "DatasetStore",
-    "DatasetType",
-    "AppendRowsParams",
-    "AppendRowsRequest",
-    "IterRowsRequest",
-    "Datasets",
-    "DatasetsProtocolPrivate",
     "DefaultRAGQueryGeneratorConfig",
     "DeleteFileRequest",
     "Docstring",
@@ -705,23 +600,6 @@
     "EmbeddingTaskType",
     "EmbeddingsResponse",
     "Error",
-    "Eval",
-    "EvalCandidate",
-    "EvaluateResponse",
-    "EvaluateRowsBodyRequest",
-    "EvaluateRowsRequest",
-    "BenchmarkIdRequest",
-    "JobCancelRequest",
-    "JobResultRequest",
-    "JobStatusRequest",
-    "RunEvalBodyRequest",
-    "RunEvalRequest",
-    # Backward compatibility helpers
-    "resolve_run_eval_request",
-    "resolve_evaluate_rows_request",
-    "resolve_job_status_request",
-    "resolve_job_cancel_request",
-    "resolve_job_result_request",
     "ExpiresAfter",
     "ExternalApiSpec",
     "ExtraBodyField",
@@ -769,20 +647,13 @@
     "JsonSchemaGenerator",
     "JsonSchemaResponseFormat",
     "JsonType",
-    "LLMAsJudgeScoringFnParams",
     "LLMRAGQueryGeneratorConfig",
     "ListBatchesResponse",
     "RetrieveBatchRequest",
-    "GetBenchmarkRequest",
-    "ListBenchmarksRequest",
-    "ListBenchmarksResponse",
-    "RegisterBenchmarkRequest",
-    "UnregisterBenchmarkRequest",
     "GetConnectorRequest",
     "GetConnectorToolRequest",
     "ListConnectorToolsRequest",
     "ListConnectorsResponse",
-    "ListDatasetsResponse",
     "ListFilesRequest",
     "ListModelsResponse",
     "GetChatCompletionRequest",
@@ -795,7 +666,6 @@
     "ListProvidersResponse",
     "ListRoutesRequest",
     "ListRoutesResponse",
-    "ListScoringFunctionsResponse",
     "ListShieldsResponse",
     "ListToolDefsResponse",
     "ListToolGroupsResponse",
@@ -806,7 +676,6 @@
     "MCPListToolsTool",
     "Metadata",
     "Model",
-    "ModelCandidate",
     "ModelInput",
     "ModelNotFoundError",
     "ModelStore",
@@ -1015,7 +884,6 @@
     "register_schema",
     "RRFRanker",
     "Ranker",
-    "RegexParserScoringFnParams",
     "RemoteProviderConfig",
     "RemoteProviderSpec",
     "RerankData",
@@ -1037,7 +905,6 @@
     "RetrieveFileRequest",
     "RouteInfo",
     "RoutingTable",
-    "RowsDataSource",
     "RunModerationRequest",
     "RunShieldRequest",
     "RunShieldResponse",
@@ -1045,24 +912,6 @@
     "SafetyViolation",
     "SamplingParams",
     "SamplingStrategy",
-    "ScoreBatchRequest",
-    "ScoreBatchResponse",
-    "ScoreRequest",
-    "ScoreResponse",
-    "Scoring",
-    "ScoringFn",
-    "ScoringFnInput",
-    "ScoringFnParams",
-    "ScoringFnParamsType",
-    "ScoringFunctionStore",
-    "ScoringFunctions",
-    "ScoringFunctionsProtocolPrivate",
-    "ScoringResult",
-    "GetScoringFunctionRequest",
-    "ListScoringFunctionsRequest",
-    "RegisterScoringFunctionRequest",
-    "UnregisterScoringFunctionRequest",
-    "ScoringResultRow",
     "Schema",
     "SchemaInfo",
     "SchemaOptions",
@@ -1103,7 +952,6 @@
     "unwrap_optional_type",
     "unwrap_union_types",
     "UploadFileRequest",
-    "URIDataSource",
     "URL",
     "_URLOrData",
     "UserMessage",
diff --git a/src/llama_stack_api/benchmarks/__init__.py b/src/llama_stack_api/benchmarks/__init__.py
deleted file mode 100644
index 9c5652dce2..0000000000
--- a/src/llama_stack_api/benchmarks/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Benchmarks API protocol and models.
-
-This module contains the Benchmarks protocol definition.
-Pydantic models are defined in llama_stack_api.benchmarks.models.
-The FastAPI router is defined in llama_stack_api.benchmarks.fastapi_routes.
-"""
-
-# Import fastapi_routes for router factory access
-from . import fastapi_routes
-
-# Import protocol for re-export
-from .api import Benchmarks
-
-# Import models for re-export
-from .models import (
-    Benchmark,
-    BenchmarkInput,
-    CommonBenchmarkFields,
-    GetBenchmarkRequest,
-    ListBenchmarksRequest,
-    ListBenchmarksResponse,
-    RegisterBenchmarkRequest,
-    UnregisterBenchmarkRequest,
-)
-
-__all__ = [
-    "Benchmarks",
-    "Benchmark",
-    "BenchmarkInput",
-    "CommonBenchmarkFields",
-    "ListBenchmarksResponse",
-    "ListBenchmarksRequest",
-    "GetBenchmarkRequest",
-    "RegisterBenchmarkRequest",
-    "UnregisterBenchmarkRequest",
-    "fastapi_routes",
-]
diff --git a/src/llama_stack_api/benchmarks/api.py b/src/llama_stack_api/benchmarks/api.py
deleted file mode 100644
index 0d6b44063e..0000000000
--- a/src/llama_stack_api/benchmarks/api.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Protocol, runtime_checkable
-
-from .models import (
-    Benchmark,
-    GetBenchmarkRequest,
-    ListBenchmarksRequest,
-    ListBenchmarksResponse,
-    RegisterBenchmarkRequest,
-    UnregisterBenchmarkRequest,
-)
-
-
-@runtime_checkable
-class Benchmarks(Protocol):
-    """Protocol for managing benchmark resources."""
-
-    async def list_benchmarks(
-        self,
-        request: ListBenchmarksRequest,
-    ) -> ListBenchmarksResponse: ...
-
-    async def get_benchmark(
-        self,
-        request: GetBenchmarkRequest,
-    ) -> Benchmark: ...
-
-    async def register_benchmark(
-        self,
-        request: RegisterBenchmarkRequest,
-    ) -> None: ...
-
-    async def unregister_benchmark(
-        self,
-        request: UnregisterBenchmarkRequest,
-    ) -> None: ...
diff --git a/src/llama_stack_api/benchmarks/fastapi_routes.py b/src/llama_stack_api/benchmarks/fastapi_routes.py
deleted file mode 100644
index 461939ab95..0000000000
--- a/src/llama_stack_api/benchmarks/fastapi_routes.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""FastAPI router for the Benchmarks API.
-
-This module defines the FastAPI router for the Benchmarks API using standard
-FastAPI route decorators. The router is defined in the API package to keep
-all API-related code together.
-"""
-
-from typing import Annotated
-
-from fastapi import APIRouter, Body, Depends
-
-from llama_stack_api.router_utils import create_path_dependency, create_query_dependency, standard_responses
-from llama_stack_api.version import LLAMA_STACK_API_V1ALPHA
-
-from .api import Benchmarks
-from .models import (
-    Benchmark,
-    GetBenchmarkRequest,
-    ListBenchmarksRequest,
-    ListBenchmarksResponse,
-    RegisterBenchmarkRequest,
-    UnregisterBenchmarkRequest,
-)
-
-# Automatically generate dependency functions from Pydantic models
-# This ensures the models are the single source of truth for descriptions
-get_list_benchmarks_request = create_query_dependency(ListBenchmarksRequest)
-get_get_benchmark_request = create_path_dependency(GetBenchmarkRequest)
-get_unregister_benchmark_request = create_path_dependency(UnregisterBenchmarkRequest)
-
-
-def create_router(impl: Benchmarks) -> APIRouter:
-    """Create a FastAPI router for the Benchmarks API.
-
-    Args:
-        impl: The Benchmarks implementation instance
-
-    Returns:
-        APIRouter configured for the Benchmarks API
-    """
-    router = APIRouter(
-        prefix=f"/{LLAMA_STACK_API_V1ALPHA}",
-        tags=["Benchmarks"],
-        responses=standard_responses,
-    )
-
-    @router.get(
-        "/eval/benchmarks",
-        response_model=ListBenchmarksResponse,
-        summary="List all benchmarks.",
-        description="List all benchmarks.",
-        responses={
-            200: {"description": "A ListBenchmarksResponse."},
-        },
-    )
-    async def list_benchmarks(
-        request: Annotated[ListBenchmarksRequest, Depends(get_list_benchmarks_request)],
-    ) -> ListBenchmarksResponse:
-        return await impl.list_benchmarks(request)
-
-    @router.get(
-        "/eval/benchmarks/{benchmark_id}",
-        response_model=Benchmark,
-        summary="Get a benchmark by its ID.",
-        description="Get a benchmark by its ID.",
-        responses={
-            200: {"description": "A Benchmark."},
-        },
-    )
-    async def get_benchmark(
-        request: Annotated[GetBenchmarkRequest, Depends(get_get_benchmark_request)],
-    ) -> Benchmark:
-        return await impl.get_benchmark(request)
-
-    @router.post(
-        "/eval/benchmarks",
-        summary="Register a benchmark.",
-        description="Register a benchmark.",
-        responses={
-            200: {"description": "The benchmark was successfully registered."},
-        },
-        deprecated=True,
-    )
-    async def register_benchmark(
-        request: Annotated[RegisterBenchmarkRequest, Body(...)],
-    ) -> None:
-        return await impl.register_benchmark(request)
-
-    @router.delete(
-        "/eval/benchmarks/{benchmark_id}",
-        summary="Unregister a benchmark.",
-        description="Unregister a benchmark.",
-        responses={
-            200: {"description": "The benchmark was successfully unregistered."},
-        },
-        deprecated=True,
-    )
-    async def unregister_benchmark(
-        request: Annotated[UnregisterBenchmarkRequest, Depends(get_unregister_benchmark_request)],
-    ) -> None:
-        return await impl.unregister_benchmark(request)
-
-    return router
diff --git a/src/llama_stack_api/benchmarks/models.py b/src/llama_stack_api/benchmarks/models.py
deleted file mode 100644
index 1f76675c28..0000000000
--- a/src/llama_stack_api/benchmarks/models.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Pydantic models for Benchmarks API requests and responses.
-
-This module defines the request and response models for the Benchmarks API
-using Pydantic with Field descriptions for OpenAPI schema generation.
-"""
-
-from typing import Any, Literal
-
-from pydantic import BaseModel, Field
-
-from llama_stack_api.resource import Resource, ResourceType
-from llama_stack_api.schema_utils import json_schema_type
-
-
-@json_schema_type
-class ListBenchmarksRequest(BaseModel):
-    """Request model for listing benchmarks."""
-
-    pass
-
-
-@json_schema_type
-class GetBenchmarkRequest(BaseModel):
-    """Request model for getting a benchmark."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark to get.")
-
-
-@json_schema_type
-class RegisterBenchmarkRequest(BaseModel):
-    """Request model for registering a benchmark."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark to register.")
-    dataset_id: str = Field(..., description="The ID of the dataset to use for the benchmark.")
-    scoring_functions: list[str] = Field(..., description="The scoring functions to use for the benchmark.")
-    provider_benchmark_id: str | None = Field(
-        default=None, description="The ID of the provider benchmark to use for the benchmark."
-    )
-    provider_id: str | None = Field(default=None, description="The ID of the provider to use for the benchmark.")
-    metadata: dict[str, Any] | None = Field(default=None, description="The metadata to use for the benchmark.")
-
-
-@json_schema_type
-class UnregisterBenchmarkRequest(BaseModel):
-    """Request model for unregistering a benchmark."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark to unregister.")
-
-
-class CommonBenchmarkFields(BaseModel):
-    """Common fields shared across benchmark creation and retrieval."""
-
-    dataset_id: str = Field(..., description="Identifier of the dataset to use for the benchmark evaluation.")
-    scoring_functions: list[str] = Field(
-        ..., description="List of scoring function identifiers to apply during evaluation."
-    )
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Metadata for this evaluation task.",
-    )
-
-
-@json_schema_type
-class Benchmark(CommonBenchmarkFields, Resource):
-    """A benchmark resource for evaluating model performance."""
-
-    type: Literal[ResourceType.benchmark] = Field(
-        default=ResourceType.benchmark,
-        description="The resource type, always benchmark.",
-    )
-
-    @property
-    def benchmark_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_benchmark_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class BenchmarkInput(CommonBenchmarkFields, BaseModel):
-    """Input model for registering a new benchmark."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark.")
-    provider_id: str | None = Field(default=None, description="The ID of the provider to use for the benchmark.")
-    provider_benchmark_id: str | None = Field(
-        default=None, description="The ID of the provider benchmark to use for the benchmark."
-    )
-
-
-@json_schema_type
-class ListBenchmarksResponse(BaseModel):
-    """Response containing a list of benchmark objects."""
-
-    data: list[Benchmark] = Field(..., description="List of benchmark objects.")
-
-
-__all__ = [
-    "ListBenchmarksRequest",
-    "GetBenchmarkRequest",
-    "RegisterBenchmarkRequest",
-    "UnregisterBenchmarkRequest",
-    "CommonBenchmarkFields",
-    "Benchmark",
-    "BenchmarkInput",
-    "ListBenchmarksResponse",
-]
diff --git a/src/llama_stack_api/datasetio/__init__.py b/src/llama_stack_api/datasetio/__init__.py
deleted file mode 100644
index e696d14145..0000000000
--- a/src/llama_stack_api/datasetio/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""DatasetIO API protocol and models.
-
-This module contains the DatasetIO protocol definition.
-Pydantic models are defined in llama_stack_api.datasetio.models.
-The FastAPI router is defined in llama_stack_api.datasetio.fastapi_routes.
-"""
-
-# Import fastapi_routes for router factory access
-from . import fastapi_routes
-
-# Import protocol for FastAPI router
-from .api import DatasetIO, DatasetStore
-
-# Import models for re-export
-from .models import (
-    AppendRowsParams,
-    AppendRowsRequest,
-    IterRowsRequest,
-    PaginatedResponse,
-)
-
-__all__ = [
-    "DatasetIO",
-    "DatasetStore",
-    "AppendRowsParams",
-    "AppendRowsRequest",
-    "IterRowsRequest",
-    "PaginatedResponse",
-    "fastapi_routes",
-]
diff --git a/src/llama_stack_api/datasetio/api.py b/src/llama_stack_api/datasetio/api.py
deleted file mode 100644
index 05e74f1882..0000000000
--- a/src/llama_stack_api/datasetio/api.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""DatasetIO API protocol definition.
-
-This module contains the DatasetIO protocol definition.
-Pydantic models are defined in llama_stack_api.datasetio.models.
-The FastAPI router is defined in llama_stack_api.datasetio.fastapi_routes.
-"""
-
-from typing import Protocol, runtime_checkable
-
-from llama_stack_api.datasets import Dataset
-
-from .models import (
-    AppendRowsParams,
-    IterRowsRequest,
-    PaginatedResponse,
-)
-
-
-class DatasetStore(Protocol):
-    """Protocol for storing and retrieving dataset definitions."""
-
-    def get_dataset(self, dataset_id: str) -> Dataset: ...
-
-
-@runtime_checkable
-class DatasetIO(Protocol):
-    """Protocol for dataset I/O operations.
-
-    The DatasetIO API provides operations for reading and writing data to datasets.
-    This includes iterating over rows and appending new rows to existing datasets.
-    """
-
-    # keeping for aligning with inference/safety, but this is not used
-    dataset_store: DatasetStore
-
-    async def iterrows(self, request: IterRowsRequest) -> PaginatedResponse: ...
-
-    async def append_rows(self, params: AppendRowsParams) -> None: ...
diff --git a/src/llama_stack_api/datasetio/fastapi_routes.py b/src/llama_stack_api/datasetio/fastapi_routes.py
deleted file mode 100644
index 040c8e9b3c..0000000000
--- a/src/llama_stack_api/datasetio/fastapi_routes.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""FastAPI router for the DatasetIO API.
-
-This module defines the FastAPI router for the DatasetIO API using standard
-FastAPI route decorators.
-"""
-
-from typing import Annotated
-
-from fastapi import APIRouter, Body, Path, Query
-
-from llama_stack_api.common.responses import PaginatedResponse
-from llama_stack_api.router_utils import standard_responses
-from llama_stack_api.version import LLAMA_STACK_API_V1BETA
-
-from .api import DatasetIO
-from .models import (
-    AppendRowsParams,
-    AppendRowsRequest,
-    IterRowsRequest,
-)
-
-
-def create_router(impl: DatasetIO) -> APIRouter:
-    """Create a FastAPI router for the DatasetIO API.
-
-    Args:
-        impl: The DatasetIO implementation instance
-
-    Returns:
-        APIRouter configured for the DatasetIO API
-    """
-    router = APIRouter(
-        prefix=f"/{LLAMA_STACK_API_V1BETA}",
-        tags=["DatasetIO"],
-        responses=standard_responses,
-    )
-
-    @router.get(
-        "/datasetio/iterrows/{dataset_id:path}",
-        response_model=PaginatedResponse,
-        summary="Get a paginated list of rows from a dataset.",
-        description="""Get a paginated list of rows from a dataset.
-
-Uses offset-based pagination where:
-- start_index: The starting index (0-based). If None, starts from beginning.
-- limit: Number of items to return. If None or -1, returns all items.
-
-The response includes:
-- data: List of items for the current page.
-- has_more: Whether there are more items available after this set.""",
-        responses={
-            200: {"description": "A PaginatedResponse containing the rows."},
-        },
-    )
-    async def iterrows(
-        dataset_id: Annotated[str, Path(description="The ID of the dataset to get the rows from.")],
-        start_index: Annotated[
-            int | None, Query(description="Index into dataset for the first row to get. Get all rows if None.")
-        ] = None,
-        limit: Annotated[int | None, Query(description="The number of rows to get.")] = None,
-    ) -> PaginatedResponse:
-        request = IterRowsRequest(
-            dataset_id=dataset_id,
-            start_index=start_index,
-            limit=limit,
-        )
-        return await impl.iterrows(request)
-
-    @router.post(
-        "/datasetio/append-rows/{dataset_id:path}",
-        status_code=204,
-        summary="Append rows to a dataset.",
-        description="Append rows to a dataset.",
-        responses={
-            204: {"description": "Rows were successfully appended."},
-        },
-    )
-    async def append_rows(
-        dataset_id: Annotated[str, Path(description="The ID of the dataset to append the rows to.")],
-        body: Annotated[AppendRowsRequest, Body(...)],
-    ) -> None:
-        # Combine path parameter with request body
-        params = AppendRowsParams(
-            dataset_id=dataset_id,
-            rows=body.rows,
-        )
-        return await impl.append_rows(params)
-
-    return router
diff --git a/src/llama_stack_api/datasetio/models.py b/src/llama_stack_api/datasetio/models.py
deleted file mode 100644
index 62e941ed9a..0000000000
--- a/src/llama_stack_api/datasetio/models.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Pydantic models for DatasetIO API requests and responses.
-
-This module defines the request and response models for the DatasetIO API
-using Pydantic with Field descriptions for OpenAPI schema generation.
-"""
-
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-from llama_stack_api.common.responses import PaginatedResponse
-from llama_stack_api.schema_utils import json_schema_type
-
-
-@json_schema_type
-class IterRowsRequest(BaseModel):
-    """Request model for iterating over rows in a dataset."""
-
-    dataset_id: str = Field(..., description="The ID of the dataset to get the rows from.")
-    start_index: int | None = Field(
-        default=None,
-        description="Index into dataset for the first row to get. Get all rows if None.",
-    )
-    limit: int | None = Field(
-        default=None,
-        description="The number of rows to get.",
-    )
-
-
-@json_schema_type
-class AppendRowsRequest(BaseModel):
-    """Request body for appending rows to a dataset."""
-
-    rows: list[dict[str, Any]] = Field(..., description="The rows to append to the dataset.")
-
-
-class AppendRowsParams(BaseModel):
-    """Internal parameters for appending rows to a dataset (includes dataset_id)."""
-
-    dataset_id: str = Field(..., description="The ID of the dataset to append the rows to.")
-    rows: list[dict[str, Any]] = Field(..., description="The rows to append to the dataset.")
-
-
-__all__ = [
-    "AppendRowsRequest",
-    "AppendRowsParams",
-    "IterRowsRequest",
-    "PaginatedResponse",
-]
diff --git a/src/llama_stack_api/datasets/__init__.py b/src/llama_stack_api/datasets/__init__.py
deleted file mode 100644
index cff53476e8..0000000000
--- a/src/llama_stack_api/datasets/__init__.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Datasets API protocol and models.
-
-This module contains the Datasets protocol definition.
-Pydantic models are defined in llama_stack_api.datasets.models.
-The FastAPI router is defined in llama_stack_api.datasets.fastapi_routes.
-"""
-
-# Import fastapi_routes for router factory access
-from . import fastapi_routes
-
-# Import new protocol for FastAPI router
-from .api import Datasets
-
-# Import models for re-export
-from .models import (
-    CommonDatasetFields,
-    Dataset,
-    DatasetPurpose,
-    DatasetType,
-    DataSource,
-    GetDatasetRequest,
-    ListDatasetsResponse,
-    RegisterDatasetRequest,
-    RowsDataSource,
-    UnregisterDatasetRequest,
-    URIDataSource,
-)
-
-
-# Define DatasetInput for backward compatibility
-class DatasetInput(CommonDatasetFields):
-    """Input parameters for dataset operations.
-
-    :param dataset_id: Unique identifier for the dataset
-    """
-
-    dataset_id: str
-
-
-__all__ = [
-    "Datasets",
-    "Dataset",
-    "CommonDatasetFields",
-    "DatasetPurpose",
-    "DataSource",
-    "DatasetInput",
-    "DatasetType",
-    "RowsDataSource",
-    "URIDataSource",
-    "ListDatasetsResponse",
-    "RegisterDatasetRequest",
-    "GetDatasetRequest",
-    "UnregisterDatasetRequest",
-    "fastapi_routes",
-]
diff --git a/src/llama_stack_api/datasets/api.py b/src/llama_stack_api/datasets/api.py
deleted file mode 100644
index 981b438f0c..0000000000
--- a/src/llama_stack_api/datasets/api.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Datasets API protocol definition.
-
-This module contains the Datasets protocol definition.
-Pydantic models are defined in llama_stack_api.datasets.models.
-The FastAPI router is defined in llama_stack_api.datasets.fastapi_routes.
-"""
-
-from typing import Protocol, runtime_checkable
-
-from .models import (
-    Dataset,
-    GetDatasetRequest,
-    ListDatasetsResponse,
-    RegisterDatasetRequest,
-    UnregisterDatasetRequest,
-)
-
-
-@runtime_checkable
-class Datasets(Protocol):
-    """Protocol for dataset management operations."""
-
-    async def register_dataset(self, request: RegisterDatasetRequest) -> Dataset: ...
-
-    async def get_dataset(self, request: GetDatasetRequest) -> Dataset: ...
-
-    async def list_datasets(self) -> ListDatasetsResponse: ...
-
-    async def unregister_dataset(self, request: UnregisterDatasetRequest) -> None: ...
diff --git a/src/llama_stack_api/datasets/fastapi_routes.py b/src/llama_stack_api/datasets/fastapi_routes.py
deleted file mode 100644
index 07a32a59f4..0000000000
--- a/src/llama_stack_api/datasets/fastapi_routes.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""FastAPI router for the Datasets API.
-
-This module defines the FastAPI router for the Datasets API using standard
-FastAPI route decorators.
-"""
-
-from typing import Annotated
-
-from fastapi import APIRouter, Body, Depends
-
-from llama_stack_api.router_utils import create_path_dependency, standard_responses
-from llama_stack_api.version import LLAMA_STACK_API_V1BETA
-
-from .api import Datasets
-from .models import (
-    Dataset,
-    GetDatasetRequest,
-    ListDatasetsResponse,
-    RegisterDatasetRequest,
-    UnregisterDatasetRequest,
-)
-
-# Path parameter dependencies for single-field models
-get_dataset_request = create_path_dependency(GetDatasetRequest)
-unregister_dataset_request = create_path_dependency(UnregisterDatasetRequest)
-
-
-def create_router(impl: Datasets) -> APIRouter:
-    """Create a FastAPI router for the Datasets API.
-
-    Args:
-        impl: The Datasets implementation instance
-
-    Returns:
-        APIRouter configured for the Datasets API
-    """
-    router = APIRouter(
-        prefix=f"/{LLAMA_STACK_API_V1BETA}",
-        tags=["Datasets"],
-        responses=standard_responses,
-    )
-
-    @router.post(
-        "/datasets",
-        response_model=Dataset,
-        summary="Register a new dataset.",
-        description="Register a new dataset.",
-        responses={
-            200: {"description": "The registered dataset object."},
-        },
-        deprecated=True,
-    )
-    async def register_dataset(
-        request: Annotated[RegisterDatasetRequest, Body(...)],
-    ) -> Dataset:
-        return await impl.register_dataset(request)
-
-    @router.get(
-        "/datasets/{dataset_id:path}",
-        response_model=Dataset,
-        summary="Get a dataset by its ID.",
-        description="Get a dataset by its ID.",
-        responses={
-            200: {"description": "The dataset object."},
-        },
-    )
-    async def get_dataset(
-        request: Annotated[GetDatasetRequest, Depends(get_dataset_request)],
-    ) -> Dataset:
-        return await impl.get_dataset(request)
-
-    @router.get(
-        "/datasets",
-        response_model=ListDatasetsResponse,
-        summary="List all datasets.",
-        description="List all datasets.",
-        responses={
-            200: {"description": "A list of dataset objects."},
-        },
-    )
-    async def list_datasets() -> ListDatasetsResponse:
-        return await impl.list_datasets()
-
-    @router.delete(
-        "/datasets/{dataset_id:path}",
-        summary="Unregister a dataset by its ID.",
-        description="Unregister a dataset by its ID.",
-        responses={
-            200: {"description": "The dataset was successfully unregistered."},
-        },
-        deprecated=True,
-    )
-    async def unregister_dataset(
-        request: Annotated[UnregisterDatasetRequest, Depends(unregister_dataset_request)],
-    ) -> None:
-        return await impl.unregister_dataset(request)
-
-    return router
diff --git a/src/llama_stack_api/datasets/models.py b/src/llama_stack_api/datasets/models.py
deleted file mode 100644
index 6fda228e43..0000000000
--- a/src/llama_stack_api/datasets/models.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Pydantic models for Datasets API requests and responses.
-
-This module defines the request and response models for the Datasets API
-using Pydantic with Field descriptions for OpenAPI schema generation.
-"""
-
-from enum import Enum, StrEnum
-from typing import Annotated, Any, Literal
-
-from pydantic import BaseModel, Field
-
-from llama_stack_api.resource import Resource, ResourceType
-from llama_stack_api.schema_utils import json_schema_type, register_schema
-
-
-class DatasetPurpose(StrEnum):
-    """Purpose of the dataset. Each purpose has a required input data schema."""
-
-    eval_question_answer = "eval/question-answer"
-    """The dataset contains a question column and an answer column."""
-    eval_messages_answer = "eval/messages-answer"
-    """The dataset contains a messages column with list of messages and an answer column."""
-
-
-class DatasetType(Enum):
-    """Type of the dataset source."""
-
-    uri = "uri"
-    """The dataset can be obtained from a URI."""
-    rows = "rows"
-    """The dataset is stored in rows."""
-
-
-@json_schema_type
-class URIDataSource(BaseModel):
-    """A dataset that can be obtained from a URI."""
-
-    type: Literal["uri"] = Field(default="uri", description="The type of data source.")
-    uri: str = Field(
-        ...,
-        description='The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}"',
-    )
-
-
-@json_schema_type
-class RowsDataSource(BaseModel):
-    """A dataset stored in rows."""
-
-    type: Literal["rows"] = Field(default="rows", description="The type of data source.")
-    rows: list[dict[str, Any]] = Field(
-        ...,
-        description='The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]',
-    )
-
-
-DataSource = Annotated[
-    URIDataSource | RowsDataSource,
-    Field(discriminator="type"),
-]
-register_schema(DataSource, name="DataSource")
-
-
-class CommonDatasetFields(BaseModel):
-    """Common fields for a dataset."""
-
-    purpose: DatasetPurpose = Field(..., description="Purpose of the dataset indicating its intended use")
-    source: DataSource = Field(..., description="Data source configuration for the dataset")
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Any additional metadata for this dataset",
-    )
-
-
-@json_schema_type
-class Dataset(CommonDatasetFields, Resource):
-    """Dataset resource for storing and accessing training or evaluation data."""
-
-    type: Literal[ResourceType.dataset] = Field(
-        default=ResourceType.dataset,
-        description="Type of resource, always 'dataset' for datasets",
-    )
-
-    @property
-    def dataset_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_dataset_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-@json_schema_type
-class ListDatasetsResponse(BaseModel):
-    """Response from listing datasets."""
-
-    data: list[Dataset] = Field(..., description="List of datasets")
-
-
-# Request models for each endpoint
-
-
-@json_schema_type
-class RegisterDatasetRequest(BaseModel):
-    """Request model for registering a dataset."""
-
-    purpose: DatasetPurpose = Field(..., description="The purpose of the dataset.")
-    source: DataSource = Field(..., description="The data source of the dataset.")
-    metadata: dict[str, Any] | None = Field(
-        default=None,
-        description="The metadata for the dataset.",
-    )
-    dataset_id: str | None = Field(
-        default=None,
-        description="The ID of the dataset. If not provided, an ID will be generated.",
-    )
-
-
-@json_schema_type
-class GetDatasetRequest(BaseModel):
-    """Request model for getting a dataset by ID."""
-
-    dataset_id: str = Field(..., description="The ID of the dataset to get.")
-
-
-@json_schema_type
-class UnregisterDatasetRequest(BaseModel):
-    """Request model for unregistering a dataset."""
-
-    dataset_id: str = Field(..., description="The ID of the dataset to unregister.")
-
-
-__all__ = [
-    "CommonDatasetFields",
-    "Dataset",
-    "DatasetPurpose",
-    "DatasetType",
-    "DataSource",
-    "RowsDataSource",
-    "URIDataSource",
-    "ListDatasetsResponse",
-    "RegisterDatasetRequest",
-    "GetDatasetRequest",
-    "UnregisterDatasetRequest",
-]
diff --git a/src/llama_stack_api/datatypes.py b/src/llama_stack_api/datatypes.py
index 900529bac2..1411d4bb20 100644
--- a/src/llama_stack_api/datatypes.py
+++ b/src/llama_stack_api/datatypes.py
@@ -11,11 +11,8 @@
 
 from pydantic import BaseModel, Field
 
-from llama_stack_api.benchmarks import Benchmark
-from llama_stack_api.datasets import Dataset
 from llama_stack_api.models import Model
 from llama_stack_api.schema_utils import json_schema_type
-from llama_stack_api.scoring_functions import ScoringFn
 from llama_stack_api.shields import Shield
 from llama_stack_api.tools import ToolGroup
 from llama_stack_api.vector_stores import VectorStore
@@ -100,16 +97,10 @@ class Api(Enum, metaclass=DynamicApiMeta):
     :cvar responses: Response orchestration and execution
     :cvar batches: Batch processing for asynchronous API requests
     :cvar vector_io: Vector database operations and queries
-    :cvar datasetio: Dataset input/output operations
-    :cvar scoring: Model output evaluation and scoring
-    :cvar eval: Model evaluation and benchmarking framework
     :cvar tool_runtime: Tool execution and management
     :cvar telemetry: Observability and system monitoring
     :cvar models: Model metadata and management
     :cvar shields: Safety shield implementations
-    :cvar datasets: Dataset creation and management
-    :cvar scoring_functions: Scoring function definitions
-    :cvar benchmarks: Benchmark suite management
     :cvar tool_groups: Tool group organization
     :cvar files: File storage and management
     :cvar file_processors: File parsing and processing operations
@@ -125,17 +116,11 @@ class Api(Enum, metaclass=DynamicApiMeta):
     responses = "responses"
     batches = "batches"
     vector_io = "vector_io"
-    datasetio = "datasetio"
-    scoring = "scoring"
-    eval = "eval"
     tool_runtime = "tool_runtime"
 
     models = "models"
     shields = "shields"
     vector_stores = "vector_stores"  # only used for routing table
-    datasets = "datasets"
-    scoring_functions = "scoring_functions"
-    benchmarks = "benchmarks"
     tool_groups = "tool_groups"
     files = "files"
     file_processors = "file_processors"
@@ -237,28 +222,6 @@ async def register_vector_store(self, vector_store: VectorStore) -> None: ...
     async def unregister_vector_store(self, vector_store_id: str) -> None: ...
 
 
-class DatasetsProtocolPrivate(Protocol):
-    """Protocol for provider-side dataset registration and unregistration."""
-
-    async def register_dataset(self, dataset: Dataset) -> None: ...
-
-    async def unregister_dataset(self, dataset_id: str) -> None: ...
-
-
-class ScoringFunctionsProtocolPrivate(Protocol):
-    """Protocol for provider-side scoring function listing and registration."""
-
-    async def list_scoring_functions(self) -> list[ScoringFn]: ...
-
-    async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
-
-
-class BenchmarksProtocolPrivate(Protocol):
-    """Protocol for provider-side benchmark registration."""
-
-    async def register_benchmark(self, benchmark: Benchmark) -> None: ...
-
-
 class ToolGroupsProtocolPrivate(Protocol):
     """Protocol for provider-side tool group registration and unregistration."""
 
diff --git a/src/llama_stack_api/eval/__init__.py b/src/llama_stack_api/eval/__init__.py
deleted file mode 100644
index 0f97a1d244..0000000000
--- a/src/llama_stack_api/eval/__init__.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api.common.job_types import Job
-
-from . import fastapi_routes
-from .api import Eval
-from .compat import (
-    resolve_evaluate_rows_request,
-    resolve_job_cancel_request,
-    resolve_job_result_request,
-    resolve_job_status_request,
-    resolve_run_eval_request,
-)
-from .models import (
-    BenchmarkConfig,
-    BenchmarkIdRequest,
-    EvalCandidate,
-    EvaluateResponse,
-    EvaluateRowsBodyRequest,
-    EvaluateRowsRequest,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    ModelCandidate,
-    RunEvalBodyRequest,
-    RunEvalRequest,
-)
-
-__all__ = [
-    "Eval",
-    "BenchmarkConfig",
-    "BenchmarkIdRequest",
-    "EvalCandidate",
-    "EvaluateResponse",
-    "EvaluateRowsBodyRequest",
-    "EvaluateRowsRequest",
-    "Job",
-    "JobCancelRequest",
-    "JobResultRequest",
-    "JobStatusRequest",
-    "ModelCandidate",
-    "RunEvalBodyRequest",
-    "RunEvalRequest",
-    "fastapi_routes",
-    # Backward compatibility helpers
-    "resolve_run_eval_request",
-    "resolve_evaluate_rows_request",
-    "resolve_job_status_request",
-    "resolve_job_cancel_request",
-    "resolve_job_result_request",
-]
diff --git a/src/llama_stack_api/eval/api.py b/src/llama_stack_api/eval/api.py
deleted file mode 100644
index 547b0f3757..0000000000
--- a/src/llama_stack_api/eval/api.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Protocol, runtime_checkable
-
-from llama_stack_api.common.job_types import Job
-
-from .models import (
-    EvaluateResponse,
-    EvaluateRowsRequest,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    RunEvalRequest,
-)
-
-
-@runtime_checkable
-class Eval(Protocol):
-    """Evaluations
-
-    Llama Stack Evaluation API for running evaluations on model and agent candidates."""
-
-    async def run_eval(
-        self,
-        request: RunEvalRequest,
-    ) -> Job:
-        """Run an evaluation on a benchmark."""
-        ...
-
-    async def evaluate_rows(
-        self,
-        request: EvaluateRowsRequest,
-    ) -> EvaluateResponse:
-        """Evaluate a list of rows on a benchmark."""
-        ...
-
-    async def job_status(self, request: JobStatusRequest) -> Job:
-        """Get the status of a job."""
-        ...
-
-    async def job_cancel(self, request: JobCancelRequest) -> None:
-        """Cancel a job."""
-        ...
-
-    async def job_result(self, request: JobResultRequest) -> EvaluateResponse:
-        """Get the result of a job."""
-        ...
diff --git a/src/llama_stack_api/eval/compat.py b/src/llama_stack_api/eval/compat.py
deleted file mode 100644
index 81ff485803..0000000000
--- a/src/llama_stack_api/eval/compat.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-Backward compatibility helpers for the Eval API.
-
-This module provides utilities to support both the old-style (individual parameters)
-and new-style (request objects) calling conventions for Eval API methods.
-
-The old-style parameters are deprecated and will be removed in a future release.
-
-Note: When both a request object AND individual parameters are provided, the request
-object takes precedence and individual parameters are ignored.
-"""
-
-import warnings
-from typing import Any
-
-from .models import (
-    BenchmarkConfig,
-    EvaluateRowsRequest,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    RunEvalRequest,
-)
-
-_DEPRECATION_TARGET = "0.6.0"
-
-_DEPRECATION_MESSAGE = (
-    "Passing individual parameters to {method_name}() is deprecated. "
-    "Please use {request_class}(benchmark_id=..., ...) instead. "
-    "This will be removed in version {target}."
-)
-
-
-def _emit_deprecation_warning(method_name: str, request_class: str) -> None:
-    """Emit a deprecation warning for old-style parameter usage."""
-    warnings.warn(
-        _DEPRECATION_MESSAGE.format(method_name=method_name, request_class=request_class, target=_DEPRECATION_TARGET),
-        DeprecationWarning,
-        stacklevel=4,
-    )
-
-
-def _format_missing_params(required: list[str], provided: dict[str, Any]) -> str:
-    """Format error message showing which parameters are missing."""
-    missing = [p for p in required if provided.get(p) is None]
-    provided_names = [p for p in required if provided.get(p) is not None]
-
-    parts = []
-    if missing:
-        parts.append(f"missing: {', '.join(missing)}")
-    if provided_names:
-        parts.append(f"provided: {', '.join(provided_names)}")
-
-    return "; ".join(parts)
-
-
-def _validate_not_empty(value: Any, name: str) -> None:
-    """Validate that a value is not None, empty string, or empty list."""
-    if not value:
-        raise ValueError(f"'{name}' cannot be None or empty. Provided: {value}")
-
-
-def resolve_run_eval_request(
-    request: RunEvalRequest | None = None,
-    *,
-    benchmark_id: str | None = None,
-    benchmark_config: BenchmarkConfig | None = None,
-) -> RunEvalRequest:
-    """
-    Resolve run_eval parameters to a RunEvalRequest object.
-
-    Supports both new-style (request object) and old-style (individual parameters).
-    Old-style usage emits a DeprecationWarning.
-
-    Note: If both request object and individual parameters are provided, the request
-    object takes precedence and individual parameters are ignored.
-
-    Args:
-        request: The new-style request object (preferred)
-        benchmark_id: (Deprecated) The benchmark ID
-        benchmark_config: (Deprecated) The benchmark configuration
-
-    Returns:
-        RunEvalRequest object
-    """
-    if request is not None:
-        _validate_not_empty(request.benchmark_id, "benchmark_id")
-        _validate_not_empty(request.benchmark_config, "benchmark_config")
-        return request
-
-    # Old-style parameters
-    if benchmark_id and benchmark_config:
-        _emit_deprecation_warning("run_eval", "RunEvalRequest")
-        return RunEvalRequest(
-            benchmark_id=benchmark_id,
-            benchmark_config=benchmark_config,
-        )
-
-    required = ["benchmark_id", "benchmark_config"]
-    provided = {"benchmark_id": benchmark_id, "benchmark_config": benchmark_config}
-    raise ValueError(
-        f"Either 'request' (RunEvalRequest) or both 'benchmark_id' and 'benchmark_config' "
-        f"must be provided. {_format_missing_params(required, provided)}"
-    )
-
-
-def resolve_evaluate_rows_request(
-    request: EvaluateRowsRequest | None = None,
-    *,
-    benchmark_id: str | None = None,
-    input_rows: list[dict[str, Any]] | None = None,
-    scoring_functions: list[str] | None = None,
-    benchmark_config: BenchmarkConfig | None = None,
-) -> EvaluateRowsRequest:
-    """
-    Resolve evaluate_rows parameters to an EvaluateRowsRequest object.
-
-    Supports both new-style (request object) and old-style (individual parameters).
-    Old-style usage emits a DeprecationWarning.
-
-    Note: If both request object and individual parameters are provided, the request
-    object takes precedence and individual parameters are ignored.
-
-    Args:
-        request: The new-style request object (preferred)
-        benchmark_id: (Deprecated) The benchmark ID
-        input_rows: (Deprecated) The rows to evaluate
-        scoring_functions: (Deprecated) The scoring functions to use
-        benchmark_config: (Deprecated) The benchmark configuration
-
-    Returns:
-        EvaluateRowsRequest object
-    """
-    if request is not None:
-        _validate_not_empty(request.benchmark_id, "benchmark_id")
-        _validate_not_empty(request.input_rows, "input_rows")
-        _validate_not_empty(request.scoring_functions, "scoring_functions")
-        _validate_not_empty(request.benchmark_config, "benchmark_config")
-        return request
-
-    # Old-style parameters
-    if benchmark_id and input_rows and scoring_functions and benchmark_config:
-        _emit_deprecation_warning("evaluate_rows", "EvaluateRowsRequest")
-        return EvaluateRowsRequest(
-            benchmark_id=benchmark_id,
-            input_rows=input_rows,
-            scoring_functions=scoring_functions,
-            benchmark_config=benchmark_config,
-        )
-
-    required = ["benchmark_id", "input_rows", "scoring_functions", "benchmark_config"]
-    provided = {
-        "benchmark_id": benchmark_id,
-        "input_rows": input_rows,
-        "scoring_functions": scoring_functions,
-        "benchmark_config": benchmark_config,
-    }
-    raise ValueError(
-        f"Either 'request' (EvaluateRowsRequest) or all of 'benchmark_id', 'input_rows', "
-        f"'scoring_functions', and 'benchmark_config' must be provided. "
-        f"{_format_missing_params(required, provided)}"
-    )
-
-
-def resolve_job_status_request(
-    request: JobStatusRequest | None = None,
-    *,
-    benchmark_id: str | None = None,
-    job_id: str | None = None,
-) -> JobStatusRequest:
-    """
-    Resolve job_status parameters to a JobStatusRequest object.
-
-    Supports both new-style (request object) and old-style (individual parameters).
-    Old-style usage emits a DeprecationWarning.
-
-    Note: If both request object and individual parameters are provided, the request
-    object takes precedence and individual parameters are ignored.
-
-    Args:
-        request: The new-style request object (preferred)
-        benchmark_id: (Deprecated) The benchmark ID
-        job_id: (Deprecated) The job ID
-
-    Returns:
-        JobStatusRequest object
-    """
-    if request is not None:
-        _validate_not_empty(request.benchmark_id, "benchmark_id")
-        _validate_not_empty(request.job_id, "job_id")
-        return request
-
-    # Old-style parameters
-    if benchmark_id and job_id:
-        _emit_deprecation_warning("job_status", "JobStatusRequest")
-        return JobStatusRequest(
-            benchmark_id=benchmark_id,
-            job_id=job_id,
-        )
-
-    required = ["benchmark_id", "job_id"]
-    provided = {"benchmark_id": benchmark_id, "job_id": job_id}
-    raise ValueError(
-        f"Either 'request' (JobStatusRequest) or both 'benchmark_id' and 'job_id' "
-        f"must be provided. {_format_missing_params(required, provided)}"
-    )
-
-
-def resolve_job_cancel_request(
-    request: JobCancelRequest | None = None,
-    *,
-    benchmark_id: str | None = None,
-    job_id: str | None = None,
-) -> JobCancelRequest:
-    """
-    Resolve job_cancel parameters to a JobCancelRequest object.
-
-    Supports both new-style (request object) and old-style (individual parameters).
-    Old-style usage emits a DeprecationWarning.
-
-    Note: If both request object and individual parameters are provided, the request
-    object takes precedence and individual parameters are ignored.
-
-    Args:
-        request: The new-style request object (preferred)
-        benchmark_id: (Deprecated) The benchmark ID
-        job_id: (Deprecated) The job ID
-
-    Returns:
-        JobCancelRequest object
-    """
-    if request is not None:
-        _validate_not_empty(request.benchmark_id, "benchmark_id")
-        _validate_not_empty(request.job_id, "job_id")
-        return request
-
-    # Old-style parameters
-    if benchmark_id and job_id:
-        _emit_deprecation_warning("job_cancel", "JobCancelRequest")
-        return JobCancelRequest(
-            benchmark_id=benchmark_id,
-            job_id=job_id,
-        )
-
-    required = ["benchmark_id", "job_id"]
-    provided = {"benchmark_id": benchmark_id, "job_id": job_id}
-    raise ValueError(
-        f"Either 'request' (JobCancelRequest) or both 'benchmark_id' and 'job_id' "
-        f"must be provided. {_format_missing_params(required, provided)}"
-    )
-
-
-def resolve_job_result_request(
-    request: JobResultRequest | None = None,
-    *,
-    benchmark_id: str | None = None,
-    job_id: str | None = None,
-) -> JobResultRequest:
-    """
-    Resolve job_result parameters to a JobResultRequest object.
-
-    Supports both new-style (request object) and old-style (individual parameters).
-    Old-style usage emits a DeprecationWarning.
-
-    Note: If both request object and individual parameters are provided, the request
-    object takes precedence and individual parameters are ignored.
-
-    Args:
-        request: The new-style request object (preferred)
-        benchmark_id: (Deprecated) The benchmark ID
-        job_id: (Deprecated) The job ID
-
-    Returns:
-        JobResultRequest object
-    """
-    if request is not None:
-        _validate_not_empty(request.benchmark_id, "benchmark_id")
-        _validate_not_empty(request.job_id, "job_id")
-        return request
-
-    # Old-style parameters
-    if benchmark_id and job_id:
-        _emit_deprecation_warning("job_result", "JobResultRequest")
-        return JobResultRequest(
-            benchmark_id=benchmark_id,
-            job_id=job_id,
-        )
-
-    required = ["benchmark_id", "job_id"]
-    provided = {"benchmark_id": benchmark_id, "job_id": job_id}
-    raise ValueError(
-        f"Either 'request' (JobResultRequest) or both 'benchmark_id' and 'job_id' "
-        f"must be provided. {_format_missing_params(required, provided)}"
-    )
diff --git a/src/llama_stack_api/eval/fastapi_routes.py b/src/llama_stack_api/eval/fastapi_routes.py
deleted file mode 100644
index b6e2b812d4..0000000000
--- a/src/llama_stack_api/eval/fastapi_routes.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Annotated
-
-from fastapi import APIRouter, Body, Depends
-
-from llama_stack_api.common.job_types import Job
-from llama_stack_api.router_utils import create_path_dependency, standard_responses
-from llama_stack_api.version import LLAMA_STACK_API_V1ALPHA
-
-from .api import Eval
-from .models import (
-    BenchmarkIdRequest,
-    EvaluateResponse,
-    EvaluateRowsBodyRequest,
-    EvaluateRowsRequest,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    RunEvalBodyRequest,
-    RunEvalRequest,
-)
-
-get_benchmark_id_request = create_path_dependency(BenchmarkIdRequest)
-
-
-def create_router(impl: Eval) -> APIRouter:
-    """Create a FastAPI router for the Eval API."""
-    router = APIRouter(
-        prefix=f"/{LLAMA_STACK_API_V1ALPHA}",
-        tags=["Eval"],
-        responses=standard_responses,
-    )
-
-    @router.post(
-        "/eval/benchmarks/{benchmark_id}/jobs",
-        response_model=Job,
-        summary="Run Eval",
-        description="Run an evaluation on a benchmark.",
-        responses={
-            200: {"description": "The job that was created to run the evaluation."},
-        },
-    )
-    async def run_eval(
-        benchmark_id_request: Annotated[BenchmarkIdRequest, Depends(get_benchmark_id_request)],
-        body_request: Annotated[RunEvalBodyRequest, Body(...)],
-    ) -> Job:
-        request = RunEvalRequest(
-            benchmark_id=benchmark_id_request.benchmark_id,
-            benchmark_config=body_request.benchmark_config,
-        )
-        return await impl.run_eval(request)
-
-    @router.post(
-        "/eval/benchmarks/{benchmark_id}/evaluations",
-        response_model=EvaluateResponse,
-        summary="Evaluate Rows",
-        description="Evaluate a list of rows on a benchmark.",
-        responses={
-            200: {"description": "EvaluateResponse object containing generations and scores."},
-        },
-    )
-    async def evaluate_rows(
-        benchmark_id_request: Annotated[BenchmarkIdRequest, Depends(get_benchmark_id_request)],
-        body_request: Annotated[EvaluateRowsBodyRequest, Body(...)],
-    ) -> EvaluateResponse:
-        request = EvaluateRowsRequest(
-            benchmark_id=benchmark_id_request.benchmark_id,
-            input_rows=body_request.input_rows,
-            scoring_functions=body_request.scoring_functions,
-            benchmark_config=body_request.benchmark_config,
-        )
-        return await impl.evaluate_rows(request)
-
-    @router.get(
-        "/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
-        response_model=Job,
-        summary="Job Status",
-        description="Get the status of a job.",
-        responses={
-            200: {"description": "The status of the evaluation job."},
-        },
-    )
-    async def job_status(
-        benchmark_id: str,
-        job_id: str,
-    ) -> Job:
-        request = JobStatusRequest(benchmark_id=benchmark_id, job_id=job_id)
-        return await impl.job_status(request)
-
-    @router.delete(
-        "/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
-        summary="Job Cancel",
-        description="Cancel a job.",
-        responses={
-            200: {"description": "Successful Response"},
-        },
-    )
-    async def job_cancel(
-        benchmark_id: str,
-        job_id: str,
-    ) -> None:
-        request = JobCancelRequest(benchmark_id=benchmark_id, job_id=job_id)
-        return await impl.job_cancel(request)
-
-    @router.get(
-        "/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result",
-        response_model=EvaluateResponse,
-        summary="Job Result",
-        description="Get the result of a job.",
-        responses={
-            200: {"description": "The result of the job."},
-        },
-    )
-    async def job_result(
-        benchmark_id: str,
-        job_id: str,
-    ) -> EvaluateResponse:
-        request = JobResultRequest(benchmark_id=benchmark_id, job_id=job_id)
-        return await impl.job_result(request)
-
-    return router
diff --git a/src/llama_stack_api/eval/models.py b/src/llama_stack_api/eval/models.py
deleted file mode 100644
index ec5db00b7f..0000000000
--- a/src/llama_stack_api/eval/models.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Literal
-
-from pydantic import BaseModel, Field
-
-from llama_stack_api.inference import SamplingParams, SystemMessage
-from llama_stack_api.schema_utils import json_schema_type
-from llama_stack_api.scoring import ScoringResult
-from llama_stack_api.scoring_functions import ScoringFnParams
-
-
-@json_schema_type
-class ModelCandidate(BaseModel):
-    """A model candidate for evaluation."""
-
-    type: Literal["model"] = "model"
-    model: str = Field(..., description="The model ID to evaluate", min_length=1)
-    sampling_params: SamplingParams = Field(..., description="The sampling parameters for the model")
-    system_message: SystemMessage | None = Field(
-        None, description="The system message providing instructions or context to the model"
-    )
-
-
-EvalCandidate = ModelCandidate
-
-
-@json_schema_type
-class BenchmarkConfig(BaseModel):
-    """A benchmark configuration for evaluation."""
-
-    eval_candidate: EvalCandidate = Field(..., description="The candidate to evaluate")
-    scoring_params: dict[str, ScoringFnParams] = Field(
-        default_factory=dict,
-        description="Map between scoring function id and parameters for each scoring function you want to run",
-    )
-    num_examples: int | None = Field(
-        None,
-        description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
-        ge=1,
-    )
-    # we could optinally add any specific dataset config here
-
-
-@json_schema_type
-class EvaluateResponse(BaseModel):
-    """The response from an evaluation."""
-
-    generations: list[dict[str, Any]] = Field(..., description="The generations from the evaluation")
-    scores: dict[str, ScoringResult] = Field(
-        ..., description="The scores from the evaluation. Each key in the dict is a scoring function name"
-    )
-
-
-@json_schema_type
-class BenchmarkIdRequest(BaseModel):
-    """Request model containing benchmark_id path parameter."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark", min_length=1)
-
-
-@json_schema_type
-class RunEvalRequest(BaseModel):
-    """Request model for running an evaluation on a benchmark."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark to run the evaluation on", min_length=1)
-    benchmark_config: BenchmarkConfig = Field(..., description="The configuration for the benchmark")
-
-
-@json_schema_type
-class RunEvalBodyRequest(BaseModel):
-    """Request body model for running an evaluation (without path parameter)."""
-
-    benchmark_config: BenchmarkConfig = Field(..., description="The configuration for the benchmark")
-
-
-@json_schema_type
-class EvaluateRowsRequest(BaseModel):
-    """Request model for evaluating a list of rows on a benchmark."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark to run the evaluation on", min_length=1)
-    input_rows: list[dict[str, Any]] = Field(..., description="The rows to evaluate", min_length=1)
-    scoring_functions: list[str] = Field(
-        ..., description="The scoring functions to use for the evaluation", min_length=1
-    )
-    benchmark_config: BenchmarkConfig = Field(..., description="The configuration for the benchmark")
-
-
-@json_schema_type
-class EvaluateRowsBodyRequest(BaseModel):
-    """Request body model for evaluating rows (without path parameter)."""
-
-    input_rows: list[dict[str, Any]] = Field(..., description="The rows to evaluate", min_length=1)
-    scoring_functions: list[str] = Field(
-        ..., description="The scoring functions to use for the evaluation", min_length=1
-    )
-    benchmark_config: BenchmarkConfig = Field(..., description="The configuration for the benchmark")
-
-
-@json_schema_type
-class JobStatusRequest(BaseModel):
-    """Request model for getting the status of a job."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark associated with the job", min_length=1)
-    job_id: str = Field(..., description="The ID of the job to get the status of", min_length=1)
-
-
-@json_schema_type
-class JobCancelRequest(BaseModel):
-    """Request model for canceling a job."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark associated with the job", min_length=1)
-    job_id: str = Field(..., description="The ID of the job to cancel", min_length=1)
-
-
-@json_schema_type
-class JobResultRequest(BaseModel):
-    """Request model for getting the result of a job."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark associated with the job", min_length=1)
-    job_id: str = Field(..., description="The ID of the job to get the result of", min_length=1)
-
-
-__all__ = [
-    "ModelCandidate",
-    "EvalCandidate",
-    "BenchmarkConfig",
-    "EvaluateResponse",
-    "BenchmarkIdRequest",
-    "RunEvalRequest",
-    "RunEvalBodyRequest",
-    "EvaluateRowsRequest",
-    "EvaluateRowsBodyRequest",
-    "JobStatusRequest",
-    "JobCancelRequest",
-    "JobResultRequest",
-]
diff --git a/src/llama_stack_api/pyproject.toml b/src/llama_stack_api/pyproject.toml
index c8e2f40b35..4b3f67a011 100644
--- a/src/llama_stack_api/pyproject.toml
+++ b/src/llama_stack_api/pyproject.toml
@@ -46,12 +46,8 @@ packages = [
     "llama_stack_api.admin",
     "llama_stack_api.responses",
     "llama_stack_api.batches",
-    "llama_stack_api.benchmarks",
     "llama_stack_api.common",
     "llama_stack_api.conversations",
-    "llama_stack_api.datasetio",
-    "llama_stack_api.datasets",
-    "llama_stack_api.eval",
     "llama_stack_api.file_processors",
     "llama_stack_api.files",
     "llama_stack_api.inspect_api",
@@ -62,9 +58,7 @@ packages = [
 
     "llama_stack_api.providers",
     "llama_stack_api.shields",
-    "llama_stack_api.scoring_functions",
     "llama_stack_api.prompts",
-    "llama_stack_api.scoring",
     "llama_stack_api.safety",
     "llama_stack_api.tools",
     "llama_stack_api.vector_io",
@@ -85,7 +79,6 @@ py-modules = [
     "llama_stack_api.vector_stores",
     "llama_stack_api.version",
     "llama_stack_api.validators",
-    "llama_stack_api.helpers",
 ]
 
 [tool.setuptools.package-data]
diff --git a/src/llama_stack_api/resource.py b/src/llama_stack_api/resource.py
index 3311597ee5..f02ec85a2a 100644
--- a/src/llama_stack_api/resource.py
+++ b/src/llama_stack_api/resource.py
@@ -15,9 +15,6 @@ class ResourceType(StrEnum):
     model = "model"
     shield = "shield"
     vector_store = "vector_store"
-    dataset = "dataset"
-    scoring_function = "scoring_function"
-    benchmark = "benchmark"
     tool = "tool"
     tool_group = "tool_group"
     prompt = "prompt"
diff --git a/src/llama_stack_api/scoring/__init__.py b/src/llama_stack_api/scoring/__init__.py
deleted file mode 100644
index db5fcab4d1..0000000000
--- a/src/llama_stack_api/scoring/__init__.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Scoring API protocol and models.
-
-This module contains the Scoring protocol definition.
-Pydantic models are defined in llama_stack_api.scoring.models.
-The FastAPI router is defined in llama_stack_api.scoring.fastapi_routes.
-"""
-
-# Import fastapi_routes for router factory access
-# Import scoring_functions for re-export
-from llama_stack_api.scoring_functions import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    CommonScoringFnFields,
-    ListScoringFunctionsResponse,
-    LLMAsJudgeScoringFnParams,
-    RegexParserScoringFnParams,
-    ScoringFn,
-    ScoringFnInput,
-    ScoringFnParams,
-    ScoringFnParamsType,
-    ScoringFunctions,
-)
-
-from . import fastapi_routes
-
-# Import protocol for FastAPI router
-from .api import Scoring, ScoringFunctionStore
-
-# Import models for re-export
-from .models import (
-    ScoreBatchRequest,
-    ScoreBatchResponse,
-    ScoreRequest,
-    ScoreResponse,
-    ScoringResult,
-    ScoringResultRow,
-)
-
-__all__ = [
-    "Scoring",
-    "ScoringFunctionStore",
-    "ScoringResult",
-    "ScoringResultRow",
-    "ScoreBatchResponse",
-    "ScoreResponse",
-    "ScoreRequest",
-    "ScoreBatchRequest",
-    "AggregationFunctionType",
-    "BasicScoringFnParams",
-    "CommonScoringFnFields",
-    "LLMAsJudgeScoringFnParams",
-    "ListScoringFunctionsResponse",
-    "RegexParserScoringFnParams",
-    "ScoringFn",
-    "ScoringFnInput",
-    "ScoringFnParams",
-    "ScoringFnParamsType",
-    "ScoringFunctions",
-    "fastapi_routes",
-]
diff --git a/src/llama_stack_api/scoring/api.py b/src/llama_stack_api/scoring/api.py
deleted file mode 100644
index 9263eb06cd..0000000000
--- a/src/llama_stack_api/scoring/api.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Scoring API protocol definition.
-
-This module contains the Scoring protocol definition.
-Pydantic models are defined in llama_stack_api.scoring.models.
-The FastAPI router is defined in llama_stack_api.scoring.fastapi_routes.
-"""
-
-from typing import Protocol, runtime_checkable
-
-from llama_stack_api.scoring_functions import ScoringFn
-
-from .models import ScoreBatchRequest, ScoreBatchResponse, ScoreRequest, ScoreResponse
-
-
-class ScoringFunctionStore(Protocol):
-    """Protocol for storing and retrieving scoring functions."""
-
-    def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn: ...
-
-
-@runtime_checkable
-class Scoring(Protocol):
-    """Protocol for scoring operations."""
-
-    scoring_function_store: ScoringFunctionStore
-
-    async def score_batch(self, request: ScoreBatchRequest) -> ScoreBatchResponse: ...
-
-    async def score(self, request: ScoreRequest) -> ScoreResponse: ...
diff --git a/src/llama_stack_api/scoring/fastapi_routes.py b/src/llama_stack_api/scoring/fastapi_routes.py
deleted file mode 100644
index fe1df9a289..0000000000
--- a/src/llama_stack_api/scoring/fastapi_routes.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""FastAPI router for the Scoring API.
-
-This module defines the FastAPI router for the Scoring API using standard
-FastAPI route decorators.
-"""
-
-from typing import Annotated
-
-from fastapi import APIRouter, Body
-
-from llama_stack_api.router_utils import standard_responses
-from llama_stack_api.version import LLAMA_STACK_API_V1
-
-from .api import Scoring
-from .models import ScoreBatchRequest, ScoreBatchResponse, ScoreRequest, ScoreResponse
-
-
-def create_router(impl: Scoring) -> APIRouter:
-    """Create a FastAPI router for the Scoring API.
-
-    Args:
-        impl: The Scoring implementation instance
-
-    Returns:
-        APIRouter configured for the Scoring API
-    """
-    router = APIRouter(
-        prefix=f"/{LLAMA_STACK_API_V1}",
-        tags=["Scoring"],
-        responses=standard_responses,
-    )
-
-    @router.post(
-        "/scoring/score",
-        response_model=ScoreResponse,
-        summary="Score a list of rows.",
-        description="Score a list of rows.",
-        responses={
-            200: {"description": "A ScoreResponse object containing rows and aggregated results."},
-        },
-    )
-    async def score(
-        request: Annotated[ScoreRequest, Body(...)],
-    ) -> ScoreResponse:
-        return await impl.score(request)
-
-    @router.post(
-        "/scoring/score-batch",
-        response_model=ScoreBatchResponse,
-        summary="Score a batch of rows.",
-        description="Score a batch of rows.",
-        responses={
-            200: {"description": "A ScoreBatchResponse."},
-        },
-    )
-    async def score_batch(
-        request: Annotated[ScoreBatchRequest, Body(...)],
-    ) -> ScoreBatchResponse:
-        return await impl.score_batch(request)
-
-    return router
diff --git a/src/llama_stack_api/scoring/models.py b/src/llama_stack_api/scoring/models.py
deleted file mode 100644
index 77edfc74d1..0000000000
--- a/src/llama_stack_api/scoring/models.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Pydantic models for Scoring API requests and responses.
-
-This module defines the request and response models for the Scoring API
-using Pydantic with Field descriptions for OpenAPI schema generation.
-"""
-
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-from llama_stack_api.schema_utils import json_schema_type
-from llama_stack_api.scoring_functions import ScoringFnParams
-
-# mapping of metric to value
-ScoringResultRow = dict[str, Any]
-
-
-@json_schema_type
-class ScoringResult(BaseModel):
-    """
-    A scoring result for a single row.
-    """
-
-    score_rows: list[ScoringResultRow] = Field(
-        ..., description="The scoring result for each row. Each row is a map of column name to value."
-    )
-    aggregated_results: dict[str, Any] = Field(..., description="Map of metric name to aggregated value")
-
-
-@json_schema_type
-class ScoreBatchResponse(BaseModel):
-    """Response from batch scoring operations on datasets."""
-
-    dataset_id: str | None = Field(default=None, description="(Optional) The identifier of the dataset that was scored")
-    results: dict[str, ScoringResult] = Field(..., description="A map of scoring function name to ScoringResult")
-
-
-@json_schema_type
-class ScoreResponse(BaseModel):
-    """
-    The response from scoring.
-    """
-
-    results: dict[str, ScoringResult] = Field(..., description="A map of scoring function name to ScoringResult.")
-
-
-@json_schema_type
-class ScoreRequest(BaseModel):
-    """Request model for scoring a list of rows."""
-
-    input_rows: list[dict[str, Any]] = Field(..., description="The rows to score.")
-    scoring_functions: dict[str, ScoringFnParams | None] = Field(
-        ..., description="The scoring functions to use for the scoring."
-    )
-
-
-@json_schema_type
-class ScoreBatchRequest(BaseModel):
-    """Request model for scoring a batch of rows from a dataset."""
-
-    dataset_id: str = Field(..., description="The ID of the dataset to score.")
-    scoring_functions: dict[str, ScoringFnParams | None] = Field(
-        ..., description="The scoring functions to use for the scoring."
-    )
-    save_results_dataset: bool = Field(default=False, description="Whether to save the results to a dataset.")
-
-
-__all__ = [
-    "ScoringResult",
-    "ScoringResultRow",
-    "ScoreBatchResponse",
-    "ScoreResponse",
-    "ScoreRequest",
-    "ScoreBatchRequest",
-]
diff --git a/src/llama_stack_api/scoring_functions/__init__.py b/src/llama_stack_api/scoring_functions/__init__.py
deleted file mode 100644
index db9047e26f..0000000000
--- a/src/llama_stack_api/scoring_functions/__init__.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""ScoringFunctions API protocol and models.
-
-This module contains the ScoringFunctions protocol definition.
-Pydantic models are defined in llama_stack_api.scoring_functions.models.
-The FastAPI router is defined in llama_stack_api.scoring_functions.fastapi_routes.
-"""
-
-from . import fastapi_routes
-from .api import ScoringFunctions
-from .models import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    CommonScoringFnFields,
-    GetScoringFunctionRequest,
-    ListScoringFunctionsRequest,
-    ListScoringFunctionsResponse,
-    LLMAsJudgeScoringFnParams,
-    RegexParserScoringFnParams,
-    RegisterScoringFunctionRequest,
-    ScoringFn,
-    ScoringFnInput,
-    ScoringFnParams,
-    ScoringFnParamsType,
-    UnregisterScoringFunctionRequest,
-)
-
-__all__ = [
-    "ScoringFunctions",
-    "ScoringFn",
-    "ScoringFnInput",
-    "ScoringFnParams",
-    "ScoringFnParamsType",
-    "AggregationFunctionType",
-    "LLMAsJudgeScoringFnParams",
-    "RegexParserScoringFnParams",
-    "BasicScoringFnParams",
-    "CommonScoringFnFields",
-    "ListScoringFunctionsResponse",
-    "ListScoringFunctionsRequest",
-    "GetScoringFunctionRequest",
-    "RegisterScoringFunctionRequest",
-    "UnregisterScoringFunctionRequest",
-    "fastapi_routes",
-]
diff --git a/src/llama_stack_api/scoring_functions/api.py b/src/llama_stack_api/scoring_functions/api.py
deleted file mode 100644
index bdb4bd0bcb..0000000000
--- a/src/llama_stack_api/scoring_functions/api.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Protocol, runtime_checkable
-
-from .models import (
-    GetScoringFunctionRequest,
-    ListScoringFunctionsRequest,
-    ListScoringFunctionsResponse,
-    RegisterScoringFunctionRequest,
-    ScoringFn,
-    UnregisterScoringFunctionRequest,
-)
-
-
-@runtime_checkable
-class ScoringFunctions(Protocol):
-    """Protocol for managing scoring function resources."""
-
-    async def list_scoring_functions(
-        self,
-        request: ListScoringFunctionsRequest,
-    ) -> ListScoringFunctionsResponse: ...
-
-    async def get_scoring_function(
-        self,
-        request: GetScoringFunctionRequest,
-    ) -> ScoringFn: ...
-
-    async def register_scoring_function(
-        self,
-        request: RegisterScoringFunctionRequest,
-    ) -> None: ...
-
-    async def unregister_scoring_function(
-        self,
-        request: UnregisterScoringFunctionRequest,
-    ) -> None: ...
diff --git a/src/llama_stack_api/scoring_functions/fastapi_routes.py b/src/llama_stack_api/scoring_functions/fastapi_routes.py
deleted file mode 100644
index 4d85d7b358..0000000000
--- a/src/llama_stack_api/scoring_functions/fastapi_routes.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""FastAPI router for the ScoringFunctions API.
-
-This module defines the FastAPI router for the ScoringFunctions API using standard
-FastAPI route decorators.
-
-The router is defined in the API package to keep all API-related code together.
-"""
-
-from typing import Annotated
-
-from fastapi import APIRouter, Body, Depends
-
-from llama_stack_api.router_utils import create_path_dependency, create_query_dependency, standard_responses
-from llama_stack_api.version import LLAMA_STACK_API_V1
-
-from .api import ScoringFunctions
-from .models import (
-    GetScoringFunctionRequest,
-    ListScoringFunctionsRequest,
-    ListScoringFunctionsResponse,
-    RegisterScoringFunctionRequest,
-    ScoringFn,
-    UnregisterScoringFunctionRequest,
-)
-
-get_list_scoring_functions_request = create_query_dependency(ListScoringFunctionsRequest)
-get_get_scoring_function_request = create_path_dependency(GetScoringFunctionRequest)
-get_unregister_scoring_function_request = create_path_dependency(UnregisterScoringFunctionRequest)
-
-
-def create_router(impl: ScoringFunctions) -> APIRouter:
-    """Create a FastAPI router for the ScoringFunctions API.
-
-    Args:
-        impl: The ScoringFunctions implementation instance
-
-    Returns:
-        APIRouter configured for the ScoringFunctions API
-    """
-    router = APIRouter(
-        prefix=f"/{LLAMA_STACK_API_V1}",
-        tags=["Scoring Functions"],
-        responses=standard_responses,
-    )
-
-    @router.get(
-        "/scoring-functions",
-        response_model=ListScoringFunctionsResponse,
-        summary="List all scoring functions.",
-        description="List all scoring functions.",
-        responses={
-            200: {"description": "A ListScoringFunctionsResponse."},
-        },
-    )
-    async def list_scoring_functions(
-        request: Annotated[ListScoringFunctionsRequest, Depends(get_list_scoring_functions_request)],
-    ) -> ListScoringFunctionsResponse:
-        return await impl.list_scoring_functions(request)
-
-    @router.get(
-        "/scoring-functions/{scoring_fn_id:path}",
-        response_model=ScoringFn,
-        summary="Get a scoring function by its ID.",
-        description="Get a scoring function by its ID.",
-        responses={
-            200: {"description": "A ScoringFn."},
-        },
-    )
-    async def get_scoring_function(
-        request: Annotated[GetScoringFunctionRequest, Depends(get_get_scoring_function_request)],
-    ) -> ScoringFn:
-        return await impl.get_scoring_function(request)
-
-    @router.post(
-        "/scoring-functions",
-        summary="Register a scoring function.",
-        description="Register a scoring function.",
-        responses={
-            200: {"description": "The scoring function was successfully registered."},
-        },
-        deprecated=True,
-    )
-    async def register_scoring_function(
-        request: Annotated[RegisterScoringFunctionRequest, Body(...)],
-    ) -> None:
-        return await impl.register_scoring_function(request)
-
-    @router.delete(
-        "/scoring-functions/{scoring_fn_id:path}",
-        summary="Unregister a scoring function.",
-        description="Unregister a scoring function.",
-        responses={
-            200: {"description": "The scoring function was successfully unregistered."},
-        },
-        deprecated=True,
-    )
-    async def unregister_scoring_function(
-        request: Annotated[UnregisterScoringFunctionRequest, Depends(get_unregister_scoring_function_request)],
-    ) -> None:
-        return await impl.unregister_scoring_function(request)
-
-    return router
diff --git a/src/llama_stack_api/scoring_functions/models.py b/src/llama_stack_api/scoring_functions/models.py
deleted file mode 100644
index b3874b287f..0000000000
--- a/src/llama_stack_api/scoring_functions/models.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Pydantic models for ScoringFunctions API requests and responses.
-
-This module defines the request and response models for the ScoringFunctions API
-using Pydantic with Field descriptions for OpenAPI schema generation.
-"""
-
-from enum import StrEnum
-from typing import Annotated, Any, Literal
-
-from pydantic import BaseModel, Field
-
-from llama_stack_api.common.type_system import ParamType
-from llama_stack_api.resource import Resource, ResourceType
-from llama_stack_api.schema_utils import json_schema_type, register_schema
-
-
-@json_schema_type
-class ScoringFnParamsType(StrEnum):
-    """Types of scoring function parameter configurations.
-    :cvar llm_as_judge: Use an LLM model to evaluate and score responses
-    :cvar regex_parser: Use regex patterns to extract and score specific parts of responses
-    :cvar basic: Basic scoring with simple aggregation functions
-    """
-
-    llm_as_judge = "llm_as_judge"
-    regex_parser = "regex_parser"
-    basic = "basic"
-
-
-@json_schema_type
-class AggregationFunctionType(StrEnum):
-    """Types of aggregation functions for scoring results.
-    :cvar average: Calculate the arithmetic mean of scores
-    :cvar weighted_average: Calculate a weighted average of scores
-    :cvar median: Calculate the median value of scores
-    :cvar categorical_count: Count occurrences of categorical values
-    :cvar accuracy: Calculate accuracy as the proportion of correct answers
-    """
-
-    average = "average"
-    weighted_average = "weighted_average"
-    median = "median"
-    categorical_count = "categorical_count"
-    accuracy = "accuracy"
-
-
-@json_schema_type
-class LLMAsJudgeScoringFnParams(BaseModel):
-    """Parameters for LLM-as-judge scoring function configuration.
-    :param type: The type of scoring function parameters, always llm_as_judge
-    :param judge_model: Identifier of the LLM model to use as a judge for scoring
-    :param prompt_template: (Optional) Custom prompt template for the judge model
-    :param judge_score_regexes: Regexes to extract the answer from generated response
-    :param aggregation_functions: Aggregation functions to apply to the scores of each row
-    """
-
-    type: Literal[ScoringFnParamsType.llm_as_judge] = ScoringFnParamsType.llm_as_judge
-    judge_model: str
-    prompt_template: str | None = None
-    judge_score_regexes: list[str] = Field(
-        description="Regexes to extract the answer from generated response",
-        default_factory=lambda: [],
-    )
-    aggregation_functions: list[AggregationFunctionType] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=lambda: [],
-    )
-
-
-@json_schema_type
-class RegexParserScoringFnParams(BaseModel):
-    """Parameters for regex parser scoring function configuration.
-    :param type: The type of scoring function parameters, always regex_parser
-    :param parsing_regexes: Regex to extract the answer from generated response
-    :param aggregation_functions: Aggregation functions to apply to the scores of each row
-    """
-
-    type: Literal[ScoringFnParamsType.regex_parser] = ScoringFnParamsType.regex_parser
-    parsing_regexes: list[str] = Field(
-        description="Regex to extract the answer from generated response",
-        default_factory=lambda: [],
-    )
-    aggregation_functions: list[AggregationFunctionType] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=lambda: [],
-    )
-
-
-@json_schema_type
-class BasicScoringFnParams(BaseModel):
-    """Parameters for basic scoring function configuration.
-    :param type: The type of scoring function parameters, always basic
-    :param aggregation_functions: Aggregation functions to apply to the scores of each row
-    """
-
-    type: Literal[ScoringFnParamsType.basic] = ScoringFnParamsType.basic
-    aggregation_functions: list[AggregationFunctionType] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=list,
-    )
-
-
-ScoringFnParams = Annotated[
-    LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams,
-    Field(discriminator="type"),
-]
-register_schema(ScoringFnParams, name="ScoringFnParams")
-
-
-@json_schema_type
-class ListScoringFunctionsRequest(BaseModel):
-    """Request model for listing scoring functions."""
-
-    pass
-
-
-@json_schema_type
-class GetScoringFunctionRequest(BaseModel):
-    """Request model for getting a scoring function."""
-
-    scoring_fn_id: str = Field(..., description="The ID of the scoring function to get.")
-
-
-@json_schema_type
-class RegisterScoringFunctionRequest(BaseModel):
-    """Request model for registering a scoring function."""
-
-    scoring_fn_id: str = Field(..., description="The ID of the scoring function to register.")
-    description: str = Field(..., description="The description of the scoring function.")
-    return_type: ParamType = Field(..., description="The return type of the scoring function.")
-    provider_scoring_fn_id: str | None = Field(
-        default=None, description="The ID of the provider scoring function to use for the scoring function."
-    )
-    provider_id: str | None = Field(default=None, description="The ID of the provider to use for the scoring function.")
-    params: ScoringFnParams | None = Field(
-        default=None,
-        description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval.",
-    )
-
-
-@json_schema_type
-class UnregisterScoringFunctionRequest(BaseModel):
-    """Request model for unregistering a scoring function."""
-
-    scoring_fn_id: str = Field(..., description="The ID of the scoring function to unregister.")
-
-
-class CommonScoringFnFields(BaseModel):
-    """Common fields shared across scoring function creation and retrieval."""
-
-    description: str | None = None
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Any additional metadata for this definition",
-    )
-    return_type: ParamType = Field(
-        description="The return type of the deterministic function",
-    )
-    params: ScoringFnParams | None = Field(
-        description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval",
-        default=None,
-    )
-
-
-@json_schema_type
-class ScoringFn(CommonScoringFnFields, Resource):
-    """A scoring function resource for evaluating model outputs.
-    :param type: The resource type, always scoring_function
-    """
-
-    type: Literal[ResourceType.scoring_function] = ResourceType.scoring_function
-
-    @property
-    def scoring_fn_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_scoring_fn_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class ScoringFnInput(CommonScoringFnFields, BaseModel):
-    """Input model for registering a new scoring function."""
-
-    scoring_fn_id: str
-    provider_id: str | None = None
-    provider_scoring_fn_id: str | None = None
-
-
-@json_schema_type
-class ListScoringFunctionsResponse(BaseModel):
-    """Response containing a list of scoring function objects."""
-
-    data: list[ScoringFn] = Field(..., description="List of scoring function objects.")
-
-
-__all__ = [
-    "ScoringFnParamsType",
-    "AggregationFunctionType",
-    "LLMAsJudgeScoringFnParams",
-    "RegexParserScoringFnParams",
-    "BasicScoringFnParams",
-    "ScoringFnParams",
-    "ListScoringFunctionsRequest",
-    "GetScoringFunctionRequest",
-    "RegisterScoringFunctionRequest",
-    "UnregisterScoringFunctionRequest",
-    "CommonScoringFnFields",
-    "ScoringFn",
-    "ScoringFnInput",
-    "ListScoringFunctionsResponse",
-]
diff --git a/tests/backward_compat/test_eval_compat.py b/tests/backward_compat/test_eval_compat.py
deleted file mode 100644
index fa15045671..0000000000
--- a/tests/backward_compat/test_eval_compat.py
+++ /dev/null
@@ -1,533 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-Tests for Eval API backward compatibility.
-
-These tests verify that both old-style (individual parameters) and new-style
-(request objects) calling conventions work correctly, and that old-style usage
-emits appropriate deprecation warnings.
-"""
-
-import warnings
-
-import pytest
-
-from llama_stack_api import (
-    BenchmarkConfig,
-    EvaluateRowsRequest,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    ModelCandidate,
-    RunEvalRequest,
-    resolve_evaluate_rows_request,
-    resolve_job_cancel_request,
-    resolve_job_result_request,
-    resolve_job_status_request,
-    resolve_run_eval_request,
-)
-from llama_stack_api.inference import SamplingParams, TopPSamplingStrategy
-
-
-@pytest.fixture
-def sample_benchmark_config():
-    return BenchmarkConfig(
-        eval_candidate=ModelCandidate(
-            model="test-model",
-            sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
-        )
-    )
-
-
-class TestResolveRunEvalRequest:
-    """Tests for resolve_run_eval_request."""
-
-    def test_new_style_with_request_object(self, sample_benchmark_config):
-        """Test that new-style (request object) works without deprecation warning."""
-        request = RunEvalRequest(benchmark_id="bench-123", benchmark_config=sample_benchmark_config)
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_run_eval_request(request)
-
-            # No deprecation warning should be emitted
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "bench-123"
-        assert result.benchmark_config == sample_benchmark_config
-
-    def test_old_style_with_individual_params(self, sample_benchmark_config):
-        """Test that old-style (individual parameters) works and emits deprecation warning."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_run_eval_request(
-                benchmark_id="bench-123",
-                benchmark_config=sample_benchmark_config,
-            )
-
-            # Deprecation warning should be emitted
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 1
-            assert "run_eval" in str(deprecation_warnings[0].message)
-            assert "RunEvalRequest" in str(deprecation_warnings[0].message)
-
-        assert result.benchmark_id == "bench-123"
-        assert result.benchmark_config == sample_benchmark_config
-
-    def test_request_object_takes_precedence_over_individual_params(self, sample_benchmark_config):
-        """Test that request object takes precedence when both are provided."""
-        request = RunEvalRequest(benchmark_id="from-request", benchmark_config=sample_benchmark_config)
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_run_eval_request(
-                request,
-                benchmark_id="from-param",  # Should be ignored
-                benchmark_config=sample_benchmark_config,
-            )
-
-            # No deprecation warning since request object is used
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        # Request object values should be used
-        assert result.benchmark_id == "from-request"
-
-    def test_missing_parameters_raises_error(self, sample_benchmark_config):
-        """Test that missing parameters raises ValueError with helpful message."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_run_eval_request()
-        assert "Either 'request'" in str(exc_info.value)
-        assert "missing:" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_run_eval_request(benchmark_id="bench-123")  # missing benchmark_config
-        assert "missing: benchmark_config" in str(exc_info.value)
-        assert "provided: benchmark_id" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_run_eval_request(benchmark_config=sample_benchmark_config)  # missing benchmark_id
-        assert "missing: benchmark_id" in str(exc_info.value)
-        assert "provided: benchmark_config" in str(exc_info.value)
-
-
-class TestResolveEvaluateRowsRequest:
-    """Tests for resolve_evaluate_rows_request."""
-
-    def test_new_style_with_request_object(self, sample_benchmark_config):
-        """Test that new-style (request object) works without deprecation warning."""
-        request = EvaluateRowsRequest(
-            benchmark_id="bench-123",
-            input_rows=[{"test": "data"}],
-            scoring_functions=["func1"],
-            benchmark_config=sample_benchmark_config,
-        )
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_evaluate_rows_request(request)
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "bench-123"
-        assert result.input_rows == [{"test": "data"}]
-        assert result.scoring_functions == ["func1"]
-
-    def test_old_style_with_individual_params(self, sample_benchmark_config):
-        """Test that old-style (individual parameters) works and emits deprecation warning."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_evaluate_rows_request(
-                benchmark_id="bench-123",
-                input_rows=[{"test": "data"}],
-                scoring_functions=["func1"],
-                benchmark_config=sample_benchmark_config,
-            )
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 1
-            assert "evaluate_rows" in str(deprecation_warnings[0].message)
-            assert "EvaluateRowsRequest" in str(deprecation_warnings[0].message)
-
-        assert result.benchmark_id == "bench-123"
-        assert result.input_rows == [{"test": "data"}]
-        assert result.scoring_functions == ["func1"]
-
-    def test_request_object_takes_precedence_over_individual_params(self, sample_benchmark_config):
-        """Test that request object takes precedence when both are provided."""
-        request = EvaluateRowsRequest(
-            benchmark_id="from-request",
-            input_rows=[{"from": "request"}],
-            scoring_functions=["request-func"],
-            benchmark_config=sample_benchmark_config,
-        )
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_evaluate_rows_request(
-                request,
-                benchmark_id="from-param",
-                input_rows=[{"from": "param"}],
-                scoring_functions=["param-func"],
-                benchmark_config=sample_benchmark_config,
-            )
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "from-request"
-        assert result.input_rows == [{"from": "request"}]
-
-    def test_missing_parameters_raises_error(self, sample_benchmark_config):
-        """Test that missing parameters raises ValueError with helpful message."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_evaluate_rows_request()
-        assert "missing:" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_evaluate_rows_request(
-                benchmark_id="bench-123",
-                input_rows=[{"test": "data"}],
-                # missing scoring_functions and benchmark_config
-            )
-        assert "missing: scoring_functions, benchmark_config" in str(exc_info.value)
-
-
-class TestResolveJobStatusRequest:
-    """Tests for resolve_job_status_request."""
-
-    def test_new_style_with_request_object(self):
-        """Test that new-style (request object) works without deprecation warning."""
-        request = JobStatusRequest(benchmark_id="bench-123", job_id="job-456")
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_status_request(request)
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "bench-123"
-        assert result.job_id == "job-456"
-
-    def test_old_style_with_individual_params(self):
-        """Test that old-style (individual parameters) works and emits deprecation warning."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_status_request(benchmark_id="bench-123", job_id="job-456")
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 1
-            assert "job_status" in str(deprecation_warnings[0].message)
-            assert "JobStatusRequest" in str(deprecation_warnings[0].message)
-
-        assert result.benchmark_id == "bench-123"
-        assert result.job_id == "job-456"
-
-    def test_request_object_takes_precedence_over_individual_params(self):
-        """Test that request object takes precedence when both are provided."""
-        request = JobStatusRequest(benchmark_id="from-request", job_id="job-from-request")
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_status_request(
-                request,
-                benchmark_id="from-param",
-                job_id="job-from-param",
-            )
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "from-request"
-        assert result.job_id == "job-from-request"
-
-    def test_missing_parameters_raises_error(self):
-        """Test that missing parameters raises ValueError with helpful message."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_status_request()
-        assert "missing:" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_status_request(benchmark_id="bench-123")  # missing job_id
-        assert "missing: job_id" in str(exc_info.value)
-        assert "provided: benchmark_id" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_status_request(job_id="job-456")  # missing benchmark_id
-        assert "missing: benchmark_id" in str(exc_info.value)
-        assert "provided: job_id" in str(exc_info.value)
-
-
-class TestResolveJobCancelRequest:
-    """Tests for resolve_job_cancel_request."""
-
-    def test_new_style_with_request_object(self):
-        """Test that new-style (request object) works without deprecation warning."""
-        request = JobCancelRequest(benchmark_id="bench-123", job_id="job-456")
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_cancel_request(request)
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "bench-123"
-        assert result.job_id == "job-456"
-
-    def test_old_style_with_individual_params(self):
-        """Test that old-style (individual parameters) works and emits deprecation warning."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_cancel_request(benchmark_id="bench-123", job_id="job-456")
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 1
-            assert "job_cancel" in str(deprecation_warnings[0].message)
-            assert "JobCancelRequest" in str(deprecation_warnings[0].message)
-
-        assert result.benchmark_id == "bench-123"
-        assert result.job_id == "job-456"
-
-    def test_request_object_takes_precedence_over_individual_params(self):
-        """Test that request object takes precedence when both are provided."""
-        request = JobCancelRequest(benchmark_id="from-request", job_id="job-from-request")
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_cancel_request(
-                request,
-                benchmark_id="from-param",
-                job_id="job-from-param",
-            )
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "from-request"
-        assert result.job_id == "job-from-request"
-
-    def test_missing_parameters_raises_error(self):
-        """Test that missing parameters raises ValueError with helpful message."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_cancel_request()
-        assert "missing:" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_cancel_request(benchmark_id="bench-123")  # missing job_id
-        assert "missing: job_id" in str(exc_info.value)
-        assert "provided: benchmark_id" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_cancel_request(job_id="job-456")  # missing benchmark_id
-        assert "missing: benchmark_id" in str(exc_info.value)
-        assert "provided: job_id" in str(exc_info.value)
-
-
-class TestResolveJobResultRequest:
-    """Tests for resolve_job_result_request."""
-
-    def test_new_style_with_request_object(self):
-        """Test that new-style (request object) works without deprecation warning."""
-        request = JobResultRequest(benchmark_id="bench-123", job_id="job-456")
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_result_request(request)
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "bench-123"
-        assert result.job_id == "job-456"
-
-    def test_old_style_with_individual_params(self):
-        """Test that old-style (individual parameters) works and emits deprecation warning."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_result_request(benchmark_id="bench-123", job_id="job-456")
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 1
-            assert "job_result" in str(deprecation_warnings[0].message)
-            assert "JobResultRequest" in str(deprecation_warnings[0].message)
-
-        assert result.benchmark_id == "bench-123"
-        assert result.job_id == "job-456"
-
-    def test_request_object_takes_precedence_over_individual_params(self):
-        """Test that request object takes precedence when both are provided."""
-        request = JobResultRequest(benchmark_id="from-request", job_id="job-from-request")
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_result_request(
-                request,
-                benchmark_id="from-param",
-                job_id="job-from-param",
-            )
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "from-request"
-        assert result.job_id == "job-from-request"
-
-    def test_missing_parameters_raises_error(self):
-        """Test that missing parameters raises ValueError with helpful message."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_result_request()
-        assert "missing:" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_result_request(benchmark_id="bench-123")  # missing job_id
-        assert "missing: job_id" in str(exc_info.value)
-        assert "provided: benchmark_id" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_result_request(job_id="job-456")  # missing benchmark_id
-        assert "missing: benchmark_id" in str(exc_info.value)
-        assert "provided: job_id" in str(exc_info.value)
-
-
-class TestEmptyValueValidation:
-    """Tests for validation of None, empty strings, and empty lists."""
-
-    def test_empty_benchmark_id_old_style(self, sample_benchmark_config):
-        """Empty benchmark_id is rejected when using old-style parameters (treated as missing)."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_run_eval_request(benchmark_id="", benchmark_config=sample_benchmark_config)
-        # Empty string is falsy, so it's treated as missing
-        assert "benchmark_id" in str(exc_info.value)
-
-    def test_empty_benchmark_id_in_request_object(self, sample_benchmark_config):
-        """Empty benchmark_id in request object (via model_construct) is rejected."""
-        request = RunEvalRequest.model_construct(
-            benchmark_id="",
-            benchmark_config=sample_benchmark_config,
-        )
-        with pytest.raises(ValueError) as exc_info:
-            resolve_run_eval_request(request)
-        assert "benchmark_id" in str(exc_info.value)
-        assert "cannot be None or empty" in str(exc_info.value)
-
-    def test_none_benchmark_id_in_request_object(self, sample_benchmark_config):
-        """None benchmark_id in request object (via model_construct) is rejected."""
-        request = RunEvalRequest.model_construct(
-            benchmark_id=None,
-            benchmark_config=sample_benchmark_config,
-        )
-        with pytest.raises(ValueError) as exc_info:
-            resolve_run_eval_request(request)
-        assert "benchmark_id" in str(exc_info.value)
-        assert "cannot be None or empty" in str(exc_info.value)
-
-    @pytest.mark.parametrize(
-        "resolver,request_class",
-        [
-            (resolve_job_status_request, JobStatusRequest),
-            (resolve_job_cancel_request, JobCancelRequest),
-            (resolve_job_result_request, JobResultRequest),
-        ],
-    )
-    def test_empty_job_id_old_style(self, resolver, request_class):
-        """Empty job_id is rejected when using old-style parameters (treated as missing)."""
-        with pytest.raises(ValueError) as exc_info:
-            resolver(benchmark_id="bench-123", job_id="")
-        # Empty string is falsy, so it's treated as missing
-        assert "job_id" in str(exc_info.value)
-
-    @pytest.mark.parametrize(
-        "resolver,request_class",
-        [
-            (resolve_job_status_request, JobStatusRequest),
-            (resolve_job_cancel_request, JobCancelRequest),
-            (resolve_job_result_request, JobResultRequest),
-        ],
-    )
-    def test_empty_job_id_in_request_object(self, resolver, request_class):
-        """Empty job_id in request object (via model_construct) is rejected."""
-        request = request_class.model_construct(
-            benchmark_id="bench-123",
-            job_id="",
-        )
-        with pytest.raises(ValueError) as exc_info:
-            resolver(request)
-        assert "job_id" in str(exc_info.value)
-        assert "cannot be None or empty" in str(exc_info.value)
-
-    @pytest.mark.parametrize(
-        "resolver,request_class",
-        [
-            (resolve_job_status_request, JobStatusRequest),
-            (resolve_job_cancel_request, JobCancelRequest),
-            (resolve_job_result_request, JobResultRequest),
-        ],
-    )
-    def test_none_job_id_in_request_object(self, resolver, request_class):
-        """None job_id in request object (via model_construct) is rejected."""
-        request = request_class.model_construct(
-            benchmark_id="bench-123",
-            job_id=None,
-        )
-        with pytest.raises(ValueError) as exc_info:
-            resolver(request)
-        assert "job_id" in str(exc_info.value)
-        assert "cannot be None or empty" in str(exc_info.value)
-
-    def test_empty_input_rows_old_style(self, sample_benchmark_config):
-        """Empty input_rows is rejected when using old-style parameters (treated as missing)."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_evaluate_rows_request(
-                benchmark_id="bench-123",
-                input_rows=[],
-                scoring_functions=["func1"],
-                benchmark_config=sample_benchmark_config,
-            )
-        # Empty list is falsy, so it's treated as missing
-        assert "input_rows" in str(exc_info.value)
-
-    def test_empty_scoring_functions_old_style(self, sample_benchmark_config):
-        """Empty scoring_functions is rejected when using old-style parameters (treated as missing)."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_evaluate_rows_request(
-                benchmark_id="bench-123",
-                input_rows=[{"test": "data"}],
-                scoring_functions=[],
-                benchmark_config=sample_benchmark_config,
-            )
-        # Empty list is falsy, so it's treated as missing
-        assert "scoring_functions" in str(exc_info.value)
-
-    def test_empty_input_rows_in_request_object(self, sample_benchmark_config):
-        """Empty input_rows in request object (via model_construct) is rejected."""
-        request = EvaluateRowsRequest.model_construct(
-            benchmark_id="bench-123",
-            input_rows=[],
-            scoring_functions=["func1"],
-            benchmark_config=sample_benchmark_config,
-        )
-        with pytest.raises(ValueError) as exc_info:
-            resolve_evaluate_rows_request(request)
-        assert "input_rows" in str(exc_info.value)
-        assert "cannot be None or empty" in str(exc_info.value)
-
-    def test_none_input_rows_in_request_object(self, sample_benchmark_config):
-        """None input_rows in request object (via model_construct) is rejected."""
-        request = EvaluateRowsRequest.model_construct(
-            benchmark_id="bench-123",
-            input_rows=None,
-            scoring_functions=["func1"],
-            benchmark_config=sample_benchmark_config,
-        )
-        with pytest.raises(ValueError) as exc_info:
-            resolve_evaluate_rows_request(request)
-        assert "input_rows" in str(exc_info.value)
-        assert "cannot be None or empty" in str(exc_info.value)
diff --git a/tests/external/llama-stack-provider-lmeval/config.yaml b/tests/external/llama-stack-provider-lmeval/config.yaml
index 966689bae7..ea2fe77f40 100644
--- a/tests/external/llama-stack-provider-lmeval/config.yaml
+++ b/tests/external/llama-stack-provider-lmeval/config.yaml
@@ -2,19 +2,11 @@ version: 2
 distro_name: external-provider-test
 apis:
 - inference
-- eval
 providers:
   inference:
   - provider_id: ollama
     provider_type: remote::ollama
     config:
       base_url: ${env.OLLAMA_URL:=http://localhost:11434}
-  eval:
-  - provider_id: trustyai_lmeval
-    provider_type: remote::trustyai_lmeval
-    module: llama_stack_provider_lmeval
-    config:
-      use_k8s: ${env.TRUSTYAI_LMEVAL_USE_K8S:=false}
-      base_url: ${env.OLLAMA_URL:=http://localhost:11434}
 server:
   port: 8321
diff --git a/tests/integration/datasets/__init__.py b/tests/integration/datasets/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/tests/integration/datasets/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/tests/integration/datasets/test_dataset.csv b/tests/integration/datasets/test_dataset.csv
deleted file mode 100644
index 7fc1c3623a..0000000000
--- a/tests/integration/datasets/test_dataset.csv
+++ /dev/null
@@ -1,6 +0,0 @@
-input_query,generated_answer,expected_answer,chat_completion_input
-What is the capital of France?,London,Paris,"[{""role"": ""user"", ""content"": ""What is the capital of France?""}]"
-Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{""role"": ""user"", ""content"": ""Who is the CEO of Meta?""}]"
-What is the largest planet in our solar system?,Jupiter,Jupiter,"[{""role"": ""user"", ""content"": ""What is the largest planet in our solar system?""}]"
-What is the smallest country in the world?,China,Vatican City,"[{""role"": ""user"", ""content"": ""What is the smallest country in the world?""}]"
-What is the currency of Japan?,Yen,Yen,"[{""role"": ""user"", ""content"": ""What is the currency of Japan?""}]"
diff --git a/tests/integration/datasets/test_datasets.py b/tests/integration/datasets/test_datasets.py
deleted file mode 100644
index 3ad5570f07..0000000000
--- a/tests/integration/datasets/test_datasets.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-import base64
-import mimetypes
-import os
-
-import pytest
-
-# How to run this test:
-#
-# LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/datasets
-
-
-def data_url_from_file(file_path: str) -> str:
-    if not os.path.exists(file_path):
-        raise FileNotFoundError(f"File not found: {file_path}")
-
-    with open(file_path, "rb") as file:
-        file_content = file.read()
-
-    base64_content = base64.b64encode(file_content).decode("utf-8")
-    mime_type, _ = mimetypes.guess_type(file_path)
-
-    data_url = f"data:{mime_type};base64,{base64_content}"
-
-    return data_url
-
-
-@pytest.mark.parametrize(
-    "purpose, source, provider_id, limit",
-    [
-        (
-            "eval/messages-answer",
-            {
-                "type": "uri",
-                "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
-            },
-            "huggingface",
-            10,
-        ),
-        (
-            "eval/messages-answer",
-            {
-                "type": "rows",
-                "rows": [
-                    {
-                        "messages": [{"role": "user", "content": "Hello, world!"}],
-                        "answer": "Hello, world!",
-                    },
-                    {
-                        "messages": [
-                            {
-                                "role": "user",
-                                "content": "What is the capital of France?",
-                            }
-                        ],
-                        "answer": "Paris",
-                    },
-                ],
-            },
-            "localfs",
-            2,
-        ),
-        (
-            "eval/messages-answer",
-            {
-                "type": "uri",
-                "uri": data_url_from_file(os.path.join(os.path.dirname(__file__), "test_dataset.csv")),
-            },
-            "localfs",
-            5,
-        ),
-    ],
-)
-def test_register_and_iterrows(llama_stack_client, purpose, source, provider_id, limit):
-    dataset = llama_stack_client.beta.datasets.register(
-        purpose=purpose,
-        source=source,
-    )
-    assert dataset.identifier is not None
-    assert dataset.provider_id == provider_id
-    iterrow_response = llama_stack_client.beta.datasets.iterrows(dataset.identifier, limit=limit)
-    assert len(iterrow_response.data) == limit
-
-    dataset_list = llama_stack_client.beta.datasets.list()
-    assert dataset.identifier in [d.identifier for d in dataset_list]
-
-    llama_stack_client.beta.datasets.unregister(dataset.identifier)
-    dataset_list = llama_stack_client.beta.datasets.list()
-    assert dataset.identifier not in [d.identifier for d in dataset_list]
diff --git a/tests/integration/datasets/test_rag_dataset.csv b/tests/integration/datasets/test_rag_dataset.csv
deleted file mode 100644
index a0e1fce72b..0000000000
--- a/tests/integration/datasets/test_rag_dataset.csv
+++ /dev/null
@@ -1,6 +0,0 @@
-input_query,context,generated_answer,expected_answer
-What is the capital of France?,"France is a country in Western Europe with a population of about 67 million people. Its capital city has been a major European cultural center since the 17th century and is known for landmarks like the Eiffel Tower and the Louvre Museum.",London,Paris
-Who is the CEO of Meta?,"Meta Platforms, formerly known as Facebook, is one of the world's largest technology companies. Founded by Mark Zuckerberg in 2004, the company has expanded to include platforms like Instagram, WhatsApp, and virtual reality technologies.",Mark Zuckerberg,Mark Zuckerberg
-What is the largest planet in our solar system?,"The solar system consists of eight planets orbiting around the Sun. These planets, in order from the Sun, are Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Gas giants are significantly larger than terrestrial planets.",Jupiter,Jupiter
-What is the smallest country in the world?,"Independent city-states and micronations are among the world's smallest sovereign territories. Some notable examples include Monaco, San Marino, and Vatican City, which is an enclave within Rome, Italy.",China,Vatican City
-What is the currency of Japan?,"Japan is an island country in East Asia with a rich cultural heritage and one of the world's largest economies. Its financial system has been established since the Meiji period, with its modern currency being introduced in 1871.",Yen,Yen
diff --git a/tests/integration/eval/__init__.py b/tests/integration/eval/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/tests/integration/eval/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/tests/integration/eval/constants.py b/tests/integration/eval/constants.py
deleted file mode 100644
index 0fb1a44c49..0000000000
--- a/tests/integration/eval/constants.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-JUDGE_PROMPT = """
-You will be given a question, a expected_answer, and a system_answer.
-Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question.
-Give your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question.
-Provide your feedback as follows:
-Feedback:::
-Total rating: (your rating, as a int between 0 and 5)
-Now here are the question, expected_answer, system_answer.
-Question: {input_query}
-Expected Answer: {expected_answer}
-System Answer: {generated_answer}
-Feedback:::
-Total rating:
-"""
diff --git a/tests/integration/eval/recordings/0a2ea52bcc4c7e04d0b4b844ad94bed06bcbaa03d13b228f61e2b36e23093469.json b/tests/integration/eval/recordings/0a2ea52bcc4c7e04d0b4b844ad94bed06bcbaa03d13b228f61e2b36e23093469.json
deleted file mode 100644
index e9e69b231d..0000000000
--- a/tests/integration/eval/recordings/0a2ea52bcc4c7e04d0b4b844ad94bed06bcbaa03d13b228f61e2b36e23093469.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the largest planet in our solar system?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-0a2ea52bcc4c",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The largest planet in our solar system is Jupiter. It has a diameter of approximately 142,984 kilometers (88,846 miles), which is more than 11 times the diameter of Earth and about 2.5 times the mass of all the other planets in our solar system combined.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 59,
-          "prompt_tokens": 35,
-          "total_tokens": 94,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/171c4dcb3dc848196f5d7fd87efd4626e70673c405ae1cd72b8dd0617104263e.json b/tests/integration/eval/recordings/171c4dcb3dc848196f5d7fd87efd4626e70673c405ae1cd72b8dd0617104263e.json
deleted file mode 100644
index 3ef9e1e026..0000000000
--- a/tests/integration/eval/recordings/171c4dcb3dc848196f5d7fd87efd4626e70673c405ae1cd72b8dd0617104263e.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_rows[txt=ollama/llama3.2:3b-instruct-fp16-basic::equality]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "Who is the CEO of Meta?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-171c4dcb3dc8",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 24,
-          "prompt_tokens": 32,
-          "total_tokens": 56,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/1b2720589d2a4273b5eb2c06b50ab45674040195c15013c9ea43bc6331e1a831.json b/tests/integration/eval/recordings/1b2720589d2a4273b5eb2c06b50ab45674040195c15013c9ea43bc6331e1a831.json
deleted file mode 100644
index 86c415a4d1..0000000000
--- a/tests/integration/eval/recordings/1b2720589d2a4273b5eb2c06b50ab45674040195c15013c9ea43bc6331e1a831.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the currency of Japan?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-1b2720589d2a",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The currency of Japan is the Japanese yen (\u00a5).",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 12,
-          "prompt_tokens": 32,
-          "total_tokens": 44,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/3e5ea35cb3dc92835d230456b6e2fc61593f964148d6c05df5c4a387a5389e6b.json b/tests/integration/eval/recordings/3e5ea35cb3dc92835d230456b6e2fc61593f964148d6c05df5c4a387a5389e6b.json
deleted file mode 100644
index c7c8576425..0000000000
--- a/tests/integration/eval/recordings/3e5ea35cb3dc92835d230456b6e2fc61593f964148d6c05df5c4a387a5389e6b.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the smallest country in the world?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-3e5ea35cb3dc",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The smallest country in the world is the Vatican City, with an area of approximately 0.44 km\u00b2 (0.17 square miles). It is an independent city-state located within Rome, Italy, and serves as the headquarters of the Catholic Church. The Vatican City has a population of around 800 people and is home to numerous iconic landmarks, including St. Peter's Basilica and the Sistine Chapel.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 84,
-          "prompt_tokens": 34,
-          "total_tokens": 118,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/58177cd1c0d7d8de9e20515c3e8fe465b66d8436754b30ff4da28c7c03c094a4.json b/tests/integration/eval/recordings/58177cd1c0d7d8de9e20515c3e8fe465b66d8436754b30ff4da28c7c03c094a4.json
deleted file mode 100644
index df2f664e76..0000000000
--- a/tests/integration/eval/recordings/58177cd1c0d7d8de9e20515c3e8fe465b66d8436754b30ff4da28c7c03c094a4.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "Who is the CEO of Meta?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-58177cd1c0d7",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 24,
-          "prompt_tokens": 32,
-          "total_tokens": 56,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/6de6d1ebc3128dfaba1efe654ca1453f12cd31ce2e294d20868c0c498b7d9136.json b/tests/integration/eval/recordings/6de6d1ebc3128dfaba1efe654ca1453f12cd31ce2e294d20868c0c498b7d9136.json
deleted file mode 100644
index 5fadb9186d..0000000000
--- a/tests/integration/eval/recordings/6de6d1ebc3128dfaba1efe654ca1453f12cd31ce2e294d20868c0c498b7d9136.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the capital of France?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-6de6d1ebc312",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The capital of France is Paris.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 8,
-          "prompt_tokens": 32,
-          "total_tokens": 40,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/752abf1ef7f71bbe7028eae85814e6d567d1e8243e9b0d18f8803cb9b7c8f92f.json b/tests/integration/eval/recordings/752abf1ef7f71bbe7028eae85814e6d567d1e8243e9b0d18f8803cb9b7c8f92f.json
deleted file mode 100644
index a9affde521..0000000000
--- a/tests/integration/eval/recordings/752abf1ef7f71bbe7028eae85814e6d567d1e8243e9b0d18f8803cb9b7c8f92f.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_rows[txt=ollama/llama3.2:3b-instruct-fp16-basic::equality]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the capital of France?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-752abf1ef7f7",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The capital of France is Paris.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 8,
-          "prompt_tokens": 32,
-          "total_tokens": 40,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/94e3d8dba56da92e1014a6ee81b61fe8e95d98692c189e7308724338f918678d.json b/tests/integration/eval/recordings/94e3d8dba56da92e1014a6ee81b61fe8e95d98692c189e7308724338f918678d.json
deleted file mode 100644
index ae2fe160cb..0000000000
--- a/tests/integration/eval/recordings/94e3d8dba56da92e1014a6ee81b61fe8e95d98692c189e7308724338f918678d.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the capital of France?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-94e3d8dba56d",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The capital of France is Paris.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 8,
-          "prompt_tokens": 32,
-          "total_tokens": 40,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/9ebe1e04fc3a8d41f88992428a7c99669c7e19b3d551090eb6bec83b33de2a18.json b/tests/integration/eval/recordings/9ebe1e04fc3a8d41f88992428a7c99669c7e19b3d551090eb6bec83b33de2a18.json
deleted file mode 100644
index cc55f2a777..0000000000
--- a/tests/integration/eval/recordings/9ebe1e04fc3a8d41f88992428a7c99669c7e19b3d551090eb6bec83b33de2a18.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "Who is the CEO of Meta?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-9ebe1e04fc3a",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 24,
-          "prompt_tokens": 32,
-          "total_tokens": 56,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/aa20023c358a0dc718355082cc244a231426700a772b8dc64abf05d8b126a736.json b/tests/integration/eval/recordings/aa20023c358a0dc718355082cc244a231426700a772b8dc64abf05d8b126a736.json
deleted file mode 100644
index 56746ef9e4..0000000000
--- a/tests/integration/eval/recordings/aa20023c358a0dc718355082cc244a231426700a772b8dc64abf05d8b126a736.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the largest planet in our solar system?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-aa20023c358a",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The largest planet in our solar system is Jupiter. It has a diameter of approximately 142,984 kilometers (88,846 miles), which is more than 11 times the diameter of the Earth! Jupiter is a gas giant and is known for its massive size, stormy atmosphere, and numerous moons.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 62,
-          "prompt_tokens": 35,
-          "total_tokens": 97,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/b52a054b314c8b42634c4a9ef76280591f73cf26c00b7308dde7d19a1ced016c.json b/tests/integration/eval/recordings/b52a054b314c8b42634c4a9ef76280591f73cf26c00b7308dde7d19a1ced016c.json
deleted file mode 100644
index f6290abcd8..0000000000
--- a/tests/integration/eval/recordings/b52a054b314c8b42634c4a9ef76280591f73cf26c00b7308dde7d19a1ced016c.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_rows[txt=ollama/llama3.2:3b-instruct-fp16-basic::equality]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the capital of France?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-b52a054b314c",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The capital of France is Paris.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 8,
-          "prompt_tokens": 32,
-          "total_tokens": 40,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/bf6b37511a044df8ad1c6113d3936b1e7f4a9d4f7f8ba8bd844d50265067f417.json b/tests/integration/eval/recordings/bf6b37511a044df8ad1c6113d3936b1e7f4a9d4f7f8ba8bd844d50265067f417.json
deleted file mode 100644
index 1a95b7cadd..0000000000
--- a/tests/integration/eval/recordings/bf6b37511a044df8ad1c6113d3936b1e7f4a9d4f7f8ba8bd844d50265067f417.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the smallest country in the world?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-bf6b37511a04",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The smallest country in the world is the Vatican City, with an area of approximately 0.44 km\u00b2 (0.17 sq mi). It is an independent city-state located within Rome, Italy, and is home to the Pope and the central government of the Catholic Church.\n\nVatican City is so small that it's actually the smallest internationally recognized sovereign state in the world, with a population of just over 800 people. Despite its tiny size, it has its own government, currency, postal system, and even its own branch of the military, known as the Pontifical Swiss Guard.\n\nInterestingly, Vatican City is also home to numerous famous landmarks, including St. Peter's Basilica, the Sistine Chapel, and the Vatican Museums, which attract millions of visitors each year.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 160,
-          "prompt_tokens": 34,
-          "total_tokens": 194,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/c07b01fe99467efcfa99f6ac9c60acc212cf2ac3bdd4192aabb5f98359236572.json b/tests/integration/eval/recordings/c07b01fe99467efcfa99f6ac9c60acc212cf2ac3bdd4192aabb5f98359236572.json
deleted file mode 100644
index 532a7d6b3f..0000000000
--- a/tests/integration/eval/recordings/c07b01fe99467efcfa99f6ac9c60acc212cf2ac3bdd4192aabb5f98359236572.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_rows[txt=ollama/llama3.2:3b-instruct-fp16-basic::equality]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the largest planet in our solar system?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-c07b01fe9946",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "Jupiter is the largest planet in our solar system, with a diameter of approximately 142,984 kilometers (88,846 miles). It is a gas giant and composed mainly of hydrogen and helium. Jupiter's large size and mass are more than 2.5 times that of all the other planets in our solar system combined.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 67,
-          "prompt_tokens": 35,
-          "total_tokens": 102,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/c4ef767672c890e77ceaa15b6239e9d5a9a5ad6ee7bcad0b12853979b1e43ede.json b/tests/integration/eval/recordings/c4ef767672c890e77ceaa15b6239e9d5a9a5ad6ee7bcad0b12853979b1e43ede.json
deleted file mode 100644
index 0663e23c22..0000000000
--- a/tests/integration/eval/recordings/c4ef767672c890e77ceaa15b6239e9d5a9a5ad6ee7bcad0b12853979b1e43ede.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the currency of Japan?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-c4ef767672c8",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The currency of Japan is the Yen (JPY). It is divided into 100 sen, but the sen is no longer in circulation. The yen is widely accepted and used for most transactions in Japan.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 42,
-          "prompt_tokens": 32,
-          "total_tokens": 74,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/cbf92825593fd79fe76e0ad0193ebe742743cd3042654efefa86714e357b86f6.json b/tests/integration/eval/recordings/cbf92825593fd79fe76e0ad0193ebe742743cd3042654efefa86714e357b86f6.json
deleted file mode 100644
index ace935a78b..0000000000
--- a/tests/integration/eval/recordings/cbf92825593fd79fe76e0ad0193ebe742743cd3042654efefa86714e357b86f6.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_rows[txt=ollama/llama3.2:3b-instruct-fp16-basic::equality]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the largest planet in our solar system?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-cbf92825593f",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The largest planet in our solar system is Jupiter. It is a gas giant, with a diameter of approximately 142,984 kilometers (88,846 miles). This is more than 11 times the diameter of Earth and is the fifth-largest object in the solar system overall. Despite its large size, Jupiter is relatively lightweight compared to solid objects, due to its composition of mostly hydrogen and helium gases.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 81,
-          "prompt_tokens": 35,
-          "total_tokens": 116,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/dcf3c9afad420e66c3cc7434a48169a1235798c2b3ad9abbb29acf1f1b2952fa.json b/tests/integration/eval/recordings/dcf3c9afad420e66c3cc7434a48169a1235798c2b3ad9abbb29acf1f1b2952fa.json
deleted file mode 100644
index 92d07571b2..0000000000
--- a/tests/integration/eval/recordings/dcf3c9afad420e66c3cc7434a48169a1235798c2b3ad9abbb29acf1f1b2952fa.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_rows[txt=ollama/llama3.2:3b-instruct-fp16-basic::equality]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "Who is the CEO of Meta?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-dcf3c9afad42",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 24,
-          "prompt_tokens": 32,
-          "total_tokens": 56,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/test_eval.py b/tests/integration/eval/test_eval.py
deleted file mode 100644
index e042008dd2..0000000000
--- a/tests/integration/eval/test_eval.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import uuid
-from pathlib import Path
-
-import pytest
-
-from ..datasets.test_datasets import data_url_from_file
-
-# How to run this test:
-#
-# LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/eval
-
-
-@pytest.mark.parametrize("scoring_fn_id", ["basic::equality"])
-def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):
-    dataset = llama_stack_client.beta.datasets.register(
-        purpose="eval/messages-answer",
-        source={
-            "type": "uri",
-            "uri": data_url_from_file(Path(__file__).parent.parent / "datasets" / "test_dataset.csv"),
-        },
-    )
-    response = llama_stack_client.beta.datasets.list()
-    assert any(x.identifier == dataset.identifier for x in response)
-
-    rows = llama_stack_client.beta.datasets.iterrows(
-        dataset_id=dataset.identifier,
-        limit=3,
-    )
-    assert len(rows.data) == 3
-
-    scoring_functions = [
-        scoring_fn_id,
-    ]
-    benchmark_id = str(uuid.uuid4())
-    llama_stack_client.alpha.benchmarks.register(
-        benchmark_id=benchmark_id,
-        dataset_id=dataset.identifier,
-        scoring_functions=scoring_functions,
-    )
-    list_benchmarks = llama_stack_client.alpha.benchmarks.list()
-    assert any(x.identifier == benchmark_id for x in list_benchmarks)
-
-    response = llama_stack_client.alpha.eval.evaluate_rows(
-        benchmark_id=benchmark_id,
-        input_rows=rows.data,
-        scoring_functions=scoring_functions,
-        benchmark_config={
-            "eval_candidate": {
-                "type": "model",
-                "model": text_model_id,
-                "sampling_params": {
-                    "temperature": 0.0,
-                    "max_tokens": 512,
-                },
-            },
-        },
-    )
-
-    assert len(response.generations) == 3
-    assert scoring_fn_id in response.scores
-
-
-@pytest.mark.parametrize("scoring_fn_id", ["basic::subset_of"])
-def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id):
-    dataset = llama_stack_client.beta.datasets.register(
-        purpose="eval/messages-answer",
-        source={
-            "type": "uri",
-            "uri": data_url_from_file(Path(__file__).parent.parent / "datasets" / "test_dataset.csv"),
-        },
-    )
-    benchmark_id = str(uuid.uuid4())
-    llama_stack_client.alpha.benchmarks.register(
-        benchmark_id=benchmark_id,
-        dataset_id=dataset.identifier,
-        scoring_functions=[scoring_fn_id],
-    )
-
-    response = llama_stack_client.alpha.eval.run_eval(
-        benchmark_id=benchmark_id,
-        benchmark_config={
-            "eval_candidate": {
-                "type": "model",
-                "model": text_model_id,
-                "sampling_params": {
-                    "temperature": 0.0,
-                    "max_tokens": 512,
-                },
-            },
-        },
-    )
-    assert response.job_id == "0"
-    job_status = llama_stack_client.alpha.eval.jobs.status(job_id=response.job_id, benchmark_id=benchmark_id)
-    assert job_status and job_status.status == "completed"
-
-    eval_response = llama_stack_client.alpha.eval.jobs.retrieve(job_id=response.job_id, benchmark_id=benchmark_id)
-    assert eval_response is not None
-    assert len(eval_response.generations) == 5
-    assert scoring_fn_id in eval_response.scores
diff --git a/tests/integration/scoring/__init__.py b/tests/integration/scoring/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/tests/integration/scoring/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py
deleted file mode 100644
index 8ca11fc897..0000000000
--- a/tests/integration/scoring/test_scoring.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from pathlib import Path
-
-import pandas as pd
-import pytest
-import requests
-
-
-@pytest.fixture
-def sample_judge_prompt_template():
-    return "Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9."
-
-
-@pytest.fixture
-def sample_scoring_fn_id():
-    return "llm-as-judge-test-prompt"
-
-
-def register_scoring_function(
-    llama_stack_client,
-    provider_id,
-    scoring_fn_id,
-    judge_model_id,
-    judge_prompt_template,
-):
-    llama_stack_client.scoring_functions.register(
-        scoring_fn_id=scoring_fn_id,
-        provider_id=provider_id,
-        description="LLM as judge scoring function with test prompt",
-        return_type={
-            "type": "string",
-        },
-        params={
-            "type": "llm_as_judge",
-            "judge_model": judge_model_id,
-            "prompt_template": judge_prompt_template,
-        },
-    )
-
-
-def unregister_scoring_function(llama_stack_client, scoring_fn_id):
-    try:
-        base_url = llama_stack_client.base_url
-    except AttributeError:
-        pytest.skip("No server base_url available; cannot test HTTP unregister in library mode")
-
-    resp = requests.delete(f"{base_url}/v1/scoring-functions/{scoring_fn_id}", timeout=30)
-    assert resp.status_code in (200, 204)
-
-
-def test_scoring_functions_list(llama_stack_client):
-    response = llama_stack_client.scoring_functions.list()
-    assert isinstance(response, list)
-    assert len(response) > 0
-
-
-def test_scoring_functions_register(
-    llama_stack_client,
-    sample_scoring_fn_id,
-    judge_model_id,
-    sample_judge_prompt_template,
-):
-    llm_as_judge_provider = [
-        x
-        for x in llama_stack_client.providers.list()
-        if x.api == "scoring" and x.provider_type == "inline::llm-as-judge"
-    ]
-    if len(llm_as_judge_provider) == 0:
-        pytest.skip("No llm-as-judge provider found, cannot test registeration")
-
-    llm_as_judge_provider_id = llm_as_judge_provider[0].provider_id
-    register_scoring_function(
-        llama_stack_client,
-        llm_as_judge_provider_id,
-        sample_scoring_fn_id,
-        judge_model_id,
-        sample_judge_prompt_template,
-    )
-
-    list_response = llama_stack_client.scoring_functions.list()
-    assert isinstance(list_response, list)
-    assert len(list_response) > 0
-    assert any(x.identifier == sample_scoring_fn_id for x in list_response)
-
-
-def test_scoring_functions_unregister(
-    llama_stack_client,
-    sample_scoring_fn_id,
-    judge_model_id,
-    sample_judge_prompt_template,
-):
-    llm_as_judge_provider = [
-        x
-        for x in llama_stack_client.providers.list()
-        if x.api == "scoring" and x.provider_type == "inline::llm-as-judge"
-    ]
-    if len(llm_as_judge_provider) == 0:
-        pytest.skip("No llm-as-judge provider found, cannot test unregister")
-
-    llm_as_judge_provider_id = llm_as_judge_provider[0].provider_id
-
-    # Ensure a clean state: shared server runs can keep a prior registration, and
-    # re-registering the same identifier would fail with a 400.
-    unregister_scoring_function(llama_stack_client, sample_scoring_fn_id)
-
-    # Register first
-    register_scoring_function(
-        llama_stack_client,
-        llm_as_judge_provider_id,
-        sample_scoring_fn_id,
-        judge_model_id,
-        sample_judge_prompt_template,
-    )
-
-    # Ensure it is present
-    list_response = llama_stack_client.scoring_functions.list()
-    assert any(x.identifier == sample_scoring_fn_id for x in list_response)
-
-    # Unregister scoring fn
-    try:
-        base_url = llama_stack_client.base_url
-    except AttributeError:
-        pytest.skip("No server base_url available; cannot test HTTP unregister in library mode")
-
-    resp = requests.delete(f"{base_url}/v1/scoring-functions/{sample_scoring_fn_id}", timeout=30)
-    assert resp.status_code in (200, 204)
-    list_after = llama_stack_client.scoring_functions.list()
-    assert all(x.identifier != sample_scoring_fn_id for x in list_after)
-
-
-@pytest.mark.parametrize("scoring_fn_id", ["basic::equality"])
-def test_scoring_score(llama_stack_client, scoring_fn_id):
-    # scoring individual rows
-    df = pd.read_csv(Path(__file__).parent.parent / "datasets" / "test_dataset.csv")
-    rows = df.to_dict(orient="records")
-
-    scoring_functions = {
-        scoring_fn_id: None,
-    }
-
-    response = llama_stack_client.scoring.score(
-        input_rows=rows,
-        scoring_functions=scoring_functions,
-    )
-    assert len(response.results) == len(scoring_functions)
-    for x in scoring_functions:
-        assert x in response.results
-        assert len(response.results[x].score_rows) == len(rows)
-
-
-def test_scoring_score_with_params_llm_as_judge(
-    llama_stack_client,
-    sample_judge_prompt_template,
-    judge_model_id,
-):
-    # scoring individual rows
-    df = pd.read_csv(Path(__file__).parent.parent / "datasets" / "test_dataset.csv")
-    rows = df.to_dict(orient="records")
-
-    scoring_functions = {
-        "llm-as-judge::base": dict(
-            type="llm_as_judge",
-            judge_model=judge_model_id,
-            prompt_template=sample_judge_prompt_template,
-            judge_score_regexes=[r"Score: (\d+)"],
-            aggregation_functions=[
-                "categorical_count",
-            ],
-        )
-    }
-
-    response = llama_stack_client.scoring.score(
-        input_rows=rows,
-        scoring_functions=scoring_functions,
-    )
-    assert len(response.results) == len(scoring_functions)
-    for x in scoring_functions:
-        assert x in response.results
-        assert len(response.results[x].score_rows) == len(rows)
-
-
-@pytest.mark.parametrize(
-    "provider_id",
-    [
-        "basic",
-        "llm-as-judge",
-        "braintrust",
-    ],
-)
-def test_scoring_score_with_aggregation_functions(
-    llama_stack_client,
-    sample_judge_prompt_template,
-    judge_model_id,
-    provider_id,
-):
-    df = pd.read_csv(Path(__file__).parent.parent / "datasets" / "test_dataset.csv")
-    rows = df.to_dict(orient="records")
-
-    scoring_fns_list = [x for x in llama_stack_client.scoring_functions.list() if x.provider_id == provider_id]
-    if len(scoring_fns_list) == 0:
-        pytest.skip(f"No scoring functions found for provider {provider_id}, skipping")
-
-    scoring_functions = {}
-    aggr_fns = [
-        "accuracy",
-        "median",
-        "categorical_count",
-        "average",
-    ]
-
-    scoring_fn = scoring_fns_list[0]
-    if scoring_fn.provider_id == "llm-as-judge":
-        aggr_fns = ["categorical_count"]
-        scoring_functions[scoring_fn.identifier] = dict(
-            type="llm_as_judge",
-            judge_model=judge_model_id,
-            prompt_template=sample_judge_prompt_template,
-            judge_score_regexes=[r"Score: (\d+)"],
-            aggregation_functions=aggr_fns,
-        )
-    elif scoring_fn.provider_id == "basic" or scoring_fn.provider_id == "braintrust":
-        if "regex_parser" in scoring_fn.identifier:
-            scoring_functions[scoring_fn.identifier] = dict(
-                type="regex_parser",
-                parsing_regexes=[r"Score: (\d+)"],
-                aggregation_functions=aggr_fns,
-            )
-        else:
-            scoring_functions[scoring_fn.identifier] = dict(
-                type="basic",
-                aggregation_functions=aggr_fns,
-            )
-    else:
-        scoring_functions[scoring_fn.identifier] = None
-
-    response = llama_stack_client.scoring.score(
-        input_rows=rows,
-        scoring_functions=scoring_functions,
-    )
-
-    assert len(response.results) == len(scoring_functions)
-    for x in scoring_functions:
-        assert x in response.results
-        assert len(response.results[x].score_rows) == len(rows)
-        assert len(response.results[x].aggregated_results) == len(aggr_fns)
diff --git a/tests/unit/distribution/routers/test_routing_tables.py b/tests/unit/distribution/routers/test_routing_tables.py
index 6f2603965d..bbf5a9aecc 100644
--- a/tests/unit/distribution/routers/test_routing_tables.py
+++ b/tests/unit/distribution/routers/test_routing_tables.py
@@ -11,38 +11,23 @@
 import pytest
 
 from llama_stack.core.datatypes import RegistryEntrySource
-from llama_stack.core.routing_tables.benchmarks import BenchmarksRoutingTable
-from llama_stack.core.routing_tables.datasets import DatasetsRoutingTable
 from llama_stack.core.routing_tables.models import ModelsRoutingTable
-from llama_stack.core.routing_tables.scoring_functions import ScoringFunctionsRoutingTable
 from llama_stack.core.routing_tables.shields import ShieldsRoutingTable
 from llama_stack.core.routing_tables.toolgroups import ToolGroupsRoutingTable
 from llama_stack_api import (
     URL,
     Api,
-    Dataset,
-    DatasetPurpose,
-    GetBenchmarkRequest,
     GetShieldRequest,
-    ListBenchmarksRequest,
     ListToolDefsResponse,
     ListToolsRequest,
     Model,
     ModelNotFoundError,
     ModelType,
-    NumberType,
-    RegisterBenchmarkRequest,
     RegisterShieldRequest,
     Shield,
     ToolDef,
     ToolGroup,
-    UnregisterBenchmarkRequest,
     UnregisterShieldRequest,
-    URIDataSource,
-)
-from llama_stack_api.datasets import (
-    RegisterDatasetRequest,
-    UnregisterDatasetRequest,
 )
 
 
@@ -103,42 +88,6 @@ async def unregister_shield(self, shield_id: str):
         return shield_id
 
 
-class DatasetsImpl(Impl):
-    def __init__(self):
-        super().__init__(Api.datasetio)
-
-    async def register_dataset(self, dataset: Dataset):
-        return dataset
-
-    async def unregister_dataset(self, dataset_id: str):
-        return dataset_id
-
-
-class ScoringFunctionsImpl(Impl):
-    def __init__(self):
-        super().__init__(Api.scoring)
-
-    async def list_scoring_functions(self):
-        return []
-
-    async def register_scoring_function(self, scoring_fn):
-        return scoring_fn
-
-    async def unregister_scoring_function(self, scoring_fn_id: str):
-        return scoring_fn_id
-
-
-class BenchmarksImpl(Impl):
-    def __init__(self):
-        super().__init__(Api.eval)
-
-    async def register_benchmark(self, benchmark):
-        return benchmark
-
-    async def unregister_benchmark(self, benchmark_id: str):
-        return benchmark_id
-
-
 class ToolGroupsImpl(Impl):
     def __init__(self):
         super().__init__(Api.tool_runtime)
@@ -264,83 +213,6 @@ async def test_shields_routing_table(cached_disk_dist_registry):
         await table.unregister_shield(UnregisterShieldRequest(identifier="non-existent"))
 
 
-async def test_datasets_routing_table(cached_disk_dist_registry):
-    table = DatasetsRoutingTable({"localfs": DatasetsImpl()}, cached_disk_dist_registry, {})
-    await table.initialize()
-
-    # Register multiple datasets and verify listing
-    await table.register_dataset(
-        RegisterDatasetRequest(
-            dataset_id="test-dataset",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(uri="test-uri"),
-        )
-    )
-    await table.register_dataset(
-        RegisterDatasetRequest(
-            dataset_id="test-dataset-2",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(uri="test-uri-2"),
-        )
-    )
-    datasets = await table.list_datasets()
-
-    assert len(datasets.data) == 2
-    dataset_ids = {d.identifier for d in datasets.data}
-    assert "test-dataset" in dataset_ids
-    assert "test-dataset-2" in dataset_ids
-
-    await table.unregister_dataset(UnregisterDatasetRequest(dataset_id="test-dataset"))
-    await table.unregister_dataset(UnregisterDatasetRequest(dataset_id="test-dataset-2"))
-
-    datasets = await table.list_datasets()
-    assert len(datasets.data) == 0
-
-
-async def test_scoring_functions_routing_table(cached_disk_dist_registry):
-    table = ScoringFunctionsRoutingTable({"test_provider": ScoringFunctionsImpl()}, cached_disk_dist_registry, {})
-    await table.initialize()
-
-    # Register multiple scoring functions and verify listing
-    from llama_stack_api import (
-        ListScoringFunctionsRequest,
-        RegisterScoringFunctionRequest,
-        UnregisterScoringFunctionRequest,
-    )
-
-    await table.register_scoring_function(
-        RegisterScoringFunctionRequest(
-            scoring_fn_id="test-scoring-fn",
-            provider_id="test_provider",
-            description="Test scoring function",
-            return_type=NumberType(),
-        )
-    )
-    await table.register_scoring_function(
-        RegisterScoringFunctionRequest(
-            scoring_fn_id="test-scoring-fn-2",
-            provider_id="test_provider",
-            description="Another test scoring function",
-            return_type=NumberType(),
-        )
-    )
-    scoring_functions = await table.list_scoring_functions(ListScoringFunctionsRequest())
-
-    assert len(scoring_functions.data) == 2
-    scoring_fn_ids = {fn.identifier for fn in scoring_functions.data}
-    assert "test-scoring-fn" in scoring_fn_ids
-    assert "test-scoring-fn-2" in scoring_fn_ids
-
-    # Unregister scoring functions and verify listing
-    for i in range(len(scoring_functions.data)):
-        await table.unregister_scoring_function(
-            UnregisterScoringFunctionRequest(scoring_fn_id=scoring_functions.data[i].scoring_fn_id)
-        )
-
-    scoring_functions_list_after_deletion = await table.list_scoring_functions(ListScoringFunctionsRequest())
-    assert len(scoring_functions_list_after_deletion.data) == 0
-
-
 async def test_double_registration_models_positive(cached_disk_dist_registry):
     """Test that registering the same model twice with identical data succeeds."""
     table = ModelsRoutingTable({"test_provider": InferenceImpl()}, cached_disk_dist_registry, {})
@@ -373,68 +245,6 @@ async def test_double_registration_models_negative(cached_disk_dist_registry):
         )
 
 
-async def test_double_registration_scoring_functions_positive(cached_disk_dist_registry):
-    """Test that registering the same scoring function twice with identical data succeeds."""
-    from llama_stack_api import ListScoringFunctionsRequest, RegisterScoringFunctionRequest
-
-    table = ScoringFunctionsRoutingTable({"test_provider": ScoringFunctionsImpl()}, cached_disk_dist_registry, {})
-    await table.initialize()
-
-    # Register a scoring function
-    await table.register_scoring_function(
-        RegisterScoringFunctionRequest(
-            scoring_fn_id="test-scoring-fn",
-            provider_id="test_provider",
-            description="Test scoring function",
-            return_type=NumberType(),
-        )
-    )
-
-    # Register the exact same scoring function again - should succeed (idempotent)
-    await table.register_scoring_function(
-        RegisterScoringFunctionRequest(
-            scoring_fn_id="test-scoring-fn",
-            provider_id="test_provider",
-            description="Test scoring function",
-            return_type=NumberType(),
-        )
-    )
-
-    # Verify only one scoring function exists
-    scoring_functions = await table.list_scoring_functions(ListScoringFunctionsRequest())
-    assert len(scoring_functions.data) == 1
-    assert scoring_functions.data[0].identifier == "test-scoring-fn"
-
-
-async def test_double_registration_scoring_functions_negative(cached_disk_dist_registry):
-    """Test that registering the same scoring function with conflicting data fails."""
-    from llama_stack_api import RegisterScoringFunctionRequest
-
-    table = ScoringFunctionsRoutingTable({"test_provider": ScoringFunctionsImpl()}, cached_disk_dist_registry, {})
-    await table.initialize()
-
-    # Register a scoring function
-    await table.register_scoring_function(
-        RegisterScoringFunctionRequest(
-            scoring_fn_id="test-scoring-fn",
-            provider_id="test_provider",
-            description="Test scoring function",
-            return_type=NumberType(),
-        )
-    )
-
-    # Try to register the same scoring function with conflicting description - should fail
-    with pytest.raises(ValueError, match="conflicting field values"):
-        await table.register_scoring_function(
-            RegisterScoringFunctionRequest(
-                scoring_fn_id="test-scoring-fn",
-                provider_id="test_provider",
-                description="Different description",
-                return_type=NumberType(),
-            )
-        )
-
-
 async def test_double_registration_different_providers(cached_disk_dist_registry):
     """Test that registering objects with same ID but different providers succeeds."""
     impl1 = InferenceImpl()
@@ -454,60 +264,6 @@ async def test_double_registration_different_providers(cached_disk_dist_registry
     assert "provider2/shared-model" in model_ids
 
 
-async def test_benchmarks_routing_table(cached_disk_dist_registry):
-    table = BenchmarksRoutingTable({"test_provider": BenchmarksImpl()}, cached_disk_dist_registry, {})
-    await table.initialize()
-
-    # Register multiple benchmarks and verify listing
-    await table.register_benchmark(
-        RegisterBenchmarkRequest(
-            benchmark_id="test-benchmark",
-            dataset_id="test-dataset",
-            scoring_functions=["test-scoring-fn", "test-scoring-fn-2"],
-        )
-    )
-    benchmarks = await table.list_benchmarks(ListBenchmarksRequest())
-
-    assert len(benchmarks.data) == 1
-    benchmark_ids = {b.identifier for b in benchmarks.data}
-    assert "test-benchmark" in benchmark_ids
-
-    # Unregister the benchmark and verify removal
-    await table.unregister_benchmark(UnregisterBenchmarkRequest(benchmark_id="test-benchmark"))
-    benchmarks_after = await table.list_benchmarks(ListBenchmarksRequest())
-    assert len(benchmarks_after.data) == 0
-
-    # Unregistering a non-existent benchmark should raise a clear error
-    with pytest.raises(ValueError, match="Benchmark 'dummy_benchmark' not found"):
-        await table.unregister_benchmark(UnregisterBenchmarkRequest(benchmark_id="dummy_benchmark"))
-
-
-async def test_benchmarks_routing_table_stores_dataset_id(cached_disk_dist_registry):
-    """Test that register_benchmark correctly stores dataset_id on the benchmark."""
-    table = BenchmarksRoutingTable({"test_provider": BenchmarksImpl()}, cached_disk_dist_registry, {})
-    await table.initialize()
-
-    test_dataset_id = "my-evaluation-dataset"
-    test_scoring_functions = ["accuracy", "f1-score"]
-
-    await table.register_benchmark(
-        RegisterBenchmarkRequest(
-            benchmark_id="test-benchmark-with-dataset",
-            dataset_id=test_dataset_id,
-            scoring_functions=test_scoring_functions,
-        )
-    )
-
-    benchmark = await table.get_benchmark(GetBenchmarkRequest(benchmark_id="test-benchmark-with-dataset"))
-
-    assert benchmark is not None
-    assert benchmark.identifier == "test-benchmark-with-dataset"
-    assert benchmark.dataset_id == test_dataset_id
-    assert benchmark.scoring_functions == test_scoring_functions
-
-    await table.unregister_benchmark(UnregisterBenchmarkRequest(benchmark_id="test-benchmark-with-dataset"))
-
-
 async def test_tool_groups_routing_table(cached_disk_dist_registry):
     table = ToolGroupsRoutingTable({"test_provider": ToolGroupsImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
diff --git a/tests/unit/providers/nvidia/test_datastore.py b/tests/unit/providers/nvidia/test_datastore.py
deleted file mode 100644
index 2a1acd6131..0000000000
--- a/tests/unit/providers/nvidia/test_datastore.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-from unittest.mock import patch
-
-import pytest
-
-from llama_stack.providers.remote.datasetio.nvidia.config import NvidiaDatasetIOConfig
-from llama_stack.providers.remote.datasetio.nvidia.datasetio import NvidiaDatasetIOAdapter
-from llama_stack_api import Dataset, DatasetPurpose, ResourceType, URIDataSource
-
-
-@pytest.fixture
-def nvidia_adapter():
-    """Fixture to set up NvidiaDatasetIOAdapter with mocked requests."""
-    os.environ["NVIDIA_DATASETS_URL"] = "http://nemo.test/datasets"
-
-    config = NvidiaDatasetIOConfig(
-        datasets_url=os.environ["NVIDIA_DATASETS_URL"], dataset_namespace="default", project_id="default"
-    )
-    adapter = NvidiaDatasetIOAdapter(config)
-
-    with patch(
-        "llama_stack.providers.remote.datasetio.nvidia.datasetio.NvidiaDatasetIOAdapter._make_request"
-    ) as mock_make_request:
-        yield adapter, mock_make_request
-
-
-def _assert_request(mock_call, expected_method, expected_path, expected_json=None):
-    """Helper function to verify request details in mock calls."""
-    call_args = mock_call.call_args
-
-    assert call_args[0][0] == expected_method
-    assert call_args[0][1] == expected_path
-
-    if expected_json:
-        for key, value in expected_json.items():
-            assert call_args[1]["json"][key] == value
-
-
-def test_register_dataset(nvidia_adapter, run_async):
-    adapter, mock_make_request = nvidia_adapter
-    mock_make_request.return_value = {
-        "id": "dataset-123456",
-        "name": "test-dataset",
-        "namespace": "default",
-    }
-
-    dataset_def = Dataset(
-        identifier="test-dataset",
-        type=ResourceType.dataset,
-        provider_resource_id="",
-        provider_id="",
-        purpose=DatasetPurpose.eval_question_answer,
-        source=URIDataSource(uri="https://example.com/data.jsonl"),
-        metadata={"provider_id": "nvidia", "format": "jsonl", "description": "Test dataset description"},
-    )
-
-    run_async(adapter.register_dataset(dataset_def))
-
-    mock_make_request.assert_called_once()
-    _assert_request(
-        mock_make_request,
-        "POST",
-        "/v1/datasets",
-        expected_json={
-            "name": "test-dataset",
-            "namespace": "default",
-            "files_url": "https://example.com/data.jsonl",
-            "project": "default",
-            "format": "jsonl",
-            "description": "Test dataset description",
-        },
-    )
-
-
-def test_unregister_dataset(nvidia_adapter, run_async):
-    adapter, mock_make_request = nvidia_adapter
-    mock_make_request.return_value = {
-        "message": "Resource deleted successfully.",
-        "id": "dataset-81RSQp7FKX3rdBtKvF9Skn",
-        "deleted_at": None,
-    }
-    dataset_id = "test-dataset"
-
-    run_async(adapter.unregister_dataset(dataset_id))
-
-    mock_make_request.assert_called_once()
-    _assert_request(mock_make_request, "DELETE", "/v1/datasets/default/test-dataset")
-
-
-def test_register_dataset_with_custom_namespace_project(run_async):
-    """Test with custom namespace and project configuration."""
-    os.environ["NVIDIA_DATASETS_URL"] = "http://nemo.test/datasets"
-
-    custom_config = NvidiaDatasetIOConfig(
-        datasets_url=os.environ["NVIDIA_DATASETS_URL"],
-        dataset_namespace="custom-namespace",
-        project_id="custom-project",
-    )
-    custom_adapter = NvidiaDatasetIOAdapter(custom_config)
-
-    with patch(
-        "llama_stack.providers.remote.datasetio.nvidia.datasetio.NvidiaDatasetIOAdapter._make_request"
-    ) as mock_make_request:
-        mock_make_request.return_value = {
-            "id": "dataset-123456",
-            "name": "test-dataset",
-            "namespace": "custom-namespace",
-        }
-
-        dataset_def = Dataset(
-            identifier="test-dataset",
-            type=ResourceType.dataset,
-            provider_resource_id="",
-            provider_id="",
-            purpose=DatasetPurpose.eval_question_answer,
-            source=URIDataSource(uri="https://example.com/data.jsonl"),
-            metadata={"format": "jsonl"},
-        )
-
-        run_async(custom_adapter.register_dataset(dataset_def))
-
-        mock_make_request.assert_called_once()
-        _assert_request(
-            mock_make_request,
-            "POST",
-            "/v1/datasets",
-            expected_json={
-                "name": "test-dataset",
-                "namespace": "custom-namespace",
-                "files_url": "https://example.com/data.jsonl",
-                "project": "custom-project",
-                "format": "jsonl",
-            },
-        )
diff --git a/tests/unit/providers/nvidia/test_eval.py b/tests/unit/providers/nvidia/test_eval.py
deleted file mode 100644
index 39487d5a49..0000000000
--- a/tests/unit/providers/nvidia/test_eval.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from llama_stack.providers.remote.eval.nvidia.config import NVIDIAEvalConfig
-from llama_stack.providers.remote.eval.nvidia.eval import NVIDIAEvalImpl
-from llama_stack_api import (
-    Benchmark,
-    BenchmarkConfig,
-    EvaluateResponse,
-    Job,
-    JobStatus,
-    ModelCandidate,
-    ResourceType,
-    SamplingParams,
-    TopPSamplingStrategy,
-)
-from llama_stack_api.eval.models import (
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    RunEvalRequest,
-)
-
-MOCK_DATASET_ID = "default/test-dataset"
-MOCK_BENCHMARK_ID = "test-benchmark"
-
-
-@pytest.fixture
-def nvidia_eval_setup():
-    """Set up the NVIDIA eval implementation with mocked dependencies."""
-    os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test"
-
-    # Create mock APIs
-    datasetio_api = MagicMock()
-    datasets_api = MagicMock()
-    scoring_api = MagicMock()
-    inference_api = MagicMock()
-    responses_api = MagicMock()
-
-    config = NVIDIAEvalConfig(
-        evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"],
-    )
-
-    eval_impl = NVIDIAEvalImpl(
-        config=config,
-        datasetio_api=datasetio_api,
-        datasets_api=datasets_api,
-        scoring_api=scoring_api,
-        inference_api=inference_api,
-        responses_api=responses_api,
-    )
-
-    # Mock the HTTP request methods
-    with (
-        patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get") as mock_evaluator_get,
-        patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post") as mock_evaluator_post,
-    ):
-        yield {
-            "eval_impl": eval_impl,
-            "mock_evaluator_get": mock_evaluator_get,
-            "mock_evaluator_post": mock_evaluator_post,
-            "datasetio_api": datasetio_api,
-            "datasets_api": datasets_api,
-            "scoring_api": scoring_api,
-            "inference_api": inference_api,
-            "responses_api": responses_api,
-        }
-
-
-def _assert_request_body(mock_evaluator_post, expected_json):
-    """Helper method to verify request body in Evaluator POST request is correct"""
-    call_args = mock_evaluator_post.call_args
-    actual_json = call_args[0][1]
-
-    # Check that all expected keys contain the expected values in the actual JSON
-    for key, value in expected_json.items():
-        assert key in actual_json, f"Key '{key}' missing in actual JSON"
-
-        if isinstance(value, dict):
-            for nested_key, nested_value in value.items():
-                assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']"
-                assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'"
-        else:
-            assert actual_json[key] == value, f"Value mismatch for '{key}'"
-
-
-async def test_register_benchmark(nvidia_eval_setup):
-    eval_impl = nvidia_eval_setup["eval_impl"]
-    mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
-
-    eval_config = {
-        "type": "custom",
-        "params": {"parallelism": 8},
-        "tasks": {
-            "qa": {
-                "type": "completion",
-                "params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}},
-                "dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"},
-                "metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}},
-            }
-        },
-    }
-
-    benchmark = Benchmark(
-        provider_id="nvidia",
-        type=ResourceType.benchmark,
-        identifier=MOCK_BENCHMARK_ID,
-        dataset_id=MOCK_DATASET_ID,
-        scoring_functions=["basic::equality"],
-        metadata=eval_config,
-    )
-
-    # Mock Evaluator API response
-    mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"}
-    mock_evaluator_post.return_value = mock_evaluator_response
-
-    # Register the benchmark
-    await eval_impl.register_benchmark(benchmark)
-
-    # Verify the Evaluator API was called correctly
-    mock_evaluator_post.assert_called_once()
-    _assert_request_body(
-        mock_evaluator_post, {"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config}
-    )
-
-
-async def test_run_eval(nvidia_eval_setup):
-    eval_impl = nvidia_eval_setup["eval_impl"]
-    mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
-
-    benchmark_config = BenchmarkConfig(
-        eval_candidate=ModelCandidate(
-            type="model",
-            model="Llama3.1-8B-Instruct",
-            sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
-        )
-    )
-
-    # Mock Evaluator API response
-    mock_evaluator_response = {"id": "job-123", "status": "created"}
-    mock_evaluator_post.return_value = mock_evaluator_response
-
-    # Run the Evaluation job
-    result = await eval_impl.run_eval(
-        request=RunEvalRequest(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config)
-    )
-
-    # Verify the Evaluator API was called correctly
-    mock_evaluator_post.assert_called_once()
-    _assert_request_body(
-        mock_evaluator_post,
-        {
-            "config": f"nvidia/{MOCK_BENCHMARK_ID}",
-            "target": {"type": "model", "model": "Llama3.1-8B-Instruct"},
-        },
-    )
-
-    # Verify the result
-    assert isinstance(result, Job)
-    assert result.job_id == "job-123"
-    assert result.status == JobStatus.in_progress
-
-
-async def test_job_status(nvidia_eval_setup):
-    eval_impl = nvidia_eval_setup["eval_impl"]
-    mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"]
-
-    # Mock Evaluator API response
-    mock_evaluator_response = {"id": "job-123", "status": "completed"}
-    mock_evaluator_get.return_value = mock_evaluator_response
-
-    # Get the Evaluation job
-    result = await eval_impl.job_status(request=JobStatusRequest(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
-
-    # Verify the result
-    assert isinstance(result, Job)
-    assert result.job_id == "job-123"
-    assert result.status == JobStatus.completed
-
-    # Verify the API was called correctly
-    mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}")
-
-
-async def test_job_cancel(nvidia_eval_setup):
-    eval_impl = nvidia_eval_setup["eval_impl"]
-    mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
-
-    # Mock Evaluator API response
-    mock_evaluator_response = {"id": "job-123", "status": "cancelled"}
-    mock_evaluator_post.return_value = mock_evaluator_response
-
-    # Cancel the Evaluation job
-    await eval_impl.job_cancel(request=JobCancelRequest(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
-
-    # Verify the API was called correctly
-    mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {})
-
-
-async def test_job_result(nvidia_eval_setup):
-    eval_impl = nvidia_eval_setup["eval_impl"]
-    mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"]
-
-    # Mock Evaluator API responses
-    mock_job_status_response = {"id": "job-123", "status": "completed"}
-    mock_job_results_response = {
-        "id": "job-123",
-        "status": "completed",
-        "results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}},
-    }
-    mock_evaluator_get.side_effect = [
-        mock_job_status_response,  # First call to retrieve job
-        mock_job_results_response,  # Second call to retrieve job results
-    ]
-
-    # Get the Evaluation job results
-    result = await eval_impl.job_result(request=JobResultRequest(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
-
-    # Verify the result
-    assert isinstance(result, EvaluateResponse)
-    assert MOCK_BENCHMARK_ID in result.scores
-    assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85
-
-    # Verify the API was called correctly
-    assert mock_evaluator_get.call_count == 2
-    mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123")
-    mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results")
diff --git a/tests/unit/providers/test_lazy_imports.py b/tests/unit/providers/test_lazy_imports.py
index d484426a68..99c1d8c68e 100644
--- a/tests/unit/providers/test_lazy_imports.py
+++ b/tests/unit/providers/test_lazy_imports.py
@@ -77,23 +77,6 @@ def test_no_torch_transformers_on_import(self):
         )
 
 
-class TestBraintrustLazyImports:
-    """Test that braintrust scoring provider doesn't load autoevals/pyarrow at import time."""
-
-    def test_braintrust_import_no_autoevals(self):
-        """Verify braintrust module import doesn't load autoevals or pyarrow."""
-        result = _check_module_import_isolation(
-            "from llama_stack.providers.inline.scoring.braintrust import braintrust",
-            ["autoevals", "pyarrow"],
-        )
-
-        assert result.get("success"), f"Import failed: {result.get('error', 'unknown error')}"
-        assert not result["loaded"], (
-            f"Heavy modules loaded unexpectedly during braintrust import: {result['loaded']}. "
-            "These should be lazily loaded only when scoring is performed."
-        )
-
-
 def _check_no_forbidden_imports(module_path: str, forbidden: list[str]) -> tuple[bool, str]:
     """Import a module in a subprocess and check that forbidden modules are not loaded."""
     code = f"""
diff --git a/tests/unit/server/test_replace_env_vars.py b/tests/unit/server/test_replace_env_vars.py
index 477015bccc..63c431c746 100644
--- a/tests/unit/server/test_replace_env_vars.py
+++ b/tests/unit/server/test_replace_env_vars.py
@@ -104,36 +104,36 @@ def test_explicit_strings_preserved(setup_env_vars):
     assert replace_env_vars(data) == expected
 
 
-def test_resource_with_empty_benchmark_id_skipped(setup_env_vars):
-    """Test that resources with empty benchmark_id from conditional env vars are skipped."""
+def test_resource_with_empty_vector_store_id_skipped(setup_env_vars):
+    """Test that resources with empty vector_store_id from conditional env vars are skipped."""
     data = {
-        "benchmarks": [
-            {"benchmark_id": "${env.BENCHMARK_ID:+my-benchmark}", "dataset_id": "test-dataset"},
-            {"benchmark_id": "always-present", "dataset_id": "another-dataset"},
+        "vector_stores": [
+            {"vector_store_id": "${env.VECTOR_STORE_ID:+my-store}", "provider_id": "test-provider"},
+            {"vector_store_id": "always-present", "provider_id": "another-provider"},
         ]
     }
-    # BENCHMARK_ID is not set, so first benchmark should be skipped
+    # VECTOR_STORE_ID is not set, so first vector store should be skipped
     result = replace_env_vars(data)
-    assert len(result["benchmarks"]) == 1
-    assert result["benchmarks"][0]["benchmark_id"] == "always-present"
+    assert len(result["vector_stores"]) == 1
+    assert result["vector_stores"][0]["vector_store_id"] == "always-present"
 
 
-def test_resource_with_set_benchmark_id_not_skipped(setup_env_vars):
-    """Test that resources with set benchmark_id are not skipped."""
-    os.environ["BENCHMARK_ID"] = "enabled"
+def test_resource_with_set_vector_store_id_not_skipped(setup_env_vars):
+    """Test that resources with set vector_store_id are not skipped."""
+    os.environ["VECTOR_STORE_ID"] = "enabled"
     try:
         data = {
-            "benchmarks": [
-                {"benchmark_id": "${env.BENCHMARK_ID:+my-benchmark}", "dataset_id": "test-dataset"},
-                {"benchmark_id": "always-present", "dataset_id": "another-dataset"},
+            "vector_stores": [
+                {"vector_store_id": "${env.VECTOR_STORE_ID:+my-store}", "provider_id": "test-provider"},
+                {"vector_store_id": "always-present", "provider_id": "another-provider"},
             ]
         }
         result = replace_env_vars(data)
-        assert len(result["benchmarks"]) == 2
-        assert result["benchmarks"][0]["benchmark_id"] == "my-benchmark"
-        assert result["benchmarks"][1]["benchmark_id"] == "always-present"
+        assert len(result["vector_stores"]) == 2
+        assert result["vector_stores"][0]["vector_store_id"] == "my-store"
+        assert result["vector_stores"][1]["vector_store_id"] == "always-present"
     finally:
-        del os.environ["BENCHMARK_ID"]
+        del os.environ["VECTOR_STORE_ID"]
 
 
 def test_resource_with_empty_model_id_skipped(setup_env_vars):
@@ -166,25 +166,25 @@ def test_resource_with_empty_shield_id_skipped(setup_env_vars):
 
 def test_multiple_resources_with_conditional_ids(setup_env_vars):
     """Test that multiple resource types with conditional IDs are handled correctly."""
-    os.environ["INCLUDE_BENCHMARK"] = "yes"
+    os.environ["INCLUDE_SHIELD"] = "yes"
     try:
         data = {
-            "benchmarks": [
-                {"benchmark_id": "${env.INCLUDE_BENCHMARK:+included-benchmark}", "dataset_id": "ds1"},
-                {"benchmark_id": "${env.EXCLUDE_BENCHMARK:+excluded-benchmark}", "dataset_id": "ds2"},
+            "shields": [
+                {"shield_id": "${env.INCLUDE_SHIELD:+included-shield}", "provider_id": "p1"},
+                {"shield_id": "${env.EXCLUDE_SHIELD:+excluded-shield}", "provider_id": "p2"},
             ],
             "models": [
                 {"model_id": "${env.EXCLUDE_MODEL:+excluded-model}", "provider_id": "p1"},
             ],
         }
         result = replace_env_vars(data)
-        # Only the benchmark with INCLUDE_BENCHMARK set should remain
-        assert len(result["benchmarks"]) == 1
-        assert result["benchmarks"][0]["benchmark_id"] == "included-benchmark"
+        # Only the shield with INCLUDE_SHIELD set should remain
+        assert len(result["shields"]) == 1
+        assert result["shields"][0]["shield_id"] == "included-shield"
         # Model with unset env var should be skipped
         assert len(result["models"]) == 0
     finally:
-        del os.environ["INCLUDE_BENCHMARK"]
+        del os.environ["INCLUDE_SHIELD"]
 
 
 def test_auth_provider_disabled_when_type_not_set(setup_env_vars):
diff --git a/tests/unit/test_eval_models.py b/tests/unit/test_eval_models.py
deleted file mode 100644
index 2b6bcd9419..0000000000
--- a/tests/unit/test_eval_models.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-from pydantic import ValidationError
-
-from llama_stack_api.eval.models import (
-    BenchmarkConfig,
-    EvaluateResponse,
-    EvaluateRowsRequest,
-    ModelCandidate,
-    RunEvalRequest,
-)
-from llama_stack_api.inference import SamplingParams, TopPSamplingStrategy
-from llama_stack_api.scoring import ScoringResult
-
-
-def test_model_candidate_valid():
-    mc = ModelCandidate(
-        model="test-model",
-        sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
-    )
-    assert mc.model == "test-model"
-    assert mc.type == "model"
-
-
-def test_benchmark_config_valid():
-    mc = ModelCandidate(
-        model="test-model",
-        sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
-    )
-    bc = BenchmarkConfig(eval_candidate=mc, num_examples=5)
-    assert bc.num_examples == 5
-    assert bc.scoring_params == {}
-
-
-def test_evaluate_response_valid():
-    er = EvaluateResponse(
-        generations=[{"input": "test", "output": "result"}],
-        scores={
-            "accuracy": ScoringResult(
-                score_rows=[{"score": 0.9}],
-                aggregated_results={"average": 0.9},
-            )
-        },
-    )
-    assert len(er.generations) == 1
-    assert "accuracy" in er.scores
-
-
-def test_run_eval_request_valid():
-    mc = ModelCandidate(
-        model="test-model",
-        sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
-    )
-    bc = BenchmarkConfig(eval_candidate=mc)
-    req = RunEvalRequest(benchmark_id="bench-123", benchmark_config=bc)
-    assert req.benchmark_id == "bench-123"
-
-
-def test_evaluate_rows_request_empty_arrays_fail():
-    mc = ModelCandidate(
-        model="test-model",
-        sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
-    )
-    bc = BenchmarkConfig(eval_candidate=mc)
-
-    with pytest.raises(ValidationError):
-        EvaluateRowsRequest(
-            benchmark_id="bench-123",
-            input_rows=[],
-            scoring_functions=["func1"],
-            benchmark_config=bc,
-        )
-
-    with pytest.raises(ValidationError):
-        EvaluateRowsRequest(
-            benchmark_id="bench-123",
-            input_rows=[{"test": "data"}],
-            scoring_functions=[],
-            benchmark_config=bc,
-        )