diff --git a/.github/workflows/backward-compat.yml b/.github/workflows/backward-compat.yml index ffc19c6d70..f454f99a00 100644 --- a/.github/workflows/backward-compat.yml +++ b/.github/workflows/backward-compat.yml @@ -98,9 +98,24 @@ jobs: env: GH_TOKEN: ${{ github.token }} PR_TITLE: ${{ github.event.pull_request.title }} + PR_NUMBER: ${{ github.event.pull_request.number }} run: | echo "Breaking changes detected. Checking for acknowledgment..." + # In merge_group context, github.event.pull_request is empty. + # Extract PR number from the merge queue branch name + # (format: gh-readonly-queue/main/pr--) + if [ -z "$PR_TITLE" ]; then + PR_NUM="$PR_NUMBER" + if [ -z "$PR_NUM" ]; then + PR_NUM=$(echo "${GITHUB_REF_NAME}" | sed -n 's|.*pr-\([0-9]*\)-.*|\1|p') + fi + if [ -n "$PR_NUM" ]; then + PR_TITLE=$(gh pr view "$PR_NUM" --json title --jq '.title' 2>/dev/null || echo "") + echo "Resolved PR title from PR #${PR_NUM}: $PR_TITLE" + fi + fi + # Check PR title for '!:' marker (conventional commits) if [[ "$PR_TITLE" =~ ^[a-z]+\!: ]]; then echo "✓ Breaking change acknowledged in PR title" @@ -190,9 +205,24 @@ jobs: env: GH_TOKEN: ${{ github.token }} PR_TITLE: ${{ github.event.pull_request.title }} + PR_NUMBER: ${{ github.event.pull_request.number }} run: | echo "Integration tests failed. Checking for acknowledgment..." + # In merge_group context, github.event.pull_request is empty. + # Extract PR number from the merge queue branch name + # (format: gh-readonly-queue/main/pr--) + if [ -z "$PR_TITLE" ]; then + PR_NUM="$PR_NUMBER" + if [ -z "$PR_NUM" ]; then + PR_NUM=$(echo "${GITHUB_REF_NAME}" | sed -n 's|.*pr-\([0-9]*\)-.*|\1|p') + fi + if [ -n "$PR_NUM" ]; then + PR_TITLE=$(gh pr view "$PR_NUM" --json title --jq '.title' 2>/dev/null || echo "") + echo "Resolved PR title from PR #${PR_NUM}: $PR_TITLE" + fi + fi + # Check PR title for '!:' marker (conventional commits) if [[ "$PR_TITLE" =~ ^[a-z]+\!: ]]; then echo "✓ Breaking change acknowledged in PR title" diff --git a/.github/workflows/file-processors-tests.yml b/.github/workflows/file-processors-tests.yml index cdb8c4566c..863b9d38b5 100644 --- a/.github/workflows/file-processors-tests.yml +++ b/.github/workflows/file-processors-tests.yml @@ -51,6 +51,8 @@ jobs: run: uv pip install docling - name: Start Llama Stack server with docling + env: + LLAMA_STACK_DISABLE_VERSION_CHECK: "1" run: | uv run --no-sync llama stack run \ --providers "file_processors=inline::docling,files=inline::localfs" \ diff --git a/.github/workflows/test-external-provider-module.yml b/.github/workflows/test-external-provider-module.yml index b492e097cd..a3bfb96de7 100644 --- a/.github/workflows/test-external-provider-module.yml +++ b/.github/workflows/test-external-provider-module.yml @@ -2,18 +2,10 @@ name: Test External Providers Installed via Module run-name: Test External Provider installation via Python module +# Disabled until we find a suitable external provider for CI. +# The lmeval provider depends on the eval API which is being removed. on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - paths: - - 'src/llama_stack/**' - - 'tests/integration/**' - - 'uv.lock' - - 'pyproject.toml' - - 'tests/external/*' - - '.github/workflows/test-external-provider-module.yml' # This workflow + workflow_dispatch: {} jobs: test-external-providers-from-module: @@ -25,16 +17,22 @@ jobs: - name: Install dependencies uses: ./.github/actions/setup-runner - - name: Install lmeval provider + - name: Install weather external API and kaze provider run: | - uv pip install llama-stack-provider-lmeval + uv pip install tests/external/llama-stack-api-weather + uv pip install tests/external/llama-stack-provider-kaze + + - name: Configure external API and provider + run: | + mkdir -p ~/.llama/apis.d ~/.llama/providers.d + cp tests/external/weather.yaml ~/.llama/apis.d/ + cp tests/external/kaze.yaml ~/.llama/providers.d/ - name: Start Llama Stack server in background env: - TRUSTYAI_LMEVAL_USE_K8S: "false" LLAMA_STACK_LOG_FILE: "server.log" run: | - nohup uv run llama stack run tests/external/llama-stack-provider-lmeval/config.yaml > server.log 2>&1 & + nohup uv run llama stack run tests/external/config.yaml > server.log 2>&1 & - name: Wait for Llama Stack server to be ready run: | @@ -54,10 +52,10 @@ jobs: run: | response=$(curl -s http://localhost:8321/v1/providers) echo "$response" | python3 -m json.tool - if echo "$response" | grep -q "trustyai_lmeval"; then - echo "lmeval external provider loaded successfully" + if echo "$response" | grep -q "kaze"; then + echo "kaze external provider loaded successfully" else - echo "ERROR: lmeval provider not found in providers list" + echo "ERROR: kaze provider not found in providers list" exit 1 fi diff --git a/client-sdks/stainless/config.yml b/client-sdks/stainless/config.yml index 70cfbeb1c8..06d0b9847a 100644 --- a/client-sdks/stainless/config.yml +++ b/client-sdks/stainless/config.yml @@ -43,26 +43,6 @@ client_settings: environments: production: http://any-hosted-llama-stack.com pagination: -- name: datasets_iterrows - type: offset - request: - dataset_id: - type: string - start_index: - type: integer - x-stainless-pagination-property: - purpose: offset_count_param - limit: - type: integer - response: - data: - type: array - items: - type: object - next_index: - type: integer - x-stainless-pagination-property: - purpose: offset_count_start_field - name: openai_cursor_page type: cursor request: @@ -105,39 +85,6 @@ settings: ' openapi: transformations: - - command: mergeObject - reason: Better return_type using enum - args: - target: - - $.components.schemas - object: - ReturnType: - additionalProperties: false - properties: - type: - enum: - - string - - number - - boolean - - array - - object - - json - - union - - chat_completion_input - - completion_input - - agent_turn_input - required: - - type - type: object - - command: replaceProperties - reason: Replace return type properties with better model (see above) - args: - filter: - only: - - $.components.schemas.ScoringFn.properties.return_type - - $.components.schemas.RegisterScoringFunctionRequest.properties.return_type - value: - $ref: '#/components/schemas/ReturnType' - command: oneOfToAnyOf reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants @@ -163,7 +110,6 @@ resources: param_type: ParamType safety_violation: SafetyViolation sampling_params: SamplingParams - scoring_result: ScoringResult system_message: SystemMessage health_info: HealthInfo provider_info: ProviderInfo @@ -365,22 +311,6 @@ resources: endpoint: get /v1/shields register: post /v1/shields delete: delete /v1/shields/{identifier} - scoring: - methods: - score: post /v1/scoring/score - score_batch: post /v1/scoring/score-batch - scoring_functions: - models: - scoring_fn: ScoringFn - scoring_fn_params: ScoringFnParams - list_scoring_functions_response: ListScoringFunctionsResponse - methods: - retrieve: get /v1/scoring-functions/{scoring_fn_id} - list: - paginated: false - endpoint: get /v1/scoring-functions - register: post /v1/scoring-functions - unregister: delete /v1/scoring-functions/{scoring_fn_id} files: models: file: OpenAIFileObject @@ -400,33 +330,6 @@ resources: cancel: post /v1/batches/{batch_id}/cancel alpha: subresources: - benchmarks: - models: - benchmark: Benchmark - list_benchmarks_response: ListBenchmarksResponse - methods: - retrieve: get /v1alpha/eval/benchmarks/{benchmark_id} - list: - paginated: false - endpoint: get /v1alpha/eval/benchmarks - register: post /v1alpha/eval/benchmarks - unregister: delete /v1alpha/eval/benchmarks/{benchmark_id} - eval: - models: - evaluate_response: EvaluateResponse - benchmark_config: BenchmarkConfig - job: Job - methods: - evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations - run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs - evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations - run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs - subresources: - jobs: - methods: - cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id} - status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id} - retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result admin: methods: list_providers: get /v1alpha/admin/providers @@ -437,17 +340,3 @@ resources: inference: methods: rerank: post /v1alpha/inference/rerank - beta: - subresources: - datasets: - models: - list_datasets_response: ListDatasetsResponse - methods: - register: post /v1beta/datasets - retrieve: get /v1beta/datasets/{dataset_id} - list: - paginated: false - endpoint: get /v1beta/datasets - unregister: delete /v1beta/datasets/{dataset_id} - iterrows: get /v1beta/datasetio/iterrows/{dataset_id} - appendrows: post /v1beta/datasetio/append-rows/{dataset_id} diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml index d903796d75..57c8f3d955 100644 --- a/client-sdks/stainless/openapi.yml +++ b/client-sdks/stainless/openapi.yml @@ -1948,190 +1948,6 @@ paths: schema: $ref: '#/components/schemas/RunShieldRequest' required: true - /v1/scoring-functions: - get: - responses: - '200': - description: A ListScoringFunctionsResponse. - content: - application/json: - schema: - $ref: '#/components/schemas/ListScoringFunctionsResponse' - '400': - description: Bad Request - $ref: '#/components/responses/BadRequest400' - '429': - description: Too Many Requests - $ref: '#/components/responses/TooManyRequests429' - '500': - description: Internal Server Error - $ref: '#/components/responses/InternalServerError500' - default: - description: Default Response - $ref: '#/components/responses/DefaultError' - tags: - - Scoring Functions - summary: List all scoring functions. - description: List all scoring functions. - operationId: list_scoring_functions_v1_scoring_functions_get - post: - responses: - '400': - description: Bad Request - $ref: '#/components/responses/BadRequest400' - '429': - description: Too Many Requests - $ref: '#/components/responses/TooManyRequests429' - '500': - description: Internal Server Error - $ref: '#/components/responses/InternalServerError500' - default: - description: Default Response - $ref: '#/components/responses/DefaultError' - '204': - description: The scoring function was successfully registered. - tags: - - Scoring Functions - summary: Register a scoring function. - description: Register a scoring function. - operationId: register_scoring_function_v1_scoring_functions_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterScoringFunctionRequest' - required: true - deprecated: true - /v1/scoring-functions/{scoring_fn_id}: - get: - responses: - '200': - description: A ScoringFn. - content: - application/json: - schema: - $ref: '#/components/schemas/ScoringFn' - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - tags: - - Scoring Functions - summary: Get a scoring function by its ID. - description: Get a scoring function by its ID. - operationId: get_scoring_function_v1_scoring_functions__scoring_fn_id__get - parameters: - - name: scoring_fn_id - in: path - required: true - schema: - type: string - description: The ID of the scoring function to get. - title: Scoring Fn Id - description: The ID of the scoring function to get. - delete: - responses: - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - '204': - description: The scoring function was successfully unregistered. - tags: - - Scoring Functions - summary: Unregister a scoring function. - description: Unregister a scoring function. - operationId: unregister_scoring_function_v1_scoring_functions__scoring_fn_id__delete - parameters: - - name: scoring_fn_id - in: path - required: true - schema: - type: string - description: The ID of the scoring function to unregister. - title: Scoring Fn Id - description: The ID of the scoring function to unregister. - deprecated: true - /v1/scoring/score: - post: - responses: - '200': - description: A ScoreResponse object containing rows and aggregated results. - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreResponse' - '400': - description: Bad Request - $ref: '#/components/responses/BadRequest400' - '429': - description: Too Many Requests - $ref: '#/components/responses/TooManyRequests429' - '500': - description: Internal Server Error - $ref: '#/components/responses/InternalServerError500' - default: - description: Default Response - $ref: '#/components/responses/DefaultError' - tags: - - Scoring - summary: Score a list of rows. - description: Score a list of rows. - operationId: score_v1_scoring_score_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreRequest' - required: true - /v1/scoring/score-batch: - post: - responses: - '200': - description: A ScoreBatchResponse. - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreBatchResponse' - '400': - description: Bad Request - $ref: '#/components/responses/BadRequest400' - '429': - description: Too Many Requests - $ref: '#/components/responses/TooManyRequests429' - '500': - description: Internal Server Error - $ref: '#/components/responses/InternalServerError500' - default: - description: Default Response - $ref: '#/components/responses/DefaultError' - tags: - - Scoring - summary: Score a batch of rows. - description: Score a batch of rows. - operationId: score_batch_v1_scoring_score_batch_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreBatchRequest' - required: true /v1/shields: get: responses: @@ -3380,116 +3196,15 @@ paths: description: Get the version of the service. operationId: version_v1_version_get x-public: true - /v1beta/datasetio/append-rows/{dataset_id}: + /v1alpha/inference/rerank: post: - responses: - '204': - description: Rows were successfully appended. - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - tags: - - DatasetIO - summary: Append rows to a dataset. - description: Append rows to a dataset. - operationId: append_rows_v1beta_datasetio_append_rows__dataset_id__post - parameters: - - name: dataset_id - in: path - required: true - schema: - type: string - description: The ID of the dataset to append the rows to. - title: Dataset Id - description: The ID of the dataset to append the rows to. - requestBody: - required: true - content: - application/json: - schema: - $ref: '#/components/schemas/AppendRowsRequest' - /v1beta/datasetio/iterrows/{dataset_id}: - get: - responses: - '200': - description: A PaginatedResponse containing the rows. - content: - application/json: - schema: - $ref: '#/components/schemas/PaginatedResponse' - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - tags: - - DatasetIO - summary: Get a paginated list of rows from a dataset. - description: |- - Get a paginated list of rows from a dataset. - - Uses offset-based pagination where: - - start_index: The starting index (0-based). If None, starts from beginning. - - limit: Number of items to return. If None or -1, returns all items. - - The response includes: - - data: List of items for the current page. - - has_more: Whether there are more items available after this set. - operationId: iterrows_v1beta_datasetio_iterrows__dataset_id__get - parameters: - - name: dataset_id - in: path - required: true - schema: - type: string - description: The ID of the dataset to get the rows from. - title: Dataset Id - description: The ID of the dataset to get the rows from. - - name: start_index - in: query - required: false - schema: - anyOf: - - type: integer - - type: 'null' - description: Index into dataset for the first row to get. Get all rows if None. - title: Start Index - description: Index into dataset for the first row to get. Get all rows if None. - - name: limit - in: query - required: false - schema: - anyOf: - - type: integer - - type: 'null' - description: The number of rows to get. - title: Limit - description: The number of rows to get. - /v1beta/datasets: - get: responses: '200': - description: A list of dataset objects. + description: RerankResponse with indices sorted by relevance score (descending). content: application/json: schema: - $ref: '#/components/schemas/ListDatasetsResponse' + $ref: '#/components/schemas/RerankResponse' '400': description: Bad Request $ref: '#/components/responses/BadRequest400' @@ -3503,18 +3218,25 @@ paths: description: Default Response $ref: '#/components/responses/DefaultError' tags: - - Datasets - summary: List all datasets. - description: List all datasets. - operationId: list_datasets_v1beta_datasets_get - post: + - Inference + summary: Rerank documents based on relevance to a query. + description: Rerank a list of documents based on their relevance to a query. + operationId: rerank_v1alpha_inference_rerank_post + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RerankRequest' + required: true + /v1alpha/admin/providers: + get: responses: '200': - description: The registered dataset object. + description: A list of provider information objects. content: application/json: schema: - $ref: '#/components/schemas/Dataset' + $ref: '#/components/schemas/ListProvidersResponse' '400': description: Bad Request $ref: '#/components/responses/BadRequest400' @@ -3528,26 +3250,19 @@ paths: description: Default Response $ref: '#/components/responses/DefaultError' tags: - - Datasets - summary: Register a new dataset. - description: Register a new dataset. - operationId: register_dataset_v1beta_datasets_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterDatasetRequest' - required: true - deprecated: true - /v1beta/datasets/{dataset_id}: + - Admin + summary: List all available providers + description: List all available providers with their configuration and health status. + operationId: list_providers_v1alpha_admin_providers_get + /v1alpha/admin/providers/{provider_id}: get: responses: '200': - description: The dataset object. + description: The provider information object. content: application/json: schema: - $ref: '#/components/schemas/Dataset' + $ref: '#/components/schemas/ProviderInfo' '400': $ref: '#/components/responses/BadRequest400' description: Bad Request @@ -3560,469 +3275,31 @@ paths: default: $ref: '#/components/responses/DefaultError' description: Default Response + '404': + description: Provider not found. tags: - - Datasets - summary: Get a dataset by its ID. - description: Get a dataset by its ID. - operationId: get_dataset_v1beta_datasets__dataset_id__get + - Admin + summary: Get provider details + description: Get detailed information about a specific provider. + operationId: inspect_provider_v1alpha_admin_providers__provider_id__get parameters: - - name: dataset_id + - name: provider_id in: path required: true schema: type: string - description: The ID of the dataset to get. - title: Dataset Id - description: The ID of the dataset to get. - delete: + description: The ID of the provider to inspect. + title: Provider Id + description: The ID of the provider to inspect. + /v1alpha/admin/inspect/routes: + get: responses: - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - '204': - description: The dataset was successfully unregistered. - tags: - - Datasets - summary: Unregister a dataset by its ID. - description: Unregister a dataset by its ID. - operationId: unregister_dataset_v1beta_datasets__dataset_id__delete - parameters: - - name: dataset_id - in: path - required: true - schema: - type: string - description: The ID of the dataset to unregister. - title: Dataset Id - description: The ID of the dataset to unregister. - deprecated: true - /v1alpha/eval/benchmarks: - get: - responses: - '200': - description: A ListBenchmarksResponse. - content: - application/json: - schema: - $ref: '#/components/schemas/ListBenchmarksResponse' - '400': - description: Bad Request - $ref: '#/components/responses/BadRequest400' - '429': - description: Too Many Requests - $ref: '#/components/responses/TooManyRequests429' - '500': - description: Internal Server Error - $ref: '#/components/responses/InternalServerError500' - default: - description: Default Response - $ref: '#/components/responses/DefaultError' - tags: - - Benchmarks - summary: List all benchmarks. - description: List all benchmarks. - operationId: list_benchmarks_v1alpha_eval_benchmarks_get - post: - responses: - '400': - description: Bad Request - $ref: '#/components/responses/BadRequest400' - '429': - description: Too Many Requests - $ref: '#/components/responses/TooManyRequests429' - '500': - description: Internal Server Error - $ref: '#/components/responses/InternalServerError500' - default: - description: Default Response - $ref: '#/components/responses/DefaultError' - '204': - description: The benchmark was successfully registered. - tags: - - Benchmarks - summary: Register a benchmark. - description: Register a benchmark. - operationId: register_benchmark_v1alpha_eval_benchmarks_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterBenchmarkRequest' - required: true - deprecated: true - /v1alpha/eval/benchmarks/{benchmark_id}: - get: - responses: - '200': - description: A Benchmark. - content: - application/json: - schema: - $ref: '#/components/schemas/Benchmark' - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - tags: - - Benchmarks - summary: Get a benchmark by its ID. - description: Get a benchmark by its ID. - operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get - parameters: - - name: benchmark_id - in: path - required: true - schema: - type: string - description: The ID of the benchmark to get. - title: Benchmark Id - description: The ID of the benchmark to get. - delete: - responses: - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - '204': - description: The benchmark was successfully unregistered. - tags: - - Benchmarks - summary: Unregister a benchmark. - description: Unregister a benchmark. - operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete - parameters: - - name: benchmark_id - in: path - required: true - schema: - type: string - description: The ID of the benchmark to unregister. - title: Benchmark Id - description: The ID of the benchmark to unregister. - deprecated: true - /v1alpha/eval/benchmarks/{benchmark_id}/evaluations: - post: - responses: - '200': - description: EvaluateResponse object containing generations and scores. - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateResponse' - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - tags: - - Eval - summary: Evaluate Rows - description: Evaluate a list of rows on a benchmark. - operationId: evaluate_rows_v1alpha_eval_benchmarks__benchmark_id__evaluations_post - parameters: - - name: benchmark_id - in: path - required: true - schema: - type: string - description: The ID of the benchmark - title: Benchmark Id - description: The ID of the benchmark - requestBody: - required: true - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateRowsBodyRequest' - /v1alpha/eval/benchmarks/{benchmark_id}/jobs: - post: - responses: - '200': - description: The job that was created to run the evaluation. - content: - application/json: - schema: - $ref: '#/components/schemas/Job' - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - tags: - - Eval - summary: Run Eval - description: Run an evaluation on a benchmark. - operationId: run_eval_v1alpha_eval_benchmarks__benchmark_id__jobs_post - parameters: - - name: benchmark_id - in: path - required: true - schema: - type: string - description: The ID of the benchmark - title: Benchmark Id - description: The ID of the benchmark - requestBody: - required: true - content: - application/json: - schema: - $ref: '#/components/schemas/RunEvalBodyRequest' - /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}: - get: - responses: - '200': - description: The status of the evaluation job. - content: - application/json: - schema: - $ref: '#/components/schemas/Job' - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - tags: - - Eval - summary: Job Status - description: Get the status of a job. - operationId: job_status_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__get - parameters: - - name: benchmark_id - in: path - required: true - schema: - type: string - title: Benchmark Id - - name: job_id - in: path - required: true - schema: - type: string - title: Job Id - delete: - responses: - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - '204': - description: Successful Response - tags: - - Eval - summary: Job Cancel - description: Cancel a job. - operationId: job_cancel_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__delete - parameters: - - name: benchmark_id - in: path - required: true - schema: - type: string - title: Benchmark Id - - name: job_id - in: path - required: true - schema: - type: string - title: Job Id - /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result: - get: - responses: - '200': - description: The result of the job. - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateResponse' - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - tags: - - Eval - summary: Job Result - description: Get the result of a job. - operationId: job_result_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__result_get - parameters: - - name: benchmark_id - in: path - required: true - schema: - type: string - title: Benchmark Id - - name: job_id - in: path - required: true - schema: - type: string - title: Job Id - /v1alpha/inference/rerank: - post: - responses: - '200': - description: RerankResponse with indices sorted by relevance score (descending). - content: - application/json: - schema: - $ref: '#/components/schemas/RerankResponse' - '400': - description: Bad Request - $ref: '#/components/responses/BadRequest400' - '429': - description: Too Many Requests - $ref: '#/components/responses/TooManyRequests429' - '500': - description: Internal Server Error - $ref: '#/components/responses/InternalServerError500' - default: - description: Default Response - $ref: '#/components/responses/DefaultError' - tags: - - Inference - summary: Rerank documents based on relevance to a query. - description: Rerank a list of documents based on their relevance to a query. - operationId: rerank_v1alpha_inference_rerank_post - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RerankRequest' - required: true - /v1alpha/admin/providers: - get: - responses: - '200': - description: A list of provider information objects. - content: - application/json: - schema: - $ref: '#/components/schemas/ListProvidersResponse' - '400': - description: Bad Request - $ref: '#/components/responses/BadRequest400' - '429': - description: Too Many Requests - $ref: '#/components/responses/TooManyRequests429' - '500': - description: Internal Server Error - $ref: '#/components/responses/InternalServerError500' - default: - description: Default Response - $ref: '#/components/responses/DefaultError' - tags: - - Admin - summary: List all available providers - description: List all available providers with their configuration and health status. - operationId: list_providers_v1alpha_admin_providers_get - /v1alpha/admin/providers/{provider_id}: - get: - responses: - '200': - description: The provider information object. - content: - application/json: - schema: - $ref: '#/components/schemas/ProviderInfo' - '400': - $ref: '#/components/responses/BadRequest400' - description: Bad Request - '429': - $ref: '#/components/responses/TooManyRequests429' - description: Too Many Requests - '500': - $ref: '#/components/responses/InternalServerError500' - description: Internal Server Error - default: - $ref: '#/components/responses/DefaultError' - description: Default Response - '404': - description: Provider not found. - tags: - - Admin - summary: Get provider details - description: Get detailed information about a specific provider. - operationId: inspect_provider_v1alpha_admin_providers__provider_id__get - parameters: - - name: provider_id - in: path - required: true - schema: - type: string - description: The ID of the provider to inspect. - title: Provider Id - description: The ID of the provider to inspect. - /v1alpha/admin/inspect/routes: - get: - responses: - '200': - description: A list of route information objects. - content: - application/json: - schema: - $ref: '#/components/schemas/ListRoutesResponse' + '200': + description: A list of route information objects. + content: + application/json: + schema: + $ref: '#/components/schemas/ListRoutesResponse' '400': $ref: '#/components/responses/BadRequest400' description: Bad Request @@ -9515,408 +8792,87 @@ components: - error title: ViolationLevel description: Severity level of a safety violation. - AggregationFunctionType: - type: string - enum: - - average - - weighted_average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - description: Types of aggregation functions for scoring results. ArrayType: + description: Parameter type for array values. properties: type: - type: string title: Type + type: string enum: - array title: ArrayType - description: Parameter type for array values. - BasicScoringFnParams: - properties: - type: - type: string - title: Type - enum: - - basic - aggregation_functions: - items: - $ref: '#/components/schemas/AggregationFunctionType' - type: array - title: Aggregation Functions - description: Aggregation functions to apply to the scores of each row - title: BasicScoringFnParams - description: Parameters for basic scoring function configuration. BooleanType: + description: Parameter type for boolean values. properties: type: - type: string title: Type + type: string enum: - boolean title: BooleanType - description: Parameter type for boolean values. ChatCompletionInputType: + description: Parameter type for chat completion input. properties: type: - type: string title: Type + type: string enum: - chat_completion_input title: ChatCompletionInputType - description: Parameter type for chat completion input. CompletionInputType: + description: Parameter type for completion input. properties: type: - type: string title: Type + type: string enum: - completion_input title: CompletionInputType - description: Parameter type for completion input. JsonType: + description: Parameter type for JSON values. properties: type: - type: string title: Type + type: string enum: - json title: JsonType - description: Parameter type for JSON values. - LLMAsJudgeScoringFnParams: - properties: - type: - type: string - title: Type - enum: - - llm_as_judge - judge_model: - type: string - title: Judge Model - prompt_template: - anyOf: - - type: string - - type: 'null' - judge_score_regexes: - items: - type: string - type: array - title: Judge Score Regexes - description: Regexes to extract the answer from generated response - aggregation_functions: - items: - $ref: '#/components/schemas/AggregationFunctionType' - type: array - title: Aggregation Functions - description: Aggregation functions to apply to the scores of each row - required: - - judge_model - title: LLMAsJudgeScoringFnParams - description: Parameters for LLM-as-judge scoring function configuration. NumberType: + description: Parameter type for numeric values. properties: type: - type: string title: Type + type: string enum: - number title: NumberType - description: Parameter type for numeric values. ObjectType: - properties: - type: - type: string - title: Type - enum: - - object - title: ObjectType description: Parameter type for object values. - RegexParserScoringFnParams: properties: type: - type: string title: Type - enum: - - regex_parser - parsing_regexes: - items: - type: string - type: array - title: Parsing Regexes - description: Regex to extract the answer from generated response - aggregation_functions: - items: - $ref: '#/components/schemas/AggregationFunctionType' - type: array - title: Aggregation Functions - description: Aggregation functions to apply to the scores of each row - title: RegexParserScoringFnParams - description: Parameters for regex parser scoring function configuration. - ScoringFn: - properties: - identifier: type: string - title: Identifier - description: Unique identifier for this resource in llama stack - provider_resource_id: - anyOf: - - type: string - - type: 'null' - description: Unique identifier for this resource in the provider - provider_id: - type: string - title: Provider Id - description: ID of the provider that owns this resource - type: - type: string - title: Type enum: - - scoring_function - description: - anyOf: - - type: string - - type: 'null' - metadata: - additionalProperties: true - type: object - title: Metadata - description: Any additional metadata for this definition - return_type: - oneOf: - - $ref: '#/components/schemas/StringType' - title: StringType - - $ref: '#/components/schemas/NumberType' - title: NumberType - - $ref: '#/components/schemas/BooleanType' - title: BooleanType - - $ref: '#/components/schemas/ArrayType' - title: ArrayType - - $ref: '#/components/schemas/ObjectType' - title: ObjectType - - $ref: '#/components/schemas/JsonType' - title: JsonType - - $ref: '#/components/schemas/UnionType' - title: UnionType - - $ref: '#/components/schemas/ChatCompletionInputType' - title: ChatCompletionInputType - - $ref: '#/components/schemas/CompletionInputType' - title: CompletionInputType - title: StringType | ... (9 variants) - description: The return type of the deterministic function - discriminator: - propertyName: type - mapping: - array: '#/components/schemas/ArrayType' - boolean: '#/components/schemas/BooleanType' - chat_completion_input: '#/components/schemas/ChatCompletionInputType' - completion_input: '#/components/schemas/CompletionInputType' - json: '#/components/schemas/JsonType' - number: '#/components/schemas/NumberType' - object: '#/components/schemas/ObjectType' - string: '#/components/schemas/StringType' - union: '#/components/schemas/UnionType' - params: - anyOf: - - oneOf: - - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - title: LLMAsJudgeScoringFnParams - - $ref: '#/components/schemas/RegexParserScoringFnParams' - title: RegexParserScoringFnParams - - $ref: '#/components/schemas/BasicScoringFnParams' - title: BasicScoringFnParams - discriminator: - propertyName: type - mapping: - basic: '#/components/schemas/BasicScoringFnParams' - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams - - type: 'null' - title: Params - description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval - required: - - identifier - - provider_id - - return_type - title: ScoringFn - description: A scoring function resource for evaluating model outputs. - ScoringFnParams: - discriminator: - mapping: - basic: '#/components/schemas/BasicScoringFnParams' - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - propertyName: type - oneOf: - - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - title: LLMAsJudgeScoringFnParams - - $ref: '#/components/schemas/RegexParserScoringFnParams' - title: RegexParserScoringFnParams - - $ref: '#/components/schemas/BasicScoringFnParams' - title: BasicScoringFnParams - title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams - ScoringFnParamsType: - description: Types of scoring function parameter configurations. - enum: - - llm_as_judge - - regex_parser - - basic - title: ScoringFnParamsType - type: string + - object + title: ObjectType StringType: + description: Parameter type for string values. properties: type: - type: string title: Type + type: string enum: - string title: StringType - description: Parameter type for string values. UnionType: + description: Parameter type for union values. properties: type: - type: string title: Type + type: string enum: - union title: UnionType - description: Parameter type for union values. - ListScoringFunctionsResponse: - properties: - data: - items: - $ref: '#/components/schemas/ScoringFn' - type: array - title: Data - description: List of scoring function objects. - required: - - data - title: ListScoringFunctionsResponse - description: Response containing a list of scoring function objects. - ScoreRequest: - properties: - input_rows: - items: - additionalProperties: true - type: object - type: array - title: Input Rows - description: The rows to score. - scoring_functions: - additionalProperties: - anyOf: - - oneOf: - - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - title: LLMAsJudgeScoringFnParams - - $ref: '#/components/schemas/RegexParserScoringFnParams' - title: RegexParserScoringFnParams - - $ref: '#/components/schemas/BasicScoringFnParams' - title: BasicScoringFnParams - discriminator: - propertyName: type - mapping: - basic: '#/components/schemas/BasicScoringFnParams' - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams - - type: 'null' - title: AdditionalpropertiesUnion - type: object - title: Scoring Functions - description: The scoring functions to use for the scoring. - required: - - input_rows - - scoring_functions - title: ScoreRequest - description: Request model for scoring a list of rows. - ScoreResponse: - properties: - results: - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - type: object - title: Results - description: A map of scoring function name to ScoringResult. - required: - - results - title: ScoreResponse - description: The response from scoring. - ScoringResult: - properties: - score_rows: - items: - additionalProperties: true - type: object - type: array - title: Score Rows - description: The scoring result for each row. Each row is a map of column name to value. - aggregated_results: - additionalProperties: true - type: object - title: Aggregated Results - description: Map of metric name to aggregated value - required: - - score_rows - - aggregated_results - title: ScoringResult - description: A scoring result for a single row. - ScoreBatchRequest: - properties: - dataset_id: - type: string - title: Dataset Id - description: The ID of the dataset to score. - scoring_functions: - additionalProperties: - anyOf: - - oneOf: - - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - title: LLMAsJudgeScoringFnParams - - $ref: '#/components/schemas/RegexParserScoringFnParams' - title: RegexParserScoringFnParams - - $ref: '#/components/schemas/BasicScoringFnParams' - title: BasicScoringFnParams - discriminator: - propertyName: type - mapping: - basic: '#/components/schemas/BasicScoringFnParams' - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams - - type: 'null' - title: AdditionalpropertiesUnion - type: object - title: Scoring Functions - description: The scoring functions to use for the scoring. - save_results_dataset: - type: boolean - title: Save Results Dataset - description: Whether to save the results to a dataset. - default: false - required: - - dataset_id - - scoring_functions - title: ScoreBatchRequest - description: Request model for scoring a batch of rows from a dataset. - ScoreBatchResponse: - properties: - dataset_id: - anyOf: - - type: string - - type: 'null' - description: (Optional) The identifier of the dataset that was scored - results: - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - type: object - title: Results - description: A map of scoring function name to ScoringResult - required: - - results - title: ScoreBatchResponse - description: Response from batch scoring operations on datasets. Shield: properties: identifier: @@ -10969,264 +9925,48 @@ components: - version title: VersionInfo description: Version information for the service. - AppendRowsRequest: - properties: - rows: - items: - additionalProperties: true - type: object - type: array - title: Rows - description: The rows to append to the dataset. - required: - - rows - title: AppendRowsRequest - description: Request body for appending rows to a dataset. PaginatedResponse: + description: A generic paginated response that follows a simple format. properties: data: items: additionalProperties: true type: object - type: array title: Data + type: array has_more: - type: boolean title: Has More + type: boolean url: anyOf: - type: string - type: 'null' + nullable: true required: - data - has_more title: PaginatedResponse - description: A generic paginated response that follows a simple format. - Dataset: - properties: - identifier: - type: string - title: Identifier - description: Unique identifier for this resource in llama stack - provider_resource_id: - anyOf: - - type: string - - type: 'null' - description: Unique identifier for this resource in the provider - provider_id: - type: string - title: Provider Id - description: ID of the provider that owns this resource - type: - type: string - title: Type - description: Type of resource, always 'dataset' for datasets - enum: - - dataset - purpose: - $ref: '#/components/schemas/DatasetPurpose' - description: Purpose of the dataset indicating its intended use - source: - oneOf: - - $ref: '#/components/schemas/URIDataSource' - title: URIDataSource - - $ref: '#/components/schemas/RowsDataSource' - title: RowsDataSource - title: URIDataSource | RowsDataSource - description: Data source configuration for the dataset - discriminator: - propertyName: type - mapping: - rows: '#/components/schemas/RowsDataSource' - uri: '#/components/schemas/URIDataSource' - metadata: - additionalProperties: true - type: object - title: Metadata - description: Any additional metadata for this dataset - required: - - identifier - - provider_id - - purpose - - source - title: Dataset - description: Dataset resource for storing and accessing training or evaluation data. - RowsDataSource: - properties: - type: - type: string - title: Type - description: The type of data source. - enum: - - rows - rows: - items: - additionalProperties: true - type: object - type: array - title: Rows - description: 'The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]' - required: - - rows - title: RowsDataSource - description: A dataset stored in rows. - URIDataSource: - properties: - type: - type: string - title: Type - description: The type of data source. - enum: - - uri - uri: - type: string - title: Uri - description: The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}" - required: - - uri - title: URIDataSource - description: A dataset that can be obtained from a URI. - ListDatasetsResponse: - properties: - data: - items: - $ref: '#/components/schemas/Dataset' - type: array - title: Data - description: List of datasets - required: - - data - title: ListDatasetsResponse - description: Response from listing datasets. - Benchmark: - properties: - identifier: - type: string - title: Identifier - description: Unique identifier for this resource in llama stack - provider_resource_id: - anyOf: - - type: string - - type: 'null' - description: Unique identifier for this resource in the provider - provider_id: - type: string - title: Provider Id - description: ID of the provider that owns this resource - type: - type: string - title: Type - description: The resource type, always benchmark. - enum: - - benchmark - dataset_id: - type: string - title: Dataset Id - description: Identifier of the dataset to use for the benchmark evaluation. - scoring_functions: - items: - type: string - type: array - title: Scoring Functions - description: List of scoring function identifiers to apply during evaluation. - metadata: - additionalProperties: true - type: object - title: Metadata - description: Metadata for this evaluation task. - required: - - identifier - - provider_id - - dataset_id - - scoring_functions - title: Benchmark - description: A benchmark resource for evaluating model performance. - ListBenchmarksResponse: - properties: - data: - items: - $ref: '#/components/schemas/Benchmark' - type: array - title: Data - description: List of benchmark objects. - required: - - data - title: ListBenchmarksResponse - description: Response containing a list of benchmark objects. - BenchmarkConfig: - properties: - eval_candidate: - $ref: '#/components/schemas/ModelCandidate' - description: The candidate to evaluate - scoring_params: - additionalProperties: - oneOf: - - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - title: LLMAsJudgeScoringFnParams - - $ref: '#/components/schemas/RegexParserScoringFnParams' - title: RegexParserScoringFnParams - - $ref: '#/components/schemas/BasicScoringFnParams' - title: BasicScoringFnParams - discriminator: - propertyName: type - mapping: - basic: '#/components/schemas/BasicScoringFnParams' - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams - type: object - title: Scoring Params - description: Map between scoring function id and parameters for each scoring function you want to run - num_examples: - anyOf: - - type: integer - minimum: 1.0 - - type: 'null' - description: Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated - required: - - eval_candidate - title: BenchmarkConfig - description: A benchmark configuration for evaluation. GreedySamplingStrategy: + description: Greedy sampling strategy that selects the highest probability token at each step. properties: type: - type: string - title: Type description: Must be 'greedy' to identify this sampling strategy. + title: Type + type: string enum: - greedy title: GreedySamplingStrategy - description: Greedy sampling strategy that selects the highest probability token at each step. - ModelCandidate: - properties: - type: - type: string - title: Type - enum: - - model - model: - type: string - minLength: 1 - title: Model - description: The model ID to evaluate - sampling_params: - $ref: '#/components/schemas/SamplingParams' - description: The sampling parameters for the model - system_message: - anyOf: - - $ref: '#/components/schemas/SystemMessage' - title: SystemMessage - - type: 'null' - description: The system message providing instructions or context to the model - title: SystemMessage - required: - - model - - sampling_params - title: ModelCandidate - description: A model candidate for evaluation. SamplingParams: + description: Sampling parameters for text generation. properties: strategy: + description: The sampling strategy to use. + discriminator: + mapping: + greedy: '#/components/schemas/GreedySamplingStrategy' + top_k: '#/components/schemas/TopKSamplingStrategy' + top_p: '#/components/schemas/TopPSamplingStrategy' + propertyName: type oneOf: - $ref: '#/components/schemas/GreedySamplingStrategy' title: GreedySamplingStrategy @@ -11235,200 +9975,127 @@ components: - $ref: '#/components/schemas/TopKSamplingStrategy' title: TopKSamplingStrategy title: GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy - description: The sampling strategy to use. - discriminator: - propertyName: type - mapping: - greedy: '#/components/schemas/GreedySamplingStrategy' - top_k: '#/components/schemas/TopKSamplingStrategy' - top_p: '#/components/schemas/TopPSamplingStrategy' max_tokens: anyOf: - - type: integer - minimum: 1.0 + - minimum: 1 + type: integer - type: 'null' description: The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length. + nullable: true repetition_penalty: anyOf: - - type: number - maximum: 2.0 + - maximum: 2.0 minimum: -2.0 + type: number - type: 'null' - description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far. default: 1.0 + description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far. stop: anyOf: - items: type: string - type: array maxItems: 4 + type: array - type: 'null' description: Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence. + nullable: true title: SamplingParams - description: Sampling parameters for text generation. SystemMessage: + description: A system message providing instructions or context to the model. properties: role: - type: string - title: Role description: Must be 'system' to identify this as a system message. + title: Role + type: string enum: - system content: anyOf: - type: string - - oneOf: - - $ref: '#/components/schemas/ImageContentItem-Input' - title: ImageContentItem-Input - - $ref: '#/components/schemas/TextContentItem' - title: TextContentItem - discriminator: - propertyName: type + - discriminator: mapping: - image: '#/components/schemas/ImageContentItem-Input' + image: '#/components/schemas/ImageContentItem' text: '#/components/schemas/TextContentItem' - title: ImageContentItem-Input | TextContentItem + propertyName: type + oneOf: + - $ref: '#/components/schemas/ImageContentItem' + title: ImageContentItem + - $ref: '#/components/schemas/TextContentItem' + title: TextContentItem + title: ImageContentItem | TextContentItem - items: - oneOf: - - $ref: '#/components/schemas/ImageContentItem-Input' - title: ImageContentItem-Input - - $ref: '#/components/schemas/TextContentItem' - title: TextContentItem discriminator: - propertyName: type mapping: - image: '#/components/schemas/ImageContentItem-Input' + image: '#/components/schemas/ImageContentItem' text: '#/components/schemas/TextContentItem' - title: ImageContentItem-Input | TextContentItem + propertyName: type + oneOf: + - $ref: '#/components/schemas/ImageContentItem' + title: ImageContentItem + - $ref: '#/components/schemas/TextContentItem' + title: TextContentItem + title: ImageContentItem | TextContentItem type: array - title: list[ImageContentItem-Input | TextContentItem] - title: string | list[ImageContentItem-Input | TextContentItem] + title: list[ImageContentItem | TextContentItem] description: The content of the 'system prompt'. If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages. + title: string | list[ImageContentItem | TextContentItem] required: - content title: SystemMessage - description: A system message providing instructions or context to the model. TopKSamplingStrategy: + description: Top-k sampling strategy that restricts sampling to the k most likely tokens. properties: type: - type: string - title: Type description: Must be 'top_k' to identify this sampling strategy. + title: Type + type: string enum: - top_k top_k: - type: integer - minimum: 1.0 - title: Top K description: Number of top tokens to consider for sampling. Must be at least 1. + minimum: 1 + title: Top K + type: integer required: - top_k title: TopKSamplingStrategy - description: Top-k sampling strategy that restricts sampling to the k most likely tokens. TopPSamplingStrategy: + description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p. properties: type: - type: string - title: Type description: Must be 'top_p' to identify this sampling strategy. + title: Type + type: string enum: - top_p temperature: - type: number + description: Controls randomness in sampling. Higher values increase randomness. maximum: 2.0 title: Temperature - description: Controls randomness in sampling. Higher values increase randomness. + type: number minimum: 0.0 top_p: - type: number + default: 0.95 + description: Cumulative probability threshold for nucleus sampling. maximum: 1.0 minimum: 0.0 title: Top P - description: Cumulative probability threshold for nucleus sampling. - default: 0.95 + type: number required: - temperature title: TopPSamplingStrategy - description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p. - EvaluateRowsRequest: - description: Request model for evaluating a list of rows on a benchmark. - properties: - benchmark_id: - description: The ID of the benchmark to run the evaluation on - minLength: 1 - title: Benchmark Id - type: string - input_rows: - description: The rows to evaluate - items: - additionalProperties: true - type: object - minItems: 1 - title: Input Rows - type: array - scoring_functions: - description: The scoring functions to use for the evaluation - items: - type: string - minItems: 1 - title: Scoring Functions - type: array - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark - required: - - benchmark_id - - input_rows - - scoring_functions - - benchmark_config - title: EvaluateRowsRequest - EvaluateResponse: - properties: - generations: - items: - additionalProperties: true - type: object - type: array - title: Generations - description: The generations from the evaluation - scores: - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - type: object - title: Scores - description: The scores from the evaluation. Each key in the dict is a scoring function name - required: - - generations - - scores - title: EvaluateResponse - description: The response from an evaluation. - RunEvalRequest: - description: Request model for running an evaluation on a benchmark. - properties: - benchmark_id: - description: The ID of the benchmark to run the evaluation on - minLength: 1 - title: Benchmark Id - type: string - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark - required: - - benchmark_id - - benchmark_config - title: RunEvalRequest Job: + description: A job execution instance with status tracking. properties: job_id: - type: string title: Job Id + type: string status: $ref: '#/components/schemas/JobStatus' required: - job_id - status title: Job - description: A job execution instance with status tracking. RerankRequest: properties: model: @@ -11573,85 +10240,6 @@ components: - $ref: '#/components/schemas/CompletionInputType' title: CompletionInputType title: StringType | ... (9 variants) - RegisterScoringFunctionRequest: - properties: - scoring_fn_id: - type: string - title: Scoring Fn Id - description: The ID of the scoring function to register. - description: - type: string - title: Description - description: The description of the scoring function. - return_type: - oneOf: - - $ref: '#/components/schemas/StringType' - title: StringType - - $ref: '#/components/schemas/NumberType' - title: NumberType - - $ref: '#/components/schemas/BooleanType' - title: BooleanType - - $ref: '#/components/schemas/ArrayType' - title: ArrayType - - $ref: '#/components/schemas/ObjectType' - title: ObjectType - - $ref: '#/components/schemas/JsonType' - title: JsonType - - $ref: '#/components/schemas/UnionType' - title: UnionType - - $ref: '#/components/schemas/ChatCompletionInputType' - title: ChatCompletionInputType - - $ref: '#/components/schemas/CompletionInputType' - title: CompletionInputType - title: StringType | ... (9 variants) - description: The return type of the scoring function. - discriminator: - propertyName: type - mapping: - array: '#/components/schemas/ArrayType' - boolean: '#/components/schemas/BooleanType' - chat_completion_input: '#/components/schemas/ChatCompletionInputType' - completion_input: '#/components/schemas/CompletionInputType' - json: '#/components/schemas/JsonType' - number: '#/components/schemas/NumberType' - object: '#/components/schemas/ObjectType' - string: '#/components/schemas/StringType' - union: '#/components/schemas/UnionType' - provider_scoring_fn_id: - anyOf: - - type: string - - type: 'null' - description: The ID of the provider scoring function to use for the scoring function. - provider_id: - anyOf: - - type: string - - type: 'null' - description: The ID of the provider to use for the scoring function. - params: - anyOf: - - oneOf: - - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - title: LLMAsJudgeScoringFnParams - - $ref: '#/components/schemas/RegexParserScoringFnParams' - title: RegexParserScoringFnParams - - $ref: '#/components/schemas/BasicScoringFnParams' - title: BasicScoringFnParams - discriminator: - propertyName: type - mapping: - basic: '#/components/schemas/BasicScoringFnParams' - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams - - type: 'null' - title: Params - description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval. - required: - - scoring_fn_id - - description - - return_type - title: RegisterScoringFunctionRequest - description: Request model for registering a scoring function. RegisterShieldRequest: properties: shield_id: @@ -11678,90 +10266,6 @@ components: - shield_id title: RegisterShieldRequest description: Request model for registering a shield. - DataSource: - discriminator: - mapping: - rows: '#/components/schemas/RowsDataSource' - uri: '#/components/schemas/URIDataSource' - propertyName: type - oneOf: - - $ref: '#/components/schemas/URIDataSource' - title: URIDataSource - - $ref: '#/components/schemas/RowsDataSource' - title: RowsDataSource - title: URIDataSource | RowsDataSource - RegisterDatasetRequest: - properties: - purpose: - $ref: '#/components/schemas/DatasetPurpose' - description: The purpose of the dataset. - source: - oneOf: - - $ref: '#/components/schemas/URIDataSource' - title: URIDataSource - - $ref: '#/components/schemas/RowsDataSource' - title: RowsDataSource - title: URIDataSource | RowsDataSource - description: The data source of the dataset. - discriminator: - propertyName: type - mapping: - rows: '#/components/schemas/RowsDataSource' - uri: '#/components/schemas/URIDataSource' - metadata: - anyOf: - - additionalProperties: true - type: object - - type: 'null' - description: The metadata for the dataset. - dataset_id: - anyOf: - - type: string - - type: 'null' - description: The ID of the dataset. If not provided, an ID will be generated. - required: - - purpose - - source - title: RegisterDatasetRequest - description: Request model for registering a dataset. - RegisterBenchmarkRequest: - properties: - benchmark_id: - type: string - title: Benchmark Id - description: The ID of the benchmark to register. - dataset_id: - type: string - title: Dataset Id - description: The ID of the dataset to use for the benchmark. - scoring_functions: - items: - type: string - type: array - title: Scoring Functions - description: The scoring functions to use for the benchmark. - provider_benchmark_id: - anyOf: - - type: string - - type: 'null' - description: The ID of the provider benchmark to use for the benchmark. - provider_id: - anyOf: - - type: string - - type: 'null' - description: The ID of the provider to use for the benchmark. - metadata: - anyOf: - - additionalProperties: true - type: object - - type: 'null' - description: The metadata to use for the benchmark. - required: - - benchmark_id - - dataset_id - - scoring_functions - title: RegisterBenchmarkRequest - description: Request model for registering a benchmark. AllowedToolsFilter: properties: tool_names: @@ -12698,13 +11202,6 @@ components: - model title: CreateResponseRequest description: Request model for creating a response. - DatasetPurpose: - type: string - enum: - - eval/question-answer - - eval/messages-answer - title: DatasetPurpose - description: Purpose of the dataset. Each purpose has a required input data schema. EmbeddedChunk-Input: properties: content: @@ -12843,32 +11340,6 @@ components: - type: 'null' additionalProperties: true title: Errors - EvaluateRowsBodyRequest: - properties: - input_rows: - items: - additionalProperties: true - type: object - type: array - minItems: 1 - title: Input Rows - description: The rows to evaluate - scoring_functions: - items: - type: string - type: array - minItems: 1 - title: Scoring Functions - description: The scoring functions to use for the evaluation - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark - required: - - input_rows - - scoring_functions - - benchmark_config - title: EvaluateRowsBodyRequest - description: Request body model for evaluating rows (without path parameter). HealthStatus: type: string enum: @@ -12912,16 +11383,6 @@ components: required: - cached_tokens title: InputTokensDetails - JobStatus: - type: string - enum: - - completed - - in_progress - - failed - - scheduled - - cancelled - title: JobStatus - description: Status of a job execution. ListConnectorsResponse: properties: data: @@ -13896,15 +12357,6 @@ components: - disabled title: ResponseTruncation description: Controls how the service truncates input when it exceeds the model context window. - RunEvalBodyRequest: - properties: - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark - required: - - benchmark_config - title: RunEvalBodyRequest - description: Request body model for running an evaluation (without path parameter). SearchRankingOptions: properties: ranker: @@ -14292,50 +12744,6 @@ components: - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText' title: OpenAIResponseContentPartReasoningText title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText - ListBenchmarksRequest: - description: Request model for listing benchmarks. - properties: {} - title: ListBenchmarksRequest - GetBenchmarkRequest: - description: Request model for getting a benchmark. - properties: - benchmark_id: - description: The ID of the benchmark to get. - title: Benchmark Id - type: string - required: - - benchmark_id - title: GetBenchmarkRequest - UnregisterBenchmarkRequest: - description: Request model for unregistering a benchmark. - properties: - benchmark_id: - description: The ID of the benchmark to unregister. - title: Benchmark Id - type: string - required: - - benchmark_id - title: UnregisterBenchmarkRequest - GetDatasetRequest: - description: Request model for getting a dataset by ID. - properties: - dataset_id: - description: The ID of the dataset to get. - title: Dataset Id - type: string - required: - - dataset_id - title: GetDatasetRequest - UnregisterDatasetRequest: - description: Request model for unregistering a dataset. - properties: - dataset_id: - description: The ID of the dataset to unregister. - title: Dataset Id - type: string - required: - - dataset_id - title: UnregisterDatasetRequest ListModelsResponse: description: Response containing a list of model objects. properties: @@ -14368,39 +12776,6 @@ components: required: - model_id title: UnregisterModelRequest - DialogType: - description: Parameter type for dialog data with semantic output labels. - properties: - type: - title: Type - type: string - enum: - - dialog - title: DialogType - ListScoringFunctionsRequest: - description: Request model for listing scoring functions. - properties: {} - title: ListScoringFunctionsRequest - GetScoringFunctionRequest: - description: Request model for getting a scoring function. - properties: - scoring_fn_id: - description: The ID of the scoring function to get. - title: Scoring Fn Id - type: string - required: - - scoring_fn_id - title: GetScoringFunctionRequest - UnregisterScoringFunctionRequest: - description: Request model for unregistering a scoring function. - properties: - scoring_fn_id: - description: The ID of the scoring function to unregister. - title: Scoring Fn Id - type: string - required: - - scoring_fn_id - title: UnregisterScoringFunctionRequest GetShieldRequest: description: Request model for getting a shield by identifier. properties: @@ -14499,16 +12874,10 @@ components: - responses - batches - vector_io - - datasetio - - scoring - - eval - tool_runtime - models - shields - vector_stores - - datasets - - scoring_functions - - benchmarks - tool_groups - files - file_processors @@ -15309,6 +13678,25 @@ components: required: - batch_id title: CancelBatchRequest + JobStatus: + description: Status of a job execution. + enum: + - completed + - in_progress + - failed + - scheduled + - cancelled + title: JobStatus + type: string + DialogType: + description: Parameter type for dialog data with semantic output labels. + properties: + type: + title: Type + type: string + enum: + - dialog + title: DialogType ConnectorInput: description: Input for creating a connector properties: @@ -15542,90 +13930,6 @@ components: - conversation_id - item_id title: DeleteItemRequest - IterRowsRequest: - description: Request model for iterating over rows in a dataset. - properties: - dataset_id: - description: The ID of the dataset to get the rows from. - title: Dataset Id - type: string - start_index: - anyOf: - - type: integer - - type: 'null' - description: Index into dataset for the first row to get. Get all rows if None. - nullable: true - limit: - anyOf: - - type: integer - - type: 'null' - description: The number of rows to get. - nullable: true - required: - - dataset_id - title: IterRowsRequest - BenchmarkIdRequest: - description: Request model containing benchmark_id path parameter. - properties: - benchmark_id: - description: The ID of the benchmark - minLength: 1 - title: Benchmark Id - type: string - required: - - benchmark_id - title: BenchmarkIdRequest - JobStatusRequest: - description: Request model for getting the status of a job. - properties: - benchmark_id: - description: The ID of the benchmark associated with the job - minLength: 1 - title: Benchmark Id - type: string - job_id: - description: The ID of the job to get the status of - minLength: 1 - title: Job Id - type: string - required: - - benchmark_id - - job_id - title: JobStatusRequest - JobCancelRequest: - description: Request model for canceling a job. - properties: - benchmark_id: - description: The ID of the benchmark associated with the job - minLength: 1 - title: Benchmark Id - type: string - job_id: - description: The ID of the job to cancel - minLength: 1 - title: Job Id - type: string - required: - - benchmark_id - - job_id - title: JobCancelRequest - JobResultRequest: - description: Request model for getting the result of a job. - properties: - benchmark_id: - description: The ID of the benchmark associated with the job - minLength: 1 - title: Benchmark Id - type: string - job_id: - description: The ID of the job to get the result of - minLength: 1 - title: Job Id - type: string - required: - - benchmark_id - - job_id - title: JobResultRequest ProcessFileRequest: description: |- Request model for file processing operation. diff --git a/docs/docs/advanced_apis/evaluation.md b/docs/docs/advanced_apis/evaluation.md deleted file mode 100644 index 085916da34..0000000000 --- a/docs/docs/advanced_apis/evaluation.md +++ /dev/null @@ -1,170 +0,0 @@ -# Evaluation - -## Evaluation Concepts - -The Llama Stack Evaluation flow allows you to run evaluations on your GenAI application datasets or pre-registered benchmarks. - -We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications: - -- `/datasetio` + `/datasets` API -- `/scoring` + `/scoring_functions` API -- `/eval` + `/benchmarks` API - -This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing). - -The Evaluation APIs are associated with a set of Resources. Please visit the Resources section in our [Core Concepts](/docs/concepts/) guide for better high-level understanding. - -- **DatasetIO**: defines interface with datasets and data loaders. - - Associated with `Dataset` resource. -- **Scoring**: evaluate outputs of the system. - - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics. -- **Eval**: generate outputs (via Inference or Agents) and perform scoring. - - Associated with `Benchmark` resource. - -## Evaluation Providers - -Llama Stack provides multiple evaluation providers: - -- **Builtin** (`inline::builtin`) - Meta's reference implementation with multi-language support -- **NVIDIA** (`remote::nvidia`) - NVIDIA's evaluation platform integration - -### Builtin - -Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics. - -#### Configuration - -| Field | Type | Required | Default | Description | -|-------|------|----------|---------|-------------| -| `kvstore` | `RedisKVStoreConfig \| SqliteKVStoreConfig \| PostgresKVStoreConfig \| MongoDBKVStoreConfig` | No | sqlite | Key-value store configuration | - -#### Sample Configuration - -```yaml -kvstore: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/builtin_eval.db -``` - -#### Features - -- Multi-language evaluation support -- Comprehensive evaluation metrics -- Integration with various key-value stores (SQLite, Redis, PostgreSQL, MongoDB) -- Built-in support for popular benchmarks - -### NVIDIA - -NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform. - -#### Configuration - -| Field | Type | Required | Default | Description | -|-------|------|----------|---------|-------------| -| `evaluator_url` | `str` | No | | The url for accessing the evaluator service | - -#### Sample Configuration - -```yaml -evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331} -``` - -#### Features - -- Integration with NVIDIA's evaluation platform -- Remote evaluation capabilities -- Scalable evaluation processing - -## Open-benchmark Eval - -### List of open-benchmarks Llama Stack support - -Llama stack pre-registers several popular open-benchmarks to easily evaluate model performance via CLI. - -The list of open-benchmarks we currently support: - -- [MMLU-COT](https://arxiv.org/abs/2009.03300) (Measuring Massive Multitask Language Understanding): Benchmark designed to comprehensively evaluate the breadth and depth of a model's academic and professional understanding -- [GPQA-COT](https://arxiv.org/abs/2311.12022) (A Graduate-Level Google-Proof Q&A Benchmark): A challenging benchmark of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. -- [SimpleQA](https://openai.com/index/introducing-simpleqa/): Benchmark designed to access models to answer short, fact-seeking questions. -- [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI): Benchmark designed to evaluate multimodal models. - -You can follow this [contributing guide](/docs/references/evals_reference/#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack - -### Run evaluation on open-benchmarks via CLI - -We have built-in functionality to run the supported open-benchmarks using llama-stack-client CLI - -#### Spin up Llama Stack server - -Spin up llama stack server with 'open-benchmark' template - -```bash -llama stack run llama_stack/distributions/open-benchmark/config.yaml -``` - -#### Run eval CLI - -There are 3 necessary inputs to run a benchmark eval - -- `list of benchmark_ids`: The list of benchmark ids to run evaluation on -- `model-id`: The model id to evaluate on -- `output_dir`: Path to store the evaluate results - -```bash -llama-stack-client eval run-benchmark ... \ ---model_id \ ---output_dir -``` - -You can run - -```bash -llama-stack-client eval run-benchmark help -``` - -to see the description of all the flags that eval run-benchmark has - -In the output log, you can find the file path that has your evaluation results. Open that file and you can see you aggregate evaluation results over there. - -## Usage Example - -Here's a basic example of using the evaluation API: - -```python -from llama_stack_client import LlamaStackClient - -client = LlamaStackClient(base_url="http://localhost:8321") - -# Register a dataset for evaluation -client.datasets.register( - purpose="evaluation", - source={ - "type": "uri", - "uri": "huggingface://datasets/llamastack/evaluation_dataset", - }, - dataset_id="my_eval_dataset", -) - -# Run evaluation -eval_result = client.eval.run_evaluation( - dataset_id="my_eval_dataset", - scoring_functions=["accuracy", "bleu"], - model_id="my_model", -) - -print(f"Evaluation completed: {eval_result}") -``` - -## Best Practices - -- **Choose appropriate providers**: Use Builtin for comprehensive evaluation, NVIDIA for platform-specific needs -- **Configure storage properly**: Ensure your key-value store configuration matches your performance requirements -- **Monitor evaluation progress**: Large evaluations can take time - implement proper monitoring -- **Use appropriate scoring functions**: Select scoring metrics that align with your evaluation goals - -## What's Next? - -- Check out our Colab notebook on working examples with running benchmark evaluations [here](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb#scrollTo=mxLCsP4MvFqP). -- Check out our [Building Applications - Evaluation](../building_applications/evals) guide for more details on how to use the Evaluation APIs to evaluate your applications. -- Check out our [Evaluation Reference](/docs/references/evals_reference/) for more details on the APIs. -- Explore the [Scoring](./scoring) documentation for available scoring functions. diff --git a/docs/docs/advanced_apis/scoring.md b/docs/docs/advanced_apis/scoring.md deleted file mode 100644 index df1e2460d4..0000000000 --- a/docs/docs/advanced_apis/scoring.md +++ /dev/null @@ -1,197 +0,0 @@ -# Scoring - -The Scoring API in Llama Stack allows you to evaluate outputs of your GenAI system using various scoring functions and metrics. This section covers all available scoring providers and their configuration. - -## Overview - -Llama Stack provides multiple scoring providers: - -- **Basic** (`inline::basic`) - Simple evaluation metrics and scoring functions -- **Braintrust** (`inline::braintrust`) - Advanced evaluation using the Braintrust platform -- **LLM-as-Judge** (`inline::llm-as-judge`) - Uses language models to evaluate responses - -The Scoring API is associated with `ScoringFunction` resources and provides a suite of out-of-the-box scoring functions. You can also add custom evaluators to meet specific evaluation needs. - -## Basic Scoring - -Basic scoring provider for simple evaluation metrics and scoring functions. This provider offers fundamental scoring capabilities without external dependencies. - -### Configuration - -No configuration required - this provider works out of the box. - -```yaml -{} -``` - -### Features - -- Simple evaluation metrics (accuracy, precision, recall, F1-score) -- String matching and similarity metrics -- Basic statistical scoring functions -- No external dependencies required -- Fast execution for standard metrics - -### Use Cases - -- Quick evaluation of basic accuracy metrics -- String similarity comparisons -- Statistical analysis of model outputs -- Development and testing scenarios - -## Braintrust - -Braintrust scoring provider for evaluation and scoring using the [Braintrust platform](https://braintrustdata.com/). Braintrust provides advanced evaluation capabilities and experiment tracking. - -### Configuration - -| Field | Type | Required | Default | Description | -|-------|------|----------|---------|-------------| -| `openai_api_key` | `str \| None` | No | | The OpenAI API Key for LLM-powered evaluations | - -### Sample Configuration - -```yaml -openai_api_key: ${env.OPENAI_API_KEY:=} -``` - -### Features - -- Advanced evaluation metrics -- Experiment tracking and comparison -- LLM-powered evaluation functions -- Integration with Braintrust's evaluation suite -- Detailed scoring analytics and insights - -### Use Cases - -- Production evaluation pipelines -- A/B testing of model versions -- Advanced scoring with custom metrics -- Detailed evaluation reporting and analysis - -## LLM-as-Judge - -LLM-as-judge scoring provider that uses language models to evaluate and score responses. This approach leverages the reasoning capabilities of large language models to assess quality, relevance, and other subjective metrics. - -### Configuration - -No configuration required - this provider works out of the box. - -```yaml -{} -``` - -### Features - -- Subjective quality evaluation using LLMs -- Flexible evaluation criteria definition -- Natural language evaluation explanations -- Support for complex evaluation scenarios -- Contextual understanding of responses - -### Use Cases - -- Evaluating response quality and relevance -- Assessing creativity and coherence -- Subjective metric evaluation -- Human-like judgment for complex tasks - -## Usage Examples - -### Basic Scoring Example - -```python -from llama_stack_client import LlamaStackClient - -client = LlamaStackClient(base_url="http://localhost:8321") - -# Register a basic accuracy scoring function -client.scoring_functions.register( - scoring_function_id="basic_accuracy", - provider_id="basic", - provider_scoring_function_id="accuracy", -) - -# Use the scoring function -result = client.scoring.score( - input_rows=[ - {"expected": "Paris", "actual": "Paris"}, - {"expected": "London", "actual": "Paris"}, - ], - scoring_function_id="basic_accuracy", -) -print(f"Accuracy: {result.results[0].score}") -``` - -### LLM-as-Judge Example - -```python -# Register an LLM-as-judge scoring function -client.scoring_functions.register( - scoring_function_id="quality_judge", - provider_id="llm_judge", - provider_scoring_function_id="response_quality", - params={ - "criteria": "Evaluate response quality, relevance, and helpfulness", - "scale": "1-10", - }, -) - -# Score responses using LLM judgment -result = client.scoring.score( - input_rows=[ - { - "query": "What is machine learning?", - "response": "Machine learning is a subset of AI that enables computers to learn patterns from data...", - } - ], - scoring_function_id="quality_judge", -) -``` - -### Braintrust Integration Example - -```python -# Register a Braintrust scoring function -client.scoring_functions.register( - scoring_function_id="braintrust_eval", - provider_id="braintrust", - provider_scoring_function_id="semantic_similarity", -) - -# Run evaluation with Braintrust -result = client.scoring.score( - input_rows=[ - { - "reference": "The capital of France is Paris", - "candidate": "Paris is the capital city of France", - } - ], - scoring_function_id="braintrust_eval", -) -``` - -## Best Practices - -- **Choose appropriate providers**: Use Basic for simple metrics, Braintrust for advanced analytics, LLM-as-Judge for subjective evaluation -- **Define clear criteria**: When using LLM-as-Judge, provide specific evaluation criteria and scales -- **Validate scoring functions**: Test your scoring functions with known examples before production use -- **Monitor performance**: Track scoring performance and adjust thresholds based on results -- **Combine multiple metrics**: Use different scoring providers together for comprehensive evaluation - -## Integration with Evaluation - -The Scoring API works closely with the [Evaluation](./evaluation) API to provide comprehensive evaluation workflows: - -1. **Datasets** are loaded via the DatasetIO API -2. **Evaluation** generates model outputs using the Eval API -3. **Scoring** evaluates the quality of outputs using various scoring functions -4. **Results** are aggregated and reported for analysis - -## Next Steps - -- Check out the [Evaluation](./evaluation) guide for running complete evaluations -- See the [Building Applications - Evaluation](../building_applications/evals) guide for application examples -- Review the [Evaluation Reference](../references/evals_reference/) for comprehensive scoring function usage -- Explore the [Evaluation Concepts](../concepts/evaluation_concepts) for detailed conceptual information diff --git a/docs/docs/distributions/remote_hosted_distro/oci.md b/docs/docs/distributions/remote_hosted_distro/oci.md index 93c1e35e2d..02cb69bab1 100644 --- a/docs/docs/distributions/remote_hosted_distro/oci.md +++ b/docs/docs/distributions/remote_hosted_distro/oci.md @@ -8,13 +8,10 @@ The `llamastack/distribution-oci` distribution consists of the following provide | API | Provider(s) | |-----|-------------| -| datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::builtin` | | files | `inline::localfs` | | inference | `remote::oci` | | responses | `inline::builtin` | | safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::file-search`, `remote::model-context-protocol` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | diff --git a/docs/docs/distributions/self_hosted_distro/nvidia.md b/docs/docs/distributions/self_hosted_distro/nvidia.md index 3372b067d1..ae4cb8d5aa 100644 --- a/docs/docs/distributions/self_hosted_distro/nvidia.md +++ b/docs/docs/distributions/self_hosted_distro/nvidia.md @@ -8,13 +8,10 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov | API | Provider(s) | |-----|-------------| -| datasetio | `inline::localfs`, `remote::nvidia` | -| eval | `remote::nvidia` | | files | `inline::localfs` | | inference | `remote::nvidia` | | responses | `inline::builtin` | | safety | `remote::nvidia` | -| scoring | `inline::basic` | | tool_runtime | `inline::file-search` | | vector_io | `inline::faiss` | @@ -26,16 +23,10 @@ The following environment variables can be configured: - `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`) -- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`) - -- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`) - - `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`) - `NVIDIA_GUARDRAILS_CONFIG_ID`: NVIDIA Guardrail Configuration ID (default: `self-check`) -- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`) - - `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`) - `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`) diff --git a/docs/docusaurus.config.ts b/docs/docusaurus.config.ts index 249b7614fd..7e5e4e97ac 100644 --- a/docs/docusaurus.config.ts +++ b/docs/docusaurus.config.ts @@ -155,23 +155,44 @@ const config: Config = { ], }, footer: { - style: 'dark', + style: 'light', links: [ { - title: 'Docs', + title: 'Getting Started', items: [ { - label: 'Getting Started', + label: 'Quickstart', to: '/docs/getting_started/quickstart', }, { label: 'Concepts', to: '/docs/concepts', }, + { + label: 'Distributions', + to: '/docs/distributions/building_distro', + }, + { + label: 'Providers', + to: '/docs/providers', + }, + ], + }, + { + title: 'API', + items: [ { label: 'API Reference', to: '/docs/api-overview', }, + { + label: 'OpenAI Compatibility', + to: '/docs/api-openai', + }, + { + label: 'Blog', + to: '/blog', + }, ], }, { @@ -181,14 +202,22 @@ const config: Config = { label: 'Discord', href: 'https://discord.gg/llama-stack', }, + { + label: 'GitHub Discussions', + href: 'https://github.com/llamastack/llama-stack/discussions', + }, { label: 'Issues', href: 'https://github.com/llamastack/llama-stack/issues', }, + { + label: 'Contributing', + to: '/docs/contributing', + }, ], }, { - title: 'More', + title: 'Resources', items: [ { label: 'GitHub', @@ -198,10 +227,14 @@ const config: Config = { label: 'PyPI', href: 'https://pypi.org/project/llama-stack/', }, + { + label: 'Releases', + href: 'https://github.com/llamastack/llama-stack/releases', + }, ], }, ], - copyright: `Copyright © ${new Date().getFullYear()} Meta Platforms, Inc. Built with Docusaurus.`, + copyright: `Copyright © ${new Date().getFullYear()} Meta Platforms, Inc.`, }, colorMode: { defaultMode: 'dark', diff --git a/docs/sidebars.ts b/docs/sidebars.ts index 8e623cf57b..8ccbcde8e9 100644 --- a/docs/sidebars.ts +++ b/docs/sidebars.ts @@ -197,10 +197,6 @@ const sidebars: SidebarsConfig = { label: 'Scoring', collapsed: false, items: [ - 'providers/scoring/index', - 'providers/scoring/inline_basic', - 'providers/scoring/inline_braintrust', - 'providers/scoring/inline_llm-as-judge' ], }, { @@ -276,8 +272,6 @@ const sidebars: SidebarsConfig = { label: 'Advanced APIs', collapsed: false, items: [ - 'advanced_apis/evaluation', - 'advanced_apis/scoring', ], }, { diff --git a/docs/src/css/custom.css b/docs/src/css/custom.css index 4bd5d61084..e92731bb21 100644 --- a/docs/src/css/custom.css +++ b/docs/src/css/custom.css @@ -93,6 +93,7 @@ html { font-feature-settings: 'cv02', 'cv03', 'cv04', 'cv11'; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; + text-rendering: optimizeLegibility; } h1, h2, h3, h4, h5, h6 { @@ -103,10 +104,19 @@ h1, h2, h3, h4, h5, h6 { h1 { font-size: 2.2rem; + line-height: 1.2; } h2 { font-size: 1.6rem; + line-height: 1.3; + margin-top: 2.5rem; +} + +h3 { + font-size: 1.25rem; + line-height: 1.35; + margin-top: 2rem; } .navbar__title { @@ -115,23 +125,150 @@ h2 { /* ========== NAVBAR ========== */ .navbar { - backdrop-filter: blur(12px); - -webkit-backdrop-filter: blur(12px); - border-bottom: 1px solid rgba(0, 0, 0, 0.05); + backdrop-filter: blur(16px); + -webkit-backdrop-filter: blur(16px); + border-bottom: 1px solid rgba(0, 0, 0, 0.06); + padding: 0.15rem 0; + transition: all 0.3s; +} + +/* Gradient accent line at top of page */ +.navbar::before { + content: ''; + position: absolute; + top: 0; + left: 0; + right: 0; + height: 2px; + background: linear-gradient(90deg, #6d28d9 0%, #2dd4bf 50%, #60a5fa 100%); + opacity: 0.8; } [data-theme='dark'] .navbar { - border-bottom: 1px solid rgba(255, 255, 255, 0.05); + border-bottom: 1px solid rgba(255, 255, 255, 0.06); +} + +/* Logo styling */ +.navbar__logo img { + border-radius: 6px; + transition: transform 0.2s; +} + +.navbar__brand:hover .navbar__logo img { + transform: scale(1.08); +} + +.navbar__brand { + margin-left: 0.75rem; } .navbar__title { font-weight: 700; font-size: 1.1rem; + background: linear-gradient(135deg, var(--ifm-font-color-base) 60%, var(--ifm-color-primary) 100%); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; } .navbar__link { font-weight: 500; - font-size: 0.9rem; + font-size: 0.88rem; + border-radius: 0.375rem; + padding: 0.4rem 0.75rem !important; + transition: all 0.15s; + position: relative; +} + +.navbar__link:hover { + background: rgba(109, 40, 217, 0.06); +} + +[data-theme='dark'] .navbar__link:hover { + background: rgba(167, 139, 250, 0.08); +} + +.navbar__link--active { + font-weight: 600; +} + +.navbar__link--active::after { + content: ''; + position: absolute; + bottom: -2px; + left: 50%; + transform: translateX(-50%); + width: 16px; + height: 2px; + border-radius: 1px; + background: var(--ifm-color-primary); +} + +/* GitHub link icon styling */ +.navbar__items--right .navbar__link[href*="github"] { + display: flex; + align-items: center; + gap: 0.4rem; + padding: 0.35rem 0.85rem !important; + border: 1px solid rgba(0, 0, 0, 0.1); + border-radius: 100px; + font-size: 0.82rem; + transition: all 0.2s; +} + +[data-theme='dark'] .navbar__items--right .navbar__link[href*="github"] { + border-color: rgba(255, 255, 255, 0.1); +} + +.navbar__items--right .navbar__link[href*="github"]:hover { + background: rgba(0, 0, 0, 0.06); + border-color: rgba(0, 0, 0, 0.2); + transform: translateY(-1px); +} + +[data-theme='dark'] .navbar__items--right .navbar__link[href*="github"]:hover { + background: rgba(255, 255, 255, 0.08); + border-color: rgba(255, 255, 255, 0.2); +} + +/* Dropdown menu styling */ +.dropdown__menu { + border-radius: 0.75rem; + border: 1px solid rgba(0, 0, 0, 0.08); + box-shadow: 0 12px 40px rgba(0, 0, 0, 0.1), 0 0 0 1px rgba(0, 0, 0, 0.04); + padding: 0.5rem; + min-width: 180px; +} + +[data-theme='dark'] .dropdown__menu { + border-color: rgba(255, 255, 255, 0.08); + box-shadow: 0 12px 40px rgba(0, 0, 0, 0.4), 0 0 0 1px rgba(255, 255, 255, 0.04); + background: #1a1a2e; +} + +.dropdown__link { + border-radius: 0.375rem; + padding: 0.5rem 0.75rem; + font-size: 0.85rem; + transition: all 0.15s; +} + +.dropdown__link:hover { + background: rgba(109, 40, 217, 0.06); +} + +[data-theme='dark'] .dropdown__link:hover { + background: rgba(167, 139, 250, 0.1); +} + +.dropdown__link--active { + background: rgba(109, 40, 217, 0.08); + font-weight: 600; + color: var(--ifm-color-primary); +} + +[data-theme='dark'] .dropdown__link--active { + background: rgba(167, 139, 250, 0.12); } /* ========== SIDEBAR ========== */ @@ -334,12 +471,94 @@ div[class*='expandButton'] { /* ========== CONTENT ========== */ .markdown { line-height: 1.75; + font-size: 0.95rem; + color: var(--ifm-font-color-base); } .markdown > p { margin-bottom: 1.25rem; } +/* Stronger visual hierarchy for headings in docs */ +.markdown h1 { + padding-bottom: 0.5rem; + border-bottom: 1px solid rgba(0, 0, 0, 0.06); + margin-bottom: 1.5rem; +} + +[data-theme='dark'] .markdown h1 { + border-bottom-color: rgba(255, 255, 255, 0.06); +} + +.markdown h2 { + padding-bottom: 0.35rem; + border-bottom: 1px solid rgba(0, 0, 0, 0.04); + margin-bottom: 1rem; +} + +[data-theme='dark'] .markdown h2 { + border-bottom-color: rgba(255, 255, 255, 0.04); +} + +/* Lists */ +.markdown ul, .markdown ol { +} + +.markdown li { + margin-bottom: 0.35rem; +} + +.markdown li > p { + margin-bottom: 0.5rem; +} + +/* Inline code in docs */ +.markdown code { + background: rgba(109, 40, 217, 0.06); + color: var(--ifm-color-primary-dark); + border: 1px solid rgba(109, 40, 217, 0.08); +} + +[data-theme='dark'] .markdown code { + background: rgba(167, 139, 250, 0.1); + color: var(--ifm-color-primary-light); + border-color: rgba(167, 139, 250, 0.12); +} + +/* Don't style code inside code blocks */ +.markdown pre code { + background: none; + color: inherit; + border: none; + padding: 0; +} + +/* Blockquotes */ +.markdown blockquote { + border-left: 3px solid var(--ifm-color-primary-lighter); + background: rgba(109, 40, 217, 0.03); + border-radius: 0 0.5rem 0.5rem 0; + padding: 0.75rem 1.25rem; + margin: 1.5rem 0; +} + +[data-theme='dark'] .markdown blockquote { + background: rgba(167, 139, 250, 0.04); + border-left-color: rgba(167, 139, 250, 0.3); +} + +/* Horizontal rules */ +.markdown hr { + border: none; + height: 1px; + background: linear-gradient(90deg, transparent 0%, rgba(109, 40, 217, 0.15) 50%, transparent 100%); + margin: 2.5rem 0; +} + +[data-theme='dark'] .markdown hr { + background: linear-gradient(90deg, transparent 0%, rgba(167, 139, 250, 0.15) 50%, transparent 100%); +} + /* Better table styling */ table { display: table; @@ -415,21 +634,417 @@ code { font-weight: 500; } -pre code { - font-size: 0.85rem; - line-height: 1.6; -} - +/* Flatten code block backgrounds to a single layer */ .theme-code-block { border: 1px solid rgba(0, 0, 0, 0.06); - border-radius: 1rem; + border-radius: 0.75rem; overflow: hidden; - box-shadow: 0 4px 24px rgba(0, 0, 0, 0.06); + box-shadow: 0 2px 12px rgba(0, 0, 0, 0.06); + background: var(--ifm-pre-background) !important; } [data-theme='dark'] .theme-code-block { border-color: rgba(255, 255, 255, 0.06); - box-shadow: 0 4px 24px rgba(0, 0, 0, 0.3); + box-shadow: 0 2px 12px rgba(0, 0, 0, 0.3); +} + +/* Title bar (filename label) */ +.theme-code-block div[class*='codeBlockTitle'] { + background: transparent !important; + border-bottom: 1px solid rgba(255, 255, 255, 0.06); + padding: 0.5rem 1.25rem; + font-size: 0.8rem; +} + +/* The pre inside code blocks */ +.theme-code-block pre { + background: transparent !important; + padding: 1.25rem 1.5rem !important; + margin: 0; + border-radius: 0; +} + +.theme-code-block pre code { + font-size: 0.85rem; + line-height: 1.6; + padding: 0 !important; + background: transparent !important; + border: none; +} + +/* Copy button */ +.theme-code-block button[class*='copyButton'] { + background: rgba(0, 0, 0, 0.04); + border: 1px solid rgba(0, 0, 0, 0.08); + border-radius: 0.375rem; + color: rgba(0, 0, 0, 0.4); + transition: all 0.15s; +} + +.theme-code-block button[class*='copyButton']:hover { + background: rgba(0, 0, 0, 0.08); + color: rgba(0, 0, 0, 0.7); +} + +[data-theme='dark'] .theme-code-block button[class*='copyButton'] { + background: rgba(255, 255, 255, 0.05); + border-color: rgba(255, 255, 255, 0.08); + color: rgba(255, 255, 255, 0.5); +} + +[data-theme='dark'] .theme-code-block button[class*='copyButton']:hover { + background: rgba(255, 255, 255, 0.1); + color: rgba(255, 255, 255, 0.8); +} + +/* Title bar light mode */ +.theme-code-block div[class*='codeBlockTitle'] { + color: #64748b; +} + +[data-theme='dark'] .theme-code-block div[class*='codeBlockTitle'] { + color: rgba(255, 255, 255, 0.5); +} + +/* ========== LIGHT CODE BLOCKS (global) ========== */ +/* Light mode: all code blocks get light background */ +.theme-code-block { + background: #f5f5f7 !important; + border-color: rgba(0, 0, 0, 0.08) !important; +} + +.theme-code-block pre { + background: transparent !important; +} + +/* Also catch pre elements used by OpenAPI plugin directly */ +pre { + background: #f5f5f7 !important; + color: #383a42 !important; +} + +/* Override Prism token colors for light background */ +.token.comment, +.token.prolog { color: #6a737d !important; } +.token.keyword { color: #a626a4 !important; } +.token.string, +.token.attr-value { color: #50a14f !important; } +.token.function { color: #4078f2 !important; } +.token.class-name { color: #c18401 !important; } +.token.number { color: #986801 !important; } +.token.operator { color: #0184bc !important; } +.token.punctuation { color: #383a42 !important; } +.token.property { color: #e45649 !important; } +.token.builtin { color: #c18401 !important; } +.token.boolean { color: #986801 !important; } +.token.plain { color: #383a42 !important; } +code { color: #383a42; } + +/* Dark mode: revert everything to oneDark defaults */ +[data-theme='dark'] .theme-code-block { + background: var(--ifm-pre-background) !important; + border-color: rgba(255, 255, 255, 0.06) !important; +} + +[data-theme='dark'] pre { + background: var(--ifm-pre-background) !important; + color: var(--ifm-pre-color) !important; +} + +[data-theme='dark'] code { + color: #cdd6f4; +} + +[data-theme='dark'] .token.comment, +[data-theme='dark'] .token.prolog, +[data-theme='dark'] .token.keyword, +[data-theme='dark'] .token.string, +[data-theme='dark'] .token.attr-value, +[data-theme='dark'] .token.function, +[data-theme='dark'] .token.class-name, +[data-theme='dark'] .token.number, +[data-theme='dark'] .token.operator, +[data-theme='dark'] .token.punctuation, +[data-theme='dark'] .token.property, +[data-theme='dark'] .token.builtin, +[data-theme='dark'] .token.boolean, +[data-theme='dark'] .token.plain { + color: unset !important; +} + +/* ========== OPENAPI DOC PAGES ========== */ + +/* Code samples container (Python/curl tabs + code) */ +.openapi-code__code-samples-container { + background: #f5f5f7 !important; + border: 1px solid rgba(0, 0, 0, 0.08) !important; + border-radius: 0.75rem !important; +} + +[data-theme='dark'] .openapi-code__code-samples-container { + background: var(--ifm-pre-background) !important; + border-color: rgba(255, 255, 255, 0.06) !important; +} + +/* Code block wrapper inside code samples and snippets */ +.openapi-explorer__code-block { + background: #f5f5f7 !important; +} + +[data-theme='dark'] .openapi-explorer__code-block { + background: var(--ifm-pre-background) !important; +} + +.openapi-explorer__code-block-content { + background: transparent !important; +} + +.openapi-explorer__code-block-title { + color: #64748b !important; + background: transparent !important; + border-bottom: 1px solid rgba(0, 0, 0, 0.06) !important; +} + +[data-theme='dark'] .openapi-explorer__code-block-title { + color: #94a3b8 !important; + border-bottom-color: rgba(255, 255, 255, 0.06) !important; +} + +/* Hide line numbers in API code samples */ +.openapi-explorer__code-block-code-line-number { + display: none !important; +} + +.openapi-explorer__code-block-code-line-content { + color: #383a42 !important; +} + +.openapi-explorer__code-block-code-line-content .token.keyword { color: #a626a4 !important; } +.openapi-explorer__code-block-code-line-content .token.string { color: #50a14f !important; } +.openapi-explorer__code-block-code-line-content .token.function { color: #4078f2 !important; } +.openapi-explorer__code-block-code-line-content .token.class-name { color: #c18401 !important; } +.openapi-explorer__code-block-code-line-content .token.operator { color: #0184bc !important; } +.openapi-explorer__code-block-code-line-content .token.punctuation { color: #383a42 !important; } +.openapi-explorer__code-block-code-line-content .token.builtin { color: #c18401 !important; } +.openapi-explorer__code-block-code-line-content .token.comment { color: #6a737d !important; } +.openapi-explorer__code-block-code-line-content .token.number { color: #986801 !important; } +.openapi-explorer__code-block-code-line-content .token.property { color: #e45649 !important; } + +[data-theme='dark'] .openapi-explorer__code-block-code-line-content { + color: #cdd6f4 !important; +} + +[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.keyword { color: #c678dd !important; } +[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.string { color: #98c379 !important; } +[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.function { color: #61afef !important; } +[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.class-name { color: #e5c07b !important; } +[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.operator { color: #56b6c2 !important; } +[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.punctuation { color: #abb2bf !important; } +[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.builtin { color: #e5c07b !important; } +[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.comment { color: #5c6370 !important; } +[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.number { color: #d19a66 !important; } +[data-theme='dark'] .openapi-explorer__code-block-code-line-content .token.property { color: #e06c75 !important; } + +/* Outermost code snippets wrapper */ +.openapi-tabs__code-container, +.openapi-tabs__code-container-inner, +.openapi-tabs__code-content, +.openapi-tabs__code-list-container { + background: #f5f5f7 !important; + color: #383a42 !important; +} + +[data-theme='dark'] .openapi-tabs__code-container, +[data-theme='dark'] .openapi-tabs__code-container-inner, +[data-theme='dark'] .openapi-tabs__code-content, +[data-theme='dark'] .openapi-tabs__code-list-container { + background: var(--ifm-pre-background) !important; + color: #cdd6f4 !important; +} + +/* Tab items (PYTHON, CURL labels) */ +.openapi-tabs__code-item--python, +.openapi-tabs__code-item--curl, +[class*='openapi-tabs__code-item'] { + color: #64748b !important; + border-color: rgba(0, 0, 0, 0.1) !important; +} + +[class*='openapi-tabs__code-item'][aria-selected='true'] { + border-color: var(--ifm-color-primary) !important; + color: #1e293b !important; +} + +[data-theme='dark'] [class*='openapi-tabs__code-item'] { + color: #94a3b8 !important; + border-color: rgba(255, 255, 255, 0.1) !important; +} + +[data-theme='dark'] [class*='openapi-tabs__code-item'][aria-selected='true'] { + color: #e2e8f0 !important; + border-color: var(--ifm-color-primary) !important; +} + +/* Variant tabs (OPENAI label) */ +[class*='openapi-tabs__code-item--variant'], +[class*='openapi-tabs__code-item--sample'] { + color: #64748b !important; + border-color: rgba(0, 0, 0, 0.1) !important; +} + +[data-theme='dark'] [class*='openapi-tabs__code-item--variant'], +[data-theme='dark'] [class*='openapi-tabs__code-item--sample'] { + color: #94a3b8 !important; + border-color: rgba(255, 255, 255, 0.1) !important; +} + +/* Request form panel */ +.openapi-explorer__request-form { + background: #f5f5f7 !important; + border: 1px solid rgba(0, 0, 0, 0.08) !important; + border-radius: 0.75rem !important; + color: #1e293b !important; +} + +[data-theme='dark'] .openapi-explorer__request-form { + background: var(--ifm-pre-background) !important; + border-color: rgba(255, 255, 255, 0.06) !important; + color: #e2e8f0 !important; +} + +/* Request/response headers */ +.openapi-explorer__request-title, +.openapi-explorer__response-title { + color: #1e293b !important; +} + +[data-theme='dark'] .openapi-explorer__request-title, +[data-theme='dark'] .openapi-explorer__response-title { + color: #e2e8f0 !important; +} + +.openapi-explorer__request-header-container, +.openapi-explorer__response-title-container { + background: rgba(0, 0, 0, 0.03) !important; + border-bottom: 1px solid rgba(0, 0, 0, 0.06) !important; +} + +[data-theme='dark'] .openapi-explorer__request-header-container, +[data-theme='dark'] .openapi-explorer__response-title-container { + background: rgba(255, 255, 255, 0.03) !important; + border-bottom-color: rgba(255, 255, 255, 0.06) !important; +} + +/* Response container */ +.openapi-explorer__response-container { + background: #f5f5f7 !important; + border: 1px solid rgba(0, 0, 0, 0.08) !important; + border-radius: 0.75rem !important; + color: #1e293b !important; +} + +[data-theme='dark'] .openapi-explorer__response-container { + background: var(--ifm-pre-background) !important; + border-color: rgba(255, 255, 255, 0.06) !important; + color: #e2e8f0 !important; +} + +/* Response placeholder text */ +.openapi-explorer__response-placeholder-message { + color: #64748b !important; +} + +[data-theme='dark'] .openapi-explorer__response-placeholder-message { + color: #94a3b8 !important; +} + +/* Server URL display */ +.openapi-explorer__server-url { + color: #6d28d9 !important; +} + +[data-theme='dark'] .openapi-explorer__server-url { + color: #a78bfa !important; +} + +/* Details/collapsible sections */ +.openapi-explorer__details-summary { + color: #1e293b !important; +} + +[data-theme='dark'] .openapi-explorer__details-summary { + color: #e2e8f0 !important; +} + +/* Form labels and inputs */ +.openapi-explorer__form-item-label { + color: #1e293b !important; +} + +[data-theme='dark'] .openapi-explorer__form-item-label { + color: #e2e8f0 !important; +} + +/* Send API request button */ +.openapi-explorer__request-btn { + border-radius: 0.5rem !important; +} + +/* Response clear button */ +.openapi-explorer__response-clear-btn { + color: #64748b !important; +} + +[data-theme='dark'] .openapi-explorer__response-clear-btn { + color: #94a3b8 !important; +} + +/* Tabs in API pages (operation tabs, schema tabs) */ +.openapi-tabs__container, +.openapi-tabs__operation-container, +.openapi-tabs__response-container, +.openapi-tabs__schema-container { + background: transparent !important; +} + +/* Method + URL box (GET http://...) */ +.openapi__method-endpoint { + background: #f5f5f7 !important; + border: 1px solid rgba(0, 0, 0, 0.08) !important; + border-radius: 0.5rem !important; + display: flex !important; + align-items: center !important; + gap: 0.75rem !important; + padding: 0.6rem 1rem !important; + margin: 0 0 1rem !important; + overflow-x: auto; +} + +[data-theme='dark'] .openapi__method-endpoint { + background: var(--ifm-pre-background) !important; + border-color: rgba(255, 255, 255, 0.06) !important; +} + +.openapi__method-endpoint .badge { + flex-shrink: 0; + font-size: 0.7rem !important; + padding: 0.2rem 0.5rem !important; + line-height: 1.2 !important; +} + +.openapi__method-endpoint-path { + color: #383a42 !important; + font-size: 0.85rem !important; + font-weight: 500 !important; + margin: 0 !important; + padding: 0 !important; + border: none !important; + line-height: 1.4 !important; + white-space: nowrap; +} + +[data-theme='dark'] .openapi__method-endpoint-path { + color: #cdd6f4 !important; } /* ========== ADMONITIONS ========== */ @@ -446,27 +1061,107 @@ pre code { } /* ========== FOOTER ========== */ -.footer--dark { - background: #0c0c14; - --ifm-footer-link-color: #94a3b8; - --ifm-footer-title-color: #e2e8f0; +.footer { + position: relative; + padding-top: 4rem !important; + padding-bottom: 2rem !important; + background: #f8f8fa; } -.footer--dark .footer__link-item { - color: #94a3b8; - font-size: 0.875rem; +[data-theme='dark'] .footer { + background: #0a0a12; } -.footer--dark .footer__link-item:hover { - color: #e2e8f0; +/* Gradient separator line at top of footer */ +.footer::before { + content: ''; + position: absolute; + top: 0; + left: 0; + right: 0; + height: 1px; + background: linear-gradient(90deg, transparent 0%, rgba(109, 40, 217, 0.3) 20%, rgba(45, 212, 191, 0.2) 50%, rgba(96, 165, 250, 0.3) 80%, transparent 100%); +} + +/* Subtle glow behind footer top */ +.footer::after { + content: ''; + position: absolute; + top: 0; + left: 50%; + transform: translateX(-50%); + width: 60%; + height: 120px; + background: radial-gradient(ellipse at center top, rgba(109, 40, 217, 0.04) 0%, transparent 70%); + pointer-events: none; +} + +[data-theme='dark'] .footer::after { + background: radial-gradient(ellipse at center top, rgba(109, 40, 217, 0.06) 0%, transparent 70%); +} + +.footer .footer__link-item { + color: #64748b; + font-size: 0.85rem; + transition: all 0.15s; + padding: 0.2rem 0; + display: inline-block; } -.footer--dark .footer__title { +.footer .footer__link-item:hover { + color: #1e293b; + transform: translateX(2px); +} + +[data-theme='dark'] .footer .footer__link-item:hover { color: #e2e8f0; - font-weight: 600; - font-size: 0.8rem; +} + +.footer .footer__title { + color: #1e293b; + font-weight: 700; + font-size: 0.78rem; text-transform: uppercase; - letter-spacing: 0.05em; + letter-spacing: 0.08em; + margin-bottom: 1rem; + position: relative; + padding-bottom: 0.6rem; +} + +[data-theme='dark'] .footer .footer__title { + color: #f1f5f9; +} + +.footer .footer__title::after { + content: ''; + position: absolute; + bottom: 0; + left: 0; + width: 20px; + height: 2px; + border-radius: 1px; + background: linear-gradient(90deg, #6d28d9, #2dd4bf); +} + +.footer .footer__col { + padding: 0 1rem; +} + +/* Footer copyright area */ +.footer .footer__bottom { + margin-top: 3rem; + padding-top: 1.5rem; + border-top: 1px solid rgba(0, 0, 0, 0.06); +} + +[data-theme='dark'] .footer .footer__bottom { + border-top-color: rgba(255, 255, 255, 0.06); +} + +.footer .footer__copyright { + font-size: 0.8rem; + color: #94a3b8; + text-align: center; } /* ========== PAGINATION ========== */ @@ -744,4 +1439,8 @@ footer.row .col { border-bottom-color: rgba(255, 255, 255, 0.06); } -/* Emoji icons on doc cards are removed via swizzled DocCard component */ +/* ========== DOC CARDS (API listing pages) ========== */ +/* Reduce excessive internal padding */ +.card.padding--lg { + padding: 0 0.75rem !important; +} diff --git a/docs/src/pages/index.js b/docs/src/pages/index.js index f6ff05ff08..195e8e3bb0 100644 --- a/docs/src/pages/index.js +++ b/docs/src/pages/index.js @@ -26,14 +26,81 @@ const Icons = { ), + chat: ( + + + + ), + zap: ( + + + + ), + layers: ( + + + + ), + database: ( + + + + ), + file: ( + + + + ), + cpu: ( + + + + ), + shield: ( + + + + ), + message: ( + + + + ), + conversation: ( + + + + ), + plug: ( + + + + ), + stack: ( + + + + ), + arrow: ( + + + + ), }; +const FEATURE_ICONS = [Icons.chat, Icons.zap, Icons.layers, Icons.database, Icons.shield, Icons.message, Icons.conversation, Icons.plug, Icons.file, Icons.stack, Icons.cpu]; + const FEATURES = [ { label: 'Chat Completions', path: '/v1/chat/completions', desc: 'Standard OpenAI-compatible chat and completion endpoints' }, { label: 'Responses API', path: '/v1/responses', desc: 'Server-side agentic orchestration with tool calling and MCP' }, { label: 'Embeddings', path: '/v1/embeddings', desc: 'Text embeddings from any provider' }, { label: 'Vector Stores', path: '/v1/vector_stores', desc: 'Managed document storage and semantic search' }, - { label: 'Files & Batches', path: '/v1/files', desc: 'File upload, processing, and batch operations' }, + { label: 'Moderations', path: '/v1/moderations', desc: 'Content moderation and safety with configurable shields' }, + { label: 'Messages API', path: '/v1/messages', desc: 'Native Anthropic Messages API support' }, + { label: 'Conversations', path: '/v1/conversations', desc: 'Multi-turn conversation state management and history' }, + { label: 'Connectors', path: '/v1/connectors', desc: 'External connectors like MCP servers and tool integrations' }, + { label: 'Files', path: '/v1/files', desc: 'File upload, processing, and content extraction' }, + { label: 'Batches', path: '/v1/batches', desc: 'Async batch processing for large-scale workloads' }, { label: 'Models', path: '/v1/models', desc: 'Model discovery and management' }, ]; @@ -120,6 +187,9 @@ function CodeTabs() { const [active, setActive] = useState(0); return (
+
+ +
{CODE_EXAMPLES.map((ex, i) => (