From e814719cec26fdc101880e14d6b60e5cc4a00603 Mon Sep 17 00:00:00 2001
From: Ilya Matiach <ilmat@microsoft.com>
Date: Fri, 6 Feb 2026 17:51:57 -0500
Subject: [PATCH] Add evaluation test scenarios to azure-ai-projects-py and
 remove orphaned azure-ai-evaluation-py tests

---
 .../azure-ai-evaluation-py/scenarios.yaml     | 270 ---------
 .../azure-ai-projects-py/scenarios.yaml       | 514 ++++++++++++++++++
 2 files changed, 514 insertions(+), 270 deletions(-)
 delete mode 100644 tests/scenarios/azure-ai-evaluation-py/scenarios.yaml

diff --git a/tests/scenarios/azure-ai-evaluation-py/scenarios.yaml b/tests/scenarios/azure-ai-evaluation-py/scenarios.yaml
deleted file mode 100644
index ef85317..0000000
--- a/tests/scenarios/azure-ai-evaluation-py/scenarios.yaml
+++ /dev/null
@@ -1,270 +0,0 @@
-# Test scenarios for azure-ai-evaluation-py skill evaluation
-# Each scenario tests a specific usage pattern against acceptance criteria
-
-config:
-  model: gpt-4
-  max_tokens: 2000
-  temperature: 0.3
-
-scenarios:
-  - name: model_config_and_quality_evaluator
-    prompt: |
-      Create a groundedness evaluation example using the Azure AI Evaluation SDK.
-      Include model configuration with Azure OpenAI settings and run a groundedness evaluation.
-    expected_patterns:
-      - "from azure\\.ai\\.evaluation import GroundednessEvaluator"
-      - "azure_endpoint"
-      - "azure_deployment"
-      - "GroundednessEvaluator\\("
-    forbidden_patterns:
-      - "deployment_name"
-    tags:
-      - quality
-      - basic
-    mock_response: |
-      import os
-      from azure.ai.evaluation import GroundednessEvaluator
-
-      model_config = {
-          "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
-          "api_key": os.environ["AZURE_OPENAI_API_KEY"],
-          "azure_deployment": os.environ["AZURE_OPENAI_DEPLOYMENT"],
-      }
-
-      groundedness = GroundednessEvaluator(model_config)
-      result = groundedness(
-          query="What is Azure AI?",
-          context="Azure AI is Microsoft's AI platform.",
-          response="Azure AI provides AI services and tools."
-      )
-      print(result["groundedness"])
-
-  - name: nlp_evaluators_metrics
-    prompt: |
-      Show how to use NLP-based evaluators (F1, ROUGE, BLEU) with ground truth data.
-    expected_patterns:
-      - "F1ScoreEvaluator"
-      - "RougeScoreEvaluator"
-      - "BleuScoreEvaluator"
-      - "ground_truth"
-    forbidden_patterns:
-      - "model_config"
-    tags:
-      - nlp
-      - quality
-    mock_response: |
-      from azure.ai.evaluation import F1ScoreEvaluator, RougeScoreEvaluator, BleuScoreEvaluator
-
-      f1 = F1ScoreEvaluator()
-      rouge = RougeScoreEvaluator(rouge_type="rouge1")
-      bleu = BleuScoreEvaluator()
-
-      f1_result = f1(response="Tokyo is the capital of Japan.", ground_truth="Tokyo is Japan's capital.")
-      rouge_result = rouge(response="Azure provides cloud services.", ground_truth="Azure is a cloud platform.")
-      bleu_result = bleu(response="The cat sat on the mat.", ground_truth="A cat is sitting on the mat.")
-      print(f1_result, rouge_result, bleu_result)
-
-  - name: safety_evaluators_project_scope
-    prompt: |
-      Create a safety evaluation example using a project scope configuration.
-      Use the ViolenceEvaluator and run a sample evaluation.
-    expected_patterns:
-      - "ViolenceEvaluator"
-      - "azure_ai_project"
-      - "resource_group_name"
-    forbidden_patterns:
-      - "model_config"
-    tags:
-      - safety
-    mock_response: |
-      import os
-      from azure.ai.evaluation import ViolenceEvaluator
-
-      azure_ai_project = {
-          "subscription_id": os.environ["AZURE_SUBSCRIPTION_ID"],
-          "resource_group_name": os.environ["AZURE_RESOURCE_GROUP"],
-          "project_name": os.environ["AZURE_AI_PROJECT_NAME"],
-      }
-
-      violence = ViolenceEvaluator(azure_ai_project=azure_ai_project)
-      result = violence(query="Tell me a story", response="Once upon a time in a peaceful village.")
-      print(result["violence_score"])
-
-  - name: custom_evaluator_decorator
-    prompt: |
-      Create a custom evaluator using the @evaluator decorator and run it in batch evaluation.
-    expected_patterns:
-      - "@evaluator"
-      - "word_count"
-      - "evaluate\\("
-    forbidden_patterns:
-      - "return 0.5"
-    tags:
-      - custom
-      - batch
-    mock_response: |
-      from azure.ai.evaluation import evaluator, evaluate
-
-      @evaluator
-      def word_count_evaluator(response: str) -> dict:
-          return {"word_count": len(response.split())}
-
-      result = evaluate(
-          data="data.jsonl",
-          evaluators={"word_count": word_count_evaluator},
-          evaluator_config={
-              "default": {
-                  "column_mapping": {
-                      "response": "${data.response}"
-                  }
-              }
-          }
-      )
-      print(result["metrics"])
-
-  - name: batch_evaluate_with_mapping
-    prompt: |
-      Run batch evaluation with multiple evaluators and proper column mapping.
-    expected_patterns:
-      - "evaluate\\("
-      - "evaluator_config"
-      - "column_mapping"
-      - "\\$\\{data\\.query\\}"
-    forbidden_patterns:
-      - "evaluators=["
-    tags:
-      - batch
-    mock_response: |
-      import os
-      from azure.ai.evaluation import evaluate, GroundednessEvaluator, RelevanceEvaluator
-
-      model_config = {
-          "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
-          "api_key": os.environ["AZURE_OPENAI_API_KEY"],
-          "azure_deployment": os.environ["AZURE_OPENAI_DEPLOYMENT"],
-      }
-
-      groundedness = GroundednessEvaluator(model_config)
-      relevance = RelevanceEvaluator(model_config)
-
-      result = evaluate(
-          data="data.jsonl",
-          evaluators={
-              "groundedness": groundedness,
-              "relevance": relevance,
-          },
-          evaluator_config={
-              "default": {
-                  "column_mapping": {
-                      "query": "${data.query}",
-                      "context": "${data.context}",
-                      "response": "${data.response}",
-                  }
-              }
-          }
-      )
-      print(result["metrics"])
-
-  - name: composite_qa_evaluator
-    prompt: |
-      Use the QAEvaluator composite evaluator with model configuration and ground truth.
-    expected_patterns:
-      - "QAEvaluator"
-      - "ground_truth"
-    forbidden_patterns:
-      - "ContentSafetyEvaluator"
-    tags:
-      - composite
-      - quality
-    mock_response: |
-      import os
-      from azure.ai.evaluation import QAEvaluator
-
-      model_config = {
-          "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
-          "api_key": os.environ["AZURE_OPENAI_API_KEY"],
-          "azure_deployment": os.environ["AZURE_OPENAI_DEPLOYMENT"],
-      }
-
-      qa = QAEvaluator(model_config)
-      result = qa(
-          query="What is Azure?",
-          context="Azure is Microsoft's cloud platform.",
-          response="Azure is a cloud computing platform by Microsoft.",
-          ground_truth="Azure is Microsoft's cloud computing platform."
-      )
-      print(result["groundedness"])
-
-  - name: model_config_class_default_credential
-    prompt: |
-      Create a model configuration using AzureOpenAIModelConfiguration with DefaultAzureCredential.
-    expected_patterns:
-      - "AzureOpenAIModelConfiguration"
-      - "DefaultAzureCredential"
-      - "azure_deployment"
-    forbidden_patterns:
-      - "api_key"
-    tags:
-      - setup
-      - authentication
-    mock_response: |
-      import os
-      from azure.ai.evaluation import AzureOpenAIModelConfiguration, GroundednessEvaluator
-      from azure.identity import DefaultAzureCredential
-
-      model_config = AzureOpenAIModelConfiguration(
-          azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
-          credential=DefaultAzureCredential(),
-          azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT"],
-          api_version="2024-06-01",
-      )
-
-      groundedness = GroundednessEvaluator(model_config)
-      result = groundedness(
-          query="What is Azure AI?",
-          context="Azure AI provides services for vision, speech, and language.",
-          response="Azure AI offers vision and speech services."
-      )
-      print(result["groundedness"])
-
-  - name: evaluate_target_callable
-    prompt: |
-      Evaluate a target callable using evaluate() with outputs column mapping.
-    expected_patterns:
-      - "target="
-      - "\\$\\{outputs\\.context\\}"
-      - "\\$\\{outputs\\.response\\}"
-    forbidden_patterns:
-      - "outputs\\.response"
-    tags:
-      - target
-      - batch
-    mock_response: |
-      from azure.ai.evaluation import evaluate, GroundednessEvaluator
-
-      def chat_app(query: str) -> dict:
-          return {"response": f"Answer: {query}", "context": "source text"}
-
-      groundedness = GroundednessEvaluator(
-          {
-              "azure_endpoint": "https://example.openai.azure.com",
-              "api_key": "fake-key",
-              "azure_deployment": "gpt-4o-mini",
-          }
-      )
-
-      result = evaluate(
-          data="queries.jsonl",
-          target=chat_app,
-          evaluators={"groundedness": groundedness},
-          evaluator_config={
-              "default": {
-                  "column_mapping": {
-                      "query": "${data.query}",
-                      "context": "${outputs.context}",
-                      "response": "${outputs.response}",
-                  }
-              }
-          }
-      )
-      print(result["metrics"])
diff --git a/tests/scenarios/azure-ai-projects-py/scenarios.yaml b/tests/scenarios/azure-ai-projects-py/scenarios.yaml
index 4e19a51..fd92a54 100644
--- a/tests/scenarios/azure-ai-projects-py/scenarios.yaml
+++ b/tests/scenarios/azure-ai-projects-py/scenarios.yaml
@@ -727,3 +727,517 @@ scenarios:
               await client.agents.delete_agent(agent.id)
       
       asyncio.run(main())
+
+  # Evaluation with Inline Data and Run
+  - name: evaluation_with_inline_data
+    prompt: |
+      Create a complete evaluation workflow: define inline JSONL data,
+      create an evaluation with testing criteria, run the evaluation,
+      poll for completion, and retrieve results.
+    expected_patterns:
+      - "CreateEvalJSONLRunDataSourceParam"
+      - "SourceFileContent"
+      - "SourceFileContentContent"
+      - "openai_client.evals.create"
+      - "openai_client.evals.runs.create"
+      - "evals.runs.retrieve"
+      - "evals.runs.output_items.list"
+      - "eval_id="
+      - "run_id="
+      - "run.status"
+    forbidden_patterns:
+      - "from azure.ai.evaluation import evaluate"  # Deprecated SDK
+      - "from azure.ai.evaluation import"  # Deprecated SDK
+    tags:
+      - evaluations
+      - runs
+    mock_response: |
+      import os
+      import time
+      from azure.ai.projects import AIProjectClient
+      from azure.identity import DefaultAzureCredential
+      from openai.types.evals.create_eval_jsonl_run_data_source_param import (
+          CreateEvalJSONLRunDataSourceParam,
+          SourceFileContent,
+          SourceFileContentContent,
+      )
+      from openai.types.eval_create_params import DataSourceConfigCustom
+      
+      with (
+          DefaultAzureCredential() as credential,
+          AIProjectClient(
+              endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"],
+              credential=credential,
+          ) as project_client,
+      ):
+          openai_client = project_client.get_openai_client()
+          
+          # Prepare inline test data
+          data = [
+              {"query": "What is Azure?", "response": "Azure is Microsoft's cloud."},
+              {"query": "What is AI?", "response": "AI is artificial intelligence."},
+          ]
+          
+          data_source = CreateEvalJSONLRunDataSourceParam(
+              type="jsonl",
+              source=SourceFileContent(
+                  type="file_content",
+                  content=[
+                      SourceFileContentContent(item=item, sample={})
+                      for item in data
+                  ],
+              ),
+          )
+          
+          data_source_config = DataSourceConfigCustom(
+              type="custom",
+              item_schema={
+                  "type": "object",
+                  "properties": {
+                      "query": {"type": "string"},
+                      "response": {"type": "string"},
+                  },
+                  "required": ["query", "response"],
+              },
+              include_sample_schema=False,
+          )
+          
+          testing_criteria = [
+              {
+                  "type": "azure_ai_evaluator",
+                  "name": "coherence",
+                  "evaluator_name": "builtin.coherence",
+                  "data_mapping": {
+                      "query": "{{item.query}}",
+                      "response": "{{item.response}}",
+                  },
+                  "initialization_parameters": {"deployment_name": "gpt-4o-mini"},
+              },
+          ]
+          
+          # Create evaluation
+          eval_object = openai_client.evals.create(
+              name="Quality Evaluation",
+              data_source_config=data_source_config,
+              testing_criteria=testing_criteria,
+          )
+          
+          # Run evaluation
+          run = openai_client.evals.runs.create(
+              eval_id=eval_object.id,
+              name="Run 1",
+              data_source=data_source,
+          )
+          
+          # Poll for completion
+          while run.status not in ["completed", "failed", "cancelled"]:
+              time.sleep(5)
+              run = openai_client.evals.runs.retrieve(
+                  eval_id=eval_object.id,
+                  run_id=run.id,
+              )
+          
+          # Retrieve results
+          output_items = list(openai_client.evals.runs.output_items.list(
+              eval_id=eval_object.id,
+              run_id=run.id,
+          ))
+          
+          for item in output_items:
+              for result in item.results:
+                  print(f"{result.name}: {result.score}")
+
+  # Agent Evaluation with Sample Mapping
+  - name: agent_evaluation_with_sample
+    prompt: |
+      Create an evaluation for an AI agent that uses sample mapping for agent outputs.
+      Include tool call data in the sample and use agent evaluators like intent_resolution
+      and tool_call_accuracy.
+    expected_patterns:
+      - "SourceFileContentContent"
+      - "sample="
+      - "output_text"
+      - "output_items"
+      - "include_sample_schema=True"
+      - "builtin.intent_resolution"
+      - "{{sample.output_text}}"
+    forbidden_patterns:
+      - "include_sample_schema=False"  # Must be True for agent evals
+      - "from azure.ai.evaluation import"  # Deprecated SDK
+    tags:
+      - evaluations
+      - agents
+    mock_response: |
+      import os
+      from azure.ai.projects import AIProjectClient
+      from azure.identity import DefaultAzureCredential
+      from openai.types.evals.create_eval_jsonl_run_data_source_param import (
+          CreateEvalJSONLRunDataSourceParam,
+          SourceFileContent,
+          SourceFileContentContent,
+      )
+      from openai.types.eval_create_params import DataSourceConfigCustom
+      
+      with (
+          DefaultAzureCredential() as credential,
+          AIProjectClient(
+              endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"],
+              credential=credential,
+          ) as project_client,
+      ):
+          openai_client = project_client.get_openai_client()
+          
+          # Agent evaluation data with tool calls
+          data_source = CreateEvalJSONLRunDataSourceParam(
+              type="jsonl",
+              source=SourceFileContent(
+                  type="file_content",
+                  content=[
+                      SourceFileContentContent(
+                          item={"query": "What's the weather in Seattle?"},
+                          sample={
+                              "output_text": "It's 55°F and cloudy in Seattle.",
+                              "output_items": [
+                                  {
+                                      "type": "tool_call",
+                                      "name": "get_weather",
+                                      "arguments": {"location": "Seattle"},
+                                      "result": {"temp": "55", "condition": "cloudy"},
+                                  }
+                              ],
+                          },
+                      ),
+                  ],
+              ),
+          )
+          
+          data_source_config = DataSourceConfigCustom(
+              type="custom",
+              item_schema={
+                  "type": "object",
+                  "properties": {"query": {"type": "string"}},
+                  "required": ["query"],
+              },
+              include_sample_schema=True,  # Required for agent evaluations
+          )
+          
+          testing_criteria = [
+              {
+                  "type": "azure_ai_evaluator",
+                  "name": "intent_resolution",
+                  "evaluator_name": "builtin.intent_resolution",
+                  "data_mapping": {
+                      "query": "{{item.query}}",
+                      "response": "{{sample.output_text}}",
+                  },
+                  "initialization_parameters": {"deployment_name": "gpt-4o-mini"},
+              },
+              {
+                  "type": "azure_ai_evaluator",
+                  "name": "tool_call_accuracy",
+                  "evaluator_name": "builtin.tool_call_accuracy",
+                  "data_mapping": {
+                      "query": "{{item.query}}",
+                      "response": "{{sample.output_items}}",
+                  },
+                  "initialization_parameters": {"deployment_name": "gpt-4o-mini"},
+              },
+          ]
+          
+          eval_object = openai_client.evals.create(
+              name="Agent Evaluation",
+              data_source_config=data_source_config,
+              testing_criteria=testing_criteria,
+          )
+          print(f"Created agent evaluation: {eval_object.id}")
+
+  # Custom Code-Based Evaluator
+  - name: custom_code_evaluator
+    prompt: |
+      Create a custom code-based evaluator using CodeBasedEvaluatorDefinition.
+      The evaluator should count words in the response and check if it's concise.
+    expected_patterns:
+      - "from azure.ai.projects.models import"
+      - "EvaluatorVersion"
+      - "EvaluatorType.CUSTOM"
+      - "CodeBasedEvaluatorDefinition"
+      - "EvaluatorMetric"
+      - "code_text="
+      - "def grade"
+      - "project_client.evaluators.create_version"
+      - "data_schema"
+      - "metrics"
+    forbidden_patterns:
+      - "from azure.ai.evaluation import evaluator"  # Deprecated decorator
+      - "@evaluator"  # Deprecated decorator
+    tags:
+      - evaluations
+      - custom
+    mock_response: |
+      import os
+      from azure.ai.projects import AIProjectClient
+      from azure.ai.projects.models import (
+          EvaluatorVersion,
+          EvaluatorCategory,
+          EvaluatorType,
+          CodeBasedEvaluatorDefinition,
+          EvaluatorMetric,
+          EvaluatorMetricType,
+          EvaluatorMetricDirection,
+      )
+      from azure.identity import DefaultAzureCredential
+      
+      with (
+          DefaultAzureCredential() as credential,
+          AIProjectClient(
+              endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"],
+              credential=credential,
+          ) as project_client,
+      ):
+          evaluator = project_client.evaluators.create_version(
+              name="word_count_evaluator",
+              evaluator_version=EvaluatorVersion(
+                  evaluator_type=EvaluatorType.CUSTOM,
+                  categories=[EvaluatorCategory.QUALITY],
+                  display_name="Word Count",
+                  description="Counts words in response",
+                  definition=CodeBasedEvaluatorDefinition(
+                      code_text='def grade(sample, item) -> dict:\n    response = item.get("response", "")\n    word_count = len(response.split())\n    return {"word_count": word_count, "is_concise": word_count < 100}',
+                      data_schema={
+                          "type": "object",
+                          "properties": {"response": {"type": "string"}},
+                          "required": ["response"],
+                      },
+                      metrics={
+                          "word_count": EvaluatorMetric(
+                              type=EvaluatorMetricType.ORDINAL,
+                              desirable_direction=EvaluatorMetricDirection.DECREASE,
+                              min_value=0,
+                              max_value=10000,
+                          ),
+                          "is_concise": EvaluatorMetric(
+                              type=EvaluatorMetricType.BINARY,
+                          ),
+                      },
+                  ),
+              ),
+          )
+          print(f"Created evaluator: {evaluator.name} (version {evaluator.version})")
+
+  # Safety Evaluators
+  - name: safety_evaluators
+    prompt: |
+      Create an evaluation using safety evaluators like builtin.violence,
+      builtin.sexual, and builtin.hate_unfairness to check content safety.
+    expected_patterns:
+      - "builtin.violence"
+      - "builtin.sexual"
+      - "builtin.hate_unfairness"
+      - "azure_ai_evaluator"
+      - "data_mapping"
+      - "testing_criteria"
+    forbidden_patterns:
+      - "ViolenceEvaluator"  # Deprecated class-based evaluator
+      - "SexualEvaluator"  # Deprecated class-based evaluator
+      - "from azure.ai.evaluation import"  # Deprecated SDK
+    tags:
+      - evaluations
+      - safety
+    mock_response: |
+      import os
+      from azure.ai.projects import AIProjectClient
+      from azure.identity import DefaultAzureCredential
+      from openai.types.eval_create_params import DataSourceConfigCustom
+      
+      with (
+          DefaultAzureCredential() as credential,
+          AIProjectClient(
+              endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"],
+              credential=credential,
+          ) as project_client,
+      ):
+          openai_client = project_client.get_openai_client()
+          
+          data_source_config = DataSourceConfigCustom(
+              type="custom",
+              item_schema={
+                  "type": "object",
+                  "properties": {
+                      "query": {"type": "string"},
+                      "response": {"type": "string"},
+                  },
+                  "required": ["query", "response"],
+              },
+              include_sample_schema=False,
+          )
+          
+          # Safety evaluators
+          testing_criteria = [
+              {
+                  "type": "azure_ai_evaluator",
+                  "name": "violence_check",
+                  "evaluator_name": "builtin.violence",
+                  "data_mapping": {
+                      "query": "{{item.query}}",
+                      "response": "{{item.response}}",
+                  },
+              },
+              {
+                  "type": "azure_ai_evaluator",
+                  "name": "sexual_check",
+                  "evaluator_name": "builtin.sexual",
+                  "data_mapping": {
+                      "query": "{{item.query}}",
+                      "response": "{{item.response}}",
+                  },
+              },
+              {
+                  "type": "azure_ai_evaluator",
+                  "name": "hate_check",
+                  "evaluator_name": "builtin.hate_unfairness",
+                  "data_mapping": {
+                      "query": "{{item.query}}",
+                      "response": "{{item.response}}",
+                  },
+              },
+          ]
+          
+          eval_object = openai_client.evals.create(
+              name="Safety Evaluation",
+              data_source_config=data_source_config,
+              testing_criteria=testing_criteria,
+          )
+          print(f"Created safety evaluation: {eval_object.id}")
+
+  # OpenAI Graders
+  - name: openai_graders
+    prompt: |
+      Create an evaluation using OpenAI graders: label_model for classification,
+      string_check for pattern matching, and text_similarity for semantic matching.
+    expected_patterns:
+      - '"type": "label_model"'
+      - '"type": "string_check"'
+      - '"type": "text_similarity"'
+      - "labels"
+      - "passing_labels"
+      - "operation"
+      - "pass_threshold"
+      - "testing_criteria"
+    forbidden_patterns:
+      - "AzureOpenAILabelGrader"  # Deprecated class
+      - "AzureOpenAIStringCheckGrader"  # Deprecated class
+      - "from azure.ai.evaluation import"  # Deprecated SDK
+    tags:
+      - evaluations
+      - graders
+    mock_response: |
+      import os
+      from azure.ai.projects import AIProjectClient
+      from azure.identity import DefaultAzureCredential
+      from openai.types.eval_create_params import DataSourceConfigCustom
+      
+      with (
+          DefaultAzureCredential() as credential,
+          AIProjectClient(
+              endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"],
+              credential=credential,
+          ) as project_client,
+      ):
+          openai_client = project_client.get_openai_client()
+          deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o-mini")
+          
+          data_source_config = DataSourceConfigCustom(
+              type="custom",
+              item_schema={
+                  "type": "object",
+                  "properties": {
+                      "response": {"type": "string"},
+                      "expected": {"type": "string"},
+                  },
+                  "required": ["response"],
+              },
+              include_sample_schema=False,
+          )
+          
+          # OpenAI graders
+          testing_criteria = [
+              # Label grader for classification
+              {
+                  "type": "label_model",
+                  "name": "sentiment_classifier",
+                  "model": deployment,
+                  "input": [
+                      {"role": "user", "content": "Classify sentiment: {{item.response}}"}
+                  ],
+                  "labels": ["positive", "negative", "neutral"],
+                  "passing_labels": ["positive", "neutral"],
+              },
+              # String check grader for pattern matching
+              {
+                  "type": "string_check",
+                  "name": "has_disclaimer",
+                  "input": "{{item.response}}",
+                  "operation": "contains",
+                  "reference": "Please consult a professional",
+              },
+              # Text similarity grader for semantic matching
+              {
+                  "type": "text_similarity",
+                  "name": "matches_expected",
+                  "input": "{{item.response}}",
+                  "reference": "{{item.expected}}",
+                  "evaluation_metric": "fuzzy_match",
+                  "pass_threshold": 0.8,
+              },
+          ]
+          
+          eval_object = openai_client.evals.create(
+              name="Graders Evaluation",
+              data_source_config=data_source_config,
+              testing_criteria=testing_criteria,
+          )
+          print(f"Created graders evaluation: {eval_object.id}")
+
+  # List Built-in Evaluators
+  - name: list_builtin_evaluators
+    prompt: |
+      List all available built-in evaluators in the project and get details
+      about a specific evaluator including its data schema and metrics.
+    expected_patterns:
+      - 'project_client.evaluators.list_latest_versions'
+      - 'type="builtin"'
+      - "evaluators.get_version"
+      - "definition.data_schema"
+      - "definition.metrics"
+    forbidden_patterns:
+      - "from azure.ai.evaluation import"  # Deprecated SDK
+    tags:
+      - evaluations
+      - discovery
+    mock_response: |
+      import os
+      from azure.ai.projects import AIProjectClient
+      from azure.identity import DefaultAzureCredential
+      
+      with (
+          DefaultAzureCredential() as credential,
+          AIProjectClient(
+              endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"],
+              credential=credential,
+          ) as project_client,
+      ):
+          # List all built-in evaluators
+          print("Built-in Evaluators:")
+          evaluators = project_client.evaluators.list_latest_versions(type="builtin")
+          for e in evaluators:
+              print(f"  builtin.{e.name}: {e.description}")
+              print(f"    Categories: {[str(c) for c in e.categories]}")
+          
+          # Get specific evaluator details
+          coherence = project_client.evaluators.get_version(
+              name="coherence",
+              version="latest"
+          )
+          print(f"\nCoherence Evaluator:")
+          print(f"  Data Schema: {coherence.definition.data_schema}")
+          print(f"  Metrics: {coherence.definition.metrics}")