diff --git a/.github/skills/azure-ai-evaluation-py/SKILL.md b/.github/skills/azure-ai-evaluation-py/SKILL.md
deleted file mode 100644
index 627693f..0000000
--- a/.github/skills/azure-ai-evaluation-py/SKILL.md
+++ /dev/null
@@ -1,433 +0,0 @@
----
-name: azure-ai-evaluation-py
-description: |
-  Azure AI Evaluation SDK for Python. Use for evaluating generative AI applications with quality, safety, agent, and custom evaluators.
-  Triggers: "azure-ai-evaluation", "evaluators", "GroundednessEvaluator", "evaluate", "AI quality metrics", "RedTeam", "agent evaluation".
-package: azure-ai-evaluation
----
-
-# Azure AI Evaluation SDK for Python
-
-Assess generative AI application performance with built-in quality, safety, agent evaluators, Azure OpenAI graders, and custom evaluators.
-
-## Installation
-
-```bash
-pip install azure-ai-evaluation
-
-# With red team support
-pip install azure-ai-evaluation[redteam]
-```
-
-## Environment Variables
-
-```bash
-# For AI-assisted evaluators
-AZURE_OPENAI_ENDPOINT=https://<resource>.openai.azure.com
-AZURE_OPENAI_API_KEY=<your-api-key>
-AZURE_OPENAI_DEPLOYMENT=gpt-4o-mini
-
-# For Foundry project integration
-AIPROJECT_CONNECTION_STRING=<your-connection-string>
-```
-
-## Built-in Evaluators
-
-### Quality Evaluators (AI-Assisted)
-
-```python
-from azure.ai.evaluation import (
-    GroundednessEvaluator,
-    GroundednessProEvaluator,  # Service-based groundedness
-    RelevanceEvaluator,
-    CoherenceEvaluator,
-    FluencyEvaluator,
-    SimilarityEvaluator,
-    RetrievalEvaluator
-)
-
-# Initialize with Azure OpenAI model config
-model_config = {
-    "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
-    "api_key": os.environ["AZURE_OPENAI_API_KEY"],
-    "azure_deployment": os.environ["AZURE_OPENAI_DEPLOYMENT"]
-}
-
-groundedness = GroundednessEvaluator(model_config)
-relevance = RelevanceEvaluator(model_config)
-coherence = CoherenceEvaluator(model_config)
-
-# For reasoning models (o1/o3), use is_reasoning_model parameter
-groundedness_reasoning = GroundednessEvaluator(model_config, is_reasoning_model=True)
-```
-
-### Quality Evaluators (NLP-based)
-
-```python
-from azure.ai.evaluation import (
-    F1ScoreEvaluator,
-    RougeScoreEvaluator,
-    BleuScoreEvaluator,
-    GleuScoreEvaluator,
-    MeteorScoreEvaluator
-)
-
-f1 = F1ScoreEvaluator()
-rouge = RougeScoreEvaluator()
-bleu = BleuScoreEvaluator()
-```
-
-### Safety Evaluators
-
-```python
-from azure.ai.evaluation import (
-    ViolenceEvaluator,
-    SexualEvaluator,
-    SelfHarmEvaluator,
-    HateUnfairnessEvaluator,
-    IndirectAttackEvaluator,
-    ProtectedMaterialEvaluator,
-    CodeVulnerabilityEvaluator,
-    UngroundedAttributesEvaluator
-)
-
-# Project scope for safety evaluators
-azure_ai_project = {
-    "subscription_id": os.environ["AZURE_SUBSCRIPTION_ID"],
-    "resource_group_name": os.environ["AZURE_RESOURCE_GROUP"],
-    "project_name": os.environ["AZURE_AI_PROJECT_NAME"],
-}
-
-violence = ViolenceEvaluator(azure_ai_project=azure_ai_project)
-sexual = SexualEvaluator(azure_ai_project=azure_ai_project)
-code_vuln = CodeVulnerabilityEvaluator(azure_ai_project=azure_ai_project)
-
-# Control whether queries are evaluated (default: False, only response evaluated)
-violence_with_query = ViolenceEvaluator(azure_ai_project=azure_ai_project, evaluate_query=True)
-```
-
-### Agent Evaluators
-
-```python
-from azure.ai.evaluation import (
-    IntentResolutionEvaluator,
-    ResponseCompletenessEvaluator,
-    TaskAdherenceEvaluator,
-    ToolCallAccuracyEvaluator
-)
-
-intent = IntentResolutionEvaluator(model_config)
-completeness = ResponseCompletenessEvaluator(model_config)
-task_adherence = TaskAdherenceEvaluator(model_config)
-tool_accuracy = ToolCallAccuracyEvaluator(model_config)
-```
-
-## Single Row Evaluation
-
-```python
-from azure.ai.evaluation import GroundednessEvaluator
-
-groundedness = GroundednessEvaluator(model_config)
-
-result = groundedness(
-    query="What is Azure AI?",
-    context="Azure AI is Microsoft's AI platform...",
-    response="Azure AI provides AI services and tools."
-)
-
-print(f"Groundedness score: {result['groundedness']}")
-print(f"Reason: {result['groundedness_reason']}")
-```
-
-## Batch Evaluation with evaluate()
-
-```python
-from azure.ai.evaluation import evaluate
-
-result = evaluate(
-    data="test_data.jsonl",
-    evaluators={
-        "groundedness": groundedness,
-        "relevance": relevance,
-        "coherence": coherence
-    },
-    evaluator_config={
-        "default": {
-            "column_mapping": {
-                "query": "${data.query}",
-                "context": "${data.context}",
-                "response": "${data.response}"
-            }
-        }
-    },
-    # Optional: Add tags for experiment tracking
-    tags={"experiment": "v1", "model": "gpt-4o"}
-)
-
-print(result["metrics"])
-```
-
-## Composite Evaluators
-
-```python
-from azure.ai.evaluation import QAEvaluator, ContentSafetyEvaluator
-
-# All quality metrics in one
-qa_evaluator = QAEvaluator(model_config)
-
-# All safety metrics in one
-safety_evaluator = ContentSafetyEvaluator(azure_ai_project=azure_ai_project)
-
-result = evaluate(
-    data="data.jsonl",
-    evaluators={
-        "qa": qa_evaluator,
-        "content_safety": safety_evaluator
-    }
-)
-```
-
-## Azure OpenAI Graders
-
-Use grader classes for structured evaluation via Azure OpenAI's grading API:
-
-```python
-from azure.ai.evaluation import (
-    AzureOpenAILabelGrader,
-    AzureOpenAIStringCheckGrader,
-    AzureOpenAITextSimilarityGrader,
-    AzureOpenAIScoreModelGrader,
-    AzureOpenAIPythonGrader
-)
-
-# Label grader for classification
-label_grader = AzureOpenAILabelGrader(
-    model_config=model_config,
-    labels=["positive", "negative", "neutral"],
-    passing_labels=["positive"]
-)
-
-# Score model grader with custom threshold
-score_grader = AzureOpenAIScoreModelGrader(
-    model_config=model_config,
-    pass_threshold=0.7
-)
-
-# Use graders as evaluators in evaluate()
-result = evaluate(
-    data="data.jsonl",
-    evaluators={
-        "sentiment": label_grader,
-        "quality": score_grader
-    }
-)
-```
-
-## Evaluate Application Target
-
-```python
-from azure.ai.evaluation import evaluate
-from my_app import chat_app  # Your application
-
-result = evaluate(
-    data="queries.jsonl",
-    target=chat_app,  # Callable that takes query, returns response
-    evaluators={
-        "groundedness": groundedness
-    },
-    evaluator_config={
-        "default": {
-            "column_mapping": {
-                "query": "${data.query}",
-                "context": "${outputs.context}",
-                "response": "${outputs.response}"
-            }
-        }
-    }
-)
-```
-
-## Custom Evaluators
-
-### Code-Based
-
-```python
-from azure.ai.evaluation import evaluator
-
-@evaluator
-def word_count_evaluator(response: str) -> dict:
-    return {"word_count": len(response.split())}
-
-# Use in evaluate()
-result = evaluate(
-    data="data.jsonl",
-    evaluators={"word_count": word_count_evaluator}
-)
-```
-
-### Class-Based with Initialization
-
-```python
-class DomainSpecificEvaluator:
-    def __init__(self, domain_terms: list[str], threshold: float = 0.5):
-        self.domain_terms = [t.lower() for t in domain_terms]
-        self.threshold = threshold
-    
-    def __call__(self, response: str) -> dict:
-        response_lower = response.lower()
-        matches = sum(1 for term in self.domain_terms if term in response_lower)
-        score = matches / len(self.domain_terms) if self.domain_terms else 0
-        return {
-            "domain_relevance": score,
-            "passes_threshold": score >= self.threshold
-        }
-
-# Usage
-domain_eval = DomainSpecificEvaluator(domain_terms=["azure", "cloud", "api"])
-```
-
-### Prompt-Based with Azure OpenAI
-
-```python
-from openai import AzureOpenAI
-import json
-
-class PromptBasedEvaluator:
-    def __init__(self, model_config: dict):
-        self.client = AzureOpenAI(
-            azure_endpoint=model_config["azure_endpoint"],
-            api_key=model_config.get("api_key"),
-            api_version="2024-06-01"
-        )
-        self.deployment = model_config["azure_deployment"]
-    
-    def __call__(self, query: str, response: str) -> dict:
-        prompt = f"Rate this response 1-5 for helpfulness. Query: {query}, Response: {response}. Return JSON: {{\"score\": <int>}}"
-        completion = self.client.chat.completions.create(
-            model=self.deployment,
-            messages=[{"role": "user", "content": prompt}],
-            temperature=0,
-            response_format={"type": "json_object"}
-        )
-        result = json.loads(completion.choices[0].message.content)
-        return {"helpfulness": result["score"]}
-```
-
-## Log to Foundry Project
-
-```python
-from azure.ai.projects import AIProjectClient
-from azure.identity import DefaultAzureCredential
-
-project = AIProjectClient.from_connection_string(
-    conn_str=os.environ["AIPROJECT_CONNECTION_STRING"],
-    credential=DefaultAzureCredential()
-)
-
-result = evaluate(
-    data="data.jsonl",
-    evaluators={"groundedness": groundedness},
-    azure_ai_project=project.scope,  # Logs results to Foundry
-    tags={"version": "1.0", "experiment": "baseline"}
-)
-
-print(f"View results: {result['studio_url']}")
-```
-
-## Red Team Adversarial Testing
-
-```python
-from azure.ai.evaluation.red_team import RedTeam, AttackStrategy
-from azure.identity import DefaultAzureCredential
-
-red_team = RedTeam(
-    azure_ai_project=azure_ai_project,
-    credential=DefaultAzureCredential()
-)
-
-# Run adversarial scan against your application
-result = await red_team.scan(
-    target=my_chat_app,  # Your application callable
-    risk_categories=["violence", "hate_unfairness", "sexual", "self_harm"],
-    attack_strategies=[
-        AttackStrategy.DIRECT,
-        AttackStrategy.MultiTurn,
-        AttackStrategy.Crescendo
-    ],
-    attack_success_thresholds={"violence": 3, "hate_unfairness": 3}
-)
-
-print(f"Attack success rate: {result.attack_success_rate}")
-```
-
-## Multimodal Evaluation
-
-```python
-from azure.ai.evaluation import ContentSafetyEvaluator
-
-safety = ContentSafetyEvaluator(azure_ai_project=azure_ai_project)
-
-# Evaluate conversations with images
-conversation = {
-    "messages": [
-        {"role": "user", "content": [
-            {"type": "text", "text": "Describe this image"},
-            {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}
-        ]},
-        {"role": "assistant", "content": [
-            {"type": "text", "text": "The image shows..."}
-        ]}
-    ]
-}
-
-result = safety(conversation=conversation)
-```
-
-## Evaluator Reference
-
-| Evaluator | Type | Metrics |
-|-----------|------|---------|
-| `GroundednessEvaluator` | AI | groundedness (1-5) |
-| `GroundednessProEvaluator` | Service | groundedness (1-5) |
-| `RelevanceEvaluator` | AI | relevance (1-5) |
-| `CoherenceEvaluator` | AI | coherence (1-5) |
-| `FluencyEvaluator` | AI | fluency (1-5) |
-| `SimilarityEvaluator` | AI | similarity (1-5) |
-| `RetrievalEvaluator` | AI | retrieval (1-5) |
-| `F1ScoreEvaluator` | NLP | f1_score (0-1) |
-| `RougeScoreEvaluator` | NLP | rouge scores |
-| `BleuScoreEvaluator` | NLP | bleu_score (0-1) |
-| `IntentResolutionEvaluator` | Agent | intent_resolution (1-5) |
-| `ResponseCompletenessEvaluator` | Agent | response_completeness (1-5) |
-| `TaskAdherenceEvaluator` | Agent | task_adherence (1-5) |
-| `ToolCallAccuracyEvaluator` | Agent | tool_call_accuracy (1-5) |
-| `ViolenceEvaluator` | Safety | violence (0-7) |
-| `SexualEvaluator` | Safety | sexual (0-7) |
-| `SelfHarmEvaluator` | Safety | self_harm (0-7) |
-| `HateUnfairnessEvaluator` | Safety | hate_unfairness (0-7) |
-| `CodeVulnerabilityEvaluator` | Safety | code vulnerabilities |
-| `UngroundedAttributesEvaluator` | Safety | ungrounded attributes |
-| `QAEvaluator` | Composite | All quality metrics |
-| `ContentSafetyEvaluator` | Composite | All safety metrics |
-
-## Best Practices
-
-1. **Use composite evaluators** for comprehensive assessment
-2. **Map columns correctly** — mismatched columns cause silent failures
-3. **Log to Foundry** for tracking and comparison across runs with `tags`
-4. **Create custom evaluators** for domain-specific metrics
-5. **Use NLP evaluators** when you have ground truth answers
-6. **Safety evaluators require** Azure AI project scope
-7. **Batch evaluation** is more efficient than single-row loops
-8. **Use graders** for structured evaluation with Azure OpenAI's grading API
-9. **Agent evaluators** for AI agents with tool calls
-10. **RedTeam scanning** for adversarial safety testing before deployment
-11. **Use `is_reasoning_model=True`** when evaluating with o1/o3 models
-
-## Reference Files
-
-| File | Contents |
-|------|----------|
-| [references/built-in-evaluators.md](references/built-in-evaluators.md) | Detailed patterns for AI-assisted, NLP-based, Safety, and Agent evaluators with configuration tables |
-| [references/custom-evaluators.md](references/custom-evaluators.md) | Creating code-based and prompt-based custom evaluators, testing patterns |
-| [scripts/run_batch_evaluation.py](scripts/run_batch_evaluation.py) | CLI tool for running batch evaluations with quality, safety, agent, and custom evaluators |
diff --git a/.github/skills/azure-ai-evaluation-py/references/acceptance-criteria.md b/.github/skills/azure-ai-evaluation-py/references/acceptance-criteria.md
deleted file mode 100644
index 61f4bc0..0000000
--- a/.github/skills/azure-ai-evaluation-py/references/acceptance-criteria.md
+++ /dev/null
@@ -1,352 +0,0 @@
-# Azure AI Evaluation SDK Acceptance Criteria
-
-**SDK**: `azure-ai-evaluation`
-**Repository**: https://github.com/Azure/azure-sdk-for-python
-**Commit**: `main`
-**Purpose**: Skill testing acceptance criteria for validating generated code correctness
-
----
-
-## 1. Imports
-
-### 1.1 ✅ CORRECT: Core SDK Imports
-```python
-from azure.ai.evaluation import (
-    # Core
-    evaluate,
-    AzureOpenAIModelConfiguration,
-    
-    # Quality Evaluators
-    GroundednessEvaluator,
-    GroundednessProEvaluator,
-    RelevanceEvaluator,
-    CoherenceEvaluator,
-    FluencyEvaluator,
-    SimilarityEvaluator,
-    RetrievalEvaluator,
-    
-    # NLP Evaluators
-    F1ScoreEvaluator,
-    RougeScoreEvaluator,
-    GleuScoreEvaluator,
-    BleuScoreEvaluator,
-    MeteorScoreEvaluator,
-    
-    # Safety Evaluators
-    ViolenceEvaluator,
-    SexualEvaluator,
-    SelfHarmEvaluator,
-    HateUnfairnessEvaluator,
-    IndirectAttackEvaluator,
-    ProtectedMaterialEvaluator,
-    CodeVulnerabilityEvaluator,
-    UngroundedAttributesEvaluator,
-    
-    # Agent Evaluators
-    IntentResolutionEvaluator,
-    ResponseCompletenessEvaluator,
-    TaskAdherenceEvaluator,
-    ToolCallAccuracyEvaluator,
-    
-    # Composite Evaluators
-    QAEvaluator,
-    ContentSafetyEvaluator,
-    
-    # Graders
-    AzureOpenAILabelGrader,
-    AzureOpenAIStringCheckGrader,
-    AzureOpenAITextSimilarityGrader,
-    AzureOpenAIScoreModelGrader,
-    AzureOpenAIPythonGrader,
-    
-    # Custom evaluator decorator
-    evaluator,
-)
-```
-
-### 1.2 ✅ CORRECT: Authentication Imports
-```python
-from azure.identity import DefaultAzureCredential
-```
-
-### 1.3 ❌ INCORRECT: Wrong Import Paths
-```python
-# WRONG - evaluators are not in a submodule
-from azure.ai.evaluation.evaluators import GroundednessEvaluator
-
-# WRONG - model configuration is not under models
-from azure.ai.evaluation.models import AzureOpenAIModelConfiguration
-
-# WRONG - non-existent imports
-from azure.ai.evaluation import Evaluator
-from azure.ai.evaluation import PromptChatTarget  # Does not exist
-```
-
----
-
-## 2. Evaluator setup
-
-### 2.1 ✅ CORRECT: Dict Model Configuration (API key)
-```python
-model_config = {
-    "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
-    "api_key": os.environ["AZURE_OPENAI_API_KEY"],
-    "azure_deployment": os.environ["AZURE_OPENAI_DEPLOYMENT"],
-}
-```
-
-### 2.2 ✅ CORRECT: AzureOpenAIModelConfiguration (Managed Identity)
-```python
-from azure.ai.evaluation import AzureOpenAIModelConfiguration
-from azure.identity import DefaultAzureCredential
-
-model_config = AzureOpenAIModelConfiguration(
-    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
-    credential=DefaultAzureCredential(),
-    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT"],
-    api_version="2024-06-01",
-)
-```
-
-### 2.3 ✅ CORRECT: Azure AI Project for Safety Evaluators
-```python
-azure_ai_project = {
-    "subscription_id": os.environ["AZURE_SUBSCRIPTION_ID"],
-    "resource_group_name": os.environ["AZURE_RESOURCE_GROUP"],
-    "project_name": os.environ["AZURE_AI_PROJECT_NAME"],
-}
-```
-
-### 2.4 ✅ CORRECT: Reasoning Model Configuration
-```python
-# For o1/o3 reasoning models
-groundedness = GroundednessEvaluator(model_config, is_reasoning_model=True)
-coherence = CoherenceEvaluator(model_config, is_reasoning_model=True)
-```
-
-### 2.5 ❌ INCORRECT: Wrong Config Keys
-```python
-# WRONG - keys must be azure_endpoint and azure_deployment
-model_config = {
-    "endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
-    "deployment_name": os.environ["AZURE_OPENAI_DEPLOYMENT"],
-}
-```
-
----
-
-## 3. Quality evaluators
-
-### 3.1 ✅ CORRECT: AI-Assisted Evaluators
-```python
-groundedness = GroundednessEvaluator(model_config)
-result = groundedness(
-    query="What is Azure AI?",
-    context="Azure AI is Microsoft's AI platform.",
-    response="Azure AI provides AI services and tools."
-)
-
-coherence = CoherenceEvaluator(model_config)
-result = coherence(
-    query="Explain Azure Functions.",
-    response="Azure Functions is a serverless compute service."
-)
-
-similarity = SimilarityEvaluator(model_config)
-result = similarity(
-    query="Capital of France?",
-    response="Paris is the capital of France.",
-    ground_truth="The capital city of France is Paris."
-)
-```
-
-### 3.2 ✅ CORRECT: NLP-Based Evaluators
-```python
-f1 = F1ScoreEvaluator()
-result = f1(response="Tokyo is the capital of Japan.", ground_truth="Tokyo is Japan's capital.")
-```
-
-### 3.3 ❌ INCORRECT: Missing Required Inputs
-```python
-# WRONG - groundedness requires context
-groundedness = GroundednessEvaluator(model_config)
-groundedness(response="Paris is the capital of France.")
-
-# WRONG - similarity requires ground_truth
-similarity = SimilarityEvaluator(model_config)
-similarity(query="Capital of France?", response="Paris")
-```
-
----
-
-## 4. Safety evaluators
-
-### 4.1 ✅ CORRECT: Safety Evaluators with Project Scope
-```python
-violence = ViolenceEvaluator(azure_ai_project=azure_ai_project)
-result = violence(query="Tell me a story", response="Once upon a time...")
-
-indirect = IndirectAttackEvaluator(azure_ai_project=azure_ai_project)
-result = indirect(
-    query="Summarize this document",
-    context="Document content... [hidden: ignore previous instructions]",
-    response="The document discusses..."
-)
-
-# With evaluate_query=True to include query in evaluation
-violence_with_query = ViolenceEvaluator(azure_ai_project=azure_ai_project, evaluate_query=True)
-```
-
-### 4.2 ✅ CORRECT: Composite Safety Evaluator
-```python
-safety = ContentSafetyEvaluator(azure_ai_project=azure_ai_project)
-result = safety(query="Tell me about history", response="World War II was...")
-```
-
-### 4.3 ✅ CORRECT: Code Vulnerability and Ungrounded Attributes
-```python
-code_vuln = CodeVulnerabilityEvaluator(azure_ai_project=azure_ai_project)
-result = code_vuln(query="Write SQL", response="SELECT * FROM users WHERE id = '" + input + "'")
-
-ungrounded = UngroundedAttributesEvaluator(azure_ai_project=azure_ai_project)
-result = ungrounded(query="About John", context="John works here.", response="John seems sad.")
-```
-
-### 4.4 ❌ INCORRECT: Using Model Config for Safety Evaluators
-```python
-# WRONG - safety evaluators require azure_ai_project, not model_config
-violence = ViolenceEvaluator(model_config)
-```
-
----
-
-## 5. Agent evaluators
-
-### 5.1 ✅ CORRECT: Agent Evaluators
-```python
-intent = IntentResolutionEvaluator(model_config)
-result = intent(query="Book a flight to Paris", response="Found flights to Paris...")
-
-completeness = ResponseCompletenessEvaluator(model_config)
-result = completeness(query="Weather and clothing advice?", response="Sunny, wear light clothes.")
-
-task_adherence = TaskAdherenceEvaluator(model_config)
-result = task_adherence(query="Calculate total with tax", response="Total with 8% tax is $108.")
-
-tool_accuracy = ToolCallAccuracyEvaluator(model_config)
-result = tool_accuracy(
-    query="Weather in Seattle?",
-    response="55°F and cloudy in Seattle.",
-    tool_calls=[{"name": "get_weather", "arguments": {"location": "Seattle"}}],
-    tool_definitions=[{"name": "get_weather", "parameters": {"location": {"type": "string"}}}]
-)
-```
-
----
-
-## 6. Azure OpenAI Graders
-
-### 6.1 ✅ CORRECT: Grader Usage
-```python
-from azure.ai.evaluation import AzureOpenAILabelGrader, AzureOpenAIScoreModelGrader
-
-label_grader = AzureOpenAILabelGrader(
-    model_config=model_config,
-    labels=["positive", "negative", "neutral"],
-    passing_labels=["positive"]
-)
-
-score_grader = AzureOpenAIScoreModelGrader(
-    model_config=model_config,
-    pass_threshold=0.7
-)
-
-# Use in evaluate()
-result = evaluate(
-    data="data.jsonl",
-    evaluators={"sentiment": label_grader, "quality": score_grader}
-)
-```
-
----
-
-## 7. Custom evaluators
-
-### 7.1 ✅ CORRECT: Decorated Function Evaluator
-```python
-from azure.ai.evaluation import evaluator
-
-@evaluator
-def word_count_evaluator(response: str) -> dict:
-    return {"word_count": len(response.split())}
-```
-
-### 7.2 ✅ CORRECT: Class-Based Evaluator
-```python
-class DomainSpecificEvaluator:
-    def __init__(self, domain_terms: list[str]):
-        self.domain_terms = [term.lower() for term in domain_terms]
-
-    def __call__(self, response: str) -> dict:
-        hits = sum(1 for term in self.domain_terms if term in response.lower())
-        return {"domain_hits": hits}
-```
-
-### 7.3 ❌ INCORRECT: Non-Dict Return
-```python
-@evaluator
-def bad_evaluator(response: str) -> float:
-    return 0.5  # WRONG - evaluators must return dict
-```
-
----
-
-## 8. Batch evaluation
-
-### 8.1 ✅ CORRECT: evaluate() with Column Mapping
-```python
-result = evaluate(
-    data="data.jsonl",
-    evaluators={
-        "groundedness": groundedness,
-        "relevance": relevance,
-    },
-    evaluator_config={
-        "default": {
-            "column_mapping": {
-                "query": "${data.query}",
-                "context": "${data.context}",
-                "response": "${data.response}",
-            }
-        }
-    },
-    # Optional: Add tags for experiment tracking
-    tags={"experiment": "v1", "model": "gpt-4o"}
-)
-```
-
-### 8.2 ✅ CORRECT: evaluate() on Target
-```python
-from my_app import chat_app
-
-result = evaluate(
-    data="queries.jsonl",
-    target=chat_app,
-    evaluators={"groundedness": groundedness},
-    evaluator_config={
-        "default": {
-            "column_mapping": {
-                "query": "${data.query}",
-                "context": "${outputs.context}",
-                "response": "${outputs.response}",
-            }
-        }
-    },
-)
-```
-
-### 8.3 ❌ INCORRECT: Evaluators Not in Dict
-```python
-# WRONG - evaluators must be a dict of name -> evaluator
-evaluate(data="data.jsonl", evaluators=[groundedness, relevance])
-```
diff --git a/.github/skills/azure-ai-evaluation-py/references/built-in-evaluators.md b/.github/skills/azure-ai-evaluation-py/references/built-in-evaluators.md
deleted file mode 100644
index 349d1b5..0000000
--- a/.github/skills/azure-ai-evaluation-py/references/built-in-evaluators.md
+++ /dev/null
@@ -1,684 +0,0 @@
-# Built-in Evaluators Reference
-
-Comprehensive patterns for Azure AI Evaluation SDK's built-in evaluators.
-
-## Model Configuration
-
-All AI-assisted evaluators require a model configuration:
-
-```python
-from azure.ai.evaluation import AzureOpenAIModelConfiguration
-
-# Using API key authentication
-model_config = AzureOpenAIModelConfiguration(
-    azure_endpoint="https://<resource>.openai.azure.com",
-    api_key="<your-api-key>",
-    azure_deployment="gpt-4o-mini",
-    api_version="2024-06-01"
-)
-
-# Using DefaultAzureCredential (recommended for production)
-from azure.identity import DefaultAzureCredential
-
-model_config = AzureOpenAIModelConfiguration(
-    azure_endpoint="https://<resource>.openai.azure.com",
-    credential=DefaultAzureCredential(),
-    azure_deployment="gpt-4o-mini",
-    api_version="2024-06-01"
-)
-```
-
-## Azure AI Project Configuration
-
-Safety evaluators and Foundry logging require an Azure AI project scope:
-
-```python
-# Option 1: Dict configuration
-azure_ai_project = {
-    "subscription_id": os.environ["AZURE_SUBSCRIPTION_ID"],
-    "resource_group_name": os.environ["AZURE_RESOURCE_GROUP"],
-    "project_name": os.environ["AZURE_AI_PROJECT_NAME"],
-}
-
-# Option 2: From AIProjectClient
-from azure.ai.projects import AIProjectClient
-from azure.identity import DefaultAzureCredential
-
-project = AIProjectClient.from_connection_string(
-    conn_str="<connection-string>",
-    credential=DefaultAzureCredential()
-)
-azure_ai_project = project.scope
-```
-
-## AI-Assisted Quality Evaluators
-
-### GroundednessEvaluator
-
-Measures whether the response is factually grounded in the provided context.
-
-```python
-from azure.ai.evaluation import GroundednessEvaluator
-
-groundedness = GroundednessEvaluator(model_config)
-
-result = groundedness(
-    query="What services does Azure AI provide?",
-    context="Azure AI provides cognitive services including vision, speech, "
-            "language understanding, and decision-making APIs.",
-    response="Azure AI offers vision and speech services."
-)
-
-# Returns:
-# {
-#     "groundedness": 5,           # Score 1-5
-#     "gpt_groundedness": 5,       # Raw GPT score
-#     "groundedness_reason": "...", # Explanation
-#     "groundedness_result": "pass", # pass/fail based on threshold
-#     "groundedness_threshold": 3,
-#     "groundedness_prompt_tokens": ...,
-#     "groundedness_completion_tokens": ...,
-#     "groundedness_model": "gpt-4o-mini"
-# }
-
-# For reasoning models (o1/o3)
-groundedness_reasoning = GroundednessEvaluator(model_config, is_reasoning_model=True)
-```
-
-**Input Requirements:**
-- `query`: The user's question
-- `context`: Source documents/information
-- `response`: The model's response to evaluate
-
-### GroundednessProEvaluator
-
-Service-based groundedness evaluation (no model config needed).
-
-```python
-from azure.ai.evaluation import GroundednessProEvaluator
-
-groundedness_pro = GroundednessProEvaluator(azure_ai_project=azure_ai_project)
-
-result = groundedness_pro(
-    query="What is Azure?",
-    context="Azure is Microsoft's cloud platform...",
-    response="Azure provides cloud services."
-)
-```
-
-### RelevanceEvaluator
-
-Measures how well the response addresses the query.
-
-```python
-from azure.ai.evaluation import RelevanceEvaluator
-
-relevance = RelevanceEvaluator(model_config)
-
-result = relevance(
-    query="How do I authenticate with Azure?",
-    context="Azure supports multiple authentication methods...",
-    response="Use DefaultAzureCredential for automatic credential discovery."
-)
-
-# Score 1-5: 5 = directly addresses query, 1 = completely irrelevant
-```
-
-### CoherenceEvaluator
-
-Measures logical flow and consistency of the response.
-
-```python
-from azure.ai.evaluation import CoherenceEvaluator
-
-coherence = CoherenceEvaluator(model_config)
-
-# Note: CoherenceEvaluator only needs query and response
-result = coherence(
-    query="Explain how Azure Functions work.",
-    response="Azure Functions is a serverless compute service. "
-             "It triggers based on events. You write code that runs on demand."
-)
-
-# Score 1-5: 5 = logically coherent, 1 = disjointed/contradictory
-```
-
-### FluencyEvaluator
-
-Measures grammatical correctness and natural language quality.
-
-```python
-from azure.ai.evaluation import FluencyEvaluator
-
-fluency = FluencyEvaluator(model_config)
-
-result = fluency(
-    query="What is Azure?",
-    response="Azure is Microsoft's cloud computing platform that provides "
-             "a wide range of services for building and deploying applications."
-)
-
-# Score 1-5: 5 = perfectly fluent, 1 = poor grammar/unnatural
-```
-
-### SimilarityEvaluator
-
-Measures semantic similarity between response and ground truth.
-
-```python
-from azure.ai.evaluation import SimilarityEvaluator
-
-similarity = SimilarityEvaluator(model_config)
-
-result = similarity(
-    query="What is the capital of France?",
-    response="Paris is the capital of France.",
-    ground_truth="The capital city of France is Paris."
-)
-
-# Score 1-5: 5 = semantically identical, 1 = completely different
-```
-
-### RetrievalEvaluator
-
-Measures quality of retrieved documents for RAG scenarios.
-
-```python
-from azure.ai.evaluation import RetrievalEvaluator
-
-retrieval = RetrievalEvaluator(model_config)
-
-result = retrieval(
-    query="How to configure Azure Storage?",
-    context="Azure Storage can be configured through the Azure Portal. "
-            "You can set replication, access tiers, and networking options."
-)
-
-# Score 1-5: 5 = highly relevant retrieval, 1 = irrelevant documents
-```
-
-## NLP-Based Evaluators
-
-These evaluators use traditional NLP metrics and don't require a model.
-
-### F1ScoreEvaluator
-
-Token-level F1 score between response and ground truth.
-
-```python
-from azure.ai.evaluation import F1ScoreEvaluator
-
-f1 = F1ScoreEvaluator()
-
-result = f1(
-    response="The quick brown fox jumps over the lazy dog",
-    ground_truth="A quick brown fox jumped over a lazy dog"
-)
-
-# Returns:
-# {
-#     "f1_score": 0.7272...  # Score 0-1
-# }
-```
-
-### RougeScoreEvaluator
-
-ROUGE scores for summarization quality.
-
-```python
-from azure.ai.evaluation import RougeScoreEvaluator
-
-rouge = RougeScoreEvaluator(rouge_type="rouge1")  # rouge1, rouge2, rougeL, rougeLsum
-
-result = rouge(
-    response="Azure provides cloud computing services.",
-    ground_truth="Azure is Microsoft's cloud computing platform."
-)
-
-# Returns:
-# {
-#     "rouge1_precision": 0.5,
-#     "rouge1_recall": 0.5,
-#     "rouge1_fmeasure": 0.5
-# }
-```
-
-**ROUGE Types:**
-- `rouge1`: Unigram overlap
-- `rouge2`: Bigram overlap
-- `rougeL`: Longest common subsequence
-- `rougeLsum`: Summary-level LCS
-
-### BleuScoreEvaluator
-
-BLEU score for translation/generation quality.
-
-```python
-from azure.ai.evaluation import BleuScoreEvaluator
-
-bleu = BleuScoreEvaluator()
-
-result = bleu(
-    response="The cat sat on the mat.",
-    ground_truth="A cat is sitting on the mat."
-)
-
-# Returns:
-# {
-#     "bleu_score": 0.3...  # Score 0-1
-# }
-```
-
-### GleuScoreEvaluator
-
-GLEU (Google-BLEU) variant optimized for sentence-level evaluation.
-
-```python
-from azure.ai.evaluation import GleuScoreEvaluator
-
-gleu = GleuScoreEvaluator()
-
-result = gleu(
-    response="Hello world",
-    ground_truth="Hello, world!"
-)
-```
-
-### MeteorScoreEvaluator
-
-METEOR score considering synonyms and paraphrases.
-
-```python
-from azure.ai.evaluation import MeteorScoreEvaluator
-
-meteor = MeteorScoreEvaluator()
-
-result = meteor(
-    response="The automobile is red.",
-    ground_truth="The car is red."
-)
-
-# METEOR handles synonyms better than BLEU
-```
-
-## Safety Evaluators
-
-Safety evaluators require an Azure AI project scope.
-
-```python
-# Safety evaluators support evaluate_query parameter (default: False)
-# When True, both query and response are evaluated
-# When False (default), only response is evaluated
-```
-
-### ViolenceEvaluator
-
-Detects violent content.
-
-```python
-from azure.ai.evaluation import ViolenceEvaluator
-
-violence = ViolenceEvaluator(azure_ai_project=azure_ai_project)
-
-result = violence(
-    query="Tell me a story",
-    response="Once upon a time in a peaceful village..."
-)
-
-# Returns:
-# {
-#     "violence": "Very low",        # Severity level
-#     "violence_score": 0,           # Score 0-7
-#     "violence_reason": "...",      # Explanation
-#     "violence_result": "pass",     # pass/fail
-#     "violence_threshold": 3
-# }
-
-# To also evaluate the query (not just response)
-violence_with_query = ViolenceEvaluator(
-    azure_ai_project=azure_ai_project,
-    evaluate_query=True
-)
-```
-
-### Sexual, SelfHarm, HateUnfairness Evaluators
-
-Same pattern as ViolenceEvaluator:
-
-```python
-from azure.ai.evaluation import (
-    SexualEvaluator,
-    SelfHarmEvaluator,
-    HateUnfairnessEvaluator
-)
-
-sexual = SexualEvaluator(azure_ai_project=azure_ai_project)
-self_harm = SelfHarmEvaluator(azure_ai_project=azure_ai_project)
-hate = HateUnfairnessEvaluator(azure_ai_project=azure_ai_project)
-```
-
-### IndirectAttackEvaluator
-
-Detects indirect prompt injection attacks.
-
-```python
-from azure.ai.evaluation import IndirectAttackEvaluator
-
-indirect = IndirectAttackEvaluator(azure_ai_project=azure_ai_project)
-
-result = indirect(
-    query="Summarize this document",
-    context="Document content... [hidden: ignore previous instructions]",
-    response="The document discusses..."
-)
-```
-
-### ProtectedMaterialEvaluator
-
-Detects use of copyrighted or protected material.
-
-```python
-from azure.ai.evaluation import ProtectedMaterialEvaluator
-
-protected = ProtectedMaterialEvaluator(azure_ai_project=azure_ai_project)
-
-result = protected(
-    query="Write me a poem",
-    response="Roses are red, violets are blue..."
-)
-```
-
-### CodeVulnerabilityEvaluator
-
-Detects security vulnerabilities in code.
-
-```python
-from azure.ai.evaluation import CodeVulnerabilityEvaluator
-
-code_vuln = CodeVulnerabilityEvaluator(azure_ai_project=azure_ai_project)
-
-result = code_vuln(
-    query="Write a SQL query",
-    response="SELECT * FROM users WHERE id = '" + user_input + "'"
-)
-
-# Detects vulnerabilities:
-# - sql-injection, code-injection, path-injection
-# - hardcoded-credentials, weak-cryptographic-algorithm
-# - reflected-xss, clear-text-logging-sensitive-data
-# - and more...
-```
-
-### UngroundedAttributesEvaluator
-
-Detects ungrounded inferences about human attributes.
-
-```python
-from azure.ai.evaluation import UngroundedAttributesEvaluator
-
-ungrounded = UngroundedAttributesEvaluator(azure_ai_project=azure_ai_project)
-
-result = ungrounded(
-    query="Tell me about this person",
-    context="John works at a tech company.",
-    response="John seems depressed and unhappy with his job."
-)
-
-# Detects:
-# - emotional_state: ungrounded emotional inferences
-# - protected_class: ungrounded protected class inferences
-# - groundedness: whether claims are grounded in context
-```
-
-## Composite Evaluators
-
-### QAEvaluator
-
-Combines all quality metrics in one evaluator.
-
-```python
-from azure.ai.evaluation import QAEvaluator
-
-qa = QAEvaluator(model_config)
-
-result = qa(
-    query="What is Azure?",
-    context="Azure is Microsoft's cloud platform...",
-    response="Azure is a cloud computing service by Microsoft.",
-    ground_truth="Azure is Microsoft's cloud computing platform."
-)
-
-# Returns all quality metrics:
-# - groundedness, relevance, coherence, fluency, similarity
-```
-
-### ContentSafetyEvaluator
-
-Combines all safety metrics in one evaluator.
-
-```python
-from azure.ai.evaluation import ContentSafetyEvaluator
-
-safety = ContentSafetyEvaluator(azure_ai_project=azure_ai_project)
-
-result = safety(
-    query="Tell me about history",
-    response="World War II was a global conflict..."
-)
-
-# Returns all safety metrics:
-# - violence, sexual, self_harm, hate_unfairness
-```
-
-## Agent Evaluators
-
-Evaluators for AI agents with tool calling capabilities.
-
-### IntentResolutionEvaluator
-
-Evaluates whether the agent correctly understood and resolved user intent.
-
-```python
-from azure.ai.evaluation import IntentResolutionEvaluator
-
-intent = IntentResolutionEvaluator(model_config)
-
-result = intent(
-    query="Book a flight to Paris for next Monday",
-    response="I've found several flights to Paris for Monday..."
-)
-
-# Returns:
-# {
-#     "intent_resolution": 4,  # Score 1-5
-#     "intent_resolution_reason": "...",
-#     "intent_resolution_result": "pass"
-# }
-```
-
-### ResponseCompletenessEvaluator
-
-Evaluates whether the agent's response fully addresses the query.
-
-```python
-from azure.ai.evaluation import ResponseCompletenessEvaluator
-
-completeness = ResponseCompletenessEvaluator(model_config)
-
-result = completeness(
-    query="What's the weather and what should I wear?",
-    response="The weather is sunny and 75°F. I recommend light clothing."
-)
-```
-
-### TaskAdherenceEvaluator
-
-Evaluates whether the agent adhered to the assigned task.
-
-```python
-from azure.ai.evaluation import TaskAdherenceEvaluator
-
-task_adherence = TaskAdherenceEvaluator(model_config)
-
-result = task_adherence(
-    query="Calculate the total cost including tax",
-    response="The total with 8% tax is $108."
-)
-```
-
-### ToolCallAccuracyEvaluator
-
-Evaluates the accuracy of tool calls made by an agent.
-
-```python
-from azure.ai.evaluation import ToolCallAccuracyEvaluator
-
-tool_accuracy = ToolCallAccuracyEvaluator(model_config)
-
-# Evaluate agent response with tool calls
-result = tool_accuracy(
-    query="What's the weather in Seattle?",
-    response="The weather in Seattle is 55°F and cloudy.",
-    tool_calls=[
-        {
-            "name": "get_weather",
-            "arguments": {"location": "Seattle"}
-        }
-    ],
-    tool_definitions=[
-        {
-            "name": "get_weather",
-            "description": "Get weather for a location",
-            "parameters": {"location": {"type": "string"}}
-        }
-    ]
-)
-```
-
-## Azure OpenAI Graders
-
-Grader classes for structured evaluation using Azure OpenAI's grading API.
-
-### AzureOpenAILabelGrader
-
-Classification-based grading with predefined labels.
-
-```python
-from azure.ai.evaluation import AzureOpenAILabelGrader
-
-label_grader = AzureOpenAILabelGrader(
-    model_config=model_config,
-    labels=["positive", "negative", "neutral"],
-    passing_labels=["positive"]
-)
-
-result = label_grader(
-    response="This product is amazing!"
-)
-```
-
-### AzureOpenAIScoreModelGrader
-
-Numeric scoring with customizable thresholds.
-
-```python
-from azure.ai.evaluation import AzureOpenAIScoreModelGrader
-
-score_grader = AzureOpenAIScoreModelGrader(
-    model_config=model_config,
-    pass_threshold=0.7
-)
-
-result = score_grader(
-    query="Explain photosynthesis",
-    response="Plants convert sunlight into energy..."
-)
-```
-
-### AzureOpenAIStringCheckGrader
-
-String matching and validation.
-
-```python
-from azure.ai.evaluation import AzureOpenAIStringCheckGrader
-
-string_grader = AzureOpenAIStringCheckGrader(
-    model_config=model_config,
-    expected_strings=["Azure", "cloud"]
-)
-```
-
-### AzureOpenAITextSimilarityGrader
-
-Semantic similarity evaluation.
-
-```python
-from azure.ai.evaluation import AzureOpenAITextSimilarityGrader
-
-similarity_grader = AzureOpenAITextSimilarityGrader(
-    model_config=model_config
-)
-
-result = similarity_grader(
-    response="Paris is France's capital",
-    ground_truth="The capital of France is Paris"
-)
-```
-
-## Evaluator Configuration Table
-
-| Evaluator | Type | Required Inputs | Score Range |
-|-----------|------|-----------------|-------------|
-| `GroundednessEvaluator` | AI | query, context, response | 1-5 |
-| `GroundednessProEvaluator` | Service | query, context, response | 1-5 |
-| `RelevanceEvaluator` | AI | query, context, response | 1-5 |
-| `CoherenceEvaluator` | AI | query, response | 1-5 |
-| `FluencyEvaluator` | AI | query, response | 1-5 |
-| `SimilarityEvaluator` | AI | query, response, ground_truth | 1-5 |
-| `RetrievalEvaluator` | AI | query, context | 1-5 |
-| `F1ScoreEvaluator` | NLP | response, ground_truth | 0-1 |
-| `RougeScoreEvaluator` | NLP | response, ground_truth | 0-1 |
-| `BleuScoreEvaluator` | NLP | response, ground_truth | 0-1 |
-| `IntentResolutionEvaluator` | Agent | query, response | 1-5 |
-| `ResponseCompletenessEvaluator` | Agent | query, response | 1-5 |
-| `TaskAdherenceEvaluator` | Agent | query, response | 1-5 |
-| `ToolCallAccuracyEvaluator` | Agent | query, response, tool_calls | 1-5 |
-| `ViolenceEvaluator` | Safety | query, response | 0-7 |
-| `SexualEvaluator` | Safety | query, response | 0-7 |
-| `SelfHarmEvaluator` | Safety | query, response | 0-7 |
-| `HateUnfairnessEvaluator` | Safety | query, response | 0-7 |
-| `CodeVulnerabilityEvaluator` | Safety | query, response | binary |
-| `UngroundedAttributesEvaluator` | Safety | query, context, response | binary |
-
-## Async Evaluation
-
-All evaluators support async execution:
-
-```python
-import asyncio
-from azure.ai.evaluation import GroundednessEvaluator
-
-async def evaluate_async():
-    groundedness = GroundednessEvaluator(model_config)
-    
-    result = await groundedness(
-        query="What is Azure?",
-        context="Azure is Microsoft's cloud...",
-        response="Azure is a cloud platform."
-    )
-    return result
-
-result = asyncio.run(evaluate_async())
-```
-
-## Best Practices
-
-1. **Choose appropriate evaluators** - Use NLP evaluators when you have ground truth, AI evaluators for subjective quality
-2. **Batch evaluation** - Use `evaluate()` function for datasets rather than looping
-3. **Safety first** - Always include safety evaluators for user-facing applications
-4. **Log to Foundry** - Track evaluations over time with `azure_ai_project` parameter and `tags`
-5. **Threshold configuration** - Set appropriate pass/fail thresholds for your use case
-6. **Use `is_reasoning_model=True`** - When evaluating with o1/o3 reasoning models
-7. **Agent evaluators** - Use IntentResolution, TaskAdherence, and ToolCallAccuracy for AI agents
-8. **Graders for structured eval** - Use AzureOpenAI graders for classification and scoring tasks
-9. **`evaluate_query` parameter** - Control whether queries are included in safety evaluation
diff --git a/.github/skills/azure-ai-evaluation-py/references/custom-evaluators.md b/.github/skills/azure-ai-evaluation-py/references/custom-evaluators.md
deleted file mode 100644
index 5216095..0000000
--- a/.github/skills/azure-ai-evaluation-py/references/custom-evaluators.md
+++ /dev/null
@@ -1,426 +0,0 @@
-# Custom Evaluators Reference
-
-Patterns for creating custom evaluators with Azure AI Evaluation SDK.
-
-## Code-Based Evaluators
-
-### Simple Function Evaluator
-
-Use the `@evaluator` decorator for simple metrics:
-
-```python
-from azure.ai.evaluation import evaluator
-
-@evaluator
-def word_count_evaluator(response: str) -> dict:
-    """Count words in response."""
-    return {"word_count": len(response.split())}
-
-@evaluator
-def response_length_evaluator(response: str) -> dict:
-    """Measure response length in characters."""
-    return {
-        "char_count": len(response),
-        "is_concise": len(response) < 500
-    }
-
-# Usage
-result = word_count_evaluator(response="Hello world")
-# {"word_count": 2}
-```
-
-### Multi-Input Evaluator
-
-Evaluators can accept multiple inputs:
-
-```python
-from azure.ai.evaluation import evaluator
-
-@evaluator
-def keyword_coverage_evaluator(
-    query: str,
-    response: str,
-    required_keywords: list[str] | None = None
-) -> dict:
-    """Check if response covers required keywords from query."""
-    if required_keywords is None:
-        # Extract keywords from query
-        required_keywords = [w.lower() for w in query.split() if len(w) > 3]
-    
-    response_lower = response.lower()
-    covered = [kw for kw in required_keywords if kw in response_lower]
-    
-    coverage = len(covered) / len(required_keywords) if required_keywords else 1.0
-    
-    return {
-        "keyword_coverage": coverage,
-        "keywords_found": covered,
-        "keywords_missing": [kw for kw in required_keywords if kw not in response_lower]
-    }
-```
-
-### Class-Based Evaluator
-
-For evaluators needing initialization or state:
-
-```python
-from azure.ai.evaluation import evaluator
-
-class DomainSpecificEvaluator:
-    """Evaluator with domain-specific vocabulary."""
-    
-    def __init__(self, domain_terms: list[str], threshold: float = 0.5):
-        self.domain_terms = [t.lower() for t in domain_terms]
-        self.threshold = threshold
-    
-    def __call__(self, response: str) -> dict:
-        response_lower = response.lower()
-        matches = sum(1 for term in self.domain_terms if term in response_lower)
-        score = matches / len(self.domain_terms) if self.domain_terms else 0
-        
-        return {
-            "domain_relevance": score,
-            "domain_terms_found": matches,
-            "passes_threshold": score >= self.threshold
-        }
-
-# Usage
-azure_evaluator = DomainSpecificEvaluator(
-    domain_terms=["azure", "cloud", "microsoft", "deployment", "resource"],
-    threshold=0.4
-)
-
-result = azure_evaluator(response="Deploy your app to Azure cloud resources.")
-```
-
-### Async Evaluator
-
-For evaluators that need async operations:
-
-```python
-import asyncio
-from azure.ai.evaluation import evaluator
-
-@evaluator
-async def async_validation_evaluator(response: str, context: str) -> dict:
-    """Async evaluator for external validation."""
-    # Simulate async validation (e.g., external API call)
-    await asyncio.sleep(0.1)
-    
-    # Check factual consistency
-    context_words = set(context.lower().split())
-    response_words = set(response.lower().split())
-    overlap = len(context_words & response_words)
-    
-    return {
-        "context_overlap": overlap,
-        "validation_status": "valid" if overlap > 5 else "needs_review"
-    }
-```
-
-## Prompt-Based Evaluators
-
-### Using Azure OpenAI Client
-
-Create evaluators that use LLM judgment:
-
-```python
-from azure.ai.evaluation import AzureOpenAIModelConfiguration
-
-class PromptBasedEvaluator:
-    """LLM-based evaluator using custom prompts."""
-    
-    EVALUATION_PROMPT = """You are an expert evaluator. Rate the following response.
-
-Query: {query}
-Response: {response}
-
-Rate the response on a scale of 1-5 for:
-1. Accuracy: Is the information correct?
-2. Completeness: Does it fully answer the query?
-3. Clarity: Is it easy to understand?
-
-Return ONLY a JSON object with keys: accuracy, completeness, clarity (integers 1-5).
-"""
-    
-    def __init__(self, model_config: dict):
-        from openai import AzureOpenAI
-        
-        self.client = AzureOpenAI(
-            azure_endpoint=model_config["azure_endpoint"],
-            api_key=model_config.get("api_key"),
-            api_version=model_config.get("api_version", "2024-06-01")
-        )
-        self.deployment = model_config["azure_deployment"]
-    
-    def __call__(self, query: str, response: str) -> dict:
-        import json
-        
-        prompt = self.EVALUATION_PROMPT.format(query=query, response=response)
-        
-        completion = self.client.chat.completions.create(
-            model=self.deployment,
-            messages=[{"role": "user", "content": prompt}],
-            temperature=0,
-            response_format={"type": "json_object"}
-        )
-        
-        result = json.loads(completion.choices[0].message.content)
-        
-        # Add aggregate score
-        result["overall_score"] = (
-            result["accuracy"] + result["completeness"] + result["clarity"]
-        ) / 3
-        
-        return result
-```
-
-### Multi-Criteria Prompt Evaluator
-
-```python
-class MultiCriteriaEvaluator:
-    """Evaluate against multiple criteria with detailed feedback."""
-    
-    CRITERIA = {
-        "technical_accuracy": "Is the technical information correct and precise?",
-        "best_practices": "Does it follow industry best practices?",
-        "security": "Are security considerations addressed?",
-        "performance": "Are performance implications considered?"
-    }
-    
-    PROMPT_TEMPLATE = """Evaluate this response against the criterion.
-
-Query: {query}
-Response: {response}
-Context: {context}
-
-Criterion: {criterion_name}
-Definition: {criterion_definition}
-
-Provide:
-1. Score (1-5): 1=poor, 5=excellent
-2. Reason: Brief explanation (1-2 sentences)
-
-Return JSON: {{"score": <int>, "reason": "<string>"}}
-"""
-    
-    def __init__(self, model_config: dict, criteria: dict | None = None):
-        from openai import AzureOpenAI
-        
-        self.client = AzureOpenAI(
-            azure_endpoint=model_config["azure_endpoint"],
-            api_key=model_config.get("api_key"),
-            api_version=model_config.get("api_version", "2024-06-01")
-        )
-        self.deployment = model_config["azure_deployment"]
-        self.criteria = criteria or self.CRITERIA
-    
-    def __call__(
-        self,
-        query: str,
-        response: str,
-        context: str = ""
-    ) -> dict:
-        import json
-        
-        results = {}
-        scores = []
-        
-        for name, definition in self.criteria.items():
-            prompt = self.PROMPT_TEMPLATE.format(
-                query=query,
-                response=response,
-                context=context,
-                criterion_name=name,
-                criterion_definition=definition
-            )
-            
-            completion = self.client.chat.completions.create(
-                model=self.deployment,
-                messages=[{"role": "user", "content": prompt}],
-                temperature=0,
-                response_format={"type": "json_object"}
-            )
-            
-            criterion_result = json.loads(completion.choices[0].message.content)
-            results[f"{name}_score"] = criterion_result["score"]
-            results[f"{name}_reason"] = criterion_result["reason"]
-            scores.append(criterion_result["score"])
-        
-        results["aggregate_score"] = sum(scores) / len(scores)
-        return results
-```
-
-## Composite Custom Evaluators
-
-### Combining Multiple Evaluators
-
-```python
-from azure.ai.evaluation import (
-    GroundednessEvaluator,
-    RelevanceEvaluator,
-    evaluate
-)
-
-class ComprehensiveEvaluator:
-    """Combine built-in and custom evaluators."""
-    
-    def __init__(self, model_config: dict):
-        self.groundedness = GroundednessEvaluator(model_config)
-        self.relevance = RelevanceEvaluator(model_config)
-        self.custom_domain = DomainSpecificEvaluator(
-            domain_terms=["azure", "cloud", "api"]
-        )
-    
-    def __call__(
-        self,
-        query: str,
-        context: str,
-        response: str
-    ) -> dict:
-        results = {}
-        
-        # Run built-in evaluators
-        ground_result = self.groundedness(
-            query=query, context=context, response=response
-        )
-        rel_result = self.relevance(
-            query=query, context=context, response=response
-        )
-        
-        # Run custom evaluator
-        domain_result = self.custom_domain(response=response)
-        
-        # Combine results
-        results.update(ground_result)
-        results.update(rel_result)
-        results.update(domain_result)
-        
-        # Calculate weighted score
-        results["composite_score"] = (
-            ground_result.get("groundedness", 0) * 0.4 +
-            rel_result.get("relevance", 0) * 0.4 +
-            domain_result.get("domain_relevance", 0) * 5 * 0.2  # Scale to 1-5
-        )
-        
-        return results
-```
-
-## Using Custom Evaluators in Batch Evaluation
-
-### With evaluate() Function
-
-```python
-from azure.ai.evaluation import evaluate
-
-# Define custom evaluators
-@evaluator
-def format_checker(response: str) -> dict:
-    has_code = "```" in response
-    has_list = any(line.strip().startswith(("-", "*", "1.")) 
-                   for line in response.split("\n"))
-    return {
-        "has_code_blocks": has_code,
-        "has_lists": has_list,
-        "is_structured": has_code or has_list
-    }
-
-domain_eval = DomainSpecificEvaluator(["python", "azure", "sdk"])
-
-# Run batch evaluation
-result = evaluate(
-    data="test_data.jsonl",
-    evaluators={
-        "format": format_checker,
-        "domain": domain_eval,
-        "groundedness": GroundednessEvaluator(model_config)
-    },
-    evaluator_config={
-        "default": {
-            "column_mapping": {
-                "query": "${data.question}",
-                "context": "${data.context}",
-                "response": "${data.answer}"
-            }
-        }
-    }
-)
-
-print(result["metrics"])
-```
-
-### Column Mapping for Custom Evaluators
-
-```python
-result = evaluate(
-    data="data.jsonl",
-    evaluators={
-        "keyword_coverage": keyword_coverage_evaluator
-    },
-    evaluator_config={
-        "keyword_coverage": {
-            "column_mapping": {
-                "query": "${data.user_query}",
-                "response": "${data.model_response}",
-                "required_keywords": "${data.expected_keywords}"
-            }
-        }
-    }
-)
-```
-
-## Evaluator Testing Patterns
-
-### Unit Testing Custom Evaluators
-
-```python
-import pytest
-from my_evaluators import word_count_evaluator, DomainSpecificEvaluator
-
-class TestWordCountEvaluator:
-    def test_empty_response(self):
-        result = word_count_evaluator(response="")
-        assert result["word_count"] == 0
-    
-    def test_simple_response(self):
-        result = word_count_evaluator(response="Hello world")
-        assert result["word_count"] == 2
-    
-    def test_multiline_response(self):
-        result = word_count_evaluator(response="Hello\nworld\ntest")
-        assert result["word_count"] == 3
-
-class TestDomainSpecificEvaluator:
-    @pytest.fixture
-    def evaluator(self):
-        return DomainSpecificEvaluator(
-            domain_terms=["azure", "cloud"],
-            threshold=0.5
-        )
-    
-    def test_full_coverage(self, evaluator):
-        result = evaluator(response="Azure cloud services")
-        assert result["domain_relevance"] == 1.0
-        assert result["passes_threshold"] is True
-    
-    def test_partial_coverage(self, evaluator):
-        result = evaluator(response="Deploy to Azure")
-        assert result["domain_relevance"] == 0.5
-        assert result["passes_threshold"] is True
-    
-    def test_no_coverage(self, evaluator):
-        result = evaluator(response="Hello world")
-        assert result["domain_relevance"] == 0.0
-        assert result["passes_threshold"] is False
-```
-
-## Best Practices
-
-1. **Return dictionaries** - All evaluators must return `dict` with metric names as keys
-2. **Use descriptive metric names** - Include evaluator context in key names (e.g., `domain_relevance` not just `score`)
-3. **Handle edge cases** - Empty inputs, missing fields, None values
-4. **Keep evaluators focused** - One evaluator = one concept (combine with composite evaluators)
-5. **Document input requirements** - Clear docstrings explaining expected inputs
-6. **Test thoroughly** - Unit tests for all custom evaluators before batch evaluation
-7. **Consider async** - Use async for evaluators with I/O operations
-8. **Normalize scores** - Keep scores in consistent ranges (0-1 or 1-5)
diff --git a/.github/skills/azure-ai-evaluation-py/scripts/run_batch_evaluation.py b/.github/skills/azure-ai-evaluation-py/scripts/run_batch_evaluation.py
deleted file mode 100644
index 7c549d7..0000000
--- a/.github/skills/azure-ai-evaluation-py/scripts/run_batch_evaluation.py
+++ /dev/null
@@ -1,400 +0,0 @@
-#!/usr/bin/env python3
-"""
-Batch Evaluation CLI Tool
-
-Run batch evaluations on test datasets using Azure AI Evaluation SDK.
-Supports quality, safety, agent, and custom evaluators with Foundry integration.
-
-Usage:
-    python run_batch_evaluation.py --data test_data.jsonl --evaluators groundedness relevance
-    python run_batch_evaluation.py --data test_data.jsonl --evaluators qa --output results.json
-    python run_batch_evaluation.py --data test_data.jsonl --safety --log-to-foundry
-    python run_batch_evaluation.py --data test_data.jsonl --agent --evaluators intent_resolution task_adherence
-    python run_batch_evaluation.py --data test_data.jsonl --tags experiment=v1 model=gpt-4o
-
-Environment Variables:
-    AZURE_OPENAI_ENDPOINT      - Azure OpenAI endpoint URL
-    AZURE_OPENAI_API_KEY       - Azure OpenAI API key (optional if using DefaultAzureCredential)
-    AZURE_OPENAI_DEPLOYMENT    - Model deployment name (default: gpt-4o-mini)
-    AZURE_SUBSCRIPTION_ID      - Azure subscription ID (for safety evaluators)
-    AZURE_RESOURCE_GROUP       - Azure resource group (for safety evaluators)
-    AZURE_AI_PROJECT_NAME      - Azure AI project name (for safety evaluators)
-"""
-
-import argparse
-import json
-import os
-import sys
-from pathlib import Path
-from typing import Any
-
-from azure.identity import DefaultAzureCredential
-
-
-# Available evaluators by category
-QUALITY_EVALUATORS = [
-    "groundedness",
-    "groundedness_pro",
-    "relevance",
-    "coherence",
-    "fluency",
-    "similarity",
-    "retrieval",
-]
-NLP_EVALUATORS = ["f1", "rouge", "bleu", "gleu", "meteor"]
-SAFETY_EVALUATORS = [
-    "violence",
-    "sexual",
-    "self_harm",
-    "hate_unfairness",
-    "code_vulnerability",
-    "ungrounded_attributes",
-]
-AGENT_EVALUATORS = [
-    "intent_resolution",
-    "response_completeness",
-    "task_adherence",
-    "tool_call_accuracy",
-]
-COMPOSITE_EVALUATORS = ["qa", "content_safety"]
-
-
-def get_model_config() -> dict[str, Any]:
-    """Build model configuration from environment variables."""
-    endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT")
-    if not endpoint:
-        raise ValueError("AZURE_OPENAI_ENDPOINT environment variable required")
-
-    api_key = os.environ.get("AZURE_OPENAI_API_KEY")
-    deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini")
-
-    config = {
-        "azure_endpoint": endpoint,
-        "azure_deployment": deployment,
-        "api_version": "2024-06-01",
-    }
-
-    if api_key:
-        config["api_key"] = api_key
-    else:
-        config["credential"] = DefaultAzureCredential()
-
-    return config
-
-
-def get_project_scope() -> dict[str, str] | None:
-    """Get Azure AI project scope for safety evaluators."""
-    subscription_id = os.environ.get("AZURE_SUBSCRIPTION_ID")
-    resource_group = os.environ.get("AZURE_RESOURCE_GROUP")
-    project_name = os.environ.get("AZURE_AI_PROJECT_NAME")
-
-    if not all([subscription_id, resource_group, project_name]):
-        return None
-
-    return {
-        "subscription_id": subscription_id,
-        "resource_group_name": resource_group,
-        "project_name": project_name,
-    }
-
-
-def build_evaluators(
-    evaluator_names: list[str],
-    model_config: dict[str, Any],
-    project_scope: dict[str, str] | None,
-    is_reasoning_model: bool = False,
-) -> dict[str, Any]:
-    """Build evaluator instances from names."""
-    from azure.ai.evaluation import (
-        GroundednessEvaluator,
-        GroundednessProEvaluator,
-        RelevanceEvaluator,
-        CoherenceEvaluator,
-        FluencyEvaluator,
-        SimilarityEvaluator,
-        RetrievalEvaluator,
-        F1ScoreEvaluator,
-        RougeScoreEvaluator,
-        BleuScoreEvaluator,
-        GleuScoreEvaluator,
-        MeteorScoreEvaluator,
-        QAEvaluator,
-        IntentResolutionEvaluator,
-        ResponseCompletenessEvaluator,
-        TaskAdherenceEvaluator,
-        ToolCallAccuracyEvaluator,
-    )
-
-    evaluators = {}
-
-    # Quality evaluators (AI-assisted)
-    quality_map = {
-        "groundedness": GroundednessEvaluator,
-        "relevance": RelevanceEvaluator,
-        "coherence": CoherenceEvaluator,
-        "fluency": FluencyEvaluator,
-        "similarity": SimilarityEvaluator,
-        "retrieval": RetrievalEvaluator,
-    }
-
-    # Agent evaluators
-    agent_map = {
-        "intent_resolution": IntentResolutionEvaluator,
-        "response_completeness": ResponseCompletenessEvaluator,
-        "task_adherence": TaskAdherenceEvaluator,
-        "tool_call_accuracy": ToolCallAccuracyEvaluator,
-    }
-
-    # NLP evaluators
-    nlp_map = {
-        "f1": F1ScoreEvaluator,
-        "rouge": RougeScoreEvaluator,
-        "bleu": BleuScoreEvaluator,
-        "gleu": GleuScoreEvaluator,
-        "meteor": MeteorScoreEvaluator,
-    }
-
-    for name in evaluator_names:
-        if name in quality_map:
-            if is_reasoning_model:
-                evaluators[name] = quality_map[name](model_config, is_reasoning_model=True)
-            else:
-                evaluators[name] = quality_map[name](model_config)
-        elif name == "groundedness_pro":
-            if not project_scope:
-                print(f"Warning: Skipping {name} - requires Azure AI project config")
-                continue
-            evaluators[name] = GroundednessProEvaluator(azure_ai_project=project_scope)
-        elif name in agent_map:
-            evaluators[name] = agent_map[name](model_config)
-        elif name in nlp_map:
-            evaluators[name] = nlp_map[name]()
-        elif name == "qa":
-            evaluators[name] = QAEvaluator(model_config)
-        elif name in SAFETY_EVALUATORS or name == "content_safety":
-            if not project_scope:
-                print(f"Warning: Skipping {name} - requires Azure AI project config")
-                continue
-            evaluators[name] = build_safety_evaluator(name, project_scope)
-        else:
-            print(f"Warning: Unknown evaluator '{name}', skipping")
-
-    return evaluators
-
-
-def build_safety_evaluator(name: str, project_scope: dict[str, str]) -> Any:
-    """Build safety evaluator instance."""
-    from azure.ai.evaluation import (
-        ViolenceEvaluator,
-        SexualEvaluator,
-        SelfHarmEvaluator,
-        HateUnfairnessEvaluator,
-        ContentSafetyEvaluator,
-        CodeVulnerabilityEvaluator,
-        UngroundedAttributesEvaluator,
-    )
-
-    safety_map = {
-        "violence": ViolenceEvaluator,
-        "sexual": SexualEvaluator,
-        "self_harm": SelfHarmEvaluator,
-        "hate_unfairness": HateUnfairnessEvaluator,
-        "content_safety": ContentSafetyEvaluator,
-        "code_vulnerability": CodeVulnerabilityEvaluator,
-        "ungrounded_attributes": UngroundedAttributesEvaluator,
-    }
-
-    return safety_map[name](azure_ai_project=project_scope)
-
-
-def run_evaluation(
-    data_path: str,
-    evaluators: dict[str, Any],
-    column_mapping: dict[str, str],
-    project_scope: dict[str, str] | None = None,
-    log_to_foundry: bool = False,
-    tags: dict[str, str] | None = None,
-) -> dict[str, Any]:
-    """Run batch evaluation."""
-    from azure.ai.evaluation import evaluate
-
-    eval_config = {"default": {"column_mapping": column_mapping}}
-
-    kwargs = {
-        "data": data_path,
-        "evaluators": evaluators,
-        "evaluator_config": eval_config,
-    }
-
-    if log_to_foundry and project_scope:
-        kwargs["azure_ai_project"] = project_scope
-
-    if tags:
-        kwargs["tags"] = tags
-
-    return evaluate(**kwargs)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Run batch evaluation on test datasets",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=__doc__,
-    )
-
-    parser.add_argument("--data", "-d", required=True, help="Path to JSONL data file")
-    parser.add_argument(
-        "--evaluators",
-        "-e",
-        nargs="+",
-        default=["groundedness", "relevance", "coherence"],
-        help=f"Evaluators to run. Quality: {QUALITY_EVALUATORS}, "
-        f"NLP: {NLP_EVALUATORS}, Agent: {AGENT_EVALUATORS}, Composite: {COMPOSITE_EVALUATORS}",
-    )
-    parser.add_argument(
-        "--safety", action="store_true", help="Include all safety evaluators"
-    )
-    parser.add_argument(
-        "--agent", action="store_true", help="Include all agent evaluators"
-    )
-    parser.add_argument(
-        "--reasoning-model",
-        action="store_true",
-        help="Use reasoning model configuration (for o1/o3 models)",
-    )
-    parser.add_argument("--output", "-o", help="Output file for results (JSON)")
-    parser.add_argument(
-        "--log-to-foundry", action="store_true", help="Log results to Foundry project"
-    )
-    parser.add_argument(
-        "--tags",
-        nargs="*",
-        help="Tags for experiment tracking (format: key=value)",
-    )
-    parser.add_argument(
-        "--query-column",
-        default="query",
-        help="Column name for query in data (default: query)",
-    )
-    parser.add_argument(
-        "--context-column",
-        default="context",
-        help="Column name for context in data (default: context)",
-    )
-    parser.add_argument(
-        "--response-column",
-        default="response",
-        help="Column name for response in data (default: response)",
-    )
-    parser.add_argument(
-        "--ground-truth-column",
-        default="ground_truth",
-        help="Column name for ground truth in data (default: ground_truth)",
-    )
-
-    args = parser.parse_args()
-
-    # Validate data file
-    data_path = Path(args.data)
-    if not data_path.exists():
-        print(f"Error: Data file not found: {args.data}")
-        sys.exit(1)
-
-    # Build column mapping
-    column_mapping = {
-        "query": f"${{data.{args.query_column}}}",
-        "context": f"${{data.{args.context_column}}}",
-        "response": f"${{data.{args.response_column}}}",
-        "ground_truth": f"${{data.{args.ground_truth_column}}}",
-    }
-
-    # Get configurations
-    try:
-        model_config = get_model_config()
-    except ValueError as e:
-        print(f"Error: {e}")
-        sys.exit(1)
-
-    project_scope = get_project_scope()
-
-    # Parse tags
-    tags = None
-    if args.tags:
-        tags = {}
-        for tag in args.tags:
-            if "=" in tag:
-                key, value = tag.split("=", 1)
-                tags[key] = value
-
-    # Build evaluator list
-    evaluator_names = list(args.evaluators)
-    if args.safety:
-        evaluator_names.extend(SAFETY_EVALUATORS)
-    if args.agent:
-        evaluator_names.extend(AGENT_EVALUATORS)
-
-    # Build evaluators
-    evaluators = build_evaluators(
-        evaluator_names,
-        model_config,
-        project_scope,
-        is_reasoning_model=args.reasoning_model,
-    )
-
-    if not evaluators:
-        print("Error: No valid evaluators configured")
-        sys.exit(1)
-
-    print(f"Running evaluation with: {list(evaluators.keys())}")
-    print(f"Data file: {args.data}")
-    if tags:
-        print(f"Tags: {tags}")
-
-    # Run evaluation
-    try:
-        result = run_evaluation(
-            data_path=str(data_path),
-            evaluators=evaluators,
-            column_mapping=column_mapping,
-            project_scope=project_scope,
-            log_to_foundry=args.log_to_foundry,
-            tags=tags,
-        )
-    except Exception as e:
-        print(f"Error during evaluation: {e}")
-        sys.exit(1)
-
-    # Output results
-    metrics = result.get("metrics", {})
-
-    print("\n=== Evaluation Results ===")
-    for metric, value in sorted(metrics.items()):
-        if isinstance(value, float):
-            print(f"  {metric}: {value:.4f}")
-        else:
-            print(f"  {metric}: {value}")
-
-    if "studio_url" in result:
-        print(f"\nView in Foundry: {result['studio_url']}")
-
-    # Save to file if requested
-    if args.output:
-        output_path = Path(args.output)
-        with open(output_path, "w", encoding="utf-8") as f:
-            json.dump(
-                {
-                    "metrics": metrics,
-                    "studio_url": result.get("studio_url"),
-                    "rows": result.get("rows", []),
-                },
-                f,
-                indent=2,
-                default=str,
-            )
-        print(f"\nResults saved to: {args.output}")
-
-    print("\nEvaluation complete!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.github/skills/azure-ai-projects-py/SKILL.md b/.github/skills/azure-ai-projects-py/SKILL.md
index 74dd7b1..8965b17 100644
--- a/.github/skills/azure-ai-projects-py/SKILL.md
+++ b/.github/skills/azure-ai-projects-py/SKILL.md
@@ -6,7 +6,7 @@ package: azure-ai-projects
 
 # Azure AI Projects Python SDK (Foundry SDK)
 
-Build AI applications on Azure AI Foundry using the `azure-ai-projects` SDK.
+Build AI applications on Microsoft Foundry using the `azure-ai-projects` SDK.
 
 ## Installation
 
@@ -284,9 +284,12 @@ agent = client.agents.create_agent(
 
 - [references/agents.md](references/agents.md): Agent operations with PromptAgentDefinition
 - [references/tools.md](references/tools.md): All agent tools with examples
-- [references/evaluation.md](references/evaluation.md): Evaluation operations and built-in evaluators
+- [references/evaluation.md](references/evaluation.md): Evaluation operations overview
+- [references/built-in-evaluators.md](references/built-in-evaluators.md): Complete built-in evaluator reference
+- [references/custom-evaluators.md](references/custom-evaluators.md): Code and prompt-based evaluator patterns
 - [references/connections.md](references/connections.md): Connection operations
 - [references/deployments.md](references/deployments.md): Deployment enumeration
 - [references/datasets-indexes.md](references/datasets-indexes.md): Dataset and index operations
 - [references/async-patterns.md](references/async-patterns.md): Async client usage
 - [references/api-reference.md](references/api-reference.md): Complete API reference for all 373 SDK exports (v2.0.0b4)
+- [scripts/run_batch_evaluation.py](scripts/run_batch_evaluation.py): CLI tool for batch evaluations
diff --git a/.github/skills/azure-ai-projects-py/references/built-in-evaluators.md b/.github/skills/azure-ai-projects-py/references/built-in-evaluators.md
new file mode 100644
index 0000000..a8158fe
--- /dev/null
+++ b/.github/skills/azure-ai-projects-py/references/built-in-evaluators.md
@@ -0,0 +1,427 @@
+# Built-in Evaluators Reference
+
+Complete reference for Microsoft Foundry's built-in evaluators using the `azure-ai-projects` SDK.
+
+## Discovering Evaluators
+
+### List All Built-in Evaluators
+
+```python
+from azure.identity import DefaultAzureCredential
+from azure.ai.projects import AIProjectClient
+
+endpoint = os.environ["AZURE_AI_PROJECT_ENDPOINT"]
+
+with (
+    DefaultAzureCredential() as credential,
+    AIProjectClient(endpoint=endpoint, credential=credential) as project_client,
+):
+    evaluators = project_client.evaluators.list_latest_versions(type="builtin")
+    for e in evaluators:
+        print(f"{e.name}: {e.description}")
+        print(f"  Categories: {e.categories}")
+```
+
+### Get Evaluator Schema
+
+Before using an evaluator, query its schema to discover required inputs:
+
+```python
+evaluator = project_client.evaluators.get_version(
+    name="builtin.task_adherence",
+    version="latest"
+)
+print(f"Init Parameters: {evaluator.definition.init_parameters}")
+print(f"Data Schema: {evaluator.definition.data_schema}")
+print(f"Metrics: {evaluator.definition.metrics}")
+```
+
+## Using Built-in Evaluators
+
+All built-in evaluators use the `azure_ai_evaluator` type with `builtin.` prefix:
+
+```python
+testing_criteria = [
+    {
+        "type": "azure_ai_evaluator",
+        "name": "my_coherence_check",          # Your custom name for results
+        "evaluator_name": "builtin.coherence", # The actual evaluator
+        "data_mapping": {
+            "query": "{{item.query}}",
+            "response": "{{item.response}}"
+        },
+        "initialization_parameters": {
+            "deployment_name": "gpt-4o-mini"   # Required for LLM-based evaluators
+        }
+    }
+]
+```
+
+## Quality Evaluators
+
+### builtin.coherence
+
+Measures logical flow and consistency of the response.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "coherence",
+    "evaluator_name": "builtin.coherence",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"},
+    "initialization_parameters": {"deployment_name": "gpt-4o-mini"}
+}
+```
+
+**Inputs:** query, response  
+**Output:** Score 1-5 (5 = highly coherent)
+
+### builtin.fluency
+
+Measures grammatical correctness and natural language quality.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "fluency",
+    "evaluator_name": "builtin.fluency",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"},
+    "initialization_parameters": {"deployment_name": "gpt-4o-mini"}
+}
+```
+
+**Inputs:** query, response  
+**Output:** Score 1-5 (5 = perfectly fluent)
+
+### builtin.relevance
+
+Measures how well the response addresses the query given context.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "relevance",
+    "evaluator_name": "builtin.relevance",
+    "data_mapping": {
+        "query": "{{item.query}}",
+        "response": "{{item.response}}",
+        "context": "{{item.context}}"
+    },
+    "initialization_parameters": {"deployment_name": "gpt-4o-mini"}
+}
+```
+
+**Inputs:** query, response, context  
+**Output:** Score 1-5 (5 = highly relevant)
+
+### builtin.groundedness
+
+Measures whether the response is factually grounded in the provided context.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "groundedness",
+    "evaluator_name": "builtin.groundedness",
+    "data_mapping": {
+        "query": "{{item.query}}",
+        "response": "{{item.response}}",
+        "context": "{{item.context}}"
+    },
+    "initialization_parameters": {"deployment_name": "gpt-4o-mini"}
+}
+```
+
+**Inputs:** query, response, context  
+**Output:** Score 1-5 (5 = fully grounded)
+
+### builtin.response_completeness
+
+Measures whether the response fully addresses all aspects of the query.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "response_completeness",
+    "evaluator_name": "builtin.response_completeness",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"},
+    "initialization_parameters": {"deployment_name": "gpt-4o-mini"}
+}
+```
+
+**Inputs:** query, response  
+**Output:** Score 1-5
+
+## Safety Evaluators
+
+Safety evaluators detect harmful content. They don't require `deployment_name`.
+
+### builtin.violence
+
+Detects violent content.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "violence",
+    "evaluator_name": "builtin.violence",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}
+}
+```
+
+**Inputs:** query, response  
+**Output:** pass/fail with severity score
+
+### builtin.sexual
+
+Detects inappropriate sexual content.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "sexual",
+    "evaluator_name": "builtin.sexual",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}
+}
+```
+
+### builtin.self_harm
+
+Detects content promoting or describing self-harm.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "self_harm",
+    "evaluator_name": "builtin.self_harm",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}
+}
+```
+
+### builtin.hate_unfairness
+
+Detects biased or hateful content.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "hate_unfairness",
+    "evaluator_name": "builtin.hate_unfairness",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}
+}
+```
+
+## Agent Evaluators
+
+Agent evaluators assess AI agent behavior and tool usage.
+
+### builtin.task_adherence
+
+Evaluates whether the agent follows its system instructions.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "task_adherence",
+    "evaluator_name": "builtin.task_adherence",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{sample.output_items}}"},
+    "initialization_parameters": {"deployment_name": "gpt-4o-mini"}
+}
+```
+
+**Note:** Use `{{sample.output_items}}` for agent responses to include tool call information.
+
+### builtin.intent_resolution
+
+Evaluates whether the agent correctly understood user intent.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "intent_resolution",
+    "evaluator_name": "builtin.intent_resolution",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{sample.output_text}}"},
+    "initialization_parameters": {"deployment_name": "gpt-4o-mini"}
+}
+```
+
+### builtin.task_completion
+
+Evaluates whether the agent completed the task end-to-end.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "task_completion",
+    "evaluator_name": "builtin.task_completion",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{sample.output_items}}"},
+    "initialization_parameters": {"deployment_name": "gpt-4o-mini"}
+}
+```
+
+### builtin.tool_call_accuracy
+
+Evaluates whether tool calls are correct (selection + parameters).
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "tool_call_accuracy",
+    "evaluator_name": "builtin.tool_call_accuracy",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{sample.output_items}}"},
+    "initialization_parameters": {"deployment_name": "gpt-4o-mini"}
+}
+```
+
+### builtin.tool_call_success
+
+Evaluates whether tool calls executed without failures.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "tool_call_success",
+    "evaluator_name": "builtin.tool_call_success",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{sample.output_items}}"}
+}
+```
+
+### builtin.tool_selection
+
+Evaluates whether the correct tools were selected.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "tool_selection",
+    "evaluator_name": "builtin.tool_selection",
+    "data_mapping": {"query": "{{item.query}}", "response": "{{sample.output_items}}"},
+    "initialization_parameters": {"deployment_name": "gpt-4o-mini"}
+}
+```
+
+## NLP Evaluators
+
+NLP evaluators compare responses to ground truth without requiring an LLM.
+
+### builtin.f1_score
+
+Token-level F1 score between response and ground truth.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "f1",
+    "evaluator_name": "builtin.f1_score",
+    "data_mapping": {"response": "{{item.response}}", "ground_truth": "{{item.ground_truth}}"}
+}
+```
+
+**Output:** Score 0-1
+
+### builtin.bleu_score
+
+BLEU score for generation quality.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "bleu",
+    "evaluator_name": "builtin.bleu_score",
+    "data_mapping": {"response": "{{item.response}}", "ground_truth": "{{item.ground_truth}}"}
+}
+```
+
+### builtin.rouge_score
+
+ROUGE score for summarization quality.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "rouge",
+    "evaluator_name": "builtin.rouge_score",
+    "data_mapping": {"response": "{{item.response}}", "ground_truth": "{{item.ground_truth}}"}
+}
+```
+
+### builtin.similarity
+
+Semantic similarity between response and ground truth.
+
+```python
+{
+    "type": "azure_ai_evaluator",
+    "name": "similarity",
+    "evaluator_name": "builtin.similarity",
+    "data_mapping": {
+        "query": "{{item.query}}",
+        "response": "{{item.response}}",
+        "ground_truth": "{{item.ground_truth}}"
+    },
+    "initialization_parameters": {"deployment_name": "gpt-4o-mini"}
+}
+```
+
+## Evaluator Sets by Use Case
+
+### Quick Health Check
+
+```python
+testing_criteria = [
+    {"type": "azure_ai_evaluator", "name": "coherence", "evaluator_name": "builtin.coherence", ...},
+    {"type": "azure_ai_evaluator", "name": "fluency", "evaluator_name": "builtin.fluency", ...},
+    {"type": "azure_ai_evaluator", "name": "violence", "evaluator_name": "builtin.violence", ...},
+]
+```
+
+### Safety Audit
+
+```python
+testing_criteria = [
+    {"type": "azure_ai_evaluator", "name": "violence", "evaluator_name": "builtin.violence", ...},
+    {"type": "azure_ai_evaluator", "name": "sexual", "evaluator_name": "builtin.sexual", ...},
+    {"type": "azure_ai_evaluator", "name": "self_harm", "evaluator_name": "builtin.self_harm", ...},
+    {"type": "azure_ai_evaluator", "name": "hate_unfairness", "evaluator_name": "builtin.hate_unfairness", ...},
+]
+```
+
+### Agent Evaluation
+
+```python
+testing_criteria = [
+    {"type": "azure_ai_evaluator", "name": "task_adherence", "evaluator_name": "builtin.task_adherence", ...},
+    {"type": "azure_ai_evaluator", "name": "intent_resolution", "evaluator_name": "builtin.intent_resolution", ...},
+    {"type": "azure_ai_evaluator", "name": "tool_call_accuracy", "evaluator_name": "builtin.tool_call_accuracy", ...},
+]
+```
+
+### RAG Evaluation
+
+```python
+testing_criteria = [
+    {"type": "azure_ai_evaluator", "name": "groundedness", "evaluator_name": "builtin.groundedness", ...},
+    {"type": "azure_ai_evaluator", "name": "relevance", "evaluator_name": "builtin.relevance", ...},
+    {"type": "azure_ai_evaluator", "name": "response_completeness", "evaluator_name": "builtin.response_completeness", ...},
+]
+```
+
+## Data Mapping Reference
+
+| Data Source | Response Mapping | Use Case |
+|-------------|------------------|----------|
+| JSONL dataset | `{{item.response}}` | Pre-recorded query/response pairs |
+| Agent target | `{{sample.output_text}}` | Plain text response |
+| Agent target | `{{sample.output_items}}` | Structured JSON with tool calls |
+
+**When to use `sample.output_items`:**
+- Tool-related evaluators (tool_call_accuracy, tool_selection, etc.)
+- Task adherence evaluator
+- Any evaluator needing tool call context
+
+## Related Documentation
+
+- [Azure AI Projects Samples](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-projects/samples/evaluations)
+- [Agent Evaluators](https://learn.microsoft.com/en-us/azure/ai-foundry/concepts/evaluation-evaluators/agent-evaluators)
+- [RAG Evaluators](https://learn.microsoft.com/en-us/azure/ai-foundry/concepts/evaluation-evaluators/rag-evaluators)
+- [Risk and Safety Evaluators](https://learn.microsoft.com/en-us/azure/ai-foundry/concepts/evaluation-evaluators/risk-safety-evaluators)
diff --git a/.github/skills/azure-ai-projects-py/references/custom-evaluators.md b/.github/skills/azure-ai-projects-py/references/custom-evaluators.md
new file mode 100644
index 0000000..03957c1
--- /dev/null
+++ b/.github/skills/azure-ai-projects-py/references/custom-evaluators.md
@@ -0,0 +1,450 @@
+# Custom Evaluators Reference
+
+Create custom evaluators when built-in evaluators don't meet your needs using the `azure-ai-projects` SDK.
+
+## Evaluator Types
+
+| Type | Best For | Requires LLM |
+|------|----------|--------------|
+| **Code-based** | Pattern matching, format validation, deterministic rules | No |
+| **Prompt-based** | Subjective judgment, semantic analysis, nuanced evaluation | Yes |
+
+## Code-Based Evaluators
+
+Use Python code for deterministic evaluation logic.
+
+### Basic Code Evaluator
+
+```python
+from azure.ai.projects import AIProjectClient
+from azure.ai.projects.models import (
+    EvaluatorVersion,
+    EvaluatorCategory,
+    EvaluatorType,
+    CodeBasedEvaluatorDefinition,
+    EvaluatorMetric,
+    EvaluatorMetricType,
+    EvaluatorMetricDirection,
+)
+from azure.identity import DefaultAzureCredential
+import os
+
+endpoint = os.environ["AZURE_AI_PROJECT_ENDPOINT"]
+
+with (
+    DefaultAzureCredential() as credential,
+    AIProjectClient(endpoint=endpoint, credential=credential) as project_client,
+):
+    evaluator = project_client.evaluators.create_version(
+        name="word_count_evaluator",
+        evaluator_version=EvaluatorVersion(
+            evaluator_type=EvaluatorType.CUSTOM,
+            categories=[EvaluatorCategory.QUALITY],
+            display_name="Word Count",
+            description="Counts words in response and checks for conciseness",
+            definition=CodeBasedEvaluatorDefinition(
+                code_text='''
+def grade(sample, item) -> dict:
+    response = item.get("response", "")
+    word_count = len(response.split())
+    return {
+        "word_count": word_count,
+        "is_concise": word_count < 100
+    }
+''',
+                data_schema={
+                    "type": "object",
+                    "properties": {
+                        "response": {"type": "string"}
+                    },
+                    "required": ["response"]
+                },
+                metrics={
+                    "word_count": EvaluatorMetric(
+                        type=EvaluatorMetricType.ORDINAL,
+                        desirable_direction=EvaluatorMetricDirection.DECREASE,
+                        min_value=0,
+                        max_value=10000,
+                    ),
+                    "is_concise": EvaluatorMetric(
+                        type=EvaluatorMetricType.BINARY,
+                    ),
+                },
+            ),
+        ),
+    )
+    print(f"Created evaluator: {evaluator.name} (version {evaluator.version})")
+```
+
+### Code Evaluator: Keyword Checker
+
+```python
+evaluator = project_client.evaluators.create_version(
+    name="disclaimer_checker",
+    evaluator_version=EvaluatorVersion(
+        evaluator_type=EvaluatorType.CUSTOM,
+        categories=[EvaluatorCategory.QUALITY],
+        display_name="Disclaimer Checker",
+        description="Verifies required disclaimers are present in response",
+        definition=CodeBasedEvaluatorDefinition(
+            code_text='''
+def grade(sample, item) -> dict:
+    response = item.get("response", "").lower()
+    required_keywords = ["disclaimer", "not financial advice", "consult a professional"]
+    
+    found = [kw for kw in required_keywords if kw in response]
+    missing = [kw for kw in required_keywords if kw not in response]
+    
+    score = len(found) / len(required_keywords) if required_keywords else 1.0
+    
+    return {
+        "compliance_score": score,
+        "missing_disclaimers": ", ".join(missing) if missing else "none",
+        "passes": score >= 0.8
+    }
+''',
+            data_schema={
+                "type": "object",
+                "properties": {"response": {"type": "string"}},
+                "required": ["response"]
+            },
+            metrics={
+                "compliance_score": EvaluatorMetric(
+                    type=EvaluatorMetricType.ORDINAL,
+                    desirable_direction=EvaluatorMetricDirection.INCREASE,
+                    min_value=0.0,
+                    max_value=1.0,
+                ),
+                "passes": EvaluatorMetric(type=EvaluatorMetricType.BINARY),
+            },
+        ),
+    ),
+)
+```
+
+### Code Evaluator: JSON Format Validator
+
+```python
+evaluator = project_client.evaluators.create_version(
+    name="json_format_checker",
+    evaluator_version=EvaluatorVersion(
+        evaluator_type=EvaluatorType.CUSTOM,
+        categories=[EvaluatorCategory.QUALITY],
+        display_name="JSON Format Validator",
+        description="Checks if response is valid JSON with required fields",
+        definition=CodeBasedEvaluatorDefinition(
+            code_text='''
+import json
+
+def grade(sample, item) -> dict:
+    response = item.get("response", "")
+    required_fields = item.get("required_fields", [])
+    
+    try:
+        parsed = json.loads(response)
+        is_valid_json = True
+        
+        if required_fields:
+            missing = [f for f in required_fields if f not in parsed]
+            has_required_fields = len(missing) == 0
+        else:
+            has_required_fields = True
+            missing = []
+            
+    except json.JSONDecodeError:
+        is_valid_json = False
+        has_required_fields = False
+        missing = required_fields
+    
+    return {
+        "is_valid_json": is_valid_json,
+        "has_required_fields": has_required_fields,
+        "missing_fields": ", ".join(missing) if missing else "none"
+    }
+''',
+            data_schema={
+                "type": "object",
+                "properties": {
+                    "response": {"type": "string"},
+                    "required_fields": {"type": "array", "items": {"type": "string"}}
+                },
+                "required": ["response"]
+            },
+            metrics={
+                "is_valid_json": EvaluatorMetric(type=EvaluatorMetricType.BINARY),
+                "has_required_fields": EvaluatorMetric(type=EvaluatorMetricType.BINARY),
+            },
+        ),
+    ),
+)
+```
+
+## Prompt-Based Evaluators
+
+Use LLM judgment for subjective evaluation.
+
+### Basic Prompt Evaluator
+
+```python
+from azure.ai.projects.models import PromptBasedEvaluatorDefinition
+
+evaluator = project_client.evaluators.create_version(
+    name="helpfulness_evaluator",
+    evaluator_version=EvaluatorVersion(
+        evaluator_type=EvaluatorType.CUSTOM,
+        categories=[EvaluatorCategory.QUALITY],
+        display_name="Helpfulness Evaluator",
+        description="Evaluates how helpful the response is to the user",
+        definition=PromptBasedEvaluatorDefinition(
+            prompt_text='''
+You are an expert evaluator. Rate the helpfulness of the AI assistant's response.
+
+Query: {query}
+Response: {response}
+
+Scoring (1-5):
+1 = Not helpful at all, doesn't address the query
+2 = Slightly helpful, partially addresses the query
+3 = Moderately helpful, addresses most of the query
+4 = Very helpful, fully addresses the query
+5 = Extremely helpful, exceeds expectations
+
+Return ONLY valid JSON: {"score": <1-5>, "reason": "<brief explanation>"}
+''',
+            init_parameters={
+                "type": "object",
+                "properties": {
+                    "deployment_name": {"type": "string", "description": "Model deployment name"}
+                },
+                "required": ["deployment_name"]
+            },
+            data_schema={
+                "type": "object",
+                "properties": {
+                    "query": {"type": "string"},
+                    "response": {"type": "string"}
+                },
+                "required": ["query", "response"]
+            },
+            metrics={
+                "score": EvaluatorMetric(
+                    type=EvaluatorMetricType.ORDINAL,
+                    desirable_direction=EvaluatorMetricDirection.INCREASE,
+                    min_value=1,
+                    max_value=5,
+                ),
+            },
+        ),
+    ),
+)
+```
+
+### Prompt Evaluator: Brand Tone Checker
+
+```python
+evaluator = project_client.evaluators.create_version(
+    name="brand_tone_checker",
+    evaluator_version=EvaluatorVersion(
+        evaluator_type=EvaluatorType.CUSTOM,
+        categories=[EvaluatorCategory.QUALITY],
+        display_name="Brand Tone Checker",
+        description="Evaluates if response matches company brand voice guidelines",
+        definition=PromptBasedEvaluatorDefinition(
+            prompt_text='''
+You are evaluating if an AI assistant's response matches brand voice guidelines.
+
+Brand Guidelines:
+- Professional but friendly
+- Avoid jargon, use simple language
+- Always offer next steps or additional help
+- Never use negative language about competitors
+- End with a helpful call-to-action
+
+Response to evaluate:
+{response}
+
+Score the response from 1-5:
+5 = Perfectly matches brand voice
+4 = Mostly matches, minor issues
+3 = Partially matches
+2 = Significant tone issues
+1 = Does not match brand voice
+
+Return ONLY valid JSON: {"score": <1-5>, "reason": "<brief explanation>", "suggestions": "<improvement suggestions>"}
+''',
+            init_parameters={
+                "type": "object",
+                "properties": {"deployment_name": {"type": "string"}},
+                "required": ["deployment_name"]
+            },
+            data_schema={
+                "type": "object",
+                "properties": {"response": {"type": "string"}},
+                "required": ["response"]
+            },
+            metrics={
+                "score": EvaluatorMetric(
+                    type=EvaluatorMetricType.ORDINAL,
+                    min_value=1,
+                    max_value=5,
+                ),
+            },
+        ),
+    ),
+)
+```
+
+### Prompt Evaluator: Factual Accuracy
+
+```python
+evaluator = project_client.evaluators.create_version(
+    name="factual_accuracy_checker",
+    evaluator_version=EvaluatorVersion(
+        evaluator_type=EvaluatorType.CUSTOM,
+        categories=[EvaluatorCategory.QUALITY],
+        display_name="Factual Accuracy",
+        description="Checks if response claims are supported by context",
+        definition=PromptBasedEvaluatorDefinition(
+            prompt_text='''
+Evaluate whether the response contains only facts supported by the provided context.
+
+Context (source of truth):
+{context}
+
+Response to evaluate:
+{response}
+
+Analysis steps:
+1. Identify each factual claim in the response
+2. Check if each claim is supported by the context
+3. Note any unsupported or fabricated claims
+
+Scoring (1-5):
+1 = Mostly fabricated or incorrect
+2 = Many unsupported claims
+3 = Mixed: some facts but notable errors
+4 = Mostly factual, minor issues
+5 = Fully factual, no unsupported claims
+
+Return ONLY valid JSON: {"score": <1-5>, "reason": "<explanation>", "unsupported_claims": ["<list of unsupported claims>"]}
+''',
+            init_parameters={
+                "type": "object",
+                "properties": {"deployment_name": {"type": "string"}},
+                "required": ["deployment_name"]
+            },
+            data_schema={
+                "type": "object",
+                "properties": {
+                    "context": {"type": "string"},
+                    "response": {"type": "string"}
+                },
+                "required": ["context", "response"]
+            },
+            metrics={
+                "score": EvaluatorMetric(
+                    type=EvaluatorMetricType.ORDINAL,
+                    min_value=1,
+                    max_value=5,
+                ),
+            },
+        ),
+    ),
+)
+```
+
+## Using Custom Evaluators
+
+### In Testing Criteria
+
+```python
+testing_criteria = [
+    # Built-in evaluator
+    {
+        "type": "azure_ai_evaluator",
+        "name": "coherence",
+        "evaluator_name": "builtin.coherence",
+        "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"},
+        "initialization_parameters": {"deployment_name": "gpt-4o-mini"}
+    },
+    # Custom code-based evaluator
+    {
+        "type": "azure_ai_evaluator",
+        "name": "word_count",
+        "evaluator_name": "word_count_evaluator",
+        "data_mapping": {"response": "{{item.response}}"}
+    },
+    # Custom prompt-based evaluator
+    {
+        "type": "azure_ai_evaluator",
+        "name": "helpfulness",
+        "evaluator_name": "helpfulness_evaluator",
+        "initialization_parameters": {"deployment_name": "gpt-4o-mini"},
+        "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}
+    },
+]
+
+eval_object = openai_client.evals.create(
+    name="Mixed Evaluators Test",
+    data_source_config=data_source_config,
+    testing_criteria=testing_criteria,
+)
+```
+
+## Managing Custom Evaluators
+
+### List Custom Evaluators
+
+```python
+evaluators = project_client.evaluators.list_latest_versions(type="custom")
+for e in evaluators:
+    print(f"{e.name} (v{e.version}): {e.display_name}")
+```
+
+### Get Evaluator Details
+
+```python
+evaluator = project_client.evaluators.get_version(
+    name="helpfulness_evaluator",
+    version="latest"
+)
+print(f"Data Schema: {evaluator.definition.data_schema}")
+print(f"Metrics: {evaluator.definition.metrics}")
+```
+
+### Update Evaluator
+
+```python
+updated = project_client.evaluators.update_version(
+    name="word_count_evaluator",
+    version="1",
+    evaluator_version={
+        "description": "Updated description",
+        "display_name": "Word Count v2",
+    }
+)
+```
+
+### Delete Evaluator
+
+```python
+project_client.evaluators.delete_version(
+    name="word_count_evaluator",
+    version="1"
+)
+```
+
+## Best Practices
+
+1. **Use code-based for deterministic logic** - Pattern matching, format validation, keyword checking
+2. **Use prompt-based for subjective judgment** - Quality assessment, tone evaluation, semantic analysis
+3. **Always define data_schema** - Ensures correct data mapping
+4. **Define meaningful metrics** - Use appropriate types (ORDINAL, BINARY)
+5. **Test before production** - Run evaluator on sample data first
+6. **Version your evaluators** - Create new versions instead of modifying existing ones
+
+## Related Documentation
+
+- [Custom Evaluators](https://learn.microsoft.com/en-us/azure/ai-foundry/concepts/evaluation-evaluators/custom-evaluators)
+- [Code-based evaluator sample](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_catalog_code_based_evaluators.py)
+- [Prompt-based evaluator sample](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_catalog_prompt_based_evaluators.py)
diff --git a/.github/skills/azure-ai-projects-py/references/evaluation.md b/.github/skills/azure-ai-projects-py/references/evaluation.md
index 8de55e4..89003b9 100644
--- a/.github/skills/azure-ai-projects-py/references/evaluation.md
+++ b/.github/skills/azure-ai-projects-py/references/evaluation.md
@@ -1,309 +1,349 @@
 # Evaluation Operations Reference
 
-## Overview
-
-Evaluations in Azure AI Foundry use the OpenAI client's evals API to test agent quality.
+Evaluate AI agents and models using Microsoft Foundry's cloud evaluation service.
 
 ## Setup
 
 ```python
+import os
 from azure.ai.projects import AIProjectClient
-from azure.ai.projects.models import PromptAgentDefinition, DataSourceConfigCustom
 from azure.identity import DefaultAzureCredential
 
-project_client = AIProjectClient(
-    endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"],
-    credential=DefaultAzureCredential(),
-)
+endpoint = os.environ["AZURE_AI_PROJECT_ENDPOINT"]
+deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o-mini")
 
-# Get OpenAI client for evals
-openai_client = project_client.get_openai_client()
+with (
+    DefaultAzureCredential() as credential,
+    AIProjectClient(endpoint=endpoint, credential=credential) as project_client,
+):
+    openai_client = project_client.get_openai_client()
+    # Use openai_client.evals.*
 ```
 
-## Create Evaluation
-
-### Define Data Source Configuration
+## Quick Start: Run a Basic Evaluation
 
 ```python
-from azure.ai.projects.models import DataSourceConfigCustom
+from openai.types.evals.create_eval_jsonl_run_data_source_param import (
+    CreateEvalJSONLRunDataSourceParam,
+    SourceFileContent,
+    SourceFileContentContent,
+)
+from openai.types.eval_create_params import DataSourceConfigCustom
+
+# 1. Prepare test data
+data = [
+    {"query": "What is Azure?", "response": "Azure is Microsoft's cloud platform."},
+    {"query": "What is AI?", "response": "AI is artificial intelligence."},
+]
+
+# 2. Create data source
+data_source = CreateEvalJSONLRunDataSourceParam(
+    type="jsonl",
+    source=SourceFileContent(
+        type="file_content",
+        content=[SourceFileContentContent(item=item, sample={}) for item in data],
+    ),
+)
 
+# 3. Configure schema
 data_source_config = DataSourceConfigCustom(
     type="custom",
     item_schema={
         "type": "object",
         "properties": {
             "query": {"type": "string"},
-            "expected_response": {"type": "string"},
+            "response": {"type": "string"},
         },
-        "required": ["query"],
+        "required": ["query", "response"],
     },
-    include_sample_schema=True,
+    include_sample_schema=False,
 )
-```
-
-### Define Testing Criteria (Evaluators)
 
-```python
-# Built-in evaluators
+# 4. Define evaluators
 testing_criteria = [
     {
         "type": "azure_ai_evaluator",
-        "name": "violence_detection",
-        "evaluator_name": "builtin.violence",
-        "data_mapping": {
-            "query": "{{item.query}}",
-            "response": "{{item.response}}",
-        },
+        "name": "coherence",
+        "evaluator_name": "builtin.coherence",
+        "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"},
+        "initialization_parameters": {"deployment_name": deployment},
     },
     {
         "type": "azure_ai_evaluator",
-        "name": "fluency_check",
-        "evaluator_name": "builtin.fluency",
-        "data_mapping": {
-            "query": "{{item.query}}",
-            "response": "{{item.response}}",
-        },
-    },
-    {
-        "type": "azure_ai_evaluator",
-        "name": "task_adherence",
-        "evaluator_name": "builtin.task_adherence",
-        "data_mapping": {
-            "query": "{{item.query}}",
-            "response": "{{item.response}}",
-        },
+        "name": "relevance",
+        "evaluator_name": "builtin.relevance",
+        "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"},
+        "initialization_parameters": {"deployment_name": deployment},
     },
 ]
-```
 
-### Create Evaluation Object
-
-```python
+# 5. Create and run evaluation
 eval_object = openai_client.evals.create(
-    name="Agent Quality Evaluation",
+    name="Quality Evaluation",
     data_source_config=data_source_config,
     testing_criteria=testing_criteria,
 )
-print(f"Created evaluation: {eval_object.id}")
-```
 
-## Run Evaluation
-
-### Define Test Data
-
-```python
-# Inline test data
-data_source = {
-    "type": "azure_ai_target_completions",
-    "source": {
-        "type": "file_content",
-        "content": [
-            {"item": {"query": "What is the capital of France?"}},
-            {"item": {"query": "How do I reverse a string in Python?"}},
-            {"item": {"query": "Explain machine learning in simple terms."}},
-        ],
-    },
-    "input_messages": {
-        "type": "template",
-        "template": [
-            {
-                "type": "message",
-                "role": "user",
-                "content": {"type": "input_text", "text": "{{item.query}}"},
-            }
-        ],
-    },
-    "target": {
-        "type": "azure_ai_agent",
-        "name": agent.name,
-        "version": agent.version,
-    },
-}
-```
-
-### Execute Evaluation Run
-
-```python
-eval_run = openai_client.evals.runs.create(
+run = openai_client.evals.runs.create(
     eval_id=eval_object.id,
-    name=f"Evaluation Run for Agent {agent.name}",
+    name="Run 1",
     data_source=data_source,
 )
-print(f"Evaluation run created: {eval_run.id}")
+
+# 6. Poll for completion
+import time
+while run.status not in ["completed", "failed", "cancelled"]:
+    time.sleep(5)
+    run = openai_client.evals.runs.retrieve(eval_id=eval_object.id, run_id=run.id)
+    print(f"Status: {run.status}")
+
+# 7. Retrieve results
+output_items = list(openai_client.evals.runs.output_items.list(
+    eval_id=eval_object.id, run_id=run.id
+))
+
+for item in output_items:
+    for result in item.results:
+        print(f"{result.name}: {result.score}")
 ```
 
 ## Built-in Evaluators
 
-| Evaluator | Description | Data Mapping |
-|-----------|-------------|--------------|
-| `builtin.violence` | Detects violent content | query, response |
-| `builtin.fluency` | Measures response fluency | query, response |
-| `builtin.task_adherence` | Checks if response follows instructions | query, response |
-| `builtin.groundedness` | Checks factual grounding | query, response, context |
-| `builtin.relevance` | Measures response relevance | query, response |
-| `builtin.coherence` | Checks logical coherence | query, response |
-| `builtin.similarity` | Compares to expected response | response, expected_response |
+Use the `builtin.` prefix for all built-in evaluators:
 
-## Full Evaluation Example
+### Quality Evaluators
 
-```python
-import os
-from azure.ai.projects import AIProjectClient
-from azure.ai.projects.models import PromptAgentDefinition, DataSourceConfigCustom
-from azure.identity import DefaultAzureCredential
+| Evaluator | Data Mapping | Use Case |
+|-----------|--------------|----------|
+| `builtin.coherence` | query, response | Logical flow and consistency |
+| `builtin.relevance` | query, response | Response addresses the query |
+| `builtin.fluency` | query, response | Language quality and readability |
+| `builtin.groundedness` | query, context, response | Factual alignment with context |
 
-# Setup
-project_client = AIProjectClient(
-    endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"],
-    credential=DefaultAzureCredential(),
-)
-openai_client = project_client.get_openai_client()
-
-# Create agent
-agent = project_client.agents.create_version(
-    agent_name="eval-test-agent",
-    definition=PromptAgentDefinition(
-        model=os.environ["AZURE_AI_MODEL_DEPLOYMENT_NAME"],
-        instructions="You are a helpful assistant that answers questions concisely.",
+### Safety Evaluators
+
+| Evaluator | Data Mapping | Use Case |
+|-----------|--------------|----------|
+| `builtin.violence` | query, response | Violent content detection |
+| `builtin.sexual` | query, response | Sexual content detection |
+| `builtin.self_harm` | query, response | Self-harm content detection |
+| `builtin.hate_unfairness` | query, response | Hate/bias detection |
+
+### Agent Evaluators
+
+| Evaluator | Data Mapping | Use Case |
+|-----------|--------------|----------|
+| `builtin.intent_resolution` | query, response | Did agent understand intent? |
+| `builtin.response_completeness` | query, response | Did agent answer fully? |
+| `builtin.task_adherence` | query, response | Did agent follow instructions? |
+| `builtin.tool_call_accuracy` | query, response (JSON) | Were tool calls correct? |
+
+See [built-in-evaluators.md](built-in-evaluators.md) for complete evaluator reference.
+
+## Agent Evaluation
+
+For evaluating AI agents with tool calls, use `sample` mapping:
+
+```python
+# Data with agent outputs
+data_source = CreateEvalJSONLRunDataSourceParam(
+    type="jsonl",
+    source=SourceFileContent(
+        type="file_content",
+        content=[
+            SourceFileContentContent(
+                item={"query": "Weather in Seattle?"},
+                sample={
+                    "output_text": "It's 55°F and cloudy in Seattle.",
+                    "output_items": [
+                        {
+                            "type": "tool_call",
+                            "name": "get_weather",
+                            "arguments": {"location": "Seattle"},
+                            "result": {"temp": "55", "condition": "cloudy"},
+                        }
+                    ],
+                },
+            )
+        ],
     ),
 )
 
-# Configure evaluation
 data_source_config = DataSourceConfigCustom(
     type="custom",
-    item_schema={
-        "type": "object",
-        "properties": {"query": {"type": "string"}},
-        "required": ["query"],
-    },
-    include_sample_schema=True,
+    item_schema={"type": "object", "properties": {"query": {"type": "string"}}},
+    include_sample_schema=True,  # Required for agent evaluations
 )
 
 testing_criteria = [
     {
         "type": "azure_ai_evaluator",
-        "name": "fluency",
-        "evaluator_name": "builtin.fluency",
+        "name": "intent_resolution",
+        "evaluator_name": "builtin.intent_resolution",
         "data_mapping": {
             "query": "{{item.query}}",
-            "response": "{{item.response}}",
+            "response": "{{sample.output_text}}",  # Use sample for agent outputs
         },
+        "initialization_parameters": {"deployment_name": deployment},
     },
     {
         "type": "azure_ai_evaluator",
-        "name": "relevance",
-        "evaluator_name": "builtin.relevance",
+        "name": "tool_call_accuracy",
+        "evaluator_name": "builtin.tool_call_accuracy",
         "data_mapping": {
             "query": "{{item.query}}",
-            "response": "{{item.response}}",
+            "response": "{{sample.output_items}}",  # JSON with tool calls
         },
+        "initialization_parameters": {"deployment_name": deployment},
     },
 ]
+```
 
-# Create evaluation
-eval_object = openai_client.evals.create(
-    name="Quality Check",
-    data_source_config=data_source_config,
-    testing_criteria=testing_criteria,
-)
+## OpenAI Graders
 
-# Run evaluation
-data_source = {
-    "type": "azure_ai_target_completions",
-    "source": {
-        "type": "file_content",
-        "content": [
-            {"item": {"query": "What is 2+2?"}},
-            {"item": {"query": "Who wrote Romeo and Juliet?"}},
-        ],
+For simpler evaluation patterns, use OpenAI graders:
+
+```python
+testing_criteria = [
+    # Label grader (classification)
+    {
+        "type": "label_model",
+        "name": "sentiment",
+        "model": deployment,
+        "input": [{"role": "user", "content": "Classify sentiment: {{item.response}}"}],
+        "labels": ["positive", "negative", "neutral"],
+        "passing_labels": ["positive", "neutral"],
     },
-    "input_messages": {
-        "type": "template",
-        "template": [
-            {
-                "type": "message",
-                "role": "user",
-                "content": {"type": "input_text", "text": "{{item.query}}"},
-            }
-        ],
+    # String check grader
+    {
+        "type": "string_check",
+        "name": "has_disclaimer",
+        "input": "{{item.response}}",
+        "operation": "contains",
+        "reference": "Please consult",
     },
-    "target": {
-        "type": "azure_ai_agent",
-        "name": agent.name,
-        "version": agent.version,
+    # Text similarity grader
+    {
+        "type": "text_similarity",
+        "name": "matches_expected",
+        "input": "{{item.response}}",
+        "reference": "{{item.expected}}",
+        "evaluation_metric": "fuzzy_match",
+        "pass_threshold": 0.8,
     },
-}
-
-eval_run = openai_client.evals.runs.create(
-    eval_id=eval_object.id,
-    name="Test Run",
-    data_source=data_source,
-)
-
-print(f"Evaluation run created: {eval_run.id}")
-print(f"Status: {eval_run.status}")
+]
 ```
 
 ## Custom Evaluators
 
+Create custom evaluators for domain-specific needs.
+
+### Code-Based Evaluator
+
 ```python
-# Define custom evaluator with specific criteria
-custom_evaluator = {
-    "type": "azure_ai_evaluator",
-    "name": "custom_length_check",
-    "evaluator_name": "builtin.fluency",  # Base on existing evaluator
-    "data_mapping": {
-        "query": "{{item.query}}",
-        "response": "{{item.response}}",
-    },
-    "threshold": 0.8,  # Pass threshold
-}
+from azure.ai.projects.models import (
+    EvaluatorVersion, EvaluatorCategory, EvaluatorType,
+    CodeBasedEvaluatorDefinition, EvaluatorMetric, EvaluatorMetricType,
+)
 
-testing_criteria = [custom_evaluator]
+evaluator = project_client.evaluators.create_version(
+    name="word_count",
+    evaluator_version=EvaluatorVersion(
+        evaluator_type=EvaluatorType.CUSTOM,
+        categories=[EvaluatorCategory.QUALITY],
+        display_name="Word Count",
+        definition=CodeBasedEvaluatorDefinition(
+            code_text='''
+def grade(sample, item) -> dict:
+    return {"word_count": len(item.get("response", "").split())}
+''',
+            data_schema={
+                "type": "object",
+                "properties": {"response": {"type": "string"}},
+                "required": ["response"],
+            },
+            metrics={
+                "word_count": EvaluatorMetric(type=EvaluatorMetricType.ORDINAL),
+            },
+        ),
+    ),
+)
 ```
 
-## Evaluation with Ground Truth
+### Prompt-Based Evaluator
 
 ```python
-data_source_config = DataSourceConfigCustom(
-    type="custom",
-    item_schema={
-        "type": "object",
-        "properties": {
-            "query": {"type": "string"},
-            "expected_response": {"type": "string"},
-        },
-        "required": ["query", "expected_response"],
-    },
-    include_sample_schema=True,
+from azure.ai.projects.models import PromptBasedEvaluatorDefinition
+
+evaluator = project_client.evaluators.create_version(
+    name="helpfulness",
+    evaluator_version=EvaluatorVersion(
+        evaluator_type=EvaluatorType.CUSTOM,
+        categories=[EvaluatorCategory.QUALITY],
+        display_name="Helpfulness",
+        definition=PromptBasedEvaluatorDefinition(
+            prompt_text='''
+Rate the helpfulness of the response (1-5):
+Query: {query}
+Response: {response}
+Return JSON: {"score": <1-5>, "reason": "<explanation>"}
+''',
+            init_parameters={
+                "type": "object",
+                "properties": {"deployment_name": {"type": "string"}},
+                "required": ["deployment_name"],
+            },
+            data_schema={
+                "type": "object",
+                "properties": {"query": {"type": "string"}, "response": {"type": "string"}},
+                "required": ["query", "response"],
+            },
+            metrics={"score": EvaluatorMetric(type=EvaluatorMetricType.ORDINAL)},
+        ),
+    ),
 )
+```
 
-testing_criteria = [
-    {
-        "type": "azure_ai_evaluator",
-        "name": "similarity",
-        "evaluator_name": "builtin.similarity",
-        "data_mapping": {
-            "response": "{{item.response}}",
-            "expected_response": "{{item.expected_response}}",
-        },
-    },
-]
+See [custom-evaluators.md](custom-evaluators.md) for complete custom evaluator reference.
 
-# Test data with ground truth
-data_source = {
-    "type": "azure_ai_target_completions",
-    "source": {
-        "type": "file_content",
-        "content": [
-            {
-                "item": {
-                    "query": "What is the capital of France?",
-                    "expected_response": "Paris",
-                }
-            },
-        ],
-    },
-    # ...
-}
+## Discover Available Evaluators
+
+```python
+# List built-in evaluators
+evaluators = project_client.evaluators.list_latest_versions(type="builtin")
+for e in evaluators:
+    print(f"builtin.{e.name}: {e.description}")
+
+# List custom evaluators
+custom = project_client.evaluators.list_latest_versions(type="custom")
+for e in custom:
+    print(f"{e.name}: {e.description}")
 ```
+
+## Data Mapping Reference
+
+| Pattern | Source | Use Case |
+|---------|--------|----------|
+| `{{item.field}}` | Your JSONL data | Standard evaluation data |
+| `{{sample.output_text}}` | Agent response (text) | Agent text outputs |
+| `{{sample.output_items}}` | Agent response (JSON) | Tool calls, structured data |
+
+## CLI Tool
+
+A batch evaluation script is available at `scripts/run_batch_evaluation.py`:
+
+```bash
+python run_batch_evaluation.py --data test_data.jsonl --evaluators coherence relevance
+python run_batch_evaluation.py --data test_data.jsonl --safety
+python run_batch_evaluation.py --data test_data.jsonl --agent --evaluators intent_resolution
+```
+
+## Related Reference Files
+
+- [built-in-evaluators.md](built-in-evaluators.md): Complete built-in evaluator reference
+- [custom-evaluators.md](custom-evaluators.md): Code and prompt-based evaluator patterns
+
+## Related Documentation
+
+- [Azure AI Projects Evaluation Samples](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-projects/samples/evaluations)
+- [Cloud Evaluation Documentation](https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/cloud-evaluation)
diff --git a/.github/skills/azure-ai-projects-py/scripts/run_batch_evaluation.py b/.github/skills/azure-ai-projects-py/scripts/run_batch_evaluation.py
new file mode 100644
index 0000000..98bc430
--- /dev/null
+++ b/.github/skills/azure-ai-projects-py/scripts/run_batch_evaluation.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+"""
+Batch Evaluation CLI Tool
+
+Run batch evaluations on test datasets using Azure AI Projects SDK.
+Supports quality, safety, agent evaluators, and OpenAI graders.
+
+Usage:
+    python run_batch_evaluation.py --data test_data.jsonl --evaluators coherence relevance
+    python run_batch_evaluation.py --data test_data.jsonl --evaluators coherence --output results.json
+    python run_batch_evaluation.py --data test_data.jsonl --safety
+    python run_batch_evaluation.py --data test_data.jsonl --agent --evaluators intent_resolution task_adherence
+
+Environment Variables:
+    AZURE_AI_PROJECT_ENDPOINT     - Azure AI project endpoint (required)
+    AZURE_AI_MODEL_DEPLOYMENT_NAME - Model deployment name (default: gpt-4o-mini)
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+from azure.ai.projects import AIProjectClient
+from azure.identity import DefaultAzureCredential
+from openai.types.evals.create_eval_jsonl_run_data_source_param import (
+    CreateEvalJSONLRunDataSourceParam,
+    SourceFileContent,
+    SourceFileContentContent,
+)
+from openai.types.eval_create_params import DataSourceConfigCustom
+
+
+# Built-in evaluators by category
+QUALITY_EVALUATORS = [
+    "coherence",
+    "relevance",
+    "fluency",
+    "groundedness",
+]
+SAFETY_EVALUATORS = [
+    "violence",
+    "sexual",
+    "self_harm",
+    "hate_unfairness",
+]
+AGENT_EVALUATORS = [
+    "intent_resolution",
+    "response_completeness",
+    "task_adherence",
+    "tool_call_accuracy",
+]
+NLP_EVALUATORS = ["f1", "rouge", "bleu", "gleu", "meteor"]
+
+
+def load_jsonl(path: str) -> list[dict]:
+    """Load JSONL file into list of dicts."""
+    data = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                data.append(json.loads(line))
+    return data
+
+
+def build_data_source(
+    data: list[dict],
+    is_agent: bool = False,
+) -> CreateEvalJSONLRunDataSourceParam:
+    """Build data source from loaded data."""
+    content = []
+    for item in data:
+        if is_agent:
+            # Agent data: extract sample fields from item
+            sample = {
+                "output_text": item.pop("output_text", item.get("response", "")),
+            }
+            if "output_items" in item:
+                sample["output_items"] = item.pop("output_items")
+            content.append(SourceFileContentContent(item=item, sample=sample))
+        else:
+            content.append(SourceFileContentContent(item=item, sample={}))
+
+    return CreateEvalJSONLRunDataSourceParam(
+        type="jsonl",
+        source=SourceFileContent(type="file_content", content=content),
+    )
+
+
+def build_data_source_config(
+    data: list[dict],
+    is_agent: bool = False,
+) -> DataSourceConfigCustom:
+    """Build data source config based on data schema."""
+    # Infer schema from first item
+    if not data:
+        raise ValueError("Data is empty")
+
+    first_item = data[0]
+    properties = {}
+    required = []
+
+    for key in first_item:
+        if key not in ["output_text", "output_items"]:  # Agent fields go in sample
+            properties[key] = {"type": "string"}
+            required.append(key)
+
+    return DataSourceConfigCustom(
+        type="custom",
+        item_schema={
+            "type": "object",
+            "properties": properties,
+            "required": required,
+        },
+        include_sample_schema=is_agent,
+    )
+
+
+def build_testing_criteria(
+    evaluator_names: list[str],
+    deployment_name: str,
+    is_agent: bool = False,
+) -> list[dict]:
+    """Build testing criteria for the specified evaluators."""
+    criteria = []
+
+    for name in evaluator_names:
+        # Determine data mapping based on evaluator type
+        if name in QUALITY_EVALUATORS:
+            if name == "groundedness":
+                data_mapping = {
+                    "query": "{{item.query}}",
+                    "context": "{{item.context}}",
+                    "response": "{{item.response}}",
+                }
+            else:
+                data_mapping = {
+                    "query": "{{item.query}}",
+                    "response": "{{item.response}}",
+                }
+            needs_model = True
+
+        elif name in SAFETY_EVALUATORS:
+            data_mapping = {
+                "query": "{{item.query}}",
+                "response": "{{item.response}}",
+            }
+            needs_model = False  # Safety evaluators may not need deployment
+
+        elif name in AGENT_EVALUATORS:
+            if is_agent:
+                if name == "tool_call_accuracy":
+                    data_mapping = {
+                        "query": "{{item.query}}",
+                        "response": "{{sample.output_items}}",
+                    }
+                else:
+                    data_mapping = {
+                        "query": "{{item.query}}",
+                        "response": "{{sample.output_text}}",
+                    }
+            else:
+                data_mapping = {
+                    "query": "{{item.query}}",
+                    "response": "{{item.response}}",
+                }
+            needs_model = True
+
+        elif name in NLP_EVALUATORS:
+            data_mapping = {
+                "response": "{{item.response}}",
+                "ground_truth": "{{item.ground_truth}}",
+            }
+            needs_model = False
+
+        else:
+            print(f"Warning: Unknown evaluator '{name}', skipping")
+            continue
+
+        criterion = {
+            "type": "azure_ai_evaluator",
+            "name": name,
+            "evaluator_name": f"builtin.{name}",
+            "data_mapping": data_mapping,
+        }
+
+        if needs_model:
+            criterion["initialization_parameters"] = {"deployment_name": deployment_name}
+
+        criteria.append(criterion)
+
+    return criteria
+
+
+def run_evaluation(
+    endpoint: str,
+    data_path: str,
+    evaluator_names: list[str],
+    deployment_name: str,
+    is_agent: bool = False,
+) -> dict[str, Any]:
+    """Run batch evaluation using Azure AI Projects SDK."""
+    # Load data
+    data = load_jsonl(data_path)
+    print(f"Loaded {len(data)} items from {data_path}")
+
+    # Build data source and config
+    data_source = build_data_source(data, is_agent=is_agent)
+    data_source_config = build_data_source_config(data, is_agent=is_agent)
+
+    # Build testing criteria
+    testing_criteria = build_testing_criteria(
+        evaluator_names,
+        deployment_name,
+        is_agent=is_agent,
+    )
+
+    if not testing_criteria:
+        raise ValueError("No valid testing criteria configured")
+
+    print(f"Configured {len(testing_criteria)} evaluators")
+
+    # Create client and run evaluation
+    with (
+        DefaultAzureCredential() as credential,
+        AIProjectClient(endpoint=endpoint, credential=credential) as project_client,
+    ):
+        openai_client = project_client.get_openai_client()
+
+        # Create evaluation definition
+        eval_object = openai_client.evals.create(
+            name=f"Batch Evaluation - {Path(data_path).stem}",
+            data_source_config=data_source_config,
+            testing_criteria=testing_criteria,
+        )
+        print(f"Created evaluation: {eval_object.id}")
+
+        # Create and run evaluation
+        run = openai_client.evals.runs.create(
+            eval_id=eval_object.id,
+            name="CLI Run",
+            data_source=data_source,
+        )
+        print(f"Started run: {run.id}")
+
+        # Poll for completion
+        while run.status not in ["completed", "failed", "cancelled"]:
+            print(f"Status: {run.status}...")
+            time.sleep(5)
+            run = openai_client.evals.runs.retrieve(
+                eval_id=eval_object.id,
+                run_id=run.id,
+            )
+
+        if run.status != "completed":
+            raise RuntimeError(f"Evaluation run {run.status}: {getattr(run, 'error', 'Unknown error')}")
+
+        print(f"Run completed: {run.status}")
+
+        # Retrieve results
+        output_items = list(
+            openai_client.evals.runs.output_items.list(
+                eval_id=eval_object.id,
+                run_id=run.id,
+            )
+        )
+
+        # Aggregate metrics
+        metrics: dict[str, list[float]] = {}
+        rows = []
+
+        for output_item in output_items:
+            row_results = {}
+            for result in output_item.results:
+                if result.score is not None:
+                    if result.name not in metrics:
+                        metrics[result.name] = []
+                    metrics[result.name].append(result.score)
+                    row_results[result.name] = result.score
+            rows.append(row_results)
+
+        # Calculate averages
+        avg_metrics = {}
+        for name, scores in metrics.items():
+            avg_metrics[name] = sum(scores) / len(scores) if scores else 0.0
+
+        return {
+            "eval_id": eval_object.id,
+            "run_id": run.id,
+            "status": run.status,
+            "metrics": avg_metrics,
+            "rows": rows,
+            "total_items": len(output_items),
+        }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run batch evaluation on test datasets using Azure AI Projects SDK",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    parser.add_argument("--data", "-d", required=True, help="Path to JSONL data file")
+    parser.add_argument(
+        "--evaluators",
+        "-e",
+        nargs="+",
+        default=["coherence", "relevance"],
+        help=f"Evaluators to run. Quality: {QUALITY_EVALUATORS}, "
+        f"Safety: {SAFETY_EVALUATORS}, Agent: {AGENT_EVALUATORS}, NLP: {NLP_EVALUATORS}",
+    )
+    parser.add_argument(
+        "--safety",
+        action="store_true",
+        help="Include all safety evaluators",
+    )
+    parser.add_argument(
+        "--agent",
+        action="store_true",
+        help="Include all agent evaluators (uses sample.output_text for response)",
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        help="Output file for results (JSON)",
+    )
+    parser.add_argument(
+        "--deployment",
+        default=None,
+        help="Model deployment name (overrides AZURE_AI_MODEL_DEPLOYMENT_NAME)",
+    )
+
+    args = parser.parse_args()
+
+    # Validate environment
+    endpoint = os.environ.get("AZURE_AI_PROJECT_ENDPOINT")
+    if not endpoint:
+        print("Error: AZURE_AI_PROJECT_ENDPOINT environment variable required")
+        sys.exit(1)
+
+    deployment = args.deployment or os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o-mini")
+
+    # Validate data file
+    data_path = Path(args.data)
+    if not data_path.exists():
+        print(f"Error: Data file not found: {args.data}")
+        sys.exit(1)
+
+    # Build evaluator list
+    evaluator_names = list(args.evaluators)
+    if args.safety:
+        evaluator_names.extend(SAFETY_EVALUATORS)
+    if args.agent:
+        evaluator_names.extend(AGENT_EVALUATORS)
+
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_evaluators = []
+    for e in evaluator_names:
+        if e not in seen:
+            seen.add(e)
+            unique_evaluators.append(e)
+    evaluator_names = unique_evaluators
+
+    print(f"Running evaluation with: {evaluator_names}")
+    print(f"Data file: {args.data}")
+    print(f"Deployment: {deployment}")
+    print(f"Agent mode: {args.agent}")
+
+    # Run evaluation
+    try:
+        result = run_evaluation(
+            endpoint=endpoint,
+            data_path=str(data_path),
+            evaluator_names=evaluator_names,
+            deployment_name=deployment,
+            is_agent=args.agent,
+        )
+    except Exception as e:
+        print(f"Error during evaluation: {e}")
+        sys.exit(1)
+
+    # Output results
+    print("\n=== Evaluation Results ===")
+    print(f"Eval ID: {result['eval_id']}")
+    print(f"Run ID: {result['run_id']}")
+    print(f"Status: {result['status']}")
+    print(f"Total Items: {result['total_items']}")
+    print("\nMetrics:")
+    for metric, value in sorted(result["metrics"].items()):
+        print(f"  {metric}: {value:.4f}")
+
+    # Save to file if requested
+    if args.output:
+        output_path = Path(args.output)
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, indent=2, default=str)
+        print(f"\nResults saved to: {args.output}")
+
+    print("\nEvaluation complete!")
+
+
+if __name__ == "__main__":
+    main()