diff --git a/.github/skills/azure-ai-evaluation-py/SKILL.md b/.github/skills/azure-ai-evaluation-py/SKILL.md deleted file mode 100644 index 627693f..0000000 --- a/.github/skills/azure-ai-evaluation-py/SKILL.md +++ /dev/null @@ -1,433 +0,0 @@ ---- -name: azure-ai-evaluation-py -description: | - Azure AI Evaluation SDK for Python. Use for evaluating generative AI applications with quality, safety, agent, and custom evaluators. - Triggers: "azure-ai-evaluation", "evaluators", "GroundednessEvaluator", "evaluate", "AI quality metrics", "RedTeam", "agent evaluation". -package: azure-ai-evaluation ---- - -# Azure AI Evaluation SDK for Python - -Assess generative AI application performance with built-in quality, safety, agent evaluators, Azure OpenAI graders, and custom evaluators. - -## Installation - -```bash -pip install azure-ai-evaluation - -# With red team support -pip install azure-ai-evaluation[redteam] -``` - -## Environment Variables - -```bash -# For AI-assisted evaluators -AZURE_OPENAI_ENDPOINT=https://.openai.azure.com -AZURE_OPENAI_API_KEY= -AZURE_OPENAI_DEPLOYMENT=gpt-4o-mini - -# For Foundry project integration -AIPROJECT_CONNECTION_STRING= -``` - -## Built-in Evaluators - -### Quality Evaluators (AI-Assisted) - -```python -from azure.ai.evaluation import ( - GroundednessEvaluator, - GroundednessProEvaluator, # Service-based groundedness - RelevanceEvaluator, - CoherenceEvaluator, - FluencyEvaluator, - SimilarityEvaluator, - RetrievalEvaluator -) - -# Initialize with Azure OpenAI model config -model_config = { - "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"], - "api_key": os.environ["AZURE_OPENAI_API_KEY"], - "azure_deployment": os.environ["AZURE_OPENAI_DEPLOYMENT"] -} - -groundedness = GroundednessEvaluator(model_config) -relevance = RelevanceEvaluator(model_config) -coherence = CoherenceEvaluator(model_config) - -# For reasoning models (o1/o3), use is_reasoning_model parameter -groundedness_reasoning = GroundednessEvaluator(model_config, is_reasoning_model=True) -``` - -### Quality Evaluators (NLP-based) - -```python -from azure.ai.evaluation import ( - F1ScoreEvaluator, - RougeScoreEvaluator, - BleuScoreEvaluator, - GleuScoreEvaluator, - MeteorScoreEvaluator -) - -f1 = F1ScoreEvaluator() -rouge = RougeScoreEvaluator() -bleu = BleuScoreEvaluator() -``` - -### Safety Evaluators - -```python -from azure.ai.evaluation import ( - ViolenceEvaluator, - SexualEvaluator, - SelfHarmEvaluator, - HateUnfairnessEvaluator, - IndirectAttackEvaluator, - ProtectedMaterialEvaluator, - CodeVulnerabilityEvaluator, - UngroundedAttributesEvaluator -) - -# Project scope for safety evaluators -azure_ai_project = { - "subscription_id": os.environ["AZURE_SUBSCRIPTION_ID"], - "resource_group_name": os.environ["AZURE_RESOURCE_GROUP"], - "project_name": os.environ["AZURE_AI_PROJECT_NAME"], -} - -violence = ViolenceEvaluator(azure_ai_project=azure_ai_project) -sexual = SexualEvaluator(azure_ai_project=azure_ai_project) -code_vuln = CodeVulnerabilityEvaluator(azure_ai_project=azure_ai_project) - -# Control whether queries are evaluated (default: False, only response evaluated) -violence_with_query = ViolenceEvaluator(azure_ai_project=azure_ai_project, evaluate_query=True) -``` - -### Agent Evaluators - -```python -from azure.ai.evaluation import ( - IntentResolutionEvaluator, - ResponseCompletenessEvaluator, - TaskAdherenceEvaluator, - ToolCallAccuracyEvaluator -) - -intent = IntentResolutionEvaluator(model_config) -completeness = ResponseCompletenessEvaluator(model_config) -task_adherence = TaskAdherenceEvaluator(model_config) -tool_accuracy = ToolCallAccuracyEvaluator(model_config) -``` - -## Single Row Evaluation - -```python -from azure.ai.evaluation import GroundednessEvaluator - -groundedness = GroundednessEvaluator(model_config) - -result = groundedness( - query="What is Azure AI?", - context="Azure AI is Microsoft's AI platform...", - response="Azure AI provides AI services and tools." -) - -print(f"Groundedness score: {result['groundedness']}") -print(f"Reason: {result['groundedness_reason']}") -``` - -## Batch Evaluation with evaluate() - -```python -from azure.ai.evaluation import evaluate - -result = evaluate( - data="test_data.jsonl", - evaluators={ - "groundedness": groundedness, - "relevance": relevance, - "coherence": coherence - }, - evaluator_config={ - "default": { - "column_mapping": { - "query": "${data.query}", - "context": "${data.context}", - "response": "${data.response}" - } - } - }, - # Optional: Add tags for experiment tracking - tags={"experiment": "v1", "model": "gpt-4o"} -) - -print(result["metrics"]) -``` - -## Composite Evaluators - -```python -from azure.ai.evaluation import QAEvaluator, ContentSafetyEvaluator - -# All quality metrics in one -qa_evaluator = QAEvaluator(model_config) - -# All safety metrics in one -safety_evaluator = ContentSafetyEvaluator(azure_ai_project=azure_ai_project) - -result = evaluate( - data="data.jsonl", - evaluators={ - "qa": qa_evaluator, - "content_safety": safety_evaluator - } -) -``` - -## Azure OpenAI Graders - -Use grader classes for structured evaluation via Azure OpenAI's grading API: - -```python -from azure.ai.evaluation import ( - AzureOpenAILabelGrader, - AzureOpenAIStringCheckGrader, - AzureOpenAITextSimilarityGrader, - AzureOpenAIScoreModelGrader, - AzureOpenAIPythonGrader -) - -# Label grader for classification -label_grader = AzureOpenAILabelGrader( - model_config=model_config, - labels=["positive", "negative", "neutral"], - passing_labels=["positive"] -) - -# Score model grader with custom threshold -score_grader = AzureOpenAIScoreModelGrader( - model_config=model_config, - pass_threshold=0.7 -) - -# Use graders as evaluators in evaluate() -result = evaluate( - data="data.jsonl", - evaluators={ - "sentiment": label_grader, - "quality": score_grader - } -) -``` - -## Evaluate Application Target - -```python -from azure.ai.evaluation import evaluate -from my_app import chat_app # Your application - -result = evaluate( - data="queries.jsonl", - target=chat_app, # Callable that takes query, returns response - evaluators={ - "groundedness": groundedness - }, - evaluator_config={ - "default": { - "column_mapping": { - "query": "${data.query}", - "context": "${outputs.context}", - "response": "${outputs.response}" - } - } - } -) -``` - -## Custom Evaluators - -### Code-Based - -```python -from azure.ai.evaluation import evaluator - -@evaluator -def word_count_evaluator(response: str) -> dict: - return {"word_count": len(response.split())} - -# Use in evaluate() -result = evaluate( - data="data.jsonl", - evaluators={"word_count": word_count_evaluator} -) -``` - -### Class-Based with Initialization - -```python -class DomainSpecificEvaluator: - def __init__(self, domain_terms: list[str], threshold: float = 0.5): - self.domain_terms = [t.lower() for t in domain_terms] - self.threshold = threshold - - def __call__(self, response: str) -> dict: - response_lower = response.lower() - matches = sum(1 for term in self.domain_terms if term in response_lower) - score = matches / len(self.domain_terms) if self.domain_terms else 0 - return { - "domain_relevance": score, - "passes_threshold": score >= self.threshold - } - -# Usage -domain_eval = DomainSpecificEvaluator(domain_terms=["azure", "cloud", "api"]) -``` - -### Prompt-Based with Azure OpenAI - -```python -from openai import AzureOpenAI -import json - -class PromptBasedEvaluator: - def __init__(self, model_config: dict): - self.client = AzureOpenAI( - azure_endpoint=model_config["azure_endpoint"], - api_key=model_config.get("api_key"), - api_version="2024-06-01" - ) - self.deployment = model_config["azure_deployment"] - - def __call__(self, query: str, response: str) -> dict: - prompt = f"Rate this response 1-5 for helpfulness. Query: {query}, Response: {response}. Return JSON: {{\"score\": }}" - completion = self.client.chat.completions.create( - model=self.deployment, - messages=[{"role": "user", "content": prompt}], - temperature=0, - response_format={"type": "json_object"} - ) - result = json.loads(completion.choices[0].message.content) - return {"helpfulness": result["score"]} -``` - -## Log to Foundry Project - -```python -from azure.ai.projects import AIProjectClient -from azure.identity import DefaultAzureCredential - -project = AIProjectClient.from_connection_string( - conn_str=os.environ["AIPROJECT_CONNECTION_STRING"], - credential=DefaultAzureCredential() -) - -result = evaluate( - data="data.jsonl", - evaluators={"groundedness": groundedness}, - azure_ai_project=project.scope, # Logs results to Foundry - tags={"version": "1.0", "experiment": "baseline"} -) - -print(f"View results: {result['studio_url']}") -``` - -## Red Team Adversarial Testing - -```python -from azure.ai.evaluation.red_team import RedTeam, AttackStrategy -from azure.identity import DefaultAzureCredential - -red_team = RedTeam( - azure_ai_project=azure_ai_project, - credential=DefaultAzureCredential() -) - -# Run adversarial scan against your application -result = await red_team.scan( - target=my_chat_app, # Your application callable - risk_categories=["violence", "hate_unfairness", "sexual", "self_harm"], - attack_strategies=[ - AttackStrategy.DIRECT, - AttackStrategy.MultiTurn, - AttackStrategy.Crescendo - ], - attack_success_thresholds={"violence": 3, "hate_unfairness": 3} -) - -print(f"Attack success rate: {result.attack_success_rate}") -``` - -## Multimodal Evaluation - -```python -from azure.ai.evaluation import ContentSafetyEvaluator - -safety = ContentSafetyEvaluator(azure_ai_project=azure_ai_project) - -# Evaluate conversations with images -conversation = { - "messages": [ - {"role": "user", "content": [ - {"type": "text", "text": "Describe this image"}, - {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} - ]}, - {"role": "assistant", "content": [ - {"type": "text", "text": "The image shows..."} - ]} - ] -} - -result = safety(conversation=conversation) -``` - -## Evaluator Reference - -| Evaluator | Type | Metrics | -|-----------|------|---------| -| `GroundednessEvaluator` | AI | groundedness (1-5) | -| `GroundednessProEvaluator` | Service | groundedness (1-5) | -| `RelevanceEvaluator` | AI | relevance (1-5) | -| `CoherenceEvaluator` | AI | coherence (1-5) | -| `FluencyEvaluator` | AI | fluency (1-5) | -| `SimilarityEvaluator` | AI | similarity (1-5) | -| `RetrievalEvaluator` | AI | retrieval (1-5) | -| `F1ScoreEvaluator` | NLP | f1_score (0-1) | -| `RougeScoreEvaluator` | NLP | rouge scores | -| `BleuScoreEvaluator` | NLP | bleu_score (0-1) | -| `IntentResolutionEvaluator` | Agent | intent_resolution (1-5) | -| `ResponseCompletenessEvaluator` | Agent | response_completeness (1-5) | -| `TaskAdherenceEvaluator` | Agent | task_adherence (1-5) | -| `ToolCallAccuracyEvaluator` | Agent | tool_call_accuracy (1-5) | -| `ViolenceEvaluator` | Safety | violence (0-7) | -| `SexualEvaluator` | Safety | sexual (0-7) | -| `SelfHarmEvaluator` | Safety | self_harm (0-7) | -| `HateUnfairnessEvaluator` | Safety | hate_unfairness (0-7) | -| `CodeVulnerabilityEvaluator` | Safety | code vulnerabilities | -| `UngroundedAttributesEvaluator` | Safety | ungrounded attributes | -| `QAEvaluator` | Composite | All quality metrics | -| `ContentSafetyEvaluator` | Composite | All safety metrics | - -## Best Practices - -1. **Use composite evaluators** for comprehensive assessment -2. **Map columns correctly** — mismatched columns cause silent failures -3. **Log to Foundry** for tracking and comparison across runs with `tags` -4. **Create custom evaluators** for domain-specific metrics -5. **Use NLP evaluators** when you have ground truth answers -6. **Safety evaluators require** Azure AI project scope -7. **Batch evaluation** is more efficient than single-row loops -8. **Use graders** for structured evaluation with Azure OpenAI's grading API -9. **Agent evaluators** for AI agents with tool calls -10. **RedTeam scanning** for adversarial safety testing before deployment -11. **Use `is_reasoning_model=True`** when evaluating with o1/o3 models - -## Reference Files - -| File | Contents | -|------|----------| -| [references/built-in-evaluators.md](references/built-in-evaluators.md) | Detailed patterns for AI-assisted, NLP-based, Safety, and Agent evaluators with configuration tables | -| [references/custom-evaluators.md](references/custom-evaluators.md) | Creating code-based and prompt-based custom evaluators, testing patterns | -| [scripts/run_batch_evaluation.py](scripts/run_batch_evaluation.py) | CLI tool for running batch evaluations with quality, safety, agent, and custom evaluators | diff --git a/.github/skills/azure-ai-evaluation-py/references/acceptance-criteria.md b/.github/skills/azure-ai-evaluation-py/references/acceptance-criteria.md deleted file mode 100644 index 61f4bc0..0000000 --- a/.github/skills/azure-ai-evaluation-py/references/acceptance-criteria.md +++ /dev/null @@ -1,352 +0,0 @@ -# Azure AI Evaluation SDK Acceptance Criteria - -**SDK**: `azure-ai-evaluation` -**Repository**: https://github.com/Azure/azure-sdk-for-python -**Commit**: `main` -**Purpose**: Skill testing acceptance criteria for validating generated code correctness - ---- - -## 1. Imports - -### 1.1 ✅ CORRECT: Core SDK Imports -```python -from azure.ai.evaluation import ( - # Core - evaluate, - AzureOpenAIModelConfiguration, - - # Quality Evaluators - GroundednessEvaluator, - GroundednessProEvaluator, - RelevanceEvaluator, - CoherenceEvaluator, - FluencyEvaluator, - SimilarityEvaluator, - RetrievalEvaluator, - - # NLP Evaluators - F1ScoreEvaluator, - RougeScoreEvaluator, - GleuScoreEvaluator, - BleuScoreEvaluator, - MeteorScoreEvaluator, - - # Safety Evaluators - ViolenceEvaluator, - SexualEvaluator, - SelfHarmEvaluator, - HateUnfairnessEvaluator, - IndirectAttackEvaluator, - ProtectedMaterialEvaluator, - CodeVulnerabilityEvaluator, - UngroundedAttributesEvaluator, - - # Agent Evaluators - IntentResolutionEvaluator, - ResponseCompletenessEvaluator, - TaskAdherenceEvaluator, - ToolCallAccuracyEvaluator, - - # Composite Evaluators - QAEvaluator, - ContentSafetyEvaluator, - - # Graders - AzureOpenAILabelGrader, - AzureOpenAIStringCheckGrader, - AzureOpenAITextSimilarityGrader, - AzureOpenAIScoreModelGrader, - AzureOpenAIPythonGrader, - - # Custom evaluator decorator - evaluator, -) -``` - -### 1.2 ✅ CORRECT: Authentication Imports -```python -from azure.identity import DefaultAzureCredential -``` - -### 1.3 ❌ INCORRECT: Wrong Import Paths -```python -# WRONG - evaluators are not in a submodule -from azure.ai.evaluation.evaluators import GroundednessEvaluator - -# WRONG - model configuration is not under models -from azure.ai.evaluation.models import AzureOpenAIModelConfiguration - -# WRONG - non-existent imports -from azure.ai.evaluation import Evaluator -from azure.ai.evaluation import PromptChatTarget # Does not exist -``` - ---- - -## 2. Evaluator setup - -### 2.1 ✅ CORRECT: Dict Model Configuration (API key) -```python -model_config = { - "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"], - "api_key": os.environ["AZURE_OPENAI_API_KEY"], - "azure_deployment": os.environ["AZURE_OPENAI_DEPLOYMENT"], -} -``` - -### 2.2 ✅ CORRECT: AzureOpenAIModelConfiguration (Managed Identity) -```python -from azure.ai.evaluation import AzureOpenAIModelConfiguration -from azure.identity import DefaultAzureCredential - -model_config = AzureOpenAIModelConfiguration( - azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], - credential=DefaultAzureCredential(), - azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT"], - api_version="2024-06-01", -) -``` - -### 2.3 ✅ CORRECT: Azure AI Project for Safety Evaluators -```python -azure_ai_project = { - "subscription_id": os.environ["AZURE_SUBSCRIPTION_ID"], - "resource_group_name": os.environ["AZURE_RESOURCE_GROUP"], - "project_name": os.environ["AZURE_AI_PROJECT_NAME"], -} -``` - -### 2.4 ✅ CORRECT: Reasoning Model Configuration -```python -# For o1/o3 reasoning models -groundedness = GroundednessEvaluator(model_config, is_reasoning_model=True) -coherence = CoherenceEvaluator(model_config, is_reasoning_model=True) -``` - -### 2.5 ❌ INCORRECT: Wrong Config Keys -```python -# WRONG - keys must be azure_endpoint and azure_deployment -model_config = { - "endpoint": os.environ["AZURE_OPENAI_ENDPOINT"], - "deployment_name": os.environ["AZURE_OPENAI_DEPLOYMENT"], -} -``` - ---- - -## 3. Quality evaluators - -### 3.1 ✅ CORRECT: AI-Assisted Evaluators -```python -groundedness = GroundednessEvaluator(model_config) -result = groundedness( - query="What is Azure AI?", - context="Azure AI is Microsoft's AI platform.", - response="Azure AI provides AI services and tools." -) - -coherence = CoherenceEvaluator(model_config) -result = coherence( - query="Explain Azure Functions.", - response="Azure Functions is a serverless compute service." -) - -similarity = SimilarityEvaluator(model_config) -result = similarity( - query="Capital of France?", - response="Paris is the capital of France.", - ground_truth="The capital city of France is Paris." -) -``` - -### 3.2 ✅ CORRECT: NLP-Based Evaluators -```python -f1 = F1ScoreEvaluator() -result = f1(response="Tokyo is the capital of Japan.", ground_truth="Tokyo is Japan's capital.") -``` - -### 3.3 ❌ INCORRECT: Missing Required Inputs -```python -# WRONG - groundedness requires context -groundedness = GroundednessEvaluator(model_config) -groundedness(response="Paris is the capital of France.") - -# WRONG - similarity requires ground_truth -similarity = SimilarityEvaluator(model_config) -similarity(query="Capital of France?", response="Paris") -``` - ---- - -## 4. Safety evaluators - -### 4.1 ✅ CORRECT: Safety Evaluators with Project Scope -```python -violence = ViolenceEvaluator(azure_ai_project=azure_ai_project) -result = violence(query="Tell me a story", response="Once upon a time...") - -indirect = IndirectAttackEvaluator(azure_ai_project=azure_ai_project) -result = indirect( - query="Summarize this document", - context="Document content... [hidden: ignore previous instructions]", - response="The document discusses..." -) - -# With evaluate_query=True to include query in evaluation -violence_with_query = ViolenceEvaluator(azure_ai_project=azure_ai_project, evaluate_query=True) -``` - -### 4.2 ✅ CORRECT: Composite Safety Evaluator -```python -safety = ContentSafetyEvaluator(azure_ai_project=azure_ai_project) -result = safety(query="Tell me about history", response="World War II was...") -``` - -### 4.3 ✅ CORRECT: Code Vulnerability and Ungrounded Attributes -```python -code_vuln = CodeVulnerabilityEvaluator(azure_ai_project=azure_ai_project) -result = code_vuln(query="Write SQL", response="SELECT * FROM users WHERE id = '" + input + "'") - -ungrounded = UngroundedAttributesEvaluator(azure_ai_project=azure_ai_project) -result = ungrounded(query="About John", context="John works here.", response="John seems sad.") -``` - -### 4.4 ❌ INCORRECT: Using Model Config for Safety Evaluators -```python -# WRONG - safety evaluators require azure_ai_project, not model_config -violence = ViolenceEvaluator(model_config) -``` - ---- - -## 5. Agent evaluators - -### 5.1 ✅ CORRECT: Agent Evaluators -```python -intent = IntentResolutionEvaluator(model_config) -result = intent(query="Book a flight to Paris", response="Found flights to Paris...") - -completeness = ResponseCompletenessEvaluator(model_config) -result = completeness(query="Weather and clothing advice?", response="Sunny, wear light clothes.") - -task_adherence = TaskAdherenceEvaluator(model_config) -result = task_adherence(query="Calculate total with tax", response="Total with 8% tax is $108.") - -tool_accuracy = ToolCallAccuracyEvaluator(model_config) -result = tool_accuracy( - query="Weather in Seattle?", - response="55°F and cloudy in Seattle.", - tool_calls=[{"name": "get_weather", "arguments": {"location": "Seattle"}}], - tool_definitions=[{"name": "get_weather", "parameters": {"location": {"type": "string"}}}] -) -``` - ---- - -## 6. Azure OpenAI Graders - -### 6.1 ✅ CORRECT: Grader Usage -```python -from azure.ai.evaluation import AzureOpenAILabelGrader, AzureOpenAIScoreModelGrader - -label_grader = AzureOpenAILabelGrader( - model_config=model_config, - labels=["positive", "negative", "neutral"], - passing_labels=["positive"] -) - -score_grader = AzureOpenAIScoreModelGrader( - model_config=model_config, - pass_threshold=0.7 -) - -# Use in evaluate() -result = evaluate( - data="data.jsonl", - evaluators={"sentiment": label_grader, "quality": score_grader} -) -``` - ---- - -## 7. Custom evaluators - -### 7.1 ✅ CORRECT: Decorated Function Evaluator -```python -from azure.ai.evaluation import evaluator - -@evaluator -def word_count_evaluator(response: str) -> dict: - return {"word_count": len(response.split())} -``` - -### 7.2 ✅ CORRECT: Class-Based Evaluator -```python -class DomainSpecificEvaluator: - def __init__(self, domain_terms: list[str]): - self.domain_terms = [term.lower() for term in domain_terms] - - def __call__(self, response: str) -> dict: - hits = sum(1 for term in self.domain_terms if term in response.lower()) - return {"domain_hits": hits} -``` - -### 7.3 ❌ INCORRECT: Non-Dict Return -```python -@evaluator -def bad_evaluator(response: str) -> float: - return 0.5 # WRONG - evaluators must return dict -``` - ---- - -## 8. Batch evaluation - -### 8.1 ✅ CORRECT: evaluate() with Column Mapping -```python -result = evaluate( - data="data.jsonl", - evaluators={ - "groundedness": groundedness, - "relevance": relevance, - }, - evaluator_config={ - "default": { - "column_mapping": { - "query": "${data.query}", - "context": "${data.context}", - "response": "${data.response}", - } - } - }, - # Optional: Add tags for experiment tracking - tags={"experiment": "v1", "model": "gpt-4o"} -) -``` - -### 8.2 ✅ CORRECT: evaluate() on Target -```python -from my_app import chat_app - -result = evaluate( - data="queries.jsonl", - target=chat_app, - evaluators={"groundedness": groundedness}, - evaluator_config={ - "default": { - "column_mapping": { - "query": "${data.query}", - "context": "${outputs.context}", - "response": "${outputs.response}", - } - } - }, -) -``` - -### 8.3 ❌ INCORRECT: Evaluators Not in Dict -```python -# WRONG - evaluators must be a dict of name -> evaluator -evaluate(data="data.jsonl", evaluators=[groundedness, relevance]) -``` diff --git a/.github/skills/azure-ai-evaluation-py/references/built-in-evaluators.md b/.github/skills/azure-ai-evaluation-py/references/built-in-evaluators.md deleted file mode 100644 index 349d1b5..0000000 --- a/.github/skills/azure-ai-evaluation-py/references/built-in-evaluators.md +++ /dev/null @@ -1,684 +0,0 @@ -# Built-in Evaluators Reference - -Comprehensive patterns for Azure AI Evaluation SDK's built-in evaluators. - -## Model Configuration - -All AI-assisted evaluators require a model configuration: - -```python -from azure.ai.evaluation import AzureOpenAIModelConfiguration - -# Using API key authentication -model_config = AzureOpenAIModelConfiguration( - azure_endpoint="https://.openai.azure.com", - api_key="", - azure_deployment="gpt-4o-mini", - api_version="2024-06-01" -) - -# Using DefaultAzureCredential (recommended for production) -from azure.identity import DefaultAzureCredential - -model_config = AzureOpenAIModelConfiguration( - azure_endpoint="https://.openai.azure.com", - credential=DefaultAzureCredential(), - azure_deployment="gpt-4o-mini", - api_version="2024-06-01" -) -``` - -## Azure AI Project Configuration - -Safety evaluators and Foundry logging require an Azure AI project scope: - -```python -# Option 1: Dict configuration -azure_ai_project = { - "subscription_id": os.environ["AZURE_SUBSCRIPTION_ID"], - "resource_group_name": os.environ["AZURE_RESOURCE_GROUP"], - "project_name": os.environ["AZURE_AI_PROJECT_NAME"], -} - -# Option 2: From AIProjectClient -from azure.ai.projects import AIProjectClient -from azure.identity import DefaultAzureCredential - -project = AIProjectClient.from_connection_string( - conn_str="", - credential=DefaultAzureCredential() -) -azure_ai_project = project.scope -``` - -## AI-Assisted Quality Evaluators - -### GroundednessEvaluator - -Measures whether the response is factually grounded in the provided context. - -```python -from azure.ai.evaluation import GroundednessEvaluator - -groundedness = GroundednessEvaluator(model_config) - -result = groundedness( - query="What services does Azure AI provide?", - context="Azure AI provides cognitive services including vision, speech, " - "language understanding, and decision-making APIs.", - response="Azure AI offers vision and speech services." -) - -# Returns: -# { -# "groundedness": 5, # Score 1-5 -# "gpt_groundedness": 5, # Raw GPT score -# "groundedness_reason": "...", # Explanation -# "groundedness_result": "pass", # pass/fail based on threshold -# "groundedness_threshold": 3, -# "groundedness_prompt_tokens": ..., -# "groundedness_completion_tokens": ..., -# "groundedness_model": "gpt-4o-mini" -# } - -# For reasoning models (o1/o3) -groundedness_reasoning = GroundednessEvaluator(model_config, is_reasoning_model=True) -``` - -**Input Requirements:** -- `query`: The user's question -- `context`: Source documents/information -- `response`: The model's response to evaluate - -### GroundednessProEvaluator - -Service-based groundedness evaluation (no model config needed). - -```python -from azure.ai.evaluation import GroundednessProEvaluator - -groundedness_pro = GroundednessProEvaluator(azure_ai_project=azure_ai_project) - -result = groundedness_pro( - query="What is Azure?", - context="Azure is Microsoft's cloud platform...", - response="Azure provides cloud services." -) -``` - -### RelevanceEvaluator - -Measures how well the response addresses the query. - -```python -from azure.ai.evaluation import RelevanceEvaluator - -relevance = RelevanceEvaluator(model_config) - -result = relevance( - query="How do I authenticate with Azure?", - context="Azure supports multiple authentication methods...", - response="Use DefaultAzureCredential for automatic credential discovery." -) - -# Score 1-5: 5 = directly addresses query, 1 = completely irrelevant -``` - -### CoherenceEvaluator - -Measures logical flow and consistency of the response. - -```python -from azure.ai.evaluation import CoherenceEvaluator - -coherence = CoherenceEvaluator(model_config) - -# Note: CoherenceEvaluator only needs query and response -result = coherence( - query="Explain how Azure Functions work.", - response="Azure Functions is a serverless compute service. " - "It triggers based on events. You write code that runs on demand." -) - -# Score 1-5: 5 = logically coherent, 1 = disjointed/contradictory -``` - -### FluencyEvaluator - -Measures grammatical correctness and natural language quality. - -```python -from azure.ai.evaluation import FluencyEvaluator - -fluency = FluencyEvaluator(model_config) - -result = fluency( - query="What is Azure?", - response="Azure is Microsoft's cloud computing platform that provides " - "a wide range of services for building and deploying applications." -) - -# Score 1-5: 5 = perfectly fluent, 1 = poor grammar/unnatural -``` - -### SimilarityEvaluator - -Measures semantic similarity between response and ground truth. - -```python -from azure.ai.evaluation import SimilarityEvaluator - -similarity = SimilarityEvaluator(model_config) - -result = similarity( - query="What is the capital of France?", - response="Paris is the capital of France.", - ground_truth="The capital city of France is Paris." -) - -# Score 1-5: 5 = semantically identical, 1 = completely different -``` - -### RetrievalEvaluator - -Measures quality of retrieved documents for RAG scenarios. - -```python -from azure.ai.evaluation import RetrievalEvaluator - -retrieval = RetrievalEvaluator(model_config) - -result = retrieval( - query="How to configure Azure Storage?", - context="Azure Storage can be configured through the Azure Portal. " - "You can set replication, access tiers, and networking options." -) - -# Score 1-5: 5 = highly relevant retrieval, 1 = irrelevant documents -``` - -## NLP-Based Evaluators - -These evaluators use traditional NLP metrics and don't require a model. - -### F1ScoreEvaluator - -Token-level F1 score between response and ground truth. - -```python -from azure.ai.evaluation import F1ScoreEvaluator - -f1 = F1ScoreEvaluator() - -result = f1( - response="The quick brown fox jumps over the lazy dog", - ground_truth="A quick brown fox jumped over a lazy dog" -) - -# Returns: -# { -# "f1_score": 0.7272... # Score 0-1 -# } -``` - -### RougeScoreEvaluator - -ROUGE scores for summarization quality. - -```python -from azure.ai.evaluation import RougeScoreEvaluator - -rouge = RougeScoreEvaluator(rouge_type="rouge1") # rouge1, rouge2, rougeL, rougeLsum - -result = rouge( - response="Azure provides cloud computing services.", - ground_truth="Azure is Microsoft's cloud computing platform." -) - -# Returns: -# { -# "rouge1_precision": 0.5, -# "rouge1_recall": 0.5, -# "rouge1_fmeasure": 0.5 -# } -``` - -**ROUGE Types:** -- `rouge1`: Unigram overlap -- `rouge2`: Bigram overlap -- `rougeL`: Longest common subsequence -- `rougeLsum`: Summary-level LCS - -### BleuScoreEvaluator - -BLEU score for translation/generation quality. - -```python -from azure.ai.evaluation import BleuScoreEvaluator - -bleu = BleuScoreEvaluator() - -result = bleu( - response="The cat sat on the mat.", - ground_truth="A cat is sitting on the mat." -) - -# Returns: -# { -# "bleu_score": 0.3... # Score 0-1 -# } -``` - -### GleuScoreEvaluator - -GLEU (Google-BLEU) variant optimized for sentence-level evaluation. - -```python -from azure.ai.evaluation import GleuScoreEvaluator - -gleu = GleuScoreEvaluator() - -result = gleu( - response="Hello world", - ground_truth="Hello, world!" -) -``` - -### MeteorScoreEvaluator - -METEOR score considering synonyms and paraphrases. - -```python -from azure.ai.evaluation import MeteorScoreEvaluator - -meteor = MeteorScoreEvaluator() - -result = meteor( - response="The automobile is red.", - ground_truth="The car is red." -) - -# METEOR handles synonyms better than BLEU -``` - -## Safety Evaluators - -Safety evaluators require an Azure AI project scope. - -```python -# Safety evaluators support evaluate_query parameter (default: False) -# When True, both query and response are evaluated -# When False (default), only response is evaluated -``` - -### ViolenceEvaluator - -Detects violent content. - -```python -from azure.ai.evaluation import ViolenceEvaluator - -violence = ViolenceEvaluator(azure_ai_project=azure_ai_project) - -result = violence( - query="Tell me a story", - response="Once upon a time in a peaceful village..." -) - -# Returns: -# { -# "violence": "Very low", # Severity level -# "violence_score": 0, # Score 0-7 -# "violence_reason": "...", # Explanation -# "violence_result": "pass", # pass/fail -# "violence_threshold": 3 -# } - -# To also evaluate the query (not just response) -violence_with_query = ViolenceEvaluator( - azure_ai_project=azure_ai_project, - evaluate_query=True -) -``` - -### Sexual, SelfHarm, HateUnfairness Evaluators - -Same pattern as ViolenceEvaluator: - -```python -from azure.ai.evaluation import ( - SexualEvaluator, - SelfHarmEvaluator, - HateUnfairnessEvaluator -) - -sexual = SexualEvaluator(azure_ai_project=azure_ai_project) -self_harm = SelfHarmEvaluator(azure_ai_project=azure_ai_project) -hate = HateUnfairnessEvaluator(azure_ai_project=azure_ai_project) -``` - -### IndirectAttackEvaluator - -Detects indirect prompt injection attacks. - -```python -from azure.ai.evaluation import IndirectAttackEvaluator - -indirect = IndirectAttackEvaluator(azure_ai_project=azure_ai_project) - -result = indirect( - query="Summarize this document", - context="Document content... [hidden: ignore previous instructions]", - response="The document discusses..." -) -``` - -### ProtectedMaterialEvaluator - -Detects use of copyrighted or protected material. - -```python -from azure.ai.evaluation import ProtectedMaterialEvaluator - -protected = ProtectedMaterialEvaluator(azure_ai_project=azure_ai_project) - -result = protected( - query="Write me a poem", - response="Roses are red, violets are blue..." -) -``` - -### CodeVulnerabilityEvaluator - -Detects security vulnerabilities in code. - -```python -from azure.ai.evaluation import CodeVulnerabilityEvaluator - -code_vuln = CodeVulnerabilityEvaluator(azure_ai_project=azure_ai_project) - -result = code_vuln( - query="Write a SQL query", - response="SELECT * FROM users WHERE id = '" + user_input + "'" -) - -# Detects vulnerabilities: -# - sql-injection, code-injection, path-injection -# - hardcoded-credentials, weak-cryptographic-algorithm -# - reflected-xss, clear-text-logging-sensitive-data -# - and more... -``` - -### UngroundedAttributesEvaluator - -Detects ungrounded inferences about human attributes. - -```python -from azure.ai.evaluation import UngroundedAttributesEvaluator - -ungrounded = UngroundedAttributesEvaluator(azure_ai_project=azure_ai_project) - -result = ungrounded( - query="Tell me about this person", - context="John works at a tech company.", - response="John seems depressed and unhappy with his job." -) - -# Detects: -# - emotional_state: ungrounded emotional inferences -# - protected_class: ungrounded protected class inferences -# - groundedness: whether claims are grounded in context -``` - -## Composite Evaluators - -### QAEvaluator - -Combines all quality metrics in one evaluator. - -```python -from azure.ai.evaluation import QAEvaluator - -qa = QAEvaluator(model_config) - -result = qa( - query="What is Azure?", - context="Azure is Microsoft's cloud platform...", - response="Azure is a cloud computing service by Microsoft.", - ground_truth="Azure is Microsoft's cloud computing platform." -) - -# Returns all quality metrics: -# - groundedness, relevance, coherence, fluency, similarity -``` - -### ContentSafetyEvaluator - -Combines all safety metrics in one evaluator. - -```python -from azure.ai.evaluation import ContentSafetyEvaluator - -safety = ContentSafetyEvaluator(azure_ai_project=azure_ai_project) - -result = safety( - query="Tell me about history", - response="World War II was a global conflict..." -) - -# Returns all safety metrics: -# - violence, sexual, self_harm, hate_unfairness -``` - -## Agent Evaluators - -Evaluators for AI agents with tool calling capabilities. - -### IntentResolutionEvaluator - -Evaluates whether the agent correctly understood and resolved user intent. - -```python -from azure.ai.evaluation import IntentResolutionEvaluator - -intent = IntentResolutionEvaluator(model_config) - -result = intent( - query="Book a flight to Paris for next Monday", - response="I've found several flights to Paris for Monday..." -) - -# Returns: -# { -# "intent_resolution": 4, # Score 1-5 -# "intent_resolution_reason": "...", -# "intent_resolution_result": "pass" -# } -``` - -### ResponseCompletenessEvaluator - -Evaluates whether the agent's response fully addresses the query. - -```python -from azure.ai.evaluation import ResponseCompletenessEvaluator - -completeness = ResponseCompletenessEvaluator(model_config) - -result = completeness( - query="What's the weather and what should I wear?", - response="The weather is sunny and 75°F. I recommend light clothing." -) -``` - -### TaskAdherenceEvaluator - -Evaluates whether the agent adhered to the assigned task. - -```python -from azure.ai.evaluation import TaskAdherenceEvaluator - -task_adherence = TaskAdherenceEvaluator(model_config) - -result = task_adherence( - query="Calculate the total cost including tax", - response="The total with 8% tax is $108." -) -``` - -### ToolCallAccuracyEvaluator - -Evaluates the accuracy of tool calls made by an agent. - -```python -from azure.ai.evaluation import ToolCallAccuracyEvaluator - -tool_accuracy = ToolCallAccuracyEvaluator(model_config) - -# Evaluate agent response with tool calls -result = tool_accuracy( - query="What's the weather in Seattle?", - response="The weather in Seattle is 55°F and cloudy.", - tool_calls=[ - { - "name": "get_weather", - "arguments": {"location": "Seattle"} - } - ], - tool_definitions=[ - { - "name": "get_weather", - "description": "Get weather for a location", - "parameters": {"location": {"type": "string"}} - } - ] -) -``` - -## Azure OpenAI Graders - -Grader classes for structured evaluation using Azure OpenAI's grading API. - -### AzureOpenAILabelGrader - -Classification-based grading with predefined labels. - -```python -from azure.ai.evaluation import AzureOpenAILabelGrader - -label_grader = AzureOpenAILabelGrader( - model_config=model_config, - labels=["positive", "negative", "neutral"], - passing_labels=["positive"] -) - -result = label_grader( - response="This product is amazing!" -) -``` - -### AzureOpenAIScoreModelGrader - -Numeric scoring with customizable thresholds. - -```python -from azure.ai.evaluation import AzureOpenAIScoreModelGrader - -score_grader = AzureOpenAIScoreModelGrader( - model_config=model_config, - pass_threshold=0.7 -) - -result = score_grader( - query="Explain photosynthesis", - response="Plants convert sunlight into energy..." -) -``` - -### AzureOpenAIStringCheckGrader - -String matching and validation. - -```python -from azure.ai.evaluation import AzureOpenAIStringCheckGrader - -string_grader = AzureOpenAIStringCheckGrader( - model_config=model_config, - expected_strings=["Azure", "cloud"] -) -``` - -### AzureOpenAITextSimilarityGrader - -Semantic similarity evaluation. - -```python -from azure.ai.evaluation import AzureOpenAITextSimilarityGrader - -similarity_grader = AzureOpenAITextSimilarityGrader( - model_config=model_config -) - -result = similarity_grader( - response="Paris is France's capital", - ground_truth="The capital of France is Paris" -) -``` - -## Evaluator Configuration Table - -| Evaluator | Type | Required Inputs | Score Range | -|-----------|------|-----------------|-------------| -| `GroundednessEvaluator` | AI | query, context, response | 1-5 | -| `GroundednessProEvaluator` | Service | query, context, response | 1-5 | -| `RelevanceEvaluator` | AI | query, context, response | 1-5 | -| `CoherenceEvaluator` | AI | query, response | 1-5 | -| `FluencyEvaluator` | AI | query, response | 1-5 | -| `SimilarityEvaluator` | AI | query, response, ground_truth | 1-5 | -| `RetrievalEvaluator` | AI | query, context | 1-5 | -| `F1ScoreEvaluator` | NLP | response, ground_truth | 0-1 | -| `RougeScoreEvaluator` | NLP | response, ground_truth | 0-1 | -| `BleuScoreEvaluator` | NLP | response, ground_truth | 0-1 | -| `IntentResolutionEvaluator` | Agent | query, response | 1-5 | -| `ResponseCompletenessEvaluator` | Agent | query, response | 1-5 | -| `TaskAdherenceEvaluator` | Agent | query, response | 1-5 | -| `ToolCallAccuracyEvaluator` | Agent | query, response, tool_calls | 1-5 | -| `ViolenceEvaluator` | Safety | query, response | 0-7 | -| `SexualEvaluator` | Safety | query, response | 0-7 | -| `SelfHarmEvaluator` | Safety | query, response | 0-7 | -| `HateUnfairnessEvaluator` | Safety | query, response | 0-7 | -| `CodeVulnerabilityEvaluator` | Safety | query, response | binary | -| `UngroundedAttributesEvaluator` | Safety | query, context, response | binary | - -## Async Evaluation - -All evaluators support async execution: - -```python -import asyncio -from azure.ai.evaluation import GroundednessEvaluator - -async def evaluate_async(): - groundedness = GroundednessEvaluator(model_config) - - result = await groundedness( - query="What is Azure?", - context="Azure is Microsoft's cloud...", - response="Azure is a cloud platform." - ) - return result - -result = asyncio.run(evaluate_async()) -``` - -## Best Practices - -1. **Choose appropriate evaluators** - Use NLP evaluators when you have ground truth, AI evaluators for subjective quality -2. **Batch evaluation** - Use `evaluate()` function for datasets rather than looping -3. **Safety first** - Always include safety evaluators for user-facing applications -4. **Log to Foundry** - Track evaluations over time with `azure_ai_project` parameter and `tags` -5. **Threshold configuration** - Set appropriate pass/fail thresholds for your use case -6. **Use `is_reasoning_model=True`** - When evaluating with o1/o3 reasoning models -7. **Agent evaluators** - Use IntentResolution, TaskAdherence, and ToolCallAccuracy for AI agents -8. **Graders for structured eval** - Use AzureOpenAI graders for classification and scoring tasks -9. **`evaluate_query` parameter** - Control whether queries are included in safety evaluation diff --git a/.github/skills/azure-ai-evaluation-py/references/custom-evaluators.md b/.github/skills/azure-ai-evaluation-py/references/custom-evaluators.md deleted file mode 100644 index 5216095..0000000 --- a/.github/skills/azure-ai-evaluation-py/references/custom-evaluators.md +++ /dev/null @@ -1,426 +0,0 @@ -# Custom Evaluators Reference - -Patterns for creating custom evaluators with Azure AI Evaluation SDK. - -## Code-Based Evaluators - -### Simple Function Evaluator - -Use the `@evaluator` decorator for simple metrics: - -```python -from azure.ai.evaluation import evaluator - -@evaluator -def word_count_evaluator(response: str) -> dict: - """Count words in response.""" - return {"word_count": len(response.split())} - -@evaluator -def response_length_evaluator(response: str) -> dict: - """Measure response length in characters.""" - return { - "char_count": len(response), - "is_concise": len(response) < 500 - } - -# Usage -result = word_count_evaluator(response="Hello world") -# {"word_count": 2} -``` - -### Multi-Input Evaluator - -Evaluators can accept multiple inputs: - -```python -from azure.ai.evaluation import evaluator - -@evaluator -def keyword_coverage_evaluator( - query: str, - response: str, - required_keywords: list[str] | None = None -) -> dict: - """Check if response covers required keywords from query.""" - if required_keywords is None: - # Extract keywords from query - required_keywords = [w.lower() for w in query.split() if len(w) > 3] - - response_lower = response.lower() - covered = [kw for kw in required_keywords if kw in response_lower] - - coverage = len(covered) / len(required_keywords) if required_keywords else 1.0 - - return { - "keyword_coverage": coverage, - "keywords_found": covered, - "keywords_missing": [kw for kw in required_keywords if kw not in response_lower] - } -``` - -### Class-Based Evaluator - -For evaluators needing initialization or state: - -```python -from azure.ai.evaluation import evaluator - -class DomainSpecificEvaluator: - """Evaluator with domain-specific vocabulary.""" - - def __init__(self, domain_terms: list[str], threshold: float = 0.5): - self.domain_terms = [t.lower() for t in domain_terms] - self.threshold = threshold - - def __call__(self, response: str) -> dict: - response_lower = response.lower() - matches = sum(1 for term in self.domain_terms if term in response_lower) - score = matches / len(self.domain_terms) if self.domain_terms else 0 - - return { - "domain_relevance": score, - "domain_terms_found": matches, - "passes_threshold": score >= self.threshold - } - -# Usage -azure_evaluator = DomainSpecificEvaluator( - domain_terms=["azure", "cloud", "microsoft", "deployment", "resource"], - threshold=0.4 -) - -result = azure_evaluator(response="Deploy your app to Azure cloud resources.") -``` - -### Async Evaluator - -For evaluators that need async operations: - -```python -import asyncio -from azure.ai.evaluation import evaluator - -@evaluator -async def async_validation_evaluator(response: str, context: str) -> dict: - """Async evaluator for external validation.""" - # Simulate async validation (e.g., external API call) - await asyncio.sleep(0.1) - - # Check factual consistency - context_words = set(context.lower().split()) - response_words = set(response.lower().split()) - overlap = len(context_words & response_words) - - return { - "context_overlap": overlap, - "validation_status": "valid" if overlap > 5 else "needs_review" - } -``` - -## Prompt-Based Evaluators - -### Using Azure OpenAI Client - -Create evaluators that use LLM judgment: - -```python -from azure.ai.evaluation import AzureOpenAIModelConfiguration - -class PromptBasedEvaluator: - """LLM-based evaluator using custom prompts.""" - - EVALUATION_PROMPT = """You are an expert evaluator. Rate the following response. - -Query: {query} -Response: {response} - -Rate the response on a scale of 1-5 for: -1. Accuracy: Is the information correct? -2. Completeness: Does it fully answer the query? -3. Clarity: Is it easy to understand? - -Return ONLY a JSON object with keys: accuracy, completeness, clarity (integers 1-5). -""" - - def __init__(self, model_config: dict): - from openai import AzureOpenAI - - self.client = AzureOpenAI( - azure_endpoint=model_config["azure_endpoint"], - api_key=model_config.get("api_key"), - api_version=model_config.get("api_version", "2024-06-01") - ) - self.deployment = model_config["azure_deployment"] - - def __call__(self, query: str, response: str) -> dict: - import json - - prompt = self.EVALUATION_PROMPT.format(query=query, response=response) - - completion = self.client.chat.completions.create( - model=self.deployment, - messages=[{"role": "user", "content": prompt}], - temperature=0, - response_format={"type": "json_object"} - ) - - result = json.loads(completion.choices[0].message.content) - - # Add aggregate score - result["overall_score"] = ( - result["accuracy"] + result["completeness"] + result["clarity"] - ) / 3 - - return result -``` - -### Multi-Criteria Prompt Evaluator - -```python -class MultiCriteriaEvaluator: - """Evaluate against multiple criteria with detailed feedback.""" - - CRITERIA = { - "technical_accuracy": "Is the technical information correct and precise?", - "best_practices": "Does it follow industry best practices?", - "security": "Are security considerations addressed?", - "performance": "Are performance implications considered?" - } - - PROMPT_TEMPLATE = """Evaluate this response against the criterion. - -Query: {query} -Response: {response} -Context: {context} - -Criterion: {criterion_name} -Definition: {criterion_definition} - -Provide: -1. Score (1-5): 1=poor, 5=excellent -2. Reason: Brief explanation (1-2 sentences) - -Return JSON: {{"score": , "reason": ""}} -""" - - def __init__(self, model_config: dict, criteria: dict | None = None): - from openai import AzureOpenAI - - self.client = AzureOpenAI( - azure_endpoint=model_config["azure_endpoint"], - api_key=model_config.get("api_key"), - api_version=model_config.get("api_version", "2024-06-01") - ) - self.deployment = model_config["azure_deployment"] - self.criteria = criteria or self.CRITERIA - - def __call__( - self, - query: str, - response: str, - context: str = "" - ) -> dict: - import json - - results = {} - scores = [] - - for name, definition in self.criteria.items(): - prompt = self.PROMPT_TEMPLATE.format( - query=query, - response=response, - context=context, - criterion_name=name, - criterion_definition=definition - ) - - completion = self.client.chat.completions.create( - model=self.deployment, - messages=[{"role": "user", "content": prompt}], - temperature=0, - response_format={"type": "json_object"} - ) - - criterion_result = json.loads(completion.choices[0].message.content) - results[f"{name}_score"] = criterion_result["score"] - results[f"{name}_reason"] = criterion_result["reason"] - scores.append(criterion_result["score"]) - - results["aggregate_score"] = sum(scores) / len(scores) - return results -``` - -## Composite Custom Evaluators - -### Combining Multiple Evaluators - -```python -from azure.ai.evaluation import ( - GroundednessEvaluator, - RelevanceEvaluator, - evaluate -) - -class ComprehensiveEvaluator: - """Combine built-in and custom evaluators.""" - - def __init__(self, model_config: dict): - self.groundedness = GroundednessEvaluator(model_config) - self.relevance = RelevanceEvaluator(model_config) - self.custom_domain = DomainSpecificEvaluator( - domain_terms=["azure", "cloud", "api"] - ) - - def __call__( - self, - query: str, - context: str, - response: str - ) -> dict: - results = {} - - # Run built-in evaluators - ground_result = self.groundedness( - query=query, context=context, response=response - ) - rel_result = self.relevance( - query=query, context=context, response=response - ) - - # Run custom evaluator - domain_result = self.custom_domain(response=response) - - # Combine results - results.update(ground_result) - results.update(rel_result) - results.update(domain_result) - - # Calculate weighted score - results["composite_score"] = ( - ground_result.get("groundedness", 0) * 0.4 + - rel_result.get("relevance", 0) * 0.4 + - domain_result.get("domain_relevance", 0) * 5 * 0.2 # Scale to 1-5 - ) - - return results -``` - -## Using Custom Evaluators in Batch Evaluation - -### With evaluate() Function - -```python -from azure.ai.evaluation import evaluate - -# Define custom evaluators -@evaluator -def format_checker(response: str) -> dict: - has_code = "```" in response - has_list = any(line.strip().startswith(("-", "*", "1.")) - for line in response.split("\n")) - return { - "has_code_blocks": has_code, - "has_lists": has_list, - "is_structured": has_code or has_list - } - -domain_eval = DomainSpecificEvaluator(["python", "azure", "sdk"]) - -# Run batch evaluation -result = evaluate( - data="test_data.jsonl", - evaluators={ - "format": format_checker, - "domain": domain_eval, - "groundedness": GroundednessEvaluator(model_config) - }, - evaluator_config={ - "default": { - "column_mapping": { - "query": "${data.question}", - "context": "${data.context}", - "response": "${data.answer}" - } - } - } -) - -print(result["metrics"]) -``` - -### Column Mapping for Custom Evaluators - -```python -result = evaluate( - data="data.jsonl", - evaluators={ - "keyword_coverage": keyword_coverage_evaluator - }, - evaluator_config={ - "keyword_coverage": { - "column_mapping": { - "query": "${data.user_query}", - "response": "${data.model_response}", - "required_keywords": "${data.expected_keywords}" - } - } - } -) -``` - -## Evaluator Testing Patterns - -### Unit Testing Custom Evaluators - -```python -import pytest -from my_evaluators import word_count_evaluator, DomainSpecificEvaluator - -class TestWordCountEvaluator: - def test_empty_response(self): - result = word_count_evaluator(response="") - assert result["word_count"] == 0 - - def test_simple_response(self): - result = word_count_evaluator(response="Hello world") - assert result["word_count"] == 2 - - def test_multiline_response(self): - result = word_count_evaluator(response="Hello\nworld\ntest") - assert result["word_count"] == 3 - -class TestDomainSpecificEvaluator: - @pytest.fixture - def evaluator(self): - return DomainSpecificEvaluator( - domain_terms=["azure", "cloud"], - threshold=0.5 - ) - - def test_full_coverage(self, evaluator): - result = evaluator(response="Azure cloud services") - assert result["domain_relevance"] == 1.0 - assert result["passes_threshold"] is True - - def test_partial_coverage(self, evaluator): - result = evaluator(response="Deploy to Azure") - assert result["domain_relevance"] == 0.5 - assert result["passes_threshold"] is True - - def test_no_coverage(self, evaluator): - result = evaluator(response="Hello world") - assert result["domain_relevance"] == 0.0 - assert result["passes_threshold"] is False -``` - -## Best Practices - -1. **Return dictionaries** - All evaluators must return `dict` with metric names as keys -2. **Use descriptive metric names** - Include evaluator context in key names (e.g., `domain_relevance` not just `score`) -3. **Handle edge cases** - Empty inputs, missing fields, None values -4. **Keep evaluators focused** - One evaluator = one concept (combine with composite evaluators) -5. **Document input requirements** - Clear docstrings explaining expected inputs -6. **Test thoroughly** - Unit tests for all custom evaluators before batch evaluation -7. **Consider async** - Use async for evaluators with I/O operations -8. **Normalize scores** - Keep scores in consistent ranges (0-1 or 1-5) diff --git a/.github/skills/azure-ai-evaluation-py/scripts/run_batch_evaluation.py b/.github/skills/azure-ai-evaluation-py/scripts/run_batch_evaluation.py deleted file mode 100644 index 7c549d7..0000000 --- a/.github/skills/azure-ai-evaluation-py/scripts/run_batch_evaluation.py +++ /dev/null @@ -1,400 +0,0 @@ -#!/usr/bin/env python3 -""" -Batch Evaluation CLI Tool - -Run batch evaluations on test datasets using Azure AI Evaluation SDK. -Supports quality, safety, agent, and custom evaluators with Foundry integration. - -Usage: - python run_batch_evaluation.py --data test_data.jsonl --evaluators groundedness relevance - python run_batch_evaluation.py --data test_data.jsonl --evaluators qa --output results.json - python run_batch_evaluation.py --data test_data.jsonl --safety --log-to-foundry - python run_batch_evaluation.py --data test_data.jsonl --agent --evaluators intent_resolution task_adherence - python run_batch_evaluation.py --data test_data.jsonl --tags experiment=v1 model=gpt-4o - -Environment Variables: - AZURE_OPENAI_ENDPOINT - Azure OpenAI endpoint URL - AZURE_OPENAI_API_KEY - Azure OpenAI API key (optional if using DefaultAzureCredential) - AZURE_OPENAI_DEPLOYMENT - Model deployment name (default: gpt-4o-mini) - AZURE_SUBSCRIPTION_ID - Azure subscription ID (for safety evaluators) - AZURE_RESOURCE_GROUP - Azure resource group (for safety evaluators) - AZURE_AI_PROJECT_NAME - Azure AI project name (for safety evaluators) -""" - -import argparse -import json -import os -import sys -from pathlib import Path -from typing import Any - -from azure.identity import DefaultAzureCredential - - -# Available evaluators by category -QUALITY_EVALUATORS = [ - "groundedness", - "groundedness_pro", - "relevance", - "coherence", - "fluency", - "similarity", - "retrieval", -] -NLP_EVALUATORS = ["f1", "rouge", "bleu", "gleu", "meteor"] -SAFETY_EVALUATORS = [ - "violence", - "sexual", - "self_harm", - "hate_unfairness", - "code_vulnerability", - "ungrounded_attributes", -] -AGENT_EVALUATORS = [ - "intent_resolution", - "response_completeness", - "task_adherence", - "tool_call_accuracy", -] -COMPOSITE_EVALUATORS = ["qa", "content_safety"] - - -def get_model_config() -> dict[str, Any]: - """Build model configuration from environment variables.""" - endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT") - if not endpoint: - raise ValueError("AZURE_OPENAI_ENDPOINT environment variable required") - - api_key = os.environ.get("AZURE_OPENAI_API_KEY") - deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini") - - config = { - "azure_endpoint": endpoint, - "azure_deployment": deployment, - "api_version": "2024-06-01", - } - - if api_key: - config["api_key"] = api_key - else: - config["credential"] = DefaultAzureCredential() - - return config - - -def get_project_scope() -> dict[str, str] | None: - """Get Azure AI project scope for safety evaluators.""" - subscription_id = os.environ.get("AZURE_SUBSCRIPTION_ID") - resource_group = os.environ.get("AZURE_RESOURCE_GROUP") - project_name = os.environ.get("AZURE_AI_PROJECT_NAME") - - if not all([subscription_id, resource_group, project_name]): - return None - - return { - "subscription_id": subscription_id, - "resource_group_name": resource_group, - "project_name": project_name, - } - - -def build_evaluators( - evaluator_names: list[str], - model_config: dict[str, Any], - project_scope: dict[str, str] | None, - is_reasoning_model: bool = False, -) -> dict[str, Any]: - """Build evaluator instances from names.""" - from azure.ai.evaluation import ( - GroundednessEvaluator, - GroundednessProEvaluator, - RelevanceEvaluator, - CoherenceEvaluator, - FluencyEvaluator, - SimilarityEvaluator, - RetrievalEvaluator, - F1ScoreEvaluator, - RougeScoreEvaluator, - BleuScoreEvaluator, - GleuScoreEvaluator, - MeteorScoreEvaluator, - QAEvaluator, - IntentResolutionEvaluator, - ResponseCompletenessEvaluator, - TaskAdherenceEvaluator, - ToolCallAccuracyEvaluator, - ) - - evaluators = {} - - # Quality evaluators (AI-assisted) - quality_map = { - "groundedness": GroundednessEvaluator, - "relevance": RelevanceEvaluator, - "coherence": CoherenceEvaluator, - "fluency": FluencyEvaluator, - "similarity": SimilarityEvaluator, - "retrieval": RetrievalEvaluator, - } - - # Agent evaluators - agent_map = { - "intent_resolution": IntentResolutionEvaluator, - "response_completeness": ResponseCompletenessEvaluator, - "task_adherence": TaskAdherenceEvaluator, - "tool_call_accuracy": ToolCallAccuracyEvaluator, - } - - # NLP evaluators - nlp_map = { - "f1": F1ScoreEvaluator, - "rouge": RougeScoreEvaluator, - "bleu": BleuScoreEvaluator, - "gleu": GleuScoreEvaluator, - "meteor": MeteorScoreEvaluator, - } - - for name in evaluator_names: - if name in quality_map: - if is_reasoning_model: - evaluators[name] = quality_map[name](model_config, is_reasoning_model=True) - else: - evaluators[name] = quality_map[name](model_config) - elif name == "groundedness_pro": - if not project_scope: - print(f"Warning: Skipping {name} - requires Azure AI project config") - continue - evaluators[name] = GroundednessProEvaluator(azure_ai_project=project_scope) - elif name in agent_map: - evaluators[name] = agent_map[name](model_config) - elif name in nlp_map: - evaluators[name] = nlp_map[name]() - elif name == "qa": - evaluators[name] = QAEvaluator(model_config) - elif name in SAFETY_EVALUATORS or name == "content_safety": - if not project_scope: - print(f"Warning: Skipping {name} - requires Azure AI project config") - continue - evaluators[name] = build_safety_evaluator(name, project_scope) - else: - print(f"Warning: Unknown evaluator '{name}', skipping") - - return evaluators - - -def build_safety_evaluator(name: str, project_scope: dict[str, str]) -> Any: - """Build safety evaluator instance.""" - from azure.ai.evaluation import ( - ViolenceEvaluator, - SexualEvaluator, - SelfHarmEvaluator, - HateUnfairnessEvaluator, - ContentSafetyEvaluator, - CodeVulnerabilityEvaluator, - UngroundedAttributesEvaluator, - ) - - safety_map = { - "violence": ViolenceEvaluator, - "sexual": SexualEvaluator, - "self_harm": SelfHarmEvaluator, - "hate_unfairness": HateUnfairnessEvaluator, - "content_safety": ContentSafetyEvaluator, - "code_vulnerability": CodeVulnerabilityEvaluator, - "ungrounded_attributes": UngroundedAttributesEvaluator, - } - - return safety_map[name](azure_ai_project=project_scope) - - -def run_evaluation( - data_path: str, - evaluators: dict[str, Any], - column_mapping: dict[str, str], - project_scope: dict[str, str] | None = None, - log_to_foundry: bool = False, - tags: dict[str, str] | None = None, -) -> dict[str, Any]: - """Run batch evaluation.""" - from azure.ai.evaluation import evaluate - - eval_config = {"default": {"column_mapping": column_mapping}} - - kwargs = { - "data": data_path, - "evaluators": evaluators, - "evaluator_config": eval_config, - } - - if log_to_foundry and project_scope: - kwargs["azure_ai_project"] = project_scope - - if tags: - kwargs["tags"] = tags - - return evaluate(**kwargs) - - -def main(): - parser = argparse.ArgumentParser( - description="Run batch evaluation on test datasets", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=__doc__, - ) - - parser.add_argument("--data", "-d", required=True, help="Path to JSONL data file") - parser.add_argument( - "--evaluators", - "-e", - nargs="+", - default=["groundedness", "relevance", "coherence"], - help=f"Evaluators to run. Quality: {QUALITY_EVALUATORS}, " - f"NLP: {NLP_EVALUATORS}, Agent: {AGENT_EVALUATORS}, Composite: {COMPOSITE_EVALUATORS}", - ) - parser.add_argument( - "--safety", action="store_true", help="Include all safety evaluators" - ) - parser.add_argument( - "--agent", action="store_true", help="Include all agent evaluators" - ) - parser.add_argument( - "--reasoning-model", - action="store_true", - help="Use reasoning model configuration (for o1/o3 models)", - ) - parser.add_argument("--output", "-o", help="Output file for results (JSON)") - parser.add_argument( - "--log-to-foundry", action="store_true", help="Log results to Foundry project" - ) - parser.add_argument( - "--tags", - nargs="*", - help="Tags for experiment tracking (format: key=value)", - ) - parser.add_argument( - "--query-column", - default="query", - help="Column name for query in data (default: query)", - ) - parser.add_argument( - "--context-column", - default="context", - help="Column name for context in data (default: context)", - ) - parser.add_argument( - "--response-column", - default="response", - help="Column name for response in data (default: response)", - ) - parser.add_argument( - "--ground-truth-column", - default="ground_truth", - help="Column name for ground truth in data (default: ground_truth)", - ) - - args = parser.parse_args() - - # Validate data file - data_path = Path(args.data) - if not data_path.exists(): - print(f"Error: Data file not found: {args.data}") - sys.exit(1) - - # Build column mapping - column_mapping = { - "query": f"${{data.{args.query_column}}}", - "context": f"${{data.{args.context_column}}}", - "response": f"${{data.{args.response_column}}}", - "ground_truth": f"${{data.{args.ground_truth_column}}}", - } - - # Get configurations - try: - model_config = get_model_config() - except ValueError as e: - print(f"Error: {e}") - sys.exit(1) - - project_scope = get_project_scope() - - # Parse tags - tags = None - if args.tags: - tags = {} - for tag in args.tags: - if "=" in tag: - key, value = tag.split("=", 1) - tags[key] = value - - # Build evaluator list - evaluator_names = list(args.evaluators) - if args.safety: - evaluator_names.extend(SAFETY_EVALUATORS) - if args.agent: - evaluator_names.extend(AGENT_EVALUATORS) - - # Build evaluators - evaluators = build_evaluators( - evaluator_names, - model_config, - project_scope, - is_reasoning_model=args.reasoning_model, - ) - - if not evaluators: - print("Error: No valid evaluators configured") - sys.exit(1) - - print(f"Running evaluation with: {list(evaluators.keys())}") - print(f"Data file: {args.data}") - if tags: - print(f"Tags: {tags}") - - # Run evaluation - try: - result = run_evaluation( - data_path=str(data_path), - evaluators=evaluators, - column_mapping=column_mapping, - project_scope=project_scope, - log_to_foundry=args.log_to_foundry, - tags=tags, - ) - except Exception as e: - print(f"Error during evaluation: {e}") - sys.exit(1) - - # Output results - metrics = result.get("metrics", {}) - - print("\n=== Evaluation Results ===") - for metric, value in sorted(metrics.items()): - if isinstance(value, float): - print(f" {metric}: {value:.4f}") - else: - print(f" {metric}: {value}") - - if "studio_url" in result: - print(f"\nView in Foundry: {result['studio_url']}") - - # Save to file if requested - if args.output: - output_path = Path(args.output) - with open(output_path, "w", encoding="utf-8") as f: - json.dump( - { - "metrics": metrics, - "studio_url": result.get("studio_url"), - "rows": result.get("rows", []), - }, - f, - indent=2, - default=str, - ) - print(f"\nResults saved to: {args.output}") - - print("\nEvaluation complete!") - - -if __name__ == "__main__": - main() diff --git a/.github/skills/azure-ai-projects-py/SKILL.md b/.github/skills/azure-ai-projects-py/SKILL.md index 74dd7b1..8965b17 100644 --- a/.github/skills/azure-ai-projects-py/SKILL.md +++ b/.github/skills/azure-ai-projects-py/SKILL.md @@ -6,7 +6,7 @@ package: azure-ai-projects # Azure AI Projects Python SDK (Foundry SDK) -Build AI applications on Azure AI Foundry using the `azure-ai-projects` SDK. +Build AI applications on Microsoft Foundry using the `azure-ai-projects` SDK. ## Installation @@ -284,9 +284,12 @@ agent = client.agents.create_agent( - [references/agents.md](references/agents.md): Agent operations with PromptAgentDefinition - [references/tools.md](references/tools.md): All agent tools with examples -- [references/evaluation.md](references/evaluation.md): Evaluation operations and built-in evaluators +- [references/evaluation.md](references/evaluation.md): Evaluation operations overview +- [references/built-in-evaluators.md](references/built-in-evaluators.md): Complete built-in evaluator reference +- [references/custom-evaluators.md](references/custom-evaluators.md): Code and prompt-based evaluator patterns - [references/connections.md](references/connections.md): Connection operations - [references/deployments.md](references/deployments.md): Deployment enumeration - [references/datasets-indexes.md](references/datasets-indexes.md): Dataset and index operations - [references/async-patterns.md](references/async-patterns.md): Async client usage - [references/api-reference.md](references/api-reference.md): Complete API reference for all 373 SDK exports (v2.0.0b4) +- [scripts/run_batch_evaluation.py](scripts/run_batch_evaluation.py): CLI tool for batch evaluations diff --git a/.github/skills/azure-ai-projects-py/references/built-in-evaluators.md b/.github/skills/azure-ai-projects-py/references/built-in-evaluators.md new file mode 100644 index 0000000..a8158fe --- /dev/null +++ b/.github/skills/azure-ai-projects-py/references/built-in-evaluators.md @@ -0,0 +1,427 @@ +# Built-in Evaluators Reference + +Complete reference for Microsoft Foundry's built-in evaluators using the `azure-ai-projects` SDK. + +## Discovering Evaluators + +### List All Built-in Evaluators + +```python +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient + +endpoint = os.environ["AZURE_AI_PROJECT_ENDPOINT"] + +with ( + DefaultAzureCredential() as credential, + AIProjectClient(endpoint=endpoint, credential=credential) as project_client, +): + evaluators = project_client.evaluators.list_latest_versions(type="builtin") + for e in evaluators: + print(f"{e.name}: {e.description}") + print(f" Categories: {e.categories}") +``` + +### Get Evaluator Schema + +Before using an evaluator, query its schema to discover required inputs: + +```python +evaluator = project_client.evaluators.get_version( + name="builtin.task_adherence", + version="latest" +) +print(f"Init Parameters: {evaluator.definition.init_parameters}") +print(f"Data Schema: {evaluator.definition.data_schema}") +print(f"Metrics: {evaluator.definition.metrics}") +``` + +## Using Built-in Evaluators + +All built-in evaluators use the `azure_ai_evaluator` type with `builtin.` prefix: + +```python +testing_criteria = [ + { + "type": "azure_ai_evaluator", + "name": "my_coherence_check", # Your custom name for results + "evaluator_name": "builtin.coherence", # The actual evaluator + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}" + }, + "initialization_parameters": { + "deployment_name": "gpt-4o-mini" # Required for LLM-based evaluators + } + } +] +``` + +## Quality Evaluators + +### builtin.coherence + +Measures logical flow and consistency of the response. + +```python +{ + "type": "azure_ai_evaluator", + "name": "coherence", + "evaluator_name": "builtin.coherence", + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}, + "initialization_parameters": {"deployment_name": "gpt-4o-mini"} +} +``` + +**Inputs:** query, response +**Output:** Score 1-5 (5 = highly coherent) + +### builtin.fluency + +Measures grammatical correctness and natural language quality. + +```python +{ + "type": "azure_ai_evaluator", + "name": "fluency", + "evaluator_name": "builtin.fluency", + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}, + "initialization_parameters": {"deployment_name": "gpt-4o-mini"} +} +``` + +**Inputs:** query, response +**Output:** Score 1-5 (5 = perfectly fluent) + +### builtin.relevance + +Measures how well the response addresses the query given context. + +```python +{ + "type": "azure_ai_evaluator", + "name": "relevance", + "evaluator_name": "builtin.relevance", + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "context": "{{item.context}}" + }, + "initialization_parameters": {"deployment_name": "gpt-4o-mini"} +} +``` + +**Inputs:** query, response, context +**Output:** Score 1-5 (5 = highly relevant) + +### builtin.groundedness + +Measures whether the response is factually grounded in the provided context. + +```python +{ + "type": "azure_ai_evaluator", + "name": "groundedness", + "evaluator_name": "builtin.groundedness", + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "context": "{{item.context}}" + }, + "initialization_parameters": {"deployment_name": "gpt-4o-mini"} +} +``` + +**Inputs:** query, response, context +**Output:** Score 1-5 (5 = fully grounded) + +### builtin.response_completeness + +Measures whether the response fully addresses all aspects of the query. + +```python +{ + "type": "azure_ai_evaluator", + "name": "response_completeness", + "evaluator_name": "builtin.response_completeness", + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}, + "initialization_parameters": {"deployment_name": "gpt-4o-mini"} +} +``` + +**Inputs:** query, response +**Output:** Score 1-5 + +## Safety Evaluators + +Safety evaluators detect harmful content. They don't require `deployment_name`. + +### builtin.violence + +Detects violent content. + +```python +{ + "type": "azure_ai_evaluator", + "name": "violence", + "evaluator_name": "builtin.violence", + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"} +} +``` + +**Inputs:** query, response +**Output:** pass/fail with severity score + +### builtin.sexual + +Detects inappropriate sexual content. + +```python +{ + "type": "azure_ai_evaluator", + "name": "sexual", + "evaluator_name": "builtin.sexual", + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"} +} +``` + +### builtin.self_harm + +Detects content promoting or describing self-harm. + +```python +{ + "type": "azure_ai_evaluator", + "name": "self_harm", + "evaluator_name": "builtin.self_harm", + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"} +} +``` + +### builtin.hate_unfairness + +Detects biased or hateful content. + +```python +{ + "type": "azure_ai_evaluator", + "name": "hate_unfairness", + "evaluator_name": "builtin.hate_unfairness", + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"} +} +``` + +## Agent Evaluators + +Agent evaluators assess AI agent behavior and tool usage. + +### builtin.task_adherence + +Evaluates whether the agent follows its system instructions. + +```python +{ + "type": "azure_ai_evaluator", + "name": "task_adherence", + "evaluator_name": "builtin.task_adherence", + "data_mapping": {"query": "{{item.query}}", "response": "{{sample.output_items}}"}, + "initialization_parameters": {"deployment_name": "gpt-4o-mini"} +} +``` + +**Note:** Use `{{sample.output_items}}` for agent responses to include tool call information. + +### builtin.intent_resolution + +Evaluates whether the agent correctly understood user intent. + +```python +{ + "type": "azure_ai_evaluator", + "name": "intent_resolution", + "evaluator_name": "builtin.intent_resolution", + "data_mapping": {"query": "{{item.query}}", "response": "{{sample.output_text}}"}, + "initialization_parameters": {"deployment_name": "gpt-4o-mini"} +} +``` + +### builtin.task_completion + +Evaluates whether the agent completed the task end-to-end. + +```python +{ + "type": "azure_ai_evaluator", + "name": "task_completion", + "evaluator_name": "builtin.task_completion", + "data_mapping": {"query": "{{item.query}}", "response": "{{sample.output_items}}"}, + "initialization_parameters": {"deployment_name": "gpt-4o-mini"} +} +``` + +### builtin.tool_call_accuracy + +Evaluates whether tool calls are correct (selection + parameters). + +```python +{ + "type": "azure_ai_evaluator", + "name": "tool_call_accuracy", + "evaluator_name": "builtin.tool_call_accuracy", + "data_mapping": {"query": "{{item.query}}", "response": "{{sample.output_items}}"}, + "initialization_parameters": {"deployment_name": "gpt-4o-mini"} +} +``` + +### builtin.tool_call_success + +Evaluates whether tool calls executed without failures. + +```python +{ + "type": "azure_ai_evaluator", + "name": "tool_call_success", + "evaluator_name": "builtin.tool_call_success", + "data_mapping": {"query": "{{item.query}}", "response": "{{sample.output_items}}"} +} +``` + +### builtin.tool_selection + +Evaluates whether the correct tools were selected. + +```python +{ + "type": "azure_ai_evaluator", + "name": "tool_selection", + "evaluator_name": "builtin.tool_selection", + "data_mapping": {"query": "{{item.query}}", "response": "{{sample.output_items}}"}, + "initialization_parameters": {"deployment_name": "gpt-4o-mini"} +} +``` + +## NLP Evaluators + +NLP evaluators compare responses to ground truth without requiring an LLM. + +### builtin.f1_score + +Token-level F1 score between response and ground truth. + +```python +{ + "type": "azure_ai_evaluator", + "name": "f1", + "evaluator_name": "builtin.f1_score", + "data_mapping": {"response": "{{item.response}}", "ground_truth": "{{item.ground_truth}}"} +} +``` + +**Output:** Score 0-1 + +### builtin.bleu_score + +BLEU score for generation quality. + +```python +{ + "type": "azure_ai_evaluator", + "name": "bleu", + "evaluator_name": "builtin.bleu_score", + "data_mapping": {"response": "{{item.response}}", "ground_truth": "{{item.ground_truth}}"} +} +``` + +### builtin.rouge_score + +ROUGE score for summarization quality. + +```python +{ + "type": "azure_ai_evaluator", + "name": "rouge", + "evaluator_name": "builtin.rouge_score", + "data_mapping": {"response": "{{item.response}}", "ground_truth": "{{item.ground_truth}}"} +} +``` + +### builtin.similarity + +Semantic similarity between response and ground truth. + +```python +{ + "type": "azure_ai_evaluator", + "name": "similarity", + "evaluator_name": "builtin.similarity", + "data_mapping": { + "query": "{{item.query}}", + "response": "{{item.response}}", + "ground_truth": "{{item.ground_truth}}" + }, + "initialization_parameters": {"deployment_name": "gpt-4o-mini"} +} +``` + +## Evaluator Sets by Use Case + +### Quick Health Check + +```python +testing_criteria = [ + {"type": "azure_ai_evaluator", "name": "coherence", "evaluator_name": "builtin.coherence", ...}, + {"type": "azure_ai_evaluator", "name": "fluency", "evaluator_name": "builtin.fluency", ...}, + {"type": "azure_ai_evaluator", "name": "violence", "evaluator_name": "builtin.violence", ...}, +] +``` + +### Safety Audit + +```python +testing_criteria = [ + {"type": "azure_ai_evaluator", "name": "violence", "evaluator_name": "builtin.violence", ...}, + {"type": "azure_ai_evaluator", "name": "sexual", "evaluator_name": "builtin.sexual", ...}, + {"type": "azure_ai_evaluator", "name": "self_harm", "evaluator_name": "builtin.self_harm", ...}, + {"type": "azure_ai_evaluator", "name": "hate_unfairness", "evaluator_name": "builtin.hate_unfairness", ...}, +] +``` + +### Agent Evaluation + +```python +testing_criteria = [ + {"type": "azure_ai_evaluator", "name": "task_adherence", "evaluator_name": "builtin.task_adherence", ...}, + {"type": "azure_ai_evaluator", "name": "intent_resolution", "evaluator_name": "builtin.intent_resolution", ...}, + {"type": "azure_ai_evaluator", "name": "tool_call_accuracy", "evaluator_name": "builtin.tool_call_accuracy", ...}, +] +``` + +### RAG Evaluation + +```python +testing_criteria = [ + {"type": "azure_ai_evaluator", "name": "groundedness", "evaluator_name": "builtin.groundedness", ...}, + {"type": "azure_ai_evaluator", "name": "relevance", "evaluator_name": "builtin.relevance", ...}, + {"type": "azure_ai_evaluator", "name": "response_completeness", "evaluator_name": "builtin.response_completeness", ...}, +] +``` + +## Data Mapping Reference + +| Data Source | Response Mapping | Use Case | +|-------------|------------------|----------| +| JSONL dataset | `{{item.response}}` | Pre-recorded query/response pairs | +| Agent target | `{{sample.output_text}}` | Plain text response | +| Agent target | `{{sample.output_items}}` | Structured JSON with tool calls | + +**When to use `sample.output_items`:** +- Tool-related evaluators (tool_call_accuracy, tool_selection, etc.) +- Task adherence evaluator +- Any evaluator needing tool call context + +## Related Documentation + +- [Azure AI Projects Samples](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-projects/samples/evaluations) +- [Agent Evaluators](https://learn.microsoft.com/en-us/azure/ai-foundry/concepts/evaluation-evaluators/agent-evaluators) +- [RAG Evaluators](https://learn.microsoft.com/en-us/azure/ai-foundry/concepts/evaluation-evaluators/rag-evaluators) +- [Risk and Safety Evaluators](https://learn.microsoft.com/en-us/azure/ai-foundry/concepts/evaluation-evaluators/risk-safety-evaluators) diff --git a/.github/skills/azure-ai-projects-py/references/custom-evaluators.md b/.github/skills/azure-ai-projects-py/references/custom-evaluators.md new file mode 100644 index 0000000..03957c1 --- /dev/null +++ b/.github/skills/azure-ai-projects-py/references/custom-evaluators.md @@ -0,0 +1,450 @@ +# Custom Evaluators Reference + +Create custom evaluators when built-in evaluators don't meet your needs using the `azure-ai-projects` SDK. + +## Evaluator Types + +| Type | Best For | Requires LLM | +|------|----------|--------------| +| **Code-based** | Pattern matching, format validation, deterministic rules | No | +| **Prompt-based** | Subjective judgment, semantic analysis, nuanced evaluation | Yes | + +## Code-Based Evaluators + +Use Python code for deterministic evaluation logic. + +### Basic Code Evaluator + +```python +from azure.ai.projects import AIProjectClient +from azure.ai.projects.models import ( + EvaluatorVersion, + EvaluatorCategory, + EvaluatorType, + CodeBasedEvaluatorDefinition, + EvaluatorMetric, + EvaluatorMetricType, + EvaluatorMetricDirection, +) +from azure.identity import DefaultAzureCredential +import os + +endpoint = os.environ["AZURE_AI_PROJECT_ENDPOINT"] + +with ( + DefaultAzureCredential() as credential, + AIProjectClient(endpoint=endpoint, credential=credential) as project_client, +): + evaluator = project_client.evaluators.create_version( + name="word_count_evaluator", + evaluator_version=EvaluatorVersion( + evaluator_type=EvaluatorType.CUSTOM, + categories=[EvaluatorCategory.QUALITY], + display_name="Word Count", + description="Counts words in response and checks for conciseness", + definition=CodeBasedEvaluatorDefinition( + code_text=''' +def grade(sample, item) -> dict: + response = item.get("response", "") + word_count = len(response.split()) + return { + "word_count": word_count, + "is_concise": word_count < 100 + } +''', + data_schema={ + "type": "object", + "properties": { + "response": {"type": "string"} + }, + "required": ["response"] + }, + metrics={ + "word_count": EvaluatorMetric( + type=EvaluatorMetricType.ORDINAL, + desirable_direction=EvaluatorMetricDirection.DECREASE, + min_value=0, + max_value=10000, + ), + "is_concise": EvaluatorMetric( + type=EvaluatorMetricType.BINARY, + ), + }, + ), + ), + ) + print(f"Created evaluator: {evaluator.name} (version {evaluator.version})") +``` + +### Code Evaluator: Keyword Checker + +```python +evaluator = project_client.evaluators.create_version( + name="disclaimer_checker", + evaluator_version=EvaluatorVersion( + evaluator_type=EvaluatorType.CUSTOM, + categories=[EvaluatorCategory.QUALITY], + display_name="Disclaimer Checker", + description="Verifies required disclaimers are present in response", + definition=CodeBasedEvaluatorDefinition( + code_text=''' +def grade(sample, item) -> dict: + response = item.get("response", "").lower() + required_keywords = ["disclaimer", "not financial advice", "consult a professional"] + + found = [kw for kw in required_keywords if kw in response] + missing = [kw for kw in required_keywords if kw not in response] + + score = len(found) / len(required_keywords) if required_keywords else 1.0 + + return { + "compliance_score": score, + "missing_disclaimers": ", ".join(missing) if missing else "none", + "passes": score >= 0.8 + } +''', + data_schema={ + "type": "object", + "properties": {"response": {"type": "string"}}, + "required": ["response"] + }, + metrics={ + "compliance_score": EvaluatorMetric( + type=EvaluatorMetricType.ORDINAL, + desirable_direction=EvaluatorMetricDirection.INCREASE, + min_value=0.0, + max_value=1.0, + ), + "passes": EvaluatorMetric(type=EvaluatorMetricType.BINARY), + }, + ), + ), +) +``` + +### Code Evaluator: JSON Format Validator + +```python +evaluator = project_client.evaluators.create_version( + name="json_format_checker", + evaluator_version=EvaluatorVersion( + evaluator_type=EvaluatorType.CUSTOM, + categories=[EvaluatorCategory.QUALITY], + display_name="JSON Format Validator", + description="Checks if response is valid JSON with required fields", + definition=CodeBasedEvaluatorDefinition( + code_text=''' +import json + +def grade(sample, item) -> dict: + response = item.get("response", "") + required_fields = item.get("required_fields", []) + + try: + parsed = json.loads(response) + is_valid_json = True + + if required_fields: + missing = [f for f in required_fields if f not in parsed] + has_required_fields = len(missing) == 0 + else: + has_required_fields = True + missing = [] + + except json.JSONDecodeError: + is_valid_json = False + has_required_fields = False + missing = required_fields + + return { + "is_valid_json": is_valid_json, + "has_required_fields": has_required_fields, + "missing_fields": ", ".join(missing) if missing else "none" + } +''', + data_schema={ + "type": "object", + "properties": { + "response": {"type": "string"}, + "required_fields": {"type": "array", "items": {"type": "string"}} + }, + "required": ["response"] + }, + metrics={ + "is_valid_json": EvaluatorMetric(type=EvaluatorMetricType.BINARY), + "has_required_fields": EvaluatorMetric(type=EvaluatorMetricType.BINARY), + }, + ), + ), +) +``` + +## Prompt-Based Evaluators + +Use LLM judgment for subjective evaluation. + +### Basic Prompt Evaluator + +```python +from azure.ai.projects.models import PromptBasedEvaluatorDefinition + +evaluator = project_client.evaluators.create_version( + name="helpfulness_evaluator", + evaluator_version=EvaluatorVersion( + evaluator_type=EvaluatorType.CUSTOM, + categories=[EvaluatorCategory.QUALITY], + display_name="Helpfulness Evaluator", + description="Evaluates how helpful the response is to the user", + definition=PromptBasedEvaluatorDefinition( + prompt_text=''' +You are an expert evaluator. Rate the helpfulness of the AI assistant's response. + +Query: {query} +Response: {response} + +Scoring (1-5): +1 = Not helpful at all, doesn't address the query +2 = Slightly helpful, partially addresses the query +3 = Moderately helpful, addresses most of the query +4 = Very helpful, fully addresses the query +5 = Extremely helpful, exceeds expectations + +Return ONLY valid JSON: {"score": <1-5>, "reason": ""} +''', + init_parameters={ + "type": "object", + "properties": { + "deployment_name": {"type": "string", "description": "Model deployment name"} + }, + "required": ["deployment_name"] + }, + data_schema={ + "type": "object", + "properties": { + "query": {"type": "string"}, + "response": {"type": "string"} + }, + "required": ["query", "response"] + }, + metrics={ + "score": EvaluatorMetric( + type=EvaluatorMetricType.ORDINAL, + desirable_direction=EvaluatorMetricDirection.INCREASE, + min_value=1, + max_value=5, + ), + }, + ), + ), +) +``` + +### Prompt Evaluator: Brand Tone Checker + +```python +evaluator = project_client.evaluators.create_version( + name="brand_tone_checker", + evaluator_version=EvaluatorVersion( + evaluator_type=EvaluatorType.CUSTOM, + categories=[EvaluatorCategory.QUALITY], + display_name="Brand Tone Checker", + description="Evaluates if response matches company brand voice guidelines", + definition=PromptBasedEvaluatorDefinition( + prompt_text=''' +You are evaluating if an AI assistant's response matches brand voice guidelines. + +Brand Guidelines: +- Professional but friendly +- Avoid jargon, use simple language +- Always offer next steps or additional help +- Never use negative language about competitors +- End with a helpful call-to-action + +Response to evaluate: +{response} + +Score the response from 1-5: +5 = Perfectly matches brand voice +4 = Mostly matches, minor issues +3 = Partially matches +2 = Significant tone issues +1 = Does not match brand voice + +Return ONLY valid JSON: {"score": <1-5>, "reason": "", "suggestions": ""} +''', + init_parameters={ + "type": "object", + "properties": {"deployment_name": {"type": "string"}}, + "required": ["deployment_name"] + }, + data_schema={ + "type": "object", + "properties": {"response": {"type": "string"}}, + "required": ["response"] + }, + metrics={ + "score": EvaluatorMetric( + type=EvaluatorMetricType.ORDINAL, + min_value=1, + max_value=5, + ), + }, + ), + ), +) +``` + +### Prompt Evaluator: Factual Accuracy + +```python +evaluator = project_client.evaluators.create_version( + name="factual_accuracy_checker", + evaluator_version=EvaluatorVersion( + evaluator_type=EvaluatorType.CUSTOM, + categories=[EvaluatorCategory.QUALITY], + display_name="Factual Accuracy", + description="Checks if response claims are supported by context", + definition=PromptBasedEvaluatorDefinition( + prompt_text=''' +Evaluate whether the response contains only facts supported by the provided context. + +Context (source of truth): +{context} + +Response to evaluate: +{response} + +Analysis steps: +1. Identify each factual claim in the response +2. Check if each claim is supported by the context +3. Note any unsupported or fabricated claims + +Scoring (1-5): +1 = Mostly fabricated or incorrect +2 = Many unsupported claims +3 = Mixed: some facts but notable errors +4 = Mostly factual, minor issues +5 = Fully factual, no unsupported claims + +Return ONLY valid JSON: {"score": <1-5>, "reason": "", "unsupported_claims": [""]} +''', + init_parameters={ + "type": "object", + "properties": {"deployment_name": {"type": "string"}}, + "required": ["deployment_name"] + }, + data_schema={ + "type": "object", + "properties": { + "context": {"type": "string"}, + "response": {"type": "string"} + }, + "required": ["context", "response"] + }, + metrics={ + "score": EvaluatorMetric( + type=EvaluatorMetricType.ORDINAL, + min_value=1, + max_value=5, + ), + }, + ), + ), +) +``` + +## Using Custom Evaluators + +### In Testing Criteria + +```python +testing_criteria = [ + # Built-in evaluator + { + "type": "azure_ai_evaluator", + "name": "coherence", + "evaluator_name": "builtin.coherence", + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}, + "initialization_parameters": {"deployment_name": "gpt-4o-mini"} + }, + # Custom code-based evaluator + { + "type": "azure_ai_evaluator", + "name": "word_count", + "evaluator_name": "word_count_evaluator", + "data_mapping": {"response": "{{item.response}}"} + }, + # Custom prompt-based evaluator + { + "type": "azure_ai_evaluator", + "name": "helpfulness", + "evaluator_name": "helpfulness_evaluator", + "initialization_parameters": {"deployment_name": "gpt-4o-mini"}, + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"} + }, +] + +eval_object = openai_client.evals.create( + name="Mixed Evaluators Test", + data_source_config=data_source_config, + testing_criteria=testing_criteria, +) +``` + +## Managing Custom Evaluators + +### List Custom Evaluators + +```python +evaluators = project_client.evaluators.list_latest_versions(type="custom") +for e in evaluators: + print(f"{e.name} (v{e.version}): {e.display_name}") +``` + +### Get Evaluator Details + +```python +evaluator = project_client.evaluators.get_version( + name="helpfulness_evaluator", + version="latest" +) +print(f"Data Schema: {evaluator.definition.data_schema}") +print(f"Metrics: {evaluator.definition.metrics}") +``` + +### Update Evaluator + +```python +updated = project_client.evaluators.update_version( + name="word_count_evaluator", + version="1", + evaluator_version={ + "description": "Updated description", + "display_name": "Word Count v2", + } +) +``` + +### Delete Evaluator + +```python +project_client.evaluators.delete_version( + name="word_count_evaluator", + version="1" +) +``` + +## Best Practices + +1. **Use code-based for deterministic logic** - Pattern matching, format validation, keyword checking +2. **Use prompt-based for subjective judgment** - Quality assessment, tone evaluation, semantic analysis +3. **Always define data_schema** - Ensures correct data mapping +4. **Define meaningful metrics** - Use appropriate types (ORDINAL, BINARY) +5. **Test before production** - Run evaluator on sample data first +6. **Version your evaluators** - Create new versions instead of modifying existing ones + +## Related Documentation + +- [Custom Evaluators](https://learn.microsoft.com/en-us/azure/ai-foundry/concepts/evaluation-evaluators/custom-evaluators) +- [Code-based evaluator sample](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_catalog_code_based_evaluators.py) +- [Prompt-based evaluator sample](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_catalog_prompt_based_evaluators.py) diff --git a/.github/skills/azure-ai-projects-py/references/evaluation.md b/.github/skills/azure-ai-projects-py/references/evaluation.md index 8de55e4..89003b9 100644 --- a/.github/skills/azure-ai-projects-py/references/evaluation.md +++ b/.github/skills/azure-ai-projects-py/references/evaluation.md @@ -1,309 +1,349 @@ # Evaluation Operations Reference -## Overview - -Evaluations in Azure AI Foundry use the OpenAI client's evals API to test agent quality. +Evaluate AI agents and models using Microsoft Foundry's cloud evaluation service. ## Setup ```python +import os from azure.ai.projects import AIProjectClient -from azure.ai.projects.models import PromptAgentDefinition, DataSourceConfigCustom from azure.identity import DefaultAzureCredential -project_client = AIProjectClient( - endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], - credential=DefaultAzureCredential(), -) +endpoint = os.environ["AZURE_AI_PROJECT_ENDPOINT"] +deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o-mini") -# Get OpenAI client for evals -openai_client = project_client.get_openai_client() +with ( + DefaultAzureCredential() as credential, + AIProjectClient(endpoint=endpoint, credential=credential) as project_client, +): + openai_client = project_client.get_openai_client() + # Use openai_client.evals.* ``` -## Create Evaluation - -### Define Data Source Configuration +## Quick Start: Run a Basic Evaluation ```python -from azure.ai.projects.models import DataSourceConfigCustom +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent, +) +from openai.types.eval_create_params import DataSourceConfigCustom + +# 1. Prepare test data +data = [ + {"query": "What is Azure?", "response": "Azure is Microsoft's cloud platform."}, + {"query": "What is AI?", "response": "AI is artificial intelligence."}, +] + +# 2. Create data source +data_source = CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content=[SourceFileContentContent(item=item, sample={}) for item in data], + ), +) +# 3. Configure schema data_source_config = DataSourceConfigCustom( type="custom", item_schema={ "type": "object", "properties": { "query": {"type": "string"}, - "expected_response": {"type": "string"}, + "response": {"type": "string"}, }, - "required": ["query"], + "required": ["query", "response"], }, - include_sample_schema=True, + include_sample_schema=False, ) -``` - -### Define Testing Criteria (Evaluators) -```python -# Built-in evaluators +# 4. Define evaluators testing_criteria = [ { "type": "azure_ai_evaluator", - "name": "violence_detection", - "evaluator_name": "builtin.violence", - "data_mapping": { - "query": "{{item.query}}", - "response": "{{item.response}}", - }, + "name": "coherence", + "evaluator_name": "builtin.coherence", + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}, + "initialization_parameters": {"deployment_name": deployment}, }, { "type": "azure_ai_evaluator", - "name": "fluency_check", - "evaluator_name": "builtin.fluency", - "data_mapping": { - "query": "{{item.query}}", - "response": "{{item.response}}", - }, - }, - { - "type": "azure_ai_evaluator", - "name": "task_adherence", - "evaluator_name": "builtin.task_adherence", - "data_mapping": { - "query": "{{item.query}}", - "response": "{{item.response}}", - }, + "name": "relevance", + "evaluator_name": "builtin.relevance", + "data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"}, + "initialization_parameters": {"deployment_name": deployment}, }, ] -``` -### Create Evaluation Object - -```python +# 5. Create and run evaluation eval_object = openai_client.evals.create( - name="Agent Quality Evaluation", + name="Quality Evaluation", data_source_config=data_source_config, testing_criteria=testing_criteria, ) -print(f"Created evaluation: {eval_object.id}") -``` -## Run Evaluation - -### Define Test Data - -```python -# Inline test data -data_source = { - "type": "azure_ai_target_completions", - "source": { - "type": "file_content", - "content": [ - {"item": {"query": "What is the capital of France?"}}, - {"item": {"query": "How do I reverse a string in Python?"}}, - {"item": {"query": "Explain machine learning in simple terms."}}, - ], - }, - "input_messages": { - "type": "template", - "template": [ - { - "type": "message", - "role": "user", - "content": {"type": "input_text", "text": "{{item.query}}"}, - } - ], - }, - "target": { - "type": "azure_ai_agent", - "name": agent.name, - "version": agent.version, - }, -} -``` - -### Execute Evaluation Run - -```python -eval_run = openai_client.evals.runs.create( +run = openai_client.evals.runs.create( eval_id=eval_object.id, - name=f"Evaluation Run for Agent {agent.name}", + name="Run 1", data_source=data_source, ) -print(f"Evaluation run created: {eval_run.id}") + +# 6. Poll for completion +import time +while run.status not in ["completed", "failed", "cancelled"]: + time.sleep(5) + run = openai_client.evals.runs.retrieve(eval_id=eval_object.id, run_id=run.id) + print(f"Status: {run.status}") + +# 7. Retrieve results +output_items = list(openai_client.evals.runs.output_items.list( + eval_id=eval_object.id, run_id=run.id +)) + +for item in output_items: + for result in item.results: + print(f"{result.name}: {result.score}") ``` ## Built-in Evaluators -| Evaluator | Description | Data Mapping | -|-----------|-------------|--------------| -| `builtin.violence` | Detects violent content | query, response | -| `builtin.fluency` | Measures response fluency | query, response | -| `builtin.task_adherence` | Checks if response follows instructions | query, response | -| `builtin.groundedness` | Checks factual grounding | query, response, context | -| `builtin.relevance` | Measures response relevance | query, response | -| `builtin.coherence` | Checks logical coherence | query, response | -| `builtin.similarity` | Compares to expected response | response, expected_response | +Use the `builtin.` prefix for all built-in evaluators: -## Full Evaluation Example +### Quality Evaluators -```python -import os -from azure.ai.projects import AIProjectClient -from azure.ai.projects.models import PromptAgentDefinition, DataSourceConfigCustom -from azure.identity import DefaultAzureCredential +| Evaluator | Data Mapping | Use Case | +|-----------|--------------|----------| +| `builtin.coherence` | query, response | Logical flow and consistency | +| `builtin.relevance` | query, response | Response addresses the query | +| `builtin.fluency` | query, response | Language quality and readability | +| `builtin.groundedness` | query, context, response | Factual alignment with context | -# Setup -project_client = AIProjectClient( - endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], - credential=DefaultAzureCredential(), -) -openai_client = project_client.get_openai_client() - -# Create agent -agent = project_client.agents.create_version( - agent_name="eval-test-agent", - definition=PromptAgentDefinition( - model=os.environ["AZURE_AI_MODEL_DEPLOYMENT_NAME"], - instructions="You are a helpful assistant that answers questions concisely.", +### Safety Evaluators + +| Evaluator | Data Mapping | Use Case | +|-----------|--------------|----------| +| `builtin.violence` | query, response | Violent content detection | +| `builtin.sexual` | query, response | Sexual content detection | +| `builtin.self_harm` | query, response | Self-harm content detection | +| `builtin.hate_unfairness` | query, response | Hate/bias detection | + +### Agent Evaluators + +| Evaluator | Data Mapping | Use Case | +|-----------|--------------|----------| +| `builtin.intent_resolution` | query, response | Did agent understand intent? | +| `builtin.response_completeness` | query, response | Did agent answer fully? | +| `builtin.task_adherence` | query, response | Did agent follow instructions? | +| `builtin.tool_call_accuracy` | query, response (JSON) | Were tool calls correct? | + +See [built-in-evaluators.md](built-in-evaluators.md) for complete evaluator reference. + +## Agent Evaluation + +For evaluating AI agents with tool calls, use `sample` mapping: + +```python +# Data with agent outputs +data_source = CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content=[ + SourceFileContentContent( + item={"query": "Weather in Seattle?"}, + sample={ + "output_text": "It's 55°F and cloudy in Seattle.", + "output_items": [ + { + "type": "tool_call", + "name": "get_weather", + "arguments": {"location": "Seattle"}, + "result": {"temp": "55", "condition": "cloudy"}, + } + ], + }, + ) + ], ), ) -# Configure evaluation data_source_config = DataSourceConfigCustom( type="custom", - item_schema={ - "type": "object", - "properties": {"query": {"type": "string"}}, - "required": ["query"], - }, - include_sample_schema=True, + item_schema={"type": "object", "properties": {"query": {"type": "string"}}}, + include_sample_schema=True, # Required for agent evaluations ) testing_criteria = [ { "type": "azure_ai_evaluator", - "name": "fluency", - "evaluator_name": "builtin.fluency", + "name": "intent_resolution", + "evaluator_name": "builtin.intent_resolution", "data_mapping": { "query": "{{item.query}}", - "response": "{{item.response}}", + "response": "{{sample.output_text}}", # Use sample for agent outputs }, + "initialization_parameters": {"deployment_name": deployment}, }, { "type": "azure_ai_evaluator", - "name": "relevance", - "evaluator_name": "builtin.relevance", + "name": "tool_call_accuracy", + "evaluator_name": "builtin.tool_call_accuracy", "data_mapping": { "query": "{{item.query}}", - "response": "{{item.response}}", + "response": "{{sample.output_items}}", # JSON with tool calls }, + "initialization_parameters": {"deployment_name": deployment}, }, ] +``` -# Create evaluation -eval_object = openai_client.evals.create( - name="Quality Check", - data_source_config=data_source_config, - testing_criteria=testing_criteria, -) +## OpenAI Graders -# Run evaluation -data_source = { - "type": "azure_ai_target_completions", - "source": { - "type": "file_content", - "content": [ - {"item": {"query": "What is 2+2?"}}, - {"item": {"query": "Who wrote Romeo and Juliet?"}}, - ], +For simpler evaluation patterns, use OpenAI graders: + +```python +testing_criteria = [ + # Label grader (classification) + { + "type": "label_model", + "name": "sentiment", + "model": deployment, + "input": [{"role": "user", "content": "Classify sentiment: {{item.response}}"}], + "labels": ["positive", "negative", "neutral"], + "passing_labels": ["positive", "neutral"], }, - "input_messages": { - "type": "template", - "template": [ - { - "type": "message", - "role": "user", - "content": {"type": "input_text", "text": "{{item.query}}"}, - } - ], + # String check grader + { + "type": "string_check", + "name": "has_disclaimer", + "input": "{{item.response}}", + "operation": "contains", + "reference": "Please consult", }, - "target": { - "type": "azure_ai_agent", - "name": agent.name, - "version": agent.version, + # Text similarity grader + { + "type": "text_similarity", + "name": "matches_expected", + "input": "{{item.response}}", + "reference": "{{item.expected}}", + "evaluation_metric": "fuzzy_match", + "pass_threshold": 0.8, }, -} - -eval_run = openai_client.evals.runs.create( - eval_id=eval_object.id, - name="Test Run", - data_source=data_source, -) - -print(f"Evaluation run created: {eval_run.id}") -print(f"Status: {eval_run.status}") +] ``` ## Custom Evaluators +Create custom evaluators for domain-specific needs. + +### Code-Based Evaluator + ```python -# Define custom evaluator with specific criteria -custom_evaluator = { - "type": "azure_ai_evaluator", - "name": "custom_length_check", - "evaluator_name": "builtin.fluency", # Base on existing evaluator - "data_mapping": { - "query": "{{item.query}}", - "response": "{{item.response}}", - }, - "threshold": 0.8, # Pass threshold -} +from azure.ai.projects.models import ( + EvaluatorVersion, EvaluatorCategory, EvaluatorType, + CodeBasedEvaluatorDefinition, EvaluatorMetric, EvaluatorMetricType, +) -testing_criteria = [custom_evaluator] +evaluator = project_client.evaluators.create_version( + name="word_count", + evaluator_version=EvaluatorVersion( + evaluator_type=EvaluatorType.CUSTOM, + categories=[EvaluatorCategory.QUALITY], + display_name="Word Count", + definition=CodeBasedEvaluatorDefinition( + code_text=''' +def grade(sample, item) -> dict: + return {"word_count": len(item.get("response", "").split())} +''', + data_schema={ + "type": "object", + "properties": {"response": {"type": "string"}}, + "required": ["response"], + }, + metrics={ + "word_count": EvaluatorMetric(type=EvaluatorMetricType.ORDINAL), + }, + ), + ), +) ``` -## Evaluation with Ground Truth +### Prompt-Based Evaluator ```python -data_source_config = DataSourceConfigCustom( - type="custom", - item_schema={ - "type": "object", - "properties": { - "query": {"type": "string"}, - "expected_response": {"type": "string"}, - }, - "required": ["query", "expected_response"], - }, - include_sample_schema=True, +from azure.ai.projects.models import PromptBasedEvaluatorDefinition + +evaluator = project_client.evaluators.create_version( + name="helpfulness", + evaluator_version=EvaluatorVersion( + evaluator_type=EvaluatorType.CUSTOM, + categories=[EvaluatorCategory.QUALITY], + display_name="Helpfulness", + definition=PromptBasedEvaluatorDefinition( + prompt_text=''' +Rate the helpfulness of the response (1-5): +Query: {query} +Response: {response} +Return JSON: {"score": <1-5>, "reason": ""} +''', + init_parameters={ + "type": "object", + "properties": {"deployment_name": {"type": "string"}}, + "required": ["deployment_name"], + }, + data_schema={ + "type": "object", + "properties": {"query": {"type": "string"}, "response": {"type": "string"}}, + "required": ["query", "response"], + }, + metrics={"score": EvaluatorMetric(type=EvaluatorMetricType.ORDINAL)}, + ), + ), ) +``` -testing_criteria = [ - { - "type": "azure_ai_evaluator", - "name": "similarity", - "evaluator_name": "builtin.similarity", - "data_mapping": { - "response": "{{item.response}}", - "expected_response": "{{item.expected_response}}", - }, - }, -] +See [custom-evaluators.md](custom-evaluators.md) for complete custom evaluator reference. -# Test data with ground truth -data_source = { - "type": "azure_ai_target_completions", - "source": { - "type": "file_content", - "content": [ - { - "item": { - "query": "What is the capital of France?", - "expected_response": "Paris", - } - }, - ], - }, - # ... -} +## Discover Available Evaluators + +```python +# List built-in evaluators +evaluators = project_client.evaluators.list_latest_versions(type="builtin") +for e in evaluators: + print(f"builtin.{e.name}: {e.description}") + +# List custom evaluators +custom = project_client.evaluators.list_latest_versions(type="custom") +for e in custom: + print(f"{e.name}: {e.description}") ``` + +## Data Mapping Reference + +| Pattern | Source | Use Case | +|---------|--------|----------| +| `{{item.field}}` | Your JSONL data | Standard evaluation data | +| `{{sample.output_text}}` | Agent response (text) | Agent text outputs | +| `{{sample.output_items}}` | Agent response (JSON) | Tool calls, structured data | + +## CLI Tool + +A batch evaluation script is available at `scripts/run_batch_evaluation.py`: + +```bash +python run_batch_evaluation.py --data test_data.jsonl --evaluators coherence relevance +python run_batch_evaluation.py --data test_data.jsonl --safety +python run_batch_evaluation.py --data test_data.jsonl --agent --evaluators intent_resolution +``` + +## Related Reference Files + +- [built-in-evaluators.md](built-in-evaluators.md): Complete built-in evaluator reference +- [custom-evaluators.md](custom-evaluators.md): Code and prompt-based evaluator patterns + +## Related Documentation + +- [Azure AI Projects Evaluation Samples](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-projects/samples/evaluations) +- [Cloud Evaluation Documentation](https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/cloud-evaluation) diff --git a/.github/skills/azure-ai-projects-py/scripts/run_batch_evaluation.py b/.github/skills/azure-ai-projects-py/scripts/run_batch_evaluation.py new file mode 100644 index 0000000..98bc430 --- /dev/null +++ b/.github/skills/azure-ai-projects-py/scripts/run_batch_evaluation.py @@ -0,0 +1,409 @@ +#!/usr/bin/env python3 +""" +Batch Evaluation CLI Tool + +Run batch evaluations on test datasets using Azure AI Projects SDK. +Supports quality, safety, agent evaluators, and OpenAI graders. + +Usage: + python run_batch_evaluation.py --data test_data.jsonl --evaluators coherence relevance + python run_batch_evaluation.py --data test_data.jsonl --evaluators coherence --output results.json + python run_batch_evaluation.py --data test_data.jsonl --safety + python run_batch_evaluation.py --data test_data.jsonl --agent --evaluators intent_resolution task_adherence + +Environment Variables: + AZURE_AI_PROJECT_ENDPOINT - Azure AI project endpoint (required) + AZURE_AI_MODEL_DEPLOYMENT_NAME - Model deployment name (default: gpt-4o-mini) +""" + +import argparse +import json +import os +import sys +import time +from pathlib import Path +from typing import Any + +from azure.ai.projects import AIProjectClient +from azure.identity import DefaultAzureCredential +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent, +) +from openai.types.eval_create_params import DataSourceConfigCustom + + +# Built-in evaluators by category +QUALITY_EVALUATORS = [ + "coherence", + "relevance", + "fluency", + "groundedness", +] +SAFETY_EVALUATORS = [ + "violence", + "sexual", + "self_harm", + "hate_unfairness", +] +AGENT_EVALUATORS = [ + "intent_resolution", + "response_completeness", + "task_adherence", + "tool_call_accuracy", +] +NLP_EVALUATORS = ["f1", "rouge", "bleu", "gleu", "meteor"] + + +def load_jsonl(path: str) -> list[dict]: + """Load JSONL file into list of dicts.""" + data = [] + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + data.append(json.loads(line)) + return data + + +def build_data_source( + data: list[dict], + is_agent: bool = False, +) -> CreateEvalJSONLRunDataSourceParam: + """Build data source from loaded data.""" + content = [] + for item in data: + if is_agent: + # Agent data: extract sample fields from item + sample = { + "output_text": item.pop("output_text", item.get("response", "")), + } + if "output_items" in item: + sample["output_items"] = item.pop("output_items") + content.append(SourceFileContentContent(item=item, sample=sample)) + else: + content.append(SourceFileContentContent(item=item, sample={})) + + return CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent(type="file_content", content=content), + ) + + +def build_data_source_config( + data: list[dict], + is_agent: bool = False, +) -> DataSourceConfigCustom: + """Build data source config based on data schema.""" + # Infer schema from first item + if not data: + raise ValueError("Data is empty") + + first_item = data[0] + properties = {} + required = [] + + for key in first_item: + if key not in ["output_text", "output_items"]: # Agent fields go in sample + properties[key] = {"type": "string"} + required.append(key) + + return DataSourceConfigCustom( + type="custom", + item_schema={ + "type": "object", + "properties": properties, + "required": required, + }, + include_sample_schema=is_agent, + ) + + +def build_testing_criteria( + evaluator_names: list[str], + deployment_name: str, + is_agent: bool = False, +) -> list[dict]: + """Build testing criteria for the specified evaluators.""" + criteria = [] + + for name in evaluator_names: + # Determine data mapping based on evaluator type + if name in QUALITY_EVALUATORS: + if name == "groundedness": + data_mapping = { + "query": "{{item.query}}", + "context": "{{item.context}}", + "response": "{{item.response}}", + } + else: + data_mapping = { + "query": "{{item.query}}", + "response": "{{item.response}}", + } + needs_model = True + + elif name in SAFETY_EVALUATORS: + data_mapping = { + "query": "{{item.query}}", + "response": "{{item.response}}", + } + needs_model = False # Safety evaluators may not need deployment + + elif name in AGENT_EVALUATORS: + if is_agent: + if name == "tool_call_accuracy": + data_mapping = { + "query": "{{item.query}}", + "response": "{{sample.output_items}}", + } + else: + data_mapping = { + "query": "{{item.query}}", + "response": "{{sample.output_text}}", + } + else: + data_mapping = { + "query": "{{item.query}}", + "response": "{{item.response}}", + } + needs_model = True + + elif name in NLP_EVALUATORS: + data_mapping = { + "response": "{{item.response}}", + "ground_truth": "{{item.ground_truth}}", + } + needs_model = False + + else: + print(f"Warning: Unknown evaluator '{name}', skipping") + continue + + criterion = { + "type": "azure_ai_evaluator", + "name": name, + "evaluator_name": f"builtin.{name}", + "data_mapping": data_mapping, + } + + if needs_model: + criterion["initialization_parameters"] = {"deployment_name": deployment_name} + + criteria.append(criterion) + + return criteria + + +def run_evaluation( + endpoint: str, + data_path: str, + evaluator_names: list[str], + deployment_name: str, + is_agent: bool = False, +) -> dict[str, Any]: + """Run batch evaluation using Azure AI Projects SDK.""" + # Load data + data = load_jsonl(data_path) + print(f"Loaded {len(data)} items from {data_path}") + + # Build data source and config + data_source = build_data_source(data, is_agent=is_agent) + data_source_config = build_data_source_config(data, is_agent=is_agent) + + # Build testing criteria + testing_criteria = build_testing_criteria( + evaluator_names, + deployment_name, + is_agent=is_agent, + ) + + if not testing_criteria: + raise ValueError("No valid testing criteria configured") + + print(f"Configured {len(testing_criteria)} evaluators") + + # Create client and run evaluation + with ( + DefaultAzureCredential() as credential, + AIProjectClient(endpoint=endpoint, credential=credential) as project_client, + ): + openai_client = project_client.get_openai_client() + + # Create evaluation definition + eval_object = openai_client.evals.create( + name=f"Batch Evaluation - {Path(data_path).stem}", + data_source_config=data_source_config, + testing_criteria=testing_criteria, + ) + print(f"Created evaluation: {eval_object.id}") + + # Create and run evaluation + run = openai_client.evals.runs.create( + eval_id=eval_object.id, + name="CLI Run", + data_source=data_source, + ) + print(f"Started run: {run.id}") + + # Poll for completion + while run.status not in ["completed", "failed", "cancelled"]: + print(f"Status: {run.status}...") + time.sleep(5) + run = openai_client.evals.runs.retrieve( + eval_id=eval_object.id, + run_id=run.id, + ) + + if run.status != "completed": + raise RuntimeError(f"Evaluation run {run.status}: {getattr(run, 'error', 'Unknown error')}") + + print(f"Run completed: {run.status}") + + # Retrieve results + output_items = list( + openai_client.evals.runs.output_items.list( + eval_id=eval_object.id, + run_id=run.id, + ) + ) + + # Aggregate metrics + metrics: dict[str, list[float]] = {} + rows = [] + + for output_item in output_items: + row_results = {} + for result in output_item.results: + if result.score is not None: + if result.name not in metrics: + metrics[result.name] = [] + metrics[result.name].append(result.score) + row_results[result.name] = result.score + rows.append(row_results) + + # Calculate averages + avg_metrics = {} + for name, scores in metrics.items(): + avg_metrics[name] = sum(scores) / len(scores) if scores else 0.0 + + return { + "eval_id": eval_object.id, + "run_id": run.id, + "status": run.status, + "metrics": avg_metrics, + "rows": rows, + "total_items": len(output_items), + } + + +def main(): + parser = argparse.ArgumentParser( + description="Run batch evaluation on test datasets using Azure AI Projects SDK", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + parser.add_argument("--data", "-d", required=True, help="Path to JSONL data file") + parser.add_argument( + "--evaluators", + "-e", + nargs="+", + default=["coherence", "relevance"], + help=f"Evaluators to run. Quality: {QUALITY_EVALUATORS}, " + f"Safety: {SAFETY_EVALUATORS}, Agent: {AGENT_EVALUATORS}, NLP: {NLP_EVALUATORS}", + ) + parser.add_argument( + "--safety", + action="store_true", + help="Include all safety evaluators", + ) + parser.add_argument( + "--agent", + action="store_true", + help="Include all agent evaluators (uses sample.output_text for response)", + ) + parser.add_argument( + "--output", + "-o", + help="Output file for results (JSON)", + ) + parser.add_argument( + "--deployment", + default=None, + help="Model deployment name (overrides AZURE_AI_MODEL_DEPLOYMENT_NAME)", + ) + + args = parser.parse_args() + + # Validate environment + endpoint = os.environ.get("AZURE_AI_PROJECT_ENDPOINT") + if not endpoint: + print("Error: AZURE_AI_PROJECT_ENDPOINT environment variable required") + sys.exit(1) + + deployment = args.deployment or os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o-mini") + + # Validate data file + data_path = Path(args.data) + if not data_path.exists(): + print(f"Error: Data file not found: {args.data}") + sys.exit(1) + + # Build evaluator list + evaluator_names = list(args.evaluators) + if args.safety: + evaluator_names.extend(SAFETY_EVALUATORS) + if args.agent: + evaluator_names.extend(AGENT_EVALUATORS) + + # Remove duplicates while preserving order + seen = set() + unique_evaluators = [] + for e in evaluator_names: + if e not in seen: + seen.add(e) + unique_evaluators.append(e) + evaluator_names = unique_evaluators + + print(f"Running evaluation with: {evaluator_names}") + print(f"Data file: {args.data}") + print(f"Deployment: {deployment}") + print(f"Agent mode: {args.agent}") + + # Run evaluation + try: + result = run_evaluation( + endpoint=endpoint, + data_path=str(data_path), + evaluator_names=evaluator_names, + deployment_name=deployment, + is_agent=args.agent, + ) + except Exception as e: + print(f"Error during evaluation: {e}") + sys.exit(1) + + # Output results + print("\n=== Evaluation Results ===") + print(f"Eval ID: {result['eval_id']}") + print(f"Run ID: {result['run_id']}") + print(f"Status: {result['status']}") + print(f"Total Items: {result['total_items']}") + print("\nMetrics:") + for metric, value in sorted(result["metrics"].items()): + print(f" {metric}: {value:.4f}") + + # Save to file if requested + if args.output: + output_path = Path(args.output) + with open(output_path, "w", encoding="utf-8") as f: + json.dump(result, f, indent=2, default=str) + print(f"\nResults saved to: {args.output}") + + print("\nEvaluation complete!") + + +if __name__ == "__main__": + main()