diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json new file mode 100644 index 00000000..7ae3138b --- /dev/null +++ b/.claude-plugin/marketplace.json @@ -0,0 +1,21 @@ +{ + "name": "observability-stack", + "owner": { + "name": "OpenSearch Project", + "email": "anirudha@nyu.edu" + }, + "metadata": { + "description": "Observability plugins for the OpenSearch stack" + }, + "plugins": [ + { + "name": "observability", + "source": "./claude-code-observability-plugin", + "description": "Query and investigate traces, logs, and metrics from an OpenSearch-based observability stack using PPL and PromQL", + "version": "1.0.0", + "author": { + "name": "OpenSearch Project" + } + } + ] +} diff --git a/.github/workflows/claude-code-plugin-release.yml b/.github/workflows/claude-code-plugin-release.yml new file mode 100644 index 00000000..7bd9cd83 --- /dev/null +++ b/.github/workflows/claude-code-plugin-release.yml @@ -0,0 +1,47 @@ +name: Claude Code Plugin Release + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + build-plugin-zips: + name: Build Plugin ZIPs + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - name: Build skill ZIP files + run: | + PLUGIN_DIR=claude-code-observability-plugin + DIST_DIR=$PLUGIN_DIR/dist + mkdir -p "$DIST_DIR" + + for skill_dir in "$PLUGIN_DIR"/skills/*/; do + skill_name=$(basename "$skill_dir") + if [ -f "$skill_dir/SKILL.md" ]; then + zip -j "$DIST_DIR/${skill_name}.zip" "$skill_dir/SKILL.md" + echo "Built $DIST_DIR/${skill_name}.zip" + fi + done + + ls -la "$DIST_DIR" + + - name: Upload ZIPs as artifacts + uses: actions/upload-artifact@v4 + with: + name: claude-code-plugin-skills + path: claude-code-observability-plugin/dist/*.zip + + - name: Attach ZIPs to release + if: github.event_name == 'release' + env: + GH_TOKEN: ${{ github.token }} + run: | + for zip in claude-code-observability-plugin/dist/*.zip; do + gh release upload "${{ github.event.release.tag_name }}" "$zip" --clobber + echo "Uploaded $(basename $zip) to release ${{ github.event.release.tag_name }}" + done diff --git a/claude-code-observability-plugin/.claude-plugin/plugin.json b/claude-code-observability-plugin/.claude-plugin/plugin.json new file mode 100644 index 00000000..580a7f71 --- /dev/null +++ b/claude-code-observability-plugin/.claude-plugin/plugin.json @@ -0,0 +1,13 @@ +{ + "name": "opensearch@observability", + "version": "1.0.0", + "description": "Query and investigate traces, logs, and metrics from an OpenSearch-based observability stack using PPL and PromQL", + "author": { + "name": "OpenSearch Project", + "url": "https://github.com/opensearch-project/observability-stack" + }, + "homepage": "https://observability.opensearch.org/docs/claude-code/", + "repository": "https://github.com/opensearch-project/observability-stack", + "license": "Apache-2.0", + "keywords": ["observability", "opensearch", "traces", "logs", "metrics", "ppl", "promql", "opentelemetry"] +} diff --git a/claude-code-observability-plugin/CLAUDE.md b/claude-code-observability-plugin/CLAUDE.md new file mode 100644 index 00000000..aa60d665 --- /dev/null +++ b/claude-code-observability-plugin/CLAUDE.md @@ -0,0 +1,103 @@ +# OpenSearch Observability Plugin for Claude Code + +This plugin teaches Claude Code how to query and investigate traces, logs, and metrics from an OpenSearch-based observability stack. It provides nine skill files containing PPL (Piped Processing Language) query templates for OpenSearch, PromQL query templates for Prometheus, and curl-based commands — all ready to execute against a running stack. + +## Skill Routing Table + +Load the appropriate skill file based on the user's intent: + +| Skill | When to Use | +|---|---| +| `skills/traces/SKILL.md` | Use when investigating agent invocations, tool executions, slow spans, error spans, token usage, or trace correlation | +| `skills/logs/SKILL.md` | Use when searching logs by severity, correlating logs with traces, identifying error patterns, or analyzing log volume | +| `skills/metrics/SKILL.md` | Use when querying HTTP request rates, latency percentiles, error rates, active connections, or GenAI metrics | +| `skills/stack-health/SKILL.md` | Use when checking stack component health, troubleshooting data flow issues, or verifying service status | +| `skills/ppl-reference/SKILL.md` | Use when constructing novel PPL queries, looking up PPL syntax, or understanding PPL functions | +| `skills/correlation/SKILL.md` | Use when performing cross-signal correlation between traces, logs, and metrics | +| `skills/apm-red/SKILL.md` | Use when analyzing RED metrics (Rate, Errors, Duration) for service-level monitoring | +| `skills/slo-sli/SKILL.md` | Use when defining SLOs/SLIs, calculating error budgets, or setting up burn rate alerts | +| `skills/osd-config/SKILL.md` | Use when discovering index patterns, workspaces, saved objects, APM configs, or field mappings from OpenSearch Dashboards or OpenSearch APIs | + +## Configuration + +### Environment Variables + +Set these environment variables to override default endpoints: + +- `$OPENSEARCH_ENDPOINT` — OpenSearch base URL (default: `https://localhost:9200`) +- `$PROMETHEUS_ENDPOINT` — Prometheus base URL (default: `http://localhost:9090`) + +### Connection Profiles + +#### Local Stack (Default) + +| Service | Endpoint | Auth | +|---|---|---| +| OpenSearch | `https://localhost:9200` | `-u admin:'My_password_123!@#' -k` (HTTPS + basic auth, skip TLS verify) | +| Prometheus | `http://localhost:9090` | None (HTTP, no auth) | + +Example OpenSearch curl: + +```bash +curl -sk -u admin:'My_password_123!@#' \ + -X POST https://localhost:9200/_plugins/_ppl \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | head 10"}' +``` + +Example Prometheus curl: + +```bash +curl -s 'http://localhost:9090/api/v1/query' \ + --data-urlencode 'query=up' +``` + +#### AWS Managed Services + +##### Amazon OpenSearch Service + +- Endpoint format: `https://DOMAIN-ID.REGION.es.amazonaws.com` +- Auth: AWS Signature Version 4 + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:es" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + -X POST https://DOMAIN-ID.REGION.es.amazonaws.com/_plugins/_ppl \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | head 10"}' +``` + +##### Amazon Managed Service for Prometheus (AMP) + +- Endpoint format: `https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query` +- Auth: AWS Signature Version 4 + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:aps" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + 'https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query' \ + --data-urlencode 'query=up' +``` + +> **Note:** PPL and PromQL query syntax is identical across local and AWS managed profiles. Only the endpoint URL and authentication method differ. + +## Port Reference + +| Component | Port | Protocol | +|---|---|---| +| OpenSearch | 9200 | HTTPS | +| OTel Collector (gRPC) | 4317 | gRPC | +| OTel Collector (HTTP) | 4318 | HTTP | +| Data Prepper | 21890 | HTTP | +| Prometheus | 9090 | HTTP | +| OpenSearch Dashboards | 5601 | HTTP | + +## Index Patterns + +| Signal | Index Pattern | Key Fields | +|---|---|---| +| Traces | `otel-v1-apm-span-*` | `traceId`, `spanId`, `serviceName`, `name`, `durationInNanos`, `status.code`, `attributes.gen_ai.*` | +| Logs | `logs-otel-v1-*` | `traceId`, `spanId`, `severityText`, `body`, `resource.attributes.service.name`, `@timestamp` | +| Service Maps | `otel-v2-apm-service-map-*` | `sourceNode`, `targetNode`, `sourceOperation`, `targetOperation` | + +> **Note:** The log index uses `resource.attributes.service.name` (backtick-quoted in PPL) instead of `serviceName`. The trace span index has a top-level `serviceName` field. diff --git a/claude-code-observability-plugin/docs/INSTALL.md b/claude-code-observability-plugin/docs/INSTALL.md new file mode 100644 index 00000000..5a77e3d1 --- /dev/null +++ b/claude-code-observability-plugin/docs/INSTALL.md @@ -0,0 +1,151 @@ +# Installation Guide + +## Prerequisites + +1. **Claude Code CLI** — Install from [claude.ai/claude-code](https://claude.ai/claude-code) +2. **Running Observability Stack** — The plugin queries a local OpenSearch + Prometheus stack + +### Start the Observability Stack + +```bash +git clone https://github.com/opensearch-project/observability-stack.git +cd observability-stack +docker compose up -d +``` + +Verify services are running: + +```bash +# OpenSearch (should return cluster health JSON) +curl -sk -u 'admin:My_password_123!@#' https://localhost:9200/_cluster/health?pretty + +# Prometheus (should return "Prometheus Server is Healthy.") +curl -s http://localhost:9090/-/healthy +``` + +## Install the Plugin + +From the `observability-stack` repository root: + +```bash +claude install-plugin ./claude-code-observability-plugin +``` + +Or install directly from GitHub: + +```bash +claude install-plugin https://github.com/opensearch-project/observability-stack/tree/main/claude-code-observability-plugin +``` + +## Verify Installation + +Start Claude Code and try a query: + +``` +claude +> Show me the top 10 services by trace span count +``` + +Claude should execute a PPL query against OpenSearch and return results. You can also try: + +``` +> Check the health of the observability stack +> Show me error logs from the last hour +> What is the p95 latency for all services? +``` + +## Configuration + +### Default Endpoints + +| Service | Endpoint | Auth | +|---|---|---| +| OpenSearch | `https://localhost:9200` | `admin` / `My_password_123!@#` (HTTPS, `-k` flag) | +| Prometheus | `http://localhost:9090` | None | + +### Custom Endpoints + +Override defaults with environment variables: + +```bash +export OPENSEARCH_ENDPOINT=https://my-opensearch:9200 +export PROMETHEUS_ENDPOINT=http://my-prometheus:9090 +``` + +### AWS Managed Services + +The plugin supports Amazon OpenSearch Service and Amazon Managed Service for Prometheus. Queries use AWS SigV4 authentication instead of basic auth. See the skill files for AWS-specific curl examples. + +## Available Skills + +| Skill | Description | +|---|---| +| `traces` | Query trace spans — agent invocations, tool executions, latency, errors | +| `logs` | Search and analyze logs — severity filtering, body search, error patterns | +| `metrics` | Query Prometheus metrics — HTTP rates, latency percentiles, GenAI tokens | +| `stack-health` | Check component health, verify data ingestion, troubleshoot issues | +| `ppl-reference` | Comprehensive PPL syntax reference with observability examples | +| `correlation` | Cross-signal correlation between traces, logs, and metrics | +| `apm-red` | RED metrics (Rate, Errors, Duration) for service monitoring | +| `slo-sli` | SLO/SLI definitions, error budgets, and burn rate alerting | + +## Running Tests + +```bash +cd claude-code-observability-plugin/tests +pip install -r requirements.txt + +# All tests (requires running stack) +pytest -v + +# Property tests only (no stack needed) +pytest test_properties.py -v + +# Filter by skill +pytest -m traces +pytest -m logs +pytest -m metrics +``` + +## Troubleshooting + +### "Observability stack is not running" + +Tests and skills require OpenSearch and Prometheus to be running locally. Start them with: + +```bash +docker compose up -d opensearch prometheus +``` + +### OpenSearch returns "Unauthorized" + +Check the password in `.env` matches what you're using. Default: `My_password_123!@#` + +### No trace/log data + +The observability stack includes example services (canary, weather-agent, travel-planner) that generate telemetry data automatically. Ensure they're running: + +```bash +docker compose ps | grep -E "canary|weather|travel" +``` + +If not running, check that `INCLUDE_COMPOSE_EXAMPLES=docker-compose.examples.yml` is set in `.env`. + +### Prometheus OOM / crash-looping + +If Prometheus is crash-looping (exit code 137), its WAL may be corrupted. Clear the volume and restart: + +```bash +docker compose stop prometheus +docker compose rm -f prometheus +docker volume rm observability-stack_prometheus-data +docker compose up -d prometheus +``` + +## Index Reference + +| Signal | Index Pattern | Key Fields | +|---|---|---| +| Traces | `otel-v1-apm-span-*` | `traceId`, `spanId`, `serviceName`, `name`, `durationInNanos`, `status.code` | +| Logs | `logs-otel-v1-*` | `traceId`, `spanId`, `severityText`, `body`, `resource.attributes.service.name` | +| Service Maps | `otel-v2-apm-service-map-*` | `sourceNode`, `targetNode`, `sourceOperation`, `targetOperation` | diff --git a/claude-code-observability-plugin/docs/USAGE.md b/claude-code-observability-plugin/docs/USAGE.md new file mode 100644 index 00000000..08552f3b --- /dev/null +++ b/claude-code-observability-plugin/docs/USAGE.md @@ -0,0 +1,326 @@ +# Feature Guide & Sample Questions + +This guide shows how to use the Claude Code Observability Plugin through natural language questions. Each section demonstrates a skill with real example questions and what Claude Code does behind the scenes. + +## Traces — Investigate Agent & Service Behavior + +The traces skill lets you query distributed trace data to understand how requests flow through your services and AI agents. + +### Sample Questions + +**Service overview:** +``` +> Which services have the most trace spans? +> Show me the top 10 services by span count +> How many distinct operations does each service have? +``` + +**GenAI agent analysis:** +``` +> How many times was each AI agent invoked? +> What is the average response time for the Travel Planner agent? +> Show me token usage by model — which model consumes the most tokens? +> Compare input vs output token counts across all LLM models +> Find the slowest agent invocations in the last hour +``` + +**Error investigation:** +``` +> Show me all error spans from the checkout service +> Which services have the most errors? +> Find failed tool executions — what tools are failing? +> Show me the trace tree for a specific traceId +``` + +**Latency analysis:** +``` +> Find all spans taking longer than 5 seconds +> What is the p95 duration for each service? +> Show me the slowest operations in the frontend service +``` + +### What Claude Does + +When you ask "Show me token usage by model", Claude runs: + +``` +source=otel-v1-apm-span-* +| where `attributes.gen_ai.usage.input_tokens` > 0 +| stats sum(`attributes.gen_ai.usage.input_tokens`) as total_input, + sum(`attributes.gen_ai.usage.output_tokens`) as total_output + by `attributes.gen_ai.request.model` +``` + +Example output: + +| Model | Input Tokens | Output Tokens | +|---|---|---| +| astronomy-llm | 2,599,194 | 453,805 | +| claude-sonnet-4.5 | 388,299 | 102,521 | +| claude-haiku | 371,422 | 93,288 | +| gpt-4.1-mini | 361,127 | 93,199 | + +--- + +## Logs — Search & Analyze Log Data + +The logs skill lets you search, filter, and analyze log entries across all services. + +### Sample Questions + +**Severity filtering:** +``` +> Show me all ERROR logs +> How many errors does each service have? +> Show me WARN and ERROR logs from the last hour +> What are the most common error messages? +``` + +**Full-text search:** +``` +> Find all logs mentioning "timeout" +> Search for logs containing "connection refused" +> Find logs about rate limiting +``` + +**Error analysis:** +``` +> Which service has the most error logs? +> Show me the error log breakdown by service and severity +> Find error patterns — group errors by message +``` + +**Log volume:** +``` +> Show me log volume over time in hourly buckets +> How many logs are generated per service? +> Show the error rate trend over the last 24 hours +``` + +### What Claude Does + +When you ask "Which service has the most error logs?", Claude runs: + +``` +source=logs-otel-v1-* +| where severityText = 'ERROR' +| stats count() as errors by `resource.attributes.service.name` +| sort - errors +``` + +Example output: + +| Service | Error Count | +|---|---| +| weather-agent | 663 | +| load-generator | 30 | +| kafka | 8 | +| product-reviews | 7 | + +--- + +## Metrics — Query Prometheus with PromQL + +The metrics skill lets you query HTTP rates, latency percentiles, and GenAI-specific metrics from Prometheus. + +### Sample Questions + +**HTTP performance:** +``` +> What is the current request rate for each service? +> Show me p95 and p99 latency for all services +> What is the 5xx error rate by service? +> How many active connections does each service have? +``` + +**GenAI metrics:** +``` +> Show me GenAI token usage rate by model +> What is the average operation duration for GenAI calls? +> Compare token consumption across different agent types +``` + +**Capacity planning:** +``` +> Show me the request rate trend over the last hour +> Which services have the highest error rates? +> What is the overall system throughput? +``` + +### What Claude Does + +When you ask "What is the p95 latency for all services?", Claude runs: + +``` +histogram_quantile(0.95, + sum(rate(http_server_duration_seconds_bucket[5m])) by (le, service_name) +) +``` + +--- + +## Stack Health — Verify & Troubleshoot + +The stack-health skill helps you check component health, verify data ingestion, and troubleshoot common issues. + +### Sample Questions + +``` +> Is the observability stack healthy? +> Check if OpenSearch is running +> How many trace spans and logs are in the system? +> List all OpenSearch indices +> Are there any services not sending data? +> Check the Prometheus scrape targets +> Show me the OTel Collector configuration +``` + +### What Claude Does + +Claude checks multiple endpoints: OpenSearch cluster health, Prometheus health, OTel Collector metrics, and verifies data exists in the expected indices. + +--- + +## Correlation — Cross-Signal Investigation + +The correlation skill connects traces, logs, and metrics to give you a complete picture of an incident. + +### Sample Questions + +**Trace-to-log:** +``` +> Find all logs for trace ID abc123def456 +> Show me error logs that have trace context +> Correlate this trace with its associated logs +``` + +**Log-to-trace:** +``` +> I see an error log — find the full trace for it +> Which traces are associated with "connection refused" errors? +> Find the span that produced this error log +``` + +**Cross-signal:** +``` +> Compare span counts vs log counts for each service +> Find services with high error rates in both traces and logs +> Show me exemplars — which Prometheus metrics have trace context? +``` + +### Real-World Workflow + +**"I see high error rates — what's happening?"** + +1. Claude checks Prometheus: `rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[5m])` +2. Finds `weather-agent` has elevated errors +3. Queries error logs: `source=logs-otel-v1-* | where severityText = 'ERROR' AND resource.attributes.service.name = 'weather-agent'` +4. Extracts traceId from error logs +5. Reconstructs the full trace: `source=otel-v1-apm-span-* | where traceId = '' | sort startTime` +6. Shows you the complete timeline from metric spike to root cause + +--- + +## APM RED — Rate, Errors, Duration + +The APM RED skill provides service-level monitoring using the RED methodology. + +### Sample Questions + +``` +> Show me RED metrics for all services +> What is the request rate, error rate, and p95 latency for the checkout service? +> Which service has the highest error rate? +> Compare RED metrics between frontend and backend services +> Show me GenAI-specific RED metrics — rate, errors, and duration for agent invocations +``` + +### What Claude Does + +Claude runs three PromQL queries (Rate, Errors, Duration) and optionally enriches with PPL span data: + +- **Rate:** `sum(rate(http_server_duration_seconds_count[5m])) by (service_name)` +- **Errors:** `sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[5m])) by (service_name)` +- **Duration:** `histogram_quantile(0.95, sum(rate(http_server_duration_seconds_bucket[5m])) by (le, service_name))` + +--- + +## SLO/SLI — Service Reliability + +The SLO/SLI skill helps you define, measure, and alert on service level objectives. + +### Sample Questions + +``` +> What is the current availability SLI for all services? +> What percentage of requests complete within 500ms? +> How much error budget do we have remaining for a 99.9% SLO? +> What is the current burn rate? Are we consuming error budget too fast? +> Help me set up SLO recording rules for Prometheus +> Calculate a multi-window burn rate alert for our checkout service +``` + +### What Claude Does + +When you ask "How much error budget do we have remaining?", Claude calculates: + +``` +1 - ((1 - availability_sli) / (1 - 0.999)) +``` + +Where `availability_sli` = ratio of non-5xx requests to total requests. + +--- + +## PPL Reference — Build Custom Queries + +The PPL reference skill is Claude's built-in guide for constructing novel PPL queries. Use it when you need queries beyond the standard templates. + +### Sample Questions + +``` +> How do I write a PPL query to join traces with logs? +> Show me the PPL syntax for regex field extraction +> How do I use the timechart command to visualize error trends? +> What PPL commands can I use for log pattern discovery? +> Help me write a PPL query to find the top 10 slowest operations per service +``` + +--- + +## Power User Tips + +### Combining Skills + +Ask questions that span multiple skills — Claude automatically routes to the right ones: + +``` +> The checkout service is slow. Show me its p95 latency, recent error logs, and the slowest traces. +> Compare the error rate in Prometheus with actual error spans in OpenSearch +> An agent is failing — show me the traces, associated logs, and token usage +``` + +### Iterative Investigation + +Claude remembers context within a conversation, so you can drill down: + +``` +> Show me services with error spans + (Claude shows: weather-agent has 150 errors) +> Show me the error spans from weather-agent + (Claude shows: most errors are "External API returned 503") +> Find the traces for those errors and show me the associated logs + (Claude correlates traces → logs) +> What was the error rate trend for weather-agent over the last 6 hours? + (Claude queries Prometheus for the time series) +``` + +### Custom Time Ranges + +Specify time ranges naturally: + +``` +> Show me error logs from the last 30 minutes +> What was the p99 latency yesterday between 2-4pm? +> Compare this week's error rate with last week +``` diff --git a/claude-code-observability-plugin/skills/.gitkeep b/claude-code-observability-plugin/skills/.gitkeep new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/claude-code-observability-plugin/skills/.gitkeep @@ -0,0 +1 @@ + diff --git a/claude-code-observability-plugin/skills/apm-red/SKILL.md b/claude-code-observability-plugin/skills/apm-red/SKILL.md new file mode 100644 index 00000000..489de3df --- /dev/null +++ b/claude-code-observability-plugin/skills/apm-red/SKILL.md @@ -0,0 +1,527 @@ +--- +name: apm-red +description: APM RED metrics (Rate, Errors, Duration) for service-level monitoring using PromQL and PPL queries. +allowed-tools: + - Bash + - curl +--- + +# APM RED Metrics + +## Overview + +This skill provides query templates for the RED methodology — the three golden signals for service-level monitoring: + +| Signal | What it measures | Key question | +|---|---|---| +| **Rate** | Requests per second | How much traffic is the service handling? | +| **Errors** | Failed requests as a ratio of total | What percentage of requests are failing? | +| **Duration** | Latency distribution (p50, p95, p99) | How long do requests take? | + +RED metrics give you a complete picture of service health at a glance. Every service should be monitored on all three signals. This skill covers both PromQL queries against Prometheus and PPL queries against OpenSearch trace spans as an alternative. + +All Prometheus queries use the HTTP API at `http://localhost:9090/api/v1/query`. All OpenSearch queries use the PPL API at `https://localhost:9200/_plugins/_ppl` with HTTPS and basic authentication. Credentials are read from the `.env` file (default: `admin` / `My_password_123!@#`). + +## Connection Defaults + +| Variable | Default | Description | +|---|---|---| +| `OPENSEARCH_ENDPOINT` | `https://localhost:9200` | OpenSearch base URL | +| `OPENSEARCH_USER` | `admin` | OpenSearch username | +| `OPENSEARCH_PASSWORD` | `My_password_123!@#` | OpenSearch password | +| `PROMETHEUS_ENDPOINT` | `http://localhost:9090` | Prometheus base URL | + + +## Metric Discovery + +Different OTel SDK versions and languages emit HTTP metrics under different names. Before querying, discover which metric names are active in your stack: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/label/__name__/values" | python3 -c " +import json, sys +for m in json.load(sys.stdin).get('data', []): + if any(k in m for k in ['http_server', 'gen_ai', 'db_client']): + print(m)" +``` + +**Common HTTP metric name variants:** + +| Metric Name | Unit | Emitted By | +|---|---|---| +| `http_server_duration_milliseconds` | milliseconds | Python OTel SDK (older semconv) | +| `http_server_duration_seconds` | seconds | .NET, Java OTel SDKs | +| `http_server_request_duration_seconds` | seconds | Stable HTTP semconv (newer SDKs) | + +> **Important:** Replace the metric name in the PromQL queries below with whichever variant is active in your stack. For millisecond-unit metrics, adjust latency thresholds accordingly (e.g., `le="250"` instead of `le="0.25"`). + +## Rate Queries + +### Per-Service Request Rate (PromQL) + +Calculate the per-second HTTP request rate over a 5-minute window, grouped by service: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count[5m])) by (service_name)' +``` + +### Per-Endpoint Request Rate (PromQL) + +Break down request rate by service and HTTP route to identify hot endpoints: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count[5m])) by (service_name, http_route)' +``` + +### Request Rate from Trace Spans (PPL) + +Calculate request rate from trace spans as an alternative to PromQL. This counts spans per 5-minute bucket grouped by service: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() as request_count by span(startTime, 5m), serviceName"}' +``` + + +## Error Queries + +### Error Rate Ratio (PromQL) + +Calculate the ratio of 5xx error responses to total requests by service. A value of 0.01 means 1% of requests are failing: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[5m])) by (service_name) / sum(rate(http_server_duration_seconds_count[5m])) by (service_name)' +``` + +### Error Count (PromQL) + +Calculate the per-second rate of 5xx errors by service (useful for alerting on absolute error volume): + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[5m])) by (service_name)' +``` + +### Error Count from Trace Spans (PPL) + +Count error spans (status code 2 = Error in OTel) grouped by service: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `status.code` = 2 | stats count() as error_count by serviceName"}' +``` + + +## Duration Queries + +### Latency Percentiles (PromQL) + +#### p50 (Median) Latency by Service + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.50, sum(rate(http_server_duration_seconds_bucket[5m])) by (le, service_name))' +``` + +#### p95 Latency by Service + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.95, sum(rate(http_server_duration_seconds_bucket[5m])) by (le, service_name))' +``` + +#### p99 Latency by Service + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.99, sum(rate(http_server_duration_seconds_bucket[5m])) by (le, service_name))' +``` + +### Latency Percentiles from Trace Spans (PPL) + +Calculate p50, p95, and p99 latency directly from trace span durations. Values are in nanoseconds — divide by 1,000,000 for milliseconds: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats percentile(durationInNanos, 50) as p50, percentile(durationInNanos, 95) as p95, percentile(durationInNanos, 99) as p99 by serviceName"}' +``` + + +## Combined RED Dashboard + +Run all three RED signals for every service in a single investigation. Execute these queries together to get a complete service health snapshot. + +### Rate — Requests per second by service: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count[5m])) by (service_name)' +``` + +### Errors — Error ratio by service: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[5m])) by (service_name) / sum(rate(http_server_duration_seconds_count[5m])) by (service_name)' +``` + +### Duration — p95 latency by service: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.95, sum(rate(http_server_duration_seconds_bucket[5m])) by (le, service_name))' +``` + +### Combined RED via PPL (Trace Spans) + +Get all three RED signals from trace spans in a single PPL query: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() as total_requests, sum(case(`status.code` = 2, 1 else 0)) as error_count, percentile(durationInNanos, 50) as p50, percentile(durationInNanos, 95) as p95, percentile(durationInNanos, 99) as p99 by serviceName"}' +``` + + +## Data Prepper APM Metrics + +Data Prepper's APM service map processor generates its own RED metrics from trace spans and writes them to Prometheus. These are the metrics that power the OpenSearch Dashboards APM UI. Unlike OTel SDK histogram metrics (which use `rate()` on counters), Data Prepper APM metrics are **gauges** — instantaneous snapshot values that should be queried directly without `rate()`. + +### Data Prepper APM Metric Reference + +| Metric | Type | Description | +|---|---|---| +| `request` | gauge | Total request count per service/operation edge | +| `error` | gauge | Error count (server-side errors, status code 2) | +| `fault` | gauge | Fault count (client-side errors) | +| `latency_seconds_seconds_bucket` | histogram | Latency distribution with `le` buckets (note: double `_seconds` suffix from unit handling) | + +Common labels on all Data Prepper APM metrics: + +| Label | Description | +|---|---| +| `service` | Source service name | +| `operation` | Source operation (e.g., `GET /api/cart`) | +| `remoteService` | Destination service name | +| `remoteOperation` | Destination operation | +| `environment` | Deployment environment (e.g., `generic:default`) | +| `namespace` | Always `span_derived` for Data Prepper APM metrics | + +> **Important:** These metrics use `service` (not `service_name`) as the label for service names, unlike OTel SDK metrics which use `service_name`. + +### Request Count by Service (Data Prepper) + +Query total request count per service. This is a gauge — no `rate()` needed: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(request{namespace="span_derived"}) by (service)' +``` + +### Request Count by Service and Operation (Data Prepper) + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=request{namespace="span_derived", service="frontend"}' +``` + +### Error Count by Service (Data Prepper) + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(error{namespace="span_derived"}) by (service)' +``` + +### Fault Count by Service (Data Prepper) + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(fault{namespace="span_derived"}) by (service)' +``` + +### Error Rate by Service (Data Prepper) + +Calculate the error ratio using safe division to avoid NaN when request count is zero: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(error{namespace="span_derived"}) by (service) / (sum(request{namespace="span_derived"}) by (service) > 0)' +``` + +### Latency Percentiles (Data Prepper) + +#### p50 (Median) Latency by Service + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.50, sum(latency_seconds_seconds_bucket{namespace="span_derived"}) by (le, service))' +``` + +#### p95 Latency by Service + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.95, sum(latency_seconds_seconds_bucket{namespace="span_derived"}) by (le, service))' +``` + +#### p99 Latency by Service + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.99, sum(latency_seconds_seconds_bucket{namespace="span_derived"}) by (le, service))' +``` + +### p99 Latency for a Specific Service (Data Prepper) + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.99, sum(latency_seconds_seconds_bucket{namespace="span_derived", service="frontend"}) by (le))' +``` + +### Top-K Services by Error Rate (Data Prepper) + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=topk(5, sum(error{namespace="span_derived"}) by (service) / (sum(request{namespace="span_derived"}) by (service) > 0))' +``` + + +## GenAI-Specific RED Metrics + +Apply the RED methodology to GenAI operations using the `gen_ai_client_operation_duration_seconds` histogram. + +### GenAI Rate — Operations per second by operation and model: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(gen_ai_client_operation_duration_seconds_count[5m])) by (gen_ai_operation_name, gen_ai_request_model)' +``` + +### GenAI Errors — Error ratio by operation and model: + +GenAI operations that result in errors (e.g., model timeouts, rate limits) are tracked via span status. Use trace spans to calculate GenAI error rates: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where isnotnull(`attributes.gen_ai.operation.name`) | stats count() as total, sum(case(`status.code` = 2, 1 else 0)) as errors by `attributes.gen_ai.operation.name`, `attributes.gen_ai.request.model`"}' +``` + +### GenAI Duration — p50/p95/p99 by operation and model: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.50, sum(rate(gen_ai_client_operation_duration_seconds_bucket[5m])) by (le, gen_ai_operation_name, gen_ai_request_model))' +``` + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.95, sum(rate(gen_ai_client_operation_duration_seconds_bucket[5m])) by (le, gen_ai_operation_name, gen_ai_request_model))' +``` + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.99, sum(rate(gen_ai_client_operation_duration_seconds_bucket[5m])) by (le, gen_ai_operation_name, gen_ai_request_model))' +``` + + +## OTel HTTP Semantic Convention Metrics Reference + +The RED queries in this skill use metrics defined by the [OpenTelemetry HTTP semantic conventions](https://opentelemetry.io/docs/specs/semconv/http/http-metrics/). The OTel SDK instruments HTTP servers and clients using these standard metric names, which Prometheus exports with underscores replacing dots. + +| OTel Metric Name | Prometheus Metric Name(s) | Type | Description | +|---|---|---|---| +| `http.server.request.duration` | `http_server_duration_seconds`, `http_server_duration_milliseconds`, `http_server_request_duration_seconds` | histogram | Duration of HTTP server requests (unit varies by SDK) | +| `http.server.active_requests` | `http_server_active_requests` | gauge | Number of active HTTP server requests | + +> **Note:** The exact Prometheus metric name depends on the OTel SDK version and language. Python SDKs with older semconv emit `http_server_duration_milliseconds`; .NET/Java SDKs emit `http_server_duration_seconds`; newer stable semconv uses `http_server_request_duration_seconds`. Use the [Metric Discovery](#metric-discovery) section to check which name is active. + +Common labels on HTTP server duration metrics: + +| Label | Description | +|---|---| +| `service_name` | Service that handled the request | +| `http_response_status_code` | HTTP response status code (200, 404, 500, etc.) | +| `http_route` | HTTP route pattern (e.g., `/api/v1/users`) | +| `http_request_method` | HTTP method (GET, POST, PUT, DELETE) | + +> **Note on status code labels:** The label name varies by OTel SDK version. Older semconv uses `http_status_code`; newer stable semconv uses `http_response_status_code`. Use the [Metric Discovery](#metric-discovery) section to check which label is present, or query both variants. + +> **Note:** Prometheus replaces dots in OTel metric and label names with underscores. The OTel metric `http.server.request.duration` becomes a Prometheus metric with a unit suffix added by the OTel exporter. The exact name varies by SDK — see the table above. + + +## OTel Collector `spanmetrics` Connector + +The OTel Collector `spanmetrics` connector auto-generates RED metrics from trace spans without requiring application-level metric instrumentation. It processes incoming spans and produces metrics for request count, error count, and duration histograms. + +### How It Works + +The `spanmetrics` connector sits between the traces pipeline and the metrics pipeline in the OTel Collector configuration: + +```yaml +connectors: + spanmetrics: + histogram: + explicit: + buckets: [2ms, 4ms, 6ms, 8ms, 10ms, 50ms, 100ms, 200ms, 400ms, 800ms, 1s, 1400ms, 2s, 5s, 10s, 15s] + dimensions: + - name: service.name + - name: http.route + - name: http.request.method + - name: http.response.status_code + exemplars: + enabled: true + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp/opensearch, spanmetrics] + metrics: + receivers: [otlp, spanmetrics] + processors: [batch] + exporters: [otlphttp/prometheus] +``` + +### Generated Metrics + +The `spanmetrics` connector produces these metrics from trace spans: + +| Metric | Type | Description | +|---|---|---| +| `traces_spanmetrics_calls_total` | counter | Total number of span calls (Rate) | +| `traces_spanmetrics_duration_seconds` | histogram | Span duration distribution (Duration) | + +Error counts are derived by filtering `traces_spanmetrics_calls_total` on `status_code="STATUS_CODE_ERROR"`. + +### Querying spanmetrics-Generated RED Metrics + +Rate from spanmetrics: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(traces_spanmetrics_calls_total[5m])) by (service_name)' +``` + +Error rate from spanmetrics: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(traces_spanmetrics_calls_total{status_code="STATUS_CODE_ERROR"}[5m])) by (service_name) / sum(rate(traces_spanmetrics_calls_total[5m])) by (service_name)' +``` + +Duration p95 from spanmetrics: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.95, sum(rate(traces_spanmetrics_duration_seconds_bucket[5m])) by (le, service_name))' +``` + +> **Note:** This stack currently routes traces to OpenSearch via Data Prepper and metrics to Prometheus via OTLP. The `spanmetrics` connector is not enabled by default but can be added to `docker-compose/otel-collector/config.yaml` to auto-generate RED metrics from traces. This is useful when application-level HTTP metrics are not available. + + +## Advanced PromQL Patterns + +### Safe Division (Avoid NaN/Inf) + +When dividing metrics (e.g., error rate = errors/total), use `clamp_min()` to avoid division-by-zero which produces NaN or Inf: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[5m])) by (service_name) / clamp_min(sum(rate(http_server_duration_seconds_count[5m])) by (service_name), 1) * 100' +``` + +### Top-K Services by Fault Rate + +Find the top 5 services with the highest fault rate using `topk()`: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=topk(5, sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[5m])) by (service_name) / clamp_min(sum(rate(http_server_duration_seconds_count[5m])) by (service_name), 1) * 100)' +``` + +### Top-K Operations by Fault Rate for a Service + +Drill into a specific service to find its worst-performing operations: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=topk(5, sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5..", service_name="frontend"}[5m])) by (http_route) / clamp_min(sum(rate(http_server_duration_seconds_count{service_name="frontend"}[5m])) by (http_route), 1) * 100)' +``` + +### Service Availability + +Calculate availability as the inverse of fault rate (percentage of non-5xx responses): + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=(1 - sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[5m])) by (service_name) / clamp_min(sum(rate(http_server_duration_seconds_count[5m])) by (service_name), 1)) * 100' +``` + +### Bottom-K Services by Availability + +Find the 5 services with the lowest availability (most errors): + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=bottomk(5, (1 - sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[5m])) by (service_name) / clamp_min(sum(rate(http_server_duration_seconds_count[5m])) by (service_name), 1)) * 100)' +``` + +### Per-Operation RED Metrics for a Service + +Get latency, request rate, and error rate per operation for a specific service: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.95, sum(rate(http_server_duration_seconds_bucket{service_name="checkout"}[5m])) by (le, http_route))' +``` + +## References + +- [PPL Language Reference](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.md) — Official PPL syntax documentation. Fetch this if queries fail due to OpenSearch version differences or new syntax. +- [Prometheus Querying Basics](https://prometheus.io/docs/prometheus/latest/querying/basics/) — PromQL syntax reference. + +## AWS Managed Service Variants + +### Amazon OpenSearch Service (SigV4) + +Replace the local OpenSearch endpoint and authentication with AWS SigV4 for all PPL queries in this skill: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:es" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + -X POST https://DOMAIN-ID.REGION.es.amazonaws.com/_plugins/_ppl \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() as request_count by span(startTime, 5m), serviceName"}' +``` + +- Endpoint format: `https://DOMAIN-ID.REGION.es.amazonaws.com` +- Auth: `--aws-sigv4 "aws:amz:REGION:es"` with `--user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY"` +- The PPL API endpoint (`/_plugins/_ppl`) and query syntax are identical to the local stack +- No `-k` flag needed — AWS managed endpoints use valid TLS certificates + +### Amazon Managed Service for Prometheus (AMP) (SigV4) + +Replace the local Prometheus endpoint and authentication with AWS SigV4 for all PromQL queries: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:aps" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + 'https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query' \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count[5m])) by (service_name)' +``` + +- Endpoint format: `https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query` +- Auth: `--aws-sigv4 "aws:amz:REGION:aps"` with `--user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY"` +- PromQL query syntax is identical between local Prometheus and Amazon Managed Prometheus; only the endpoint and authentication differ diff --git a/claude-code-observability-plugin/skills/correlation/SKILL.md b/claude-code-observability-plugin/skills/correlation/SKILL.md new file mode 100644 index 00000000..56e013be --- /dev/null +++ b/claude-code-observability-plugin/skills/correlation/SKILL.md @@ -0,0 +1,580 @@ +--- +name: correlation +description: Cross-signal correlation between traces, logs, and metrics using OTel semantic convention fields for end-to-end observability investigations. +allowed-tools: + - Bash + - curl +--- + +# Cross-Signal Correlation + +## Overview + +This skill teaches how to correlate traces, logs, and metrics across all three telemetry signals using shared OTel semantic convention fields. Correlation enables end-to-end investigations: start from a metric spike, trace it to a specific request, and find the associated logs — or start from an error log and reconstruct the full trace that produced it. + +All OpenSearch queries use the PPL API at `/_plugins/_ppl` with HTTPS and basic authentication. Prometheus queries use the HTTP API at `localhost:9090`. Credentials are read from the `.env` file (default: `admin` / `My_password_123!@#`). + +## Connection Defaults + +| Variable | Default | Description | +|---|---|---| +| `OPENSEARCH_ENDPOINT` | `https://localhost:9200` | OpenSearch base URL | +| `OPENSEARCH_USER` | `admin` | OpenSearch username | +| `OPENSEARCH_PASSWORD` | `My_password_123!@#` | OpenSearch password | + +## OTel Correlation Fields Reference + +### Trace Context Correlation + +Traces and logs share `traceId` and `spanId` fields. When an application emits a log within an active span, the OTel SDK automatically injects the current trace context into the log record. This creates a direct link between log entries and the spans that produced them. + +| Field | Signal | Type | Description | +|---|---|---|---| +| `traceId` | Traces, Logs | keyword | Hex-encoded 128-bit trace identifier shared between spans and log records | +| `spanId` | Traces, Logs | keyword | Hex-encoded 64-bit span identifier shared between spans and log records | +| `traceFlags` | Logs | integer | W3C trace flags (e.g., 01 = sampled) carried on log records | + +- In the Trace_Index (`otel-v1-apm-span-*`): `traceId` and `spanId` identify each span +- In the Log_Index (`logs-otel-v1-*`): `traceId` and `spanId` link the log to the span that was active when the log was emitted + +### Metric-to-Trace Correlation (Prometheus Exemplars) + +Prometheus exemplars attach trace context to individual metric samples. When the OTel SDK records a metric observation inside an active span, it can attach the `trace_id` and `span_id` as exemplar labels. This links a specific metric data point back to the trace that produced it. + +Exemplar data model: + +| Field | Description | +|---|---| +| `trace_id` | Hex-encoded trace identifier from the span active during metric recording | +| `span_id` | Hex-encoded span identifier from the span active during metric recording | +| `filtered_attributes` | Additional key-value pairs attached to the exemplar | +| `timestamp` | Time when the exemplar was recorded | +| `value` | The metric sample value associated with this exemplar | + +### Resource-Level Correlation + +All three signals (traces, logs, metrics) share resource attributes that identify the originating service. These attributes are set by the OTel SDK and propagated through the pipeline: + +| Resource Attribute | Traces Field | Logs Field | Prometheus Label | Description | +|---|---|---|---|---| +| `service.name` | `serviceName` | `resource.attributes.service.name` | `service_name` | Service that produced the telemetry | +| `service.namespace` | `resource.service.namespace` | `resource.attributes.service.namespace` | `service_namespace` | Namespace grouping related services | +| `service.version` | `resource.service.version` | `resource.attributes.service.version` | `service_version` | Service version string | +| `service.instance.id` | `resource.service.instance.id` | `resource.attributes.service.instance.id` | `service_instance_id` | Unique instance identifier | +| `deployment.environment.name` | `resource.deployment.environment.name` | `resource.attributes.deployment.environment.name` | `deployment_environment_name` | Deployment environment (e.g., production, staging) | + +The OTel Collector's `resourcedetection` processor enriches telemetry with environment context, and the Prometheus `promote_resource_attributes` configuration (in `docker-compose/prometheus/prometheus.yml`) promotes these resource attributes to metric labels so they are queryable in PromQL. + +### GenAI Resource Attributes in Prometheus + +The following GenAI resource attributes are promoted to Prometheus metric labels via the `promote_resource_attributes` configuration, enabling metric queries filtered by agent or model: + +| Resource Attribute | Prometheus Label | Description | +|---|---|---| +| `gen_ai.agent.id` | `gen_ai_agent_id` | Agent identifier | +| `gen_ai.agent.name` | `gen_ai_agent_name` | Human-readable agent name (only available if SDK sets this as a resource attribute; most SDKs set it as a span attribute instead) | +| `gen_ai.provider.name` | `gen_ai_provider_name` | LLM provider (e.g., bedrock, openai) | +| `gen_ai.request.model` | `gen_ai_request_model` | Model requested for the operation | +| `gen_ai.response.model` | `gen_ai_response_model` | Model that actually served the response | + + +## Trace-to-Log Correlation (PPL) + +### Find Logs by traceId + +Given a trace ID, find all log entries emitted during that trace: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where traceId = '\'''\'' | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort `@timestamp`"}' +``` + +### Find Logs by spanId + +Given a span ID, find all log entries emitted during that specific span: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where spanId = '\'''\'' | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort `@timestamp`"}' +``` + +### Join Spans and Logs by traceId + +Use PPL `join` to combine trace spans with their correlated logs in a single query: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' | join left=s right=l ON s.traceId = l.traceId logs-otel-v1-* | fields s.spanId, s.name, s.serviceName, s.durationInNanos, l.severityText, l.body, l.`@timestamp`"}' +``` + +### Full Timeline Reconstruction + +Reconstruct the complete request timeline by interleaving spans and logs sorted by timestamp. Run both queries and merge results by time: + +Step 1 — Get all spans for the trace: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' | eval signal = '\''span'\'' | fields traceId, spanId, serviceName, name, startTime, endTime, durationInNanos, `status.code`, signal | sort startTime"}' +``` + +Step 2 — Get all logs for the trace: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where traceId = '\'''\'' | eval signal = '\''log'\'' | fields traceId, spanId, `resource.attributes.service.name`, severityText, body, `@timestamp`, signal | sort `@timestamp`"}' +``` + +Merge both result sets by timestamp to see the full chronological sequence of spans and log entries for the request. + + +## Log-to-Trace Correlation (PPL) + +### Find Originating Trace from an Error Log + +When you find an error log, extract its `traceId` and query the Trace_Index to reconstruct the full trace: + +Step 1 — Find error logs and get their traceId: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''ERROR'\'' | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort - `@timestamp` | head 10"}' +``` + +Step 2 — Query the Trace_Index with the extracted traceId to get all spans: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' | fields traceId, spanId, parentSpanId, serviceName, name, startTime, endTime, durationInNanos, `status.code` | sort startTime"}' +``` + +### Find Specific Span from a Log Entry + +When a log entry has a `spanId`, query the Trace_Index to find the exact span that was active when the log was emitted: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where spanId = '\'''\'' | fields traceId, spanId, parentSpanId, serviceName, name, startTime, endTime, durationInNanos, `status.code`, `attributes.gen_ai.operation.name`"}' +``` + + +## Metric-to-Trace Correlation (Prometheus Exemplars) + +### Query Exemplars from Prometheus + +Use the Prometheus exemplars API to retrieve trace context attached to metric samples. This links a metric observation back to the specific trace that produced it: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query_exemplars" \ + --data-urlencode 'query=http_server_duration_seconds_bucket' \ + --data-urlencode 'start=2024-01-01T00:00:00Z' \ + --data-urlencode 'end=2024-01-02T00:00:00Z' +``` + +The response contains exemplar objects with `trace_id` and `span_id` in the `labels` field: + +```json +{ + "status": "success", + "data": [ + { + "seriesLabels": { "service_name": "my-agent", "__name__": "http_server_duration_seconds_bucket" }, + "exemplars": [ + { + "labels": { "trace_id": "abc123...", "span_id": "def456..." }, + "value": "0.25", + "timestamp": 1704067200.000 + } + ] + } + ] +} +``` + +### Query Exemplars for GenAI Metrics + +Query exemplars for GenAI operation duration, filtered by agent name: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query_exemplars" \ + --data-urlencode 'query=gen_ai_client_operation_duration_seconds_bucket{gen_ai_operation_name="invoke_agent"}' \ + --data-urlencode 'start=2024-01-01T00:00:00Z' \ + --data-urlencode 'end=2024-01-02T00:00:00Z' +``` + +### Extract trace_id and Query Trace_Index + +After extracting a `trace_id` from an exemplar response, query the Trace_Index for the full trace: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' | fields traceId, spanId, parentSpanId, serviceName, name, startTime, endTime, durationInNanos, `status.code` | sort startTime"}' +``` + +### PromQL Queries with GenAI Resource Labels + +Filter metrics by GenAI resource labels before correlating to traces via exemplars: + +By agent name: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=rate(gen_ai_client_operation_duration_seconds_count{gen_ai_operation_name="invoke_agent"}[5m])' +``` + +By model: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=rate(gen_ai_client_token_usage_count[5m])' +``` + +Then query exemplars for the filtered metric to get trace IDs for correlation. + + +## Resource-Level Correlation + +### service.name Across All Signals + +The `service.name` resource attribute is the primary key for correlating telemetry across all three signals at the service level. + +Find all traces from a specific service: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where serviceName = '\''my-service'\'' | stats count() as span_count, avg(durationInNanos) as avg_duration by serviceName"}' +``` + +Find all logs from the same service: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where `resource.attributes.service.name` = '\''my-service'\'' | stats count() by severityText"}' +``` + +Find all metrics from the same service in Prometheus: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=rate(http_server_duration_seconds_count{service_name="my-service"}[5m])' +``` + +### GenAI Resource Labels in Prometheus + +Query metrics filtered by GenAI resource attributes that are promoted to Prometheus labels: + +By agent: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(gen_ai_client_operation_duration_seconds_count{gen_ai_operation_name="invoke_agent"}[5m])) by (gen_ai_operation_name)' +``` + +By provider and model: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(gen_ai_client_token_usage_count[5m])) by (gen_ai_request_model)' +``` + +### How Resource Attributes Flow Through the Stack + +1. The OTel SDK sets resource attributes (`service.name`, `service.version`, etc.) on all telemetry +2. The OTel Collector's `resourcedetection` processor enriches telemetry with environment context (Docker, system info) +3. For traces and logs: resource attributes are stored in OpenSearch as part of the document +4. For metrics: the Prometheus `promote_resource_attributes` configuration (in `docker-compose/prometheus/prometheus.yml`) promotes resource attributes to metric labels, making them queryable in PromQL + +This ensures the same `service.name` value appears in traces (`serviceName` field), logs (`resource.attributes.service.name` field), and metrics (`service_name` label) — enabling service-level correlation across all backends. + + +## Correlation Workflows + +### Workflow 1: Metric Spike Investigation + +Investigate a metric anomaly by correlating from metrics → traces → logs. + +**Step 1 — Detect the spike via PromQL:** + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=rate(http_server_duration_seconds_count[5m])' +``` + +Look for services with unusually high request rates or latency. + +**Step 2 — Query exemplars to get trace IDs from the spike window:** + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query_exemplars" \ + --data-urlencode 'query=http_server_duration_seconds_bucket' \ + --data-urlencode 'start=' \ + --data-urlencode 'end=' +``` + +Extract `trace_id` values from the exemplar response. + +**Step 3 — Query the Trace_Index for those traces:** + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' | fields traceId, spanId, parentSpanId, serviceName, name, startTime, endTime, durationInNanos, `status.code` | sort startTime"}' +``` + +**Step 4 — Query the Log_Index for correlated logs:** + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where traceId = '\'''\'' | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort `@timestamp`"}' +``` + +### Workflow 2: Error Log Investigation + +Start from an error log and trace back to the root cause. + +**Step 1 — Find error logs:** + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''ERROR'\'' | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort - `@timestamp` | head 10"}' +``` + +**Step 2 — Extract the traceId from the error log and reconstruct the full trace tree:** + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' | fields traceId, spanId, parentSpanId, serviceName, name, startTime, endTime, durationInNanos, `status.code` | sort startTime"}' +``` + +**Step 3 — Identify the root cause span (look for error status or exceptions):** + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' AND `status.code` = 2 | fields traceId, spanId, serviceName, name, `events.attributes.exception.type`, `events.attributes.exception.message` | sort startTime"}' +``` + +**Step 4 — Get all logs for the error span to see the full context:** + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where traceId = '\'''\'' | fields traceId, spanId, severityText, body, `@timestamp` | sort `@timestamp`"}' +``` + +### Workflow 3: Slow Agent Investigation + +Investigate a slow agent invocation by correlating spans, child operations, logs, and metrics. + +**Step 1 — Find slow `invoke_agent` spans:** + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''invoke_agent'\'' AND durationInNanos > 5000000000 | fields traceId, spanId, `attributes.gen_ai.agent.name`, durationInNanos, startTime | sort - durationInNanos | head 10"}' +``` + +**Step 2 — Get all child spans to identify the bottleneck:** + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' | fields traceId, spanId, parentSpanId, name, `attributes.gen_ai.operation.name`, durationInNanos, startTime | sort startTime"}' +``` + +Look for child spans with high `durationInNanos` — these are the bottleneck operations (e.g., slow tool calls, slow LLM responses). + +**Step 3 — Check tool calls within the slow trace:** + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' AND `attributes.gen_ai.operation.name` = '\''execute_tool'\'' | fields spanId, `attributes.gen_ai.tool.name`, `attributes.gen_ai.tool.call.arguments`, durationInNanos | sort - durationInNanos"}' +``` + +**Step 4 — Get correlated logs for the slow spans:** + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where traceId = '\'''\'' | fields spanId, severityText, body, `@timestamp` | sort `@timestamp`"}' +``` + +**Step 5 — Check GenAI token usage metrics for the agent via PromQL:** + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(gen_ai_client_token_usage_count[5m])) by (gen_ai_operation_name, gen_ai_request_model)' +``` + +Check if the agent is consuming unusually high token counts, which may explain slow response times. + + +## Dynamic Field Discovery for Correlation + +When correlating across signals, use `describe` or `_mapping` to discover available fields dynamically. This is especially useful when index schemas differ from the defaults. + +### Discover Trace Index Fields + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "describe otel-v1-apm-span-000001"}' +``` + +### Discover Log Index Fields + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "describe logs-otel-v1-000001"}' +``` + +### Discover Service Map Fields + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "describe otel-v2-apm-service-map-000001"}' +``` + +Use the field names from `describe` output to construct correlation queries when the default field names don't match your index schema. + +## Advanced Correlation Patterns + +### Batch Log Correlation via traceId IN List + +When you have a set of traceIds from span queries, use `IN` to fetch all correlated logs in one query: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where traceId IN ('\'''\'', '\'''\'', '\'''\'') | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort `@timestamp`"}' +``` + +This is more efficient than querying logs one traceId at a time. + +### Log Correlation with Fallback for Missing Trace Context + +Some logs may have empty or null traceId. Include those alongside correlated logs: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where `resource.attributes.service.name` = '\''frontend'\'' | where (traceId IN ('\'''\'', '\'''\'') OR traceId = '\'''\'' OR isnull(traceId)) | sort - `@timestamp` | head 50"}' +``` + +### Remote Service Dependency Correlation + +Identify which remote services a given service calls using `coalesce()` across OTel attribute variants. Different instrumentation libraries (Node.js, Go, Python, .NET) use different attributes: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where serviceName = '\''checkout'\'' | where kind = '\''SPAN_KIND_CLIENT'\'' | eval _remoteService = coalesce(`attributes.net.peer.name`, `attributes.server.address`, `attributes.rpc.service`, `attributes.db.system`, `attributes.gen_ai.system`, '\''unknown'\'') | stats count() as calls, avg(durationInNanos) as avg_latency by _remoteService | sort - calls"}' +``` + +### Field Reference by Protocol + +When correlating across different service types, these are the key fields by protocol: + +| Protocol | Remote Service Field | Operation Field | +|---|---|---| +| gRPC | `attributes.net.peer.name` or `attributes.server.address` | `attributes.rpc.method` | +| HTTP | `attributes.http.host` or `attributes.server.address` | `attributes.http.route` | +| Database | `attributes.db.system` + `attributes.server.address` | `attributes.db.statement` | +| Envoy/Istio | `attributes.upstream_cluster` | span `name` | +| LLM/GenAI | `attributes.gen_ai.system` + `attributes.server.address` | `attributes.gen_ai.request.model` | +| Message Queue | `attributes.messaging.destination.name` | span `name` | + +## References + +- [PPL Language Reference](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.md) — Official PPL syntax documentation. Fetch this if queries fail due to OpenSearch version differences or new syntax. + +## AWS Managed Service Variants + +### Amazon OpenSearch Service (SigV4) + +Replace the local OpenSearch endpoint and authentication with AWS SigV4 for all PPL queries in this skill: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:es" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + -X POST https://DOMAIN-ID.REGION.es.amazonaws.com/_plugins/_ppl \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' | fields traceId, spanId, parentSpanId, serviceName, name, startTime, endTime, durationInNanos, `status.code` | sort startTime"}' +``` + +- Endpoint format: `https://DOMAIN-ID.REGION.es.amazonaws.com` +- Auth: `--aws-sigv4 "aws:amz:REGION:es"` with `--user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY"` +- The PPL API endpoint (`/_plugins/_ppl`) and query syntax are identical to the local stack +- No `-k` flag needed — AWS managed endpoints use valid TLS certificates + +### Amazon Managed Service for Prometheus (AMP) (SigV4) + +Replace the local Prometheus endpoint and authentication with AWS SigV4 for all PromQL and exemplar queries: + +Query exemplars: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:aps" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + 'https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query_exemplars' \ + --data-urlencode 'query=http_server_duration_seconds_bucket' \ + --data-urlencode 'start=2024-01-01T00:00:00Z' \ + --data-urlencode 'end=2024-01-02T00:00:00Z' +``` + +Query metrics: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:aps" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + 'https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query' \ + --data-urlencode 'query=rate(http_server_duration_seconds_count{service_name="my-service"}[5m])' +``` + +- Endpoint format: `https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query` +- Auth: `--aws-sigv4 "aws:amz:REGION:aps"` with `--user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY"` +- PromQL query syntax and exemplar API are identical to local Prometheus; only the endpoint and authentication differ diff --git a/claude-code-observability-plugin/skills/logs/SKILL.md b/claude-code-observability-plugin/skills/logs/SKILL.md new file mode 100644 index 00000000..3a5c74ce --- /dev/null +++ b/claude-code-observability-plugin/skills/logs/SKILL.md @@ -0,0 +1,297 @@ +--- +name: logs +description: Query and search log data from OpenSearch using PPL for severity filtering, trace correlation, error patterns, and log volume analysis. +allowed-tools: + - Bash + - curl +--- + +# Log Querying with PPL + +## Overview + +This skill provides PPL (Piped Processing Language) query templates for searching and analyzing log data stored in OpenSearch. Logs are stored in the `logs-otel-v1-*` index pattern. All queries use the OpenSearch PPL API at `/_plugins/_ppl` with HTTPS and basic authentication. + +Credentials are read from the `.env` file (default: `admin` / `My_password_123!@#`). All curl commands use `-k` to skip TLS certificate verification for local development. + +## Connection Defaults + +All commands below use these variables. Set them in your environment or use the defaults: + +| Variable | Default | Description | +|---|---|---| +| `OPENSEARCH_ENDPOINT` | `https://localhost:9200` | OpenSearch base URL | +| `OPENSEARCH_USER` | `admin` | OpenSearch username | +| `OPENSEARCH_PASSWORD` | `My_password_123!@#` | OpenSearch password | + +## Base Command + +All PPL queries in this skill use this curl pattern: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": ""}' +``` + +The examples below show the full command for clarity, but only the PPL query varies. + +## Log Index Key Fields + +Key fields available in the `logs-otel-v1-*` index: + +| Field | Type | Description | +|---|---|---| +| `severityText` | keyword | Log level string (ERROR, WARN, INFO, DEBUG) | +| `severityNumber` | integer | Numeric severity (1–24, higher = more severe; ERROR=17, WARN=13, INFO=9, DEBUG=5) | +| `traceId` | keyword | Correlated trace identifier (links log to a distributed trace) | +| `spanId` | keyword | Correlated span identifier (links log to a specific span within a trace) | +| `resource.attributes.service.name` | keyword | Service that produced the log entry (use backtick-quoted `` `resource.attributes.service.name` `` in PPL queries) | +| `body` | text | Log message body content | +| `@timestamp` | date | Log entry timestamp | + +> **Note:** Unlike the trace span index (`otel-v1-apm-span-*`) which has a top-level `serviceName` field, the log index stores the service name at `resource.attributes.service.name`. Always use backtick quoting in PPL: `` `resource.attributes.service.name` ``. + +## Severity Filtering + +### ERROR Logs + +Query all error-level logs: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''ERROR'\'' | fields traceId, spanId, `resource.attributes.service.name`, body, `@timestamp` | sort - `@timestamp` | head 20"}' +``` + +### WARN Logs + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''WARN'\'' | fields traceId, spanId, `resource.attributes.service.name`, body, `@timestamp` | sort - `@timestamp` | head 20"}' +``` + +### INFO Logs + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''INFO'\'' | fields traceId, spanId, `resource.attributes.service.name`, body, `@timestamp` | sort - `@timestamp` | head 20"}' +``` + +### Filter by Severity Number + +Use `severityNumber` for numeric comparisons. For example, find all logs at WARN level or above (severityNumber >= 13): + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where severityNumber >= 13 | fields severityText, severityNumber, `resource.attributes.service.name`, body, `@timestamp` | sort - `@timestamp` | head 20"}' +``` + +## Trace Correlation by traceId + +Find all logs associated with a specific trace: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where traceId = '\'''\'' | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort `@timestamp`"}' +``` + +Find error logs for a specific trace: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where traceId = '\'''\'' AND severityText = '\''ERROR'\'' | fields spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort `@timestamp`"}' +``` + +## Error Patterns + +### Error Count by Severity and Service + +Identify error patterns by aggregating log counts grouped by severity level and service name: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | stats count() by severityText, `resource.attributes.service.name`"}' +``` + +### Error Count by Service Only + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''ERROR'\'' | stats count() as error_count by `resource.attributes.service.name` | sort - error_count"}' +``` + +## Log Volume Over Time + +### Hourly Log Volume + +Analyze log volume over time using `stats count() by span(@timestamp, 1h)`: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | stats count() as log_count by span(`@timestamp`, 1h)"}' +``` + +### Configurable Interval + +Change the interval to suit your analysis. Common intervals: `5m`, `15m`, `1h`, `1d`. + +15-minute buckets: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | stats count() as log_count by span(`@timestamp`, 15m)"}' +``` + +### Error Volume Over Time + +Track error log volume specifically: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''ERROR'\'' | stats count() as error_count by span(`@timestamp`, 1h), `resource.attributes.service.name`"}' +``` + +## Body Content Search + +### String Matching + +Search log body content for a specific string using `where` with `like`: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where body like '\''%timeout%'\'' | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort - `@timestamp` | head 20"}' +``` + +### Relevance Search with match + +Use the `match` relevance function for full-text search on the body field: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where match(body, '\''connection refused'\'') | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort - `@timestamp` | head 20"}' +``` + +### Relevance Search with match_phrase + +Use `match_phrase` for exact phrase matching in the body: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where match_phrase(body, '\''failed to connect'\'') | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort - `@timestamp` | head 20"}' +``` + +## Cross-Signal Correlation + +### Log-to-Span Correlation by spanId + +Find all logs associated with a specific span to understand what happened during that operation: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where spanId = '\'''\'' | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort `@timestamp`"}' +``` + +### Exception-Log Correlation with Traces + +Find error logs and their associated trace spans. First, find error logs with traceId: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''ERROR'\'' AND traceId != '\'''\'' | fields traceId, spanId, body, `resource.attributes.service.name`, `@timestamp` | sort - `@timestamp` | head 20"}' +``` + +Then query the trace index for the corresponding spans using the traceId from the error log: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' | fields traceId, spanId, serviceName, name, `status.code`, durationInNanos, startTime | sort startTime"}' +``` + +Correlate exception spans with their associated error logs using shared traceId and spanId: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where traceId = '\'''\'' AND spanId = '\'''\'' AND severityText = '\''ERROR'\'' | fields body, severityText, `@timestamp`"}' +``` + +## PPL Commands for Log Analysis + +The following PPL commands are particularly useful when analyzing log data: + +| Command | Use Case | +|---|---| +| `stats` | Aggregate log counts by severity, service, or time bucket | +| `where` | Filter logs by severity level, traceId, spanId, service, or body content | +| `fields` | Select specific fields to return (body, severityText, traceId, etc.) | +| `sort` | Order results by timestamp or severity | +| `head` | Limit result count for quick exploration | +| `grok` | Extract structured fields from unstructured log body text using grok patterns | +| `parse` | Parse log body content using regex patterns to extract fields | +| `rex` | Extract fields from text using named capture groups | +| `patterns` | Discover common log message patterns automatically | +| `rare` | Find the least frequent log messages or error types | +| `top` | Find the most frequent log messages, services, or severity levels | +| `timechart` | Visualize log volume or error counts over time buckets | +| `eval` | Compute derived fields (e.g., classify severity ranges) | +| `dedup` | Remove duplicate log entries (e.g., deduplicate by body to find unique messages) | +| `fillnull` | Replace null field values with defaults for cleaner output | +| `regex` | Filter logs using regular expression patterns on field values | + +## References + +- [PPL Language Reference](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.md) — Official PPL syntax documentation. Fetch this if queries fail due to OpenSearch version differences or new syntax. +- [OpenTelemetry Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/) — Standard attribute names used in log records. + +## AWS Managed OpenSearch + +To query logs on Amazon OpenSearch Service, replace the local endpoint and authentication with AWS SigV4: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:es" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + -X POST https://DOMAIN-ID.REGION.es.amazonaws.com/_plugins/_ppl \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''ERROR'\'' | fields traceId, spanId, `resource.attributes.service.name`, body, `@timestamp` | sort - `@timestamp` | head 20"}' +``` + +- Endpoint format: `https://DOMAIN-ID.REGION.es.amazonaws.com` +- Auth: `--aws-sigv4 "aws:amz:REGION:es"` with `--user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY"` +- The PPL API endpoint (`/_plugins/_ppl`) and query syntax are identical to the local stack +- No `-k` flag needed — AWS managed endpoints use valid TLS certificates diff --git a/claude-code-observability-plugin/skills/metrics/SKILL.md b/claude-code-observability-plugin/skills/metrics/SKILL.md new file mode 100644 index 00000000..04bd2bc6 --- /dev/null +++ b/claude-code-observability-plugin/skills/metrics/SKILL.md @@ -0,0 +1,178 @@ +--- +name: metrics +description: Query metrics from Prometheus using PromQL for HTTP request rates, latency percentiles, error rates, active connections, and GenAI token usage. +allowed-tools: + - Bash + - curl +--- + +# Metrics Querying with PromQL + +## Overview + +This skill provides PromQL query templates for querying metrics from Prometheus. All queries use the Prometheus HTTP API at `http://localhost:9090/api/v1/query`. No authentication is needed for local Prometheus. + +Prometheus runs on port 9090 using HTTP (not HTTPS). + +## Connection Defaults + +| Variable | Default | Description | +|---|---|---| +| `PROMETHEUS_ENDPOINT` | `http://localhost:9090` | Prometheus base URL | + +## Metric Discovery + +Different OTel SDK versions and languages emit HTTP metrics under different names. Before querying, discover which metric names are active in your stack: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/label/__name__/values" | python3 -c " +import json, sys +for m in json.load(sys.stdin).get('data', []): + if any(k in m for k in ['http_server', 'gen_ai', 'db_client']): + print(m)" +``` + +**Common HTTP metric name variants:** + +| Metric Name | Unit | Emitted By | +|---|---|---| +| `http_server_duration_milliseconds` | milliseconds | Python OTel SDK (older semconv) | +| `http_server_duration_seconds` | seconds | .NET, Java OTel SDKs | +| `http_server_request_duration_seconds` | seconds | Stable HTTP semconv (newer SDKs) | + +> **Important:** Replace the metric name in the queries below with whichever variant is active in your stack. The query patterns (rate, histogram_quantile, etc.) are identical — only the metric name changes. For histogram bucket queries, replace `_seconds_bucket` with `_milliseconds_bucket` as appropriate, and adjust latency thresholds accordingly (e.g., `le="0.25"` for seconds vs `le="250"` for milliseconds). + +## HTTP Request Rate by Service + +Calculate the per-second HTTP request rate over a 5-minute window, grouped by service: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count[5m])) by (service_name)' +``` + +## HTTP Latency Percentiles + +### p95 Latency by Service + +Calculate the 95th percentile HTTP request latency by service: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.95, sum(rate(http_server_duration_seconds_bucket[5m])) by (le, service_name))' +``` + +### p99 Latency by Service + +Calculate the 99th percentile HTTP request latency by service: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.99, sum(rate(http_server_duration_seconds_bucket[5m])) by (le, service_name))' +``` + +## Error Rate (5xx Responses) + +Calculate the ratio of 5xx error responses to total requests by service: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[5m])) by (service_name) / sum(rate(http_server_duration_seconds_count[5m])) by (service_name)' +``` + +> **Note on status code labels:** The label name varies by OTel SDK version. Older semconv uses `http_status_code`; newer stable semconv uses `http_response_status_code`. Use the Metric Discovery section to check which label is present, or query both: +> ``` +> sum(rate(http_server_duration_seconds_count{http_status_code=~"5.."}[5m])) by (service_name) +> ``` + +## Active Connections + +Query the current number of active HTTP connections by service: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(http_server_active_requests) by (service_name)' +``` + +## Database Operation Latency + +### DB Operation p95 Latency by Service + +Calculate the 95th percentile database operation latency by service: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.95, sum(rate(db_client_operation_duration_seconds_bucket[5m])) by (le, service_name))' +``` + +## GenAI-Specific Metrics + +### Token Usage by Operation and Model + +Query GenAI token usage histograms grouped by operation name and request model: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(gen_ai_client_token_usage_bucket[5m])) by (le, gen_ai_operation_name, gen_ai_request_model)' +``` + +Token usage p95 by operation and model: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.95, sum(rate(gen_ai_client_token_usage_bucket[5m])) by (le, gen_ai_operation_name, gen_ai_request_model))' +``` + +### Operation Duration by Operation and Model + +Query GenAI operation duration histograms grouped by operation and model: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(gen_ai_client_operation_duration_seconds_bucket[5m])) by (le, gen_ai_operation_name, gen_ai_request_model)' +``` + +Operation duration p95 by operation and model: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=histogram_quantile(0.95, sum(rate(gen_ai_client_operation_duration_seconds_bucket[5m])) by (le, gen_ai_operation_name, gen_ai_request_model))' +``` + +## Available Metric Names and Label Dimensions + +| Metric | Type | Labels | +|---|---|---| +| `http_server_duration_milliseconds` | histogram | `service_name`, `http_response_status_code` | +| `http_server_duration_seconds` | histogram | `service_name`, `http_response_status_code` | +| `http_server_request_duration_seconds` | histogram | `service_name`, `http_response_status_code` | +| `http_server_active_requests` | gauge | `service_name` | +| `db_client_operation_duration_seconds` | histogram | `service_name` | +| `gen_ai_client_token_usage` | histogram | `gen_ai.operation.name`, `gen_ai.request.model` | +| `gen_ai_client_operation_duration_seconds` | histogram | `gen_ai.operation.name`, `gen_ai.request.model` | + +> **Note on Prometheus label names:** Prometheus replaces dots in label names with underscores. The OTel attribute `gen_ai.operation.name` becomes the Prometheus label `gen_ai_operation_name` in PromQL queries. The table above shows the original OTel attribute names for reference. + +## PPL Alternative for OpenSearch-Ingested Metrics + +PPL can also query metrics stored in OpenSearch when metrics are ingested via Data Prepper, as an alternative to PromQL. This is useful for OpenSearch-native workflows where you want to query metrics alongside traces and logs using a single query language. When Data Prepper is configured to ingest metrics into OpenSearch, you can use PPL `source=` queries against the metrics index just as you would for traces and logs. + +## References + +- [PPL Language Reference](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.md) — Official PPL syntax documentation. Fetch this if queries fail due to OpenSearch version differences or new syntax. +- [Prometheus Querying Basics](https://prometheus.io/docs/prometheus/latest/querying/basics/) — PromQL syntax reference. + +## AWS Managed Service for Prometheus + +To query metrics on Amazon Managed Service for Prometheus (AMP), replace the local endpoint and add AWS SigV4 authentication: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:aps" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + 'https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query' \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count[5m])) by (service_name)' +``` + +- Endpoint format: `https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query` +- Auth: `--aws-sigv4 "aws:amz:REGION:aps"` with `--user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY"` +- PromQL query syntax is identical between local Prometheus and Amazon Managed Prometheus; only the endpoint and authentication differ diff --git a/claude-code-observability-plugin/skills/osd-config/SKILL.md b/claude-code-observability-plugin/skills/osd-config/SKILL.md new file mode 100644 index 00000000..cfeefc06 --- /dev/null +++ b/claude-code-observability-plugin/skills/osd-config/SKILL.md @@ -0,0 +1,281 @@ +--- +name: osd-config +description: Query OpenSearch Dashboards APIs for workspace configuration, index pattern discovery, APM correlation configs, and saved objects. +allowed-tools: + - Bash + - curl +--- + +## Connection Defaults + +| Variable | Default | Description | +|---|---|---| +| `OSD_ENDPOINT` | `http://localhost:5601` | OpenSearch Dashboards base URL | +| `OPENSEARCH_USER` | `admin` | Username (same as OpenSearch) | +| `OPENSEARCH_PASSWORD` | `My_password_123!@#` | Password (same as OpenSearch) | + +Note: All OSD API calls require the `osd-xsrf: true` header. + +## Workspace Discovery + +### List All Workspaces + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/workspaces/_list" \ + -H 'osd-xsrf: true' +``` + +### Get Workspace Details + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/workspaces/" \ + -H 'osd-xsrf: true' +``` + +### Associate Datasource with Workspace + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OSD_ENDPOINT/api/workspaces/_associate" \ + -H 'osd-xsrf: true' \ + -H 'Content-Type: application/json' \ + -d '{"workspaceId": "", "savedObjects": [{"type": "data-source", "id": ""}]}' +``` + +## Dashboards Settings + +### Get Default Workspace + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/opensearch-dashboards/settings" \ + -H 'osd-xsrf: true' +``` + +## Index Pattern Discovery + +### List All Index Patterns + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/_find?type=index-pattern&per_page=100" \ + -H 'osd-xsrf: true' +``` + +### Workspace-Scoped Index Patterns + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/w//api/saved_objects/_find?type=index-pattern&per_page=100" \ + -H 'osd-xsrf: true' +``` + +### Get Index Pattern Field Mappings + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/index-pattern/" \ + -H 'osd-xsrf: true' +``` + +## Dataset Discovery + +Datasets are an evolution of index patterns that classify indices by signal type (logs, traces, metrics). Users define which indices are logs vs traces through the Dashboards UI. See [Dataset Discovery documentation](https://docs.opensearch.org/latest/observing-your-data/exploring-observability-data/datasets/) for details. + +To discover datasets programmatically, query the saved objects API for index patterns with their `signalType`, `displayName`, and `description` fields: + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/_find?fields=title&fields=type&fields=displayName&fields=signalType&fields=description&per_page=10000&type=index-pattern" \ + -H 'osd-xsrf: true' +``` + +Index patterns created by the observability stack init script include schema mappings that identify their signal type (e.g., `otelLogs` for log indices, trace-specific time fields for trace indices). These mappings are visible in the index pattern's `attributes.fields` property. + +## APM Configuration + +### Get APM Correlation Config + +The APM plugin stores `correlations` saved objects that define how traces, logs, and metrics are linked. Two correlation types are created by the init script: + +- **`trace-to-logs-*`** — Links a trace index pattern to a log index pattern for cross-signal navigation +- **`APM-Config-*`** — Ties together the traces index, service map index, and Prometheus datasource for the APM UI + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/_find?type=correlations&per_page=100" \ + -H 'osd-xsrf: true' +``` + +### Workspace-Scoped APM Config + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/w//api/saved_objects/_find?type=correlations&per_page=100" \ + -H 'osd-xsrf: true' +``` + +## Saved Objects + +### Count Saved Objects by Type + +The `_find` API requires a `type` parameter. To get a count without loading objects, use `per_page=0`: + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/_find?type=index-pattern&per_page=0" \ + -H 'osd-xsrf: true' +``` + +Common saved object types: `index-pattern`, `query`, `dashboard`, `visualization`, `config`, `correlations`, `data-source`, `data-connection`, `explore`. + +### Find Correlations + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/_find?type=correlations&per_page=100" \ + -H 'osd-xsrf: true' +``` + +### Find Data Sources (OpenSearch) + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/_find?type=data-source&per_page=100" \ + -H 'osd-xsrf: true' +``` + +### Find Data Connections (Prometheus) + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/_find?type=data-connection&per_page=100" \ + -H 'osd-xsrf: true' +``` + +### Find Explore Panels + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/_find?type=explore&per_page=100" \ + -H 'osd-xsrf: true' +``` + +### Find Saved Queries + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/_find?type=query&per_page=100" \ + -H 'osd-xsrf: true' +``` + +### Find Dashboards + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/_find?type=dashboard&per_page=100" \ + -H 'osd-xsrf: true' +``` + +### Find Visualizations + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/_find?type=visualization&per_page=100" \ + -H 'osd-xsrf: true' +``` + +## Data Connections + +### List Prometheus Data Connections + +To discover existing Prometheus data connections, use the saved objects API (the `data-connection` type): + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OSD_ENDPOINT/api/saved_objects/_find?type=data-connection&per_page=100" \ + -H 'osd-xsrf: true' +``` + +### Create Prometheus Data Connection + +The init script creates Prometheus data connections via the direct query API: + +```bash +curl -s -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OSD_ENDPOINT/api/directquery/dataconnections" \ + -H 'osd-xsrf: true' \ + -H 'Content-Type: application/json' \ + -d '{"name": "MyPrometheus", "connector": "prometheus", "allowedRoles": ["all_access"], "properties": {"prometheus.uri": "http://prometheus:9090", "prometheus.auth.type": "basicauth", "prometheus.auth.username": "", "prometheus.auth.password": ""}}' +``` + +## Dynamic Index Discovery via OpenSearch API + +When OSD is not available, query OpenSearch directly to discover indices and field mappings: + +### List All Observability Indices + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OPENSEARCH_ENDPOINT/_cat/indices?format=json&v" | python3 -c " +import sys, json +for idx in json.load(sys.stdin): + name = idx['index'] + if any(p in name for p in ['otel-', 'logs-otel-', 'apm-']): + print(f\"{name} docs={idx['docs.count']} size={idx['store.size']}\")" +``` + +### Get Trace Index Field Mappings + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OPENSEARCH_ENDPOINT/otel-v1-apm-span-*/_mapping?pretty" +``` + +### Get Log Index Field Mappings + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OPENSEARCH_ENDPOINT/logs-otel-v1-*/_mapping?pretty" +``` + +### Get Service Map Index Field Mappings + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OPENSEARCH_ENDPOINT/otel-v2-apm-service-map-*/_mapping?pretty" +``` + +### PPL Describe for Field Discovery + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "describe otel-v1-apm-span-000001"}' +``` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "describe logs-otel-v1-000001"}' +``` + +## Default Index Patterns + +When dynamic discovery is not possible, these are the default index patterns used by the observability stack: + +| Signal | Index Pattern | Description | +|---|---|---| +| Traces | `otel-v1-apm-span-*` | Trace span data with serviceName, traceId, spanId | +| Logs | `logs-otel-v1-*` | Log entries with resource.attributes.service.name | +| Service Maps | `otel-v2-apm-service-map-*` | Service topology with sourceNode, targetNode | + +## References + +- [OpenSearch Dashboards Saved Objects API](https://opensearch.org/docs/latest/dashboards/management/saved-objects-api/) — API reference for saved objects +- [Dataset Discovery](https://docs.opensearch.org/latest/observing-your-data/exploring-observability-data/datasets/) — Datasets for classifying indices by signal type +- [PPL Language Reference](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.md) — PPL syntax for describe and other commands diff --git a/claude-code-observability-plugin/skills/ppl-reference/SKILL.md b/claude-code-observability-plugin/skills/ppl-reference/SKILL.md new file mode 100644 index 00000000..3e259d0f --- /dev/null +++ b/claude-code-observability-plugin/skills/ppl-reference/SKILL.md @@ -0,0 +1,1279 @@ +--- +name: ppl-reference +description: Comprehensive PPL (Piped Processing Language) reference for OpenSearch with command syntax, functions, and examples for observability queries. +allowed-tools: + - Bash + - curl +--- + +# PPL Language Reference + +## Overview + +This is a comprehensive reference for the Piped Processing Language (PPL) used by OpenSearch. PPL queries follow a pipe-delimited syntax starting with `source=` and chaining commands with `|`. This reference covers all commands, functions, API endpoints, and usage patterns needed to construct observability queries against trace and log indices. + +Grammar sourced from the `opensearch-project/sql` repository's `docs/user/ppl/` directory: +https://github.com/opensearch-project/sql + +## Connection Defaults + +| Variable | Default | Description | +|---|---|---| +| `OPENSEARCH_ENDPOINT` | `https://localhost:9200` | OpenSearch base URL | +| `OPENSEARCH_USER` | `admin` | OpenSearch username | +| `OPENSEARCH_PASSWORD` | `My_password_123!@#` | OpenSearch password | + +## Field Name Escaping + +Field names containing dots must be enclosed in backticks to avoid parsing errors: + +``` +`attributes.gen_ai.operation.name` +`attributes.gen_ai.usage.input_tokens` +`status.code` +`events.attributes.exception.type` +`@timestamp` +``` + +This is critical for OTel attribute fields which use dotted naming conventions. + +## API Endpoints + +### Query Endpoint + +Execute a PPL query: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() by serviceName"}' +``` + +Request body: `{"query": ""}` + +### Explain Endpoint + +Retrieve the query execution plan (useful for debugging and profiling): + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl/_explain" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `status.code` = 2 | stats count() by serviceName"}' +``` + +--- + +## Commands + +### Core Query Commands + +#### search / source + +Start a query by specifying the data source index pattern. + +**Syntax**: `search source=` or `source=` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | head 10"}' +``` + +#### where + +Filter results based on a condition. + +**Syntax**: `where ` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `status.code` = 2 | head 10"}' +``` + +Supports: `=`, `!=`, `<`, `>`, `<=`, `>=`, `AND`, `OR`, `NOT`, `LIKE`, `IN`, `BETWEEN`, `IS NULL`, `IS NOT NULL`. + +#### fields + +Select specific fields to return. + +**Syntax**: `fields [+|-] ` + +Use `+` to include or `-` to exclude fields. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | fields traceId, spanId, serviceName, durationInNanos | head 10"}' +``` + +#### stats + +Aggregate data using statistical functions. + +**Syntax**: `stats ... [by ]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() as span_count, avg(durationInNanos) as avg_duration by serviceName"}' +``` + +Supports: `count()`, `sum()`, `avg()`, `max()`, `min()`, `var_samp()`, `var_pop()`, `stddev_samp()`, `stddev_pop()`, `distinct_count()`, `percentile()`, `earliest()`, `latest()`, `list()`, `values()`, `first()`, `last()`. + +#### sort + +Order results by one or more fields. + +**Syntax**: `sort [+|-] [, ...]` + +Use `+` for ascending (default), `-` for descending. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | sort - durationInNanos | head 10"}' +``` + +#### head + +Limit the number of results returned. + +**Syntax**: `head [N]` (default N=10) + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | head 5"}' +``` + +#### eval + +Compute new fields from expressions. + +**Syntax**: `eval = [, ...]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | eval duration_ms = durationInNanos / 1000000 | fields traceId, serviceName, duration_ms | sort - duration_ms | head 10"}' +``` + +#### dedup + +Remove duplicate results based on field values. + +**Syntax**: `dedup [N] [keepempty=] [consecutive=]` + +> **Caveat:** `dedup` may throw a ClassCastException on fields with mixed types (e.g., a field that contains both strings and numbers across documents). Ensure the dedup field has a consistent type. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | dedup serviceName | fields serviceName"}' +``` + +#### rename + +Rename one or more fields. + +**Syntax**: `rename AS [, ...]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | rename serviceName as service, durationInNanos as duration | fields traceId, service, duration | head 10"}' +``` + +#### top + +Find the most frequent values for a field. + +**Syntax**: `top [N] [by ]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | top 5 serviceName"}' +``` + +#### rare + +Find the least frequent values for a field. + +**Syntax**: `rare [by ]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | rare `attributes.gen_ai.operation.name`"}' +``` + +#### table + +Display results in tabular format (alias for fields in some contexts). + +**Syntax**: `table ` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `status.code` = 2 | table traceId, spanId, serviceName, name | head 10"}' +``` + +### Time-Series Commands + +#### timechart + +Aggregate data into time buckets for time-series visualization. + +**Syntax**: `timechart span= ... [by ]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | timechart span=5m count() as span_count by serviceName"}' +``` + +Rate functions for timechart: `per_second()`, `per_minute()`, `per_hour()`, `per_day()` — compute rate of an aggregation per time unit. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | timechart span=1h per_minute(count()) as spans_per_min by serviceName"}' +``` + +#### chart + +General charting command for aggregation over arbitrary fields. + +**Syntax**: `chart ... by [span(, )]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | chart avg(durationInNanos) by serviceName"}' +``` + +#### bin + +Bucket numeric or date values into intervals. + +**Syntax**: `eval = bin(, )` or used within `stats ... by span(, )` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() by span(durationInNanos, 1000000000)"}' +``` + +#### trendline + +Calculate moving averages over sorted data. + +**Syntax**: `trendline [sort ] sma(, ) [as ]` + +SMA = Simple Moving Average. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | trendline sort startTime sma(10, durationInNanos) as avg_duration | fields startTime, durationInNanos, avg_duration | head 50"}' +``` + +#### streamstats + +Compute running (cumulative) statistics over ordered results. + +**Syntax**: `streamstats ... [by ]` + +> **Caveat:** `streamstats` processes all matching rows in memory. On large indices, this will fail with "insufficient resources" errors. Always add `| head N` before `streamstats` to limit data volume. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | sort startTime | streamstats count() as running_count, sum(`attributes.gen_ai.usage.input_tokens`) as cumulative_tokens | fields startTime, running_count, cumulative_tokens | head 50"}' +``` + +#### eventstats + +Add aggregation results as new fields to each row without collapsing rows (unlike `stats`). + +**Syntax**: `eventstats ... [by ]` + +> **Caveat:** `eventstats` processes all matching rows in memory. On large indices, this will fail with "insufficient resources" errors. Always add `| head N` before `eventstats` to limit data volume. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | eventstats avg(durationInNanos) as avg_svc_duration by serviceName | eval deviation = durationInNanos - avg_svc_duration | fields traceId, serviceName, durationInNanos, avg_svc_duration, deviation | sort - deviation | head 20"}' +``` + +### Parse/Extract Commands + +#### parse + +Extract fields from text using a regular expression with named groups. + +**Syntax**: `parse ''` + +> **Caveat:** `parse` may silently drop extracted fields on some OpenSearch versions. If extracted fields are missing from results, use `grok` or `rex` as more reliable alternatives. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | parse body '\''(?P\\w+): (?P.+)'\'' | fields level, msg | head 10"}' +``` + +#### grok + +Extract fields using Grok patterns (predefined regex patterns). + +**Syntax**: `grok ''` + +> **Caveat:** `grok` processes all matching rows in memory. On large indices, this will fail with "insufficient resources" errors. Always add `| head N` before `grok` to limit data volume. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | grok body '\''%{LOGLEVEL:level} %{GREEDYDATA:message}'\'' | fields level, message | head 10"}' +``` + +#### rex + +Extract fields using named capture groups (similar to parse but with Splunk-compatible syntax). + +**Syntax**: `rex field= ''` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | rex field=body '\''(?\\d{3})'\'' | fields statuscode, body | head 10"}' +``` + +#### regex + +Filter results using a regular expression match on a field. + +**Syntax**: ` = regex ''` (used within `where`) + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where body like '\''%error%'\'' | fields traceId, body, severityText | head 10"}' +``` + +#### patterns + +Auto-discover log patterns by clustering similar log messages. + +**Syntax**: `patterns [pattern=''] [new_field=]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | patterns body | fields body, patterns_field | head 20"}' +``` + +#### spath + +Extract values from structured data (JSON, XML) using path expressions. + +**Syntax**: `spath input= [path=] [output=]` + +> **Note:** Verify the target field exists in your index before using `spath`. Run `describe ` first to confirm the field name. The example below uses a representative field; adjust to match your actual schema. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where isnotnull(`attributes.gen_ai.tool.name`) | spath input=`attributes.gen_ai.tool.name` | head 10"}' +``` + +### Join/Lookup Commands + +#### join + +Join results from two indices. + +**Syntax**: `join left= right= ON ` or `join on ` + +Types: `inner`, `left`, `right`, `cross`. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | join left=s right=l ON s.traceId = l.traceId logs-otel-v1-* | fields s.spanId, s.name, l.severityText, l.body | head 10"}' +``` + +#### lookup + +Enrich results by looking up values from another index. + +**Syntax**: `lookup [AS ] [OUTPUT ]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | lookup otel-v2-apm-service-map serviceName AS `sourceNode` | fields serviceName, `targetNode`, durationInNanos | head 10"}' +``` + +> **Note:** The service map index (`otel-v2-apm-service-map`) uses nested fields `sourceNode` and `targetNode`, not `serviceName`. Match accordingly when joining or looking up against this index. + +#### graphlookup + +Perform graph traversal lookups for hierarchical or connected data. + +**Syntax**: `graphlookup connectFromField= connectToField= [maxDepth=] [as ]` + +> **Caveat:** `graphlookup` has limited support in OpenSearch 3.x PPL and may not work as expected. Test carefully against your OpenSearch version before relying on this command in production queries. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v2-apm-service-map | graphlookup otel-v2-apm-service-map connectFromField=`destination.domain` connectToField=serviceName maxDepth=3 as dependencies | head 10"}' +``` + +#### subquery + +Use a nested query as a data source or filter. + +**Syntax**: `where IN [ source= | ... | fields ]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId IN [ source=otel-v1-apm-span-* | where `status.code` = 2 | fields traceId ] | fields traceId, spanId, serviceName, name | head 20"}' +``` + +#### append + +Append results from another query to the current result set. + +**Syntax**: `append [ source= | ... ]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() as cnt by serviceName | append [ source=logs-otel-v1-* | stats count() as cnt by `resource.attributes.service.name` ] | head 20"}' +``` + +#### appendcol + +Append columns from another query to the current result set. + +**Syntax**: `appendcol [ ]` + +> **Caveat:** `source=` is not valid inside `appendcol []` subqueries. The subquery inside `appendcol` operates on the current result set, not a new index. Use `append` followed by reshaping if you need to bring in data from another index. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() as span_count | appendcol [ source=logs-otel-v1-* | stats count() as log_count ]"}' +``` + +#### appendpipe + +Append the results of a sub-pipeline to the current results. + +**Syntax**: `appendpipe [ ]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() as cnt by serviceName | appendpipe [ stats sum(cnt) as total ]"}' +``` + +### Transform Commands + +#### fillnull + +Replace null values with a specified value. + +**Syntax**: `fillnull [with ] []` + +> **Caveat:** Backtick-quoted field names are not supported in the `fillnull` field list. Use `eval` to rename dotted fields to simple names before applying `fillnull`, or apply `fillnull` without a field list to fill all null fields. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | fillnull with 0 `attributes.gen_ai.usage.input_tokens`, `attributes.gen_ai.usage.output_tokens` | fields traceId, `attributes.gen_ai.usage.input_tokens`, `attributes.gen_ai.usage.output_tokens` | head 10"}' +``` + +#### flatten + +Flatten nested fields into top-level fields. + +**Syntax**: `flatten ` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | flatten events | head 10"}' +``` + +#### expand + +Expand multi-value or array fields into separate rows. + +**Syntax**: `expand ` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | expand events | fields traceId, spanId, events | head 20"}' +``` + +#### transpose + +Pivot rows into columns. + +**Syntax**: `transpose [] [include_null=]` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() as cnt by serviceName | transpose"}' +``` + +#### convert + +Convert field types (e.g., string to number). + +**Syntax**: `convert () [as ]` + +Functions: `auto()`, `num()`, `ip()`, `ctime()`, `dur2sec()`, `mktime()`, `mstime()`, `rmcomma()`, `rmunit()`, `memk()`, `none()`. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | eval duration_str = CAST(durationInNanos AS STRING) | convert num(duration_str) as duration_num | fields traceId, duration_num | head 10"}' +``` + +#### replace + +Replace values in a field using the `replace()` string function inside `eval`. + +**Syntax**: `eval = replace(, '', '')` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | eval severityText = replace(severityText, '\''ERROR'\'', '\''ERR'\'') | fields severityText, body | head 10"}' +``` + +#### reverse + +Reverse the order of results. + +**Syntax**: `reverse` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | sort startTime | head 20 | reverse"}' +``` + +### Multi-Value Commands + +#### mvexpand + +Expand a multi-value field into separate rows (one row per value). + +**Syntax**: `mvexpand ` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | mvexpand events | fields traceId, spanId, events | head 20"}' +``` + +#### mvcombine + +Combine multiple rows with the same key into a single row with a multi-value field. + +**Syntax**: `mvcombine ` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | fields traceId, serviceName | mvcombine serviceName | head 10"}' +``` + +#### nomv + +Convert a multi-value field to a single-value field (takes the first value or joins with a delimiter). + +**Syntax**: `nomv ` + +> **Caveat:** `nomv` only works on string arrays, not nested object arrays. If the field contains nested objects (e.g., `events` with sub-fields), `nomv` will fail or produce unexpected results. Use `flatten` or `expand` for nested object arrays instead. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | nomv events | fields traceId, events | head 10"}' +``` + +### Aggregation/Totals Commands + +#### addcoltotals + +Add a summary row at the bottom with column totals. + +**Syntax**: `addcoltotals []` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() as cnt by serviceName | addcoltotals"}' +``` + +#### addtotals + +Add a new field to each row containing the sum of specified numeric fields. + +**Syntax**: `addtotals [row=] [col=] []` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats sum(`attributes.gen_ai.usage.input_tokens`) as input_tok, sum(`attributes.gen_ai.usage.output_tokens`) as output_tok by serviceName | addtotals"}' +``` + +### ML Commands + +#### ad + +Anomaly detection — identify anomalous values in numeric fields using built-in ML algorithms. + +**Syntax**: `ad [time_field=] [number_of_trees=] [shingle_size=] [time_zone=]` + +> **Note:** The `ad` command does not take a positional field argument. It auto-detects the input field(s) from the preceding `stats` or `eval` output in the pipeline. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where durationInNanos > 0 | ad time_field=startTime number_of_trees=100 time_zone=\"UTC\" | head 50"}' +``` + +#### kmeans + +Cluster data points using the k-means algorithm. + +**Syntax**: `kmeans [centroids=] [iterations=] [distance_type=]` + +> **Note:** The `kmeans` command does not take positional field arguments. It operates on all numeric fields from the preceding pipeline output. Use `fields` or `eval` before `kmeans` to control which numeric fields are used for clustering. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where durationInNanos > 0 | fields traceId, serviceName, durationInNanos | kmeans centroids=3 | fields traceId, serviceName, durationInNanos, ClusterID | head 30"}' +``` + +#### ml + +General ML command for running machine learning algorithms on query results. + +**Syntax**: `ml action= [parameters...]` + +Supported algorithms include: `kmeans`, `ad` (anomaly detection). + +> **Note:** `ml action=rcf` is not a valid action in OpenSearch 3.x PPL. Random Cut Forest anomaly detection is accessed via the `ad` command directly (see the `ad` section above), not through `ml action=rcf`. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where durationInNanos > 0 | ml action=kmeans centroids=3 | head 50"}' +``` + +### System Commands + +#### describe + +Inspect the index mapping and field types for an index. + +**Syntax**: `describe ` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "describe otel-v1-apm-span-*"}' +``` + +#### explain + +Show the query execution plan (used via the `_explain` API endpoint rather than as an inline command). + +**Syntax**: Use the `/_plugins/_ppl/_explain` endpoint with the query body. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl/_explain" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `status.code` = 2 | stats count() by serviceName"}' +``` + +#### showdatasources + +List all configured data sources available for PPL queries. + +**Syntax**: `show datasources` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "show datasources"}' +``` + +#### multisearch + +Execute multiple PPL queries in a single request. Each query is independent. + +**Syntax**: Use the `/_plugins/_ppl` endpoint with multiple queries separated by newlines (NDJSON format), or execute sequentially. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() as total_spans"}' +``` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | stats count() as total_logs"}' +``` + +### Display Commands + +#### fieldformat + +Format the display of a field's values without changing the underlying data. + +**Syntax**: `fieldformat = ` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | eval duration_ms = durationInNanos / 1000000 | fieldformat duration_ms = CONCAT(CAST(duration_ms AS STRING), '\'' ms'\'') | fields traceId, serviceName, duration_ms | head 10"}' +``` + +--- + +## Span Expression Syntax + +The `span()` function buckets numeric or datetime values into intervals. Used with `stats`, `timechart`, and `chart`. + +**Syntax**: `span(, )` + +### Supported Time Units + +| Unit | Description | Example | +|------|-------------|---------| +| `ms` | Milliseconds | `span(startTime, 500ms)` | +| `s` | Seconds | `span(startTime, 30s)` | +| `m` | Minutes | `span(startTime, 5m)` | +| `h` | Hours | `span(startTime, 1h)` | +| `d` | Days | `span(startTime, 1d)` | +| `w` | Weeks | `span(startTime, 1w)` | +| `M` | Months | `span(startTime, 1M)` | +| `q` | Quarters | `span(startTime, 1q)` | +| `y` | Years | `span(startTime, 1y)` | + +### Numeric Spans + +For numeric fields, the interval is a plain number: + +``` +stats count() by span(durationInNanos, 1000000000) +``` + +### Example + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() as span_count, avg(durationInNanos) as avg_duration by span(startTime, 1h)"}' +``` + +## Timechart Rate Functions + +Rate functions normalize aggregation values to a per-time-unit rate within `timechart`: + +| Function | Description | +|----------|-------------| +| `per_second()` | Aggregation value per second | +| `per_minute()` | Aggregation value per minute | +| `per_hour()` | Aggregation value per hour | +| `per_day()` | Aggregation value per day | + +### Example + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | timechart span=5m per_second(count()) as requests_per_sec"}' +``` + +--- + +## Functions + +### Aggregation Functions + +Used with `stats`, `eventstats`, `streamstats`, `timechart`, and `chart` commands. + +| Function | Syntax | Description | +|----------|--------|-------------| +| `COUNT` | `count()` | Count of events | +| `SUM` | `sum(field)` | Sum of numeric values | +| `AVG` | `avg(field)` | Arithmetic mean | +| `MAX` | `max(field)` | Maximum value | +| `MIN` | `min(field)` | Minimum value | +| `VAR_SAMP` | `var_samp(field)` | Sample variance | +| `VAR_POP` | `var_pop(field)` | Population variance | +| `STDDEV_SAMP` | `stddev_samp(field)` | Sample standard deviation | +| `STDDEV_POP` | `stddev_pop(field)` | Population standard deviation | +| `DISTINCT_COUNT` | `distinct_count(field)` | Count of distinct values | +| `PERCENTILE` | `percentile(field, pct)` | Value at the given percentile | +| `EARLIEST` | `earliest(field)` | Earliest (first chronological) value | +| `LATEST` | `latest(field)` | Latest (most recent) value | +| `LIST` | `list(field)` | All values as a list | +| `VALUES` | `values(field)` | Distinct values as a list | +| `FIRST` | `first(field)` | First value encountered | +| `LAST` | `last(field)` | Last value encountered | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() as total, avg(durationInNanos) as avg_ns, percentile(durationInNanos, 95) as p95_ns, distinct_count(serviceName) as services"}' +``` + +### Collection Functions + +Functions for working with multi-value fields and arrays. + +| Function | Syntax | Description | +|----------|--------|-------------| +| `ARRAY` | `array(val1, val2, ...)` | Create an array from values | +| `SPLIT` | `split(field, delimiter)` | Split a string into an array | +| `MVJOIN` | `mvjoin(field, delimiter)` | Join multi-value field into a string | +| `MVCOUNT` | `mvcount(field)` | Count of values in a multi-value field | +| `MVINDEX` | `mvindex(field, index)` | Get value at index from multi-value field | +| `MVFIRST` | `mvfirst(field)` | First value of a multi-value field | +| `MVLAST` | `mvlast(field)` | Last value of a multi-value field | +| `MVAPPEND` | `mvappend(field1, field2)` | Append two multi-value fields | +| `MVDEDUP` | `mvdedup(field)` | Remove duplicates from multi-value field | +| `MVSORT` | `mvsort(field)` | Sort values in a multi-value field | +| `MVZIP` | `mvzip(field1, field2, delim)` | Zip two multi-value fields together | +| `MVRANGE` | `mvrange(start, end, step)` | Generate a range of numeric values | +| `MVFILTER` | `mvfilter(expression)` | Filter values in a multi-value field | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | eval tokens = array(`attributes.gen_ai.usage.input_tokens`, `attributes.gen_ai.usage.output_tokens`) | fields traceId, tokens | head 10"}' +``` + +### Condition Functions + +Functions for conditional logic and null handling. + +| Function | Syntax | Description | +|----------|--------|-------------| +| `ISNULL` | `isnull(field)` | Returns true if field is null | +| `ISNOTNULL` | `isnotnull(field)` | Returns true if field is not null | +| `IF` | `if(cond, true_val, false_val)` | Conditional expression | +| `IFNULL` | `ifnull(field, default)` | Return default if field is null | +| `NULLIF` | `nullif(val1, val2)` | Return null if val1 equals val2 | +| `CASE` | `case(cond1, val1, cond2, val2, ..., else_val)` | Multi-branch conditional | +| `COALESCE` | `coalesce(val1, val2, ...)` | First non-null value | +| `LIKE` | `field LIKE 'pattern'` | Wildcard pattern match (`%` and `_`) | +| `IN` | `field IN (val1, val2, ...)` | Check membership in a set | +| `BETWEEN` | `field BETWEEN val1 AND val2` | Range check (inclusive) | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | eval status_label = case(`status.code` = 0, '\''UNSET'\'', `status.code` = 1, '\''OK'\'', `status.code` = 2, '\''ERROR'\'') | stats count() by status_label"}' +``` + +### Conversion Functions + +Functions for type casting and conversion. + +| Function | Syntax | Description | +|----------|--------|-------------| +| `CAST` | `cast(field AS type)` | Cast to a specified type (STRING, INT, LONG, FLOAT, DOUBLE, BOOLEAN, DATE, TIMESTAMP) | +| `TOSTRING` | `tostring(field)` | Convert to string | +| `TONUMBER` | `tonumber(field)` | Convert to number | +| `TOINT` | `toint(field)` | Convert to integer | +| `TOLONG` | `tolong(field)` | Convert to long | +| `TOFLOAT` | `tofloat(field)` | Convert to float | +| `TODOUBLE` | `todouble(field)` | Convert to double | +| `TOBOOLEAN` | `toboolean(field)` | Convert to boolean | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | eval duration_ms = CAST(durationInNanos AS DOUBLE) / 1000000.0 | fields traceId, serviceName, duration_ms | sort - duration_ms | head 10"}' +``` + +### Cryptographic Functions + +Functions for computing hash digests. + +| Function | Syntax | Description | +|----------|--------|-------------| +| `MD5` | `md5(field)` | MD5 hash of the value | +| `SHA1` | `sha1(field)` | SHA-1 hash of the value | +| `SHA2` | `sha2(field, numBits)` | SHA-2 hash (numBits: 224, 256, 384, 512) | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | eval trace_hash = md5(traceId) | fields traceId, trace_hash | head 5"}' +``` + +### Datetime Functions + +Functions for date and time manipulation. + +| Function | Syntax | Description | +|----------|--------|-------------| +| `NOW` | `now()` | Current timestamp | +| `CURDATE` | `curdate()` | Current date | +| `CURTIME` | `curtime()` | Current time | +| `DATE_FORMAT` | `date_format(date, fmt)` | Format a date (`%Y-%m-%d %H:%i:%s`) | +| `DATE_ADD` | `date_add(date, INTERVAL n unit)` | Add interval to date | +| `DATE_SUB` | `date_sub(date, INTERVAL n unit)` | Subtract interval from date | +| `DATEDIFF` | `datediff(date1, date2)` | Difference in days between two dates | +| `DAY` | `day(date)` | Day of month (1–31) | +| `MONTH` | `month(date)` | Month (1–12) | +| `YEAR` | `year(date)` | Year | +| `HOUR` | `hour(time)` | Hour (0–23) | +| `MINUTE` | `minute(time)` | Minute (0–59) | +| `SECOND` | `second(time)` | Second (0–59) | +| `DAYOFWEEK` | `dayofweek(date)` | Day of week (1=Sun, 7=Sat) | +| `DAYOFYEAR` | `dayofyear(date)` | Day of year (1–366) | +| `WEEK` | `week(date)` | Week number of the year | +| `UNIX_TIMESTAMP` | `unix_timestamp(date)` | Convert to Unix epoch seconds | +| `FROM_UNIXTIME` | `from_unixtime(epoch)` | Convert Unix epoch to timestamp | +| `TIMESTAMPADD` | `timestampadd(unit, n, ts)` | Add interval to timestamp | +| `TIMESTAMPDIFF` | `timestampdiff(unit, ts1, ts2)` | Difference between timestamps in given unit | +| `PERIOD_ADD` | `period_add(period, n)` | Add months to a period (YYMM/YYYYMM) | +| `PERIOD_DIFF` | `period_diff(p1, p2)` | Difference in months between periods | +| `MAKETIME` | `maketime(h, m, s)` | Create a time value | +| `MAKEDATE` | `makedate(year, dayofyear)` | Create a date from year and day-of-year | +| `ADDDATE` | `adddate(date, INTERVAL n unit)` | Alias for DATE_ADD | +| `SUBDATE` | `subdate(date, INTERVAL n unit)` | Alias for DATE_SUB | +| `SYSDATE` | `sysdate()` | Current date and time (evaluated at execution) | +| `UTC_DATE` | `utc_date()` | Current UTC date | +| `UTC_TIME` | `utc_time()` | Current UTC time | +| `UTC_TIMESTAMP` | `utc_timestamp()` | Current UTC timestamp | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where startTime > DATE_SUB(NOW(), INTERVAL 1 HOUR) | stats count() as recent_spans by serviceName"}' +``` + +### Expressions + +Operators for arithmetic, comparison, and logical expressions used in `eval`, `where`, and other commands. + +#### Arithmetic Operators + +| Operator | Description | Example | +|----------|-------------|---------| +| `+` | Addition | `eval total = input_tokens + output_tokens` | +| `-` | Subtraction | `eval gap = endTime - startTime` | +| `*` | Multiplication | `eval cost = tokens * price_per_token` | +| `/` | Division | `eval duration_ms = durationInNanos / 1000000` | + +#### Comparison Operators + +| Operator | Description | +|----------|-------------| +| `=` | Equal to | +| `!=` or `<>` | Not equal to | +| `<` | Less than | +| `>` | Greater than | +| `<=` | Less than or equal to | +| `>=` | Greater than or equal to | + +#### Logical Operators + +| Operator | Description | +|----------|-------------| +| `AND` | Logical AND | +| `OR` | Logical OR | +| `NOT` | Logical NOT | +| `XOR` | Logical exclusive OR | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | eval duration_ms = durationInNanos / 1000000, total_tokens = `attributes.gen_ai.usage.input_tokens` + `attributes.gen_ai.usage.output_tokens` | where duration_ms > 1000 AND total_tokens > 0 | fields traceId, serviceName, duration_ms, total_tokens | head 10"}' +``` + +### IP Functions + +Functions for IP address operations. + +| Function | Syntax | Description | +|----------|--------|-------------| +| `CIDRMATCH` | `cidrmatch(ip_field, 'cidr')` | Check if IP is within a CIDR range | +| `GEOIP` | `geoip(ip_field)` | Geo-locate an IP address (returns country, region, city, lat/lon) | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where isnotnull(`attributes.net.peer.ip`) | where cidrmatch(`attributes.net.peer.ip`, '\''10.0.0.0/8'\'') | fields traceId, `attributes.net.peer.ip`, serviceName | head 10"}' +``` + +### JSON Functions + +Functions for working with JSON data. + +| Function | Syntax | Description | +|----------|--------|-------------| +| `JSON_EXTRACT` | `json_extract(field, path)` | Extract value at JSON path | +| `JSON_KEYS` | `json_keys(field)` | Get all keys from a JSON object | +| `JSON_VALID` | `json_valid(field)` | Check if value is valid JSON | +| `JSON_ARRAY` | `json_array(val1, val2, ...)` | Create a JSON array | +| `JSON_OBJECT` | `json_object(key1, val1, ...)` | Create a JSON object | +| `JSON_ARRAY_LENGTH` | `json_array_length(field)` | Length of a JSON array | +| `JSON_EXTRACT_PATH_TEXT` | `json_extract_path_text(field, path)` | Extract value as text from JSON path | +| `TO_JSON_STRING` | `to_json_string(field)` | Convert value to JSON string | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where json_valid(`attributes.gen_ai.tool.call.arguments`) | eval tool_args = json_extract(`attributes.gen_ai.tool.call.arguments`, '\''$'\'') | fields traceId, `attributes.gen_ai.tool.name`, tool_args | head 10"}' +``` + +### Math Functions + +Functions for mathematical operations. + +| Function | Syntax | Description | +|----------|--------|-------------| +| `ABS` | `abs(val)` | Absolute value | +| `CEIL` | `ceil(val)` | Round up to nearest integer | +| `FLOOR` | `floor(val)` | Round down to nearest integer | +| `ROUND` | `round(val [, decimals])` | Round to N decimal places | +| `SQRT` | `sqrt(val)` | Square root | +| `POW` | `pow(base, exp)` | Exponentiation | +| `MOD` | `mod(a, b)` | Modulo (remainder) | +| `LOG` | `log(val)` | Natural logarithm | +| `LOG2` | `log2(val)` | Base-2 logarithm | +| `LOG10` | `log10(val)` | Base-10 logarithm | +| `LN` | `ln(val)` | Natural logarithm (alias for LOG) | +| `EXP` | `exp(val)` | e raised to the power of val | +| `SIGN` | `sign(val)` | Sign of value (-1, 0, 1) | +| `TRUNCATE` | `truncate(val, decimals)` | Truncate to N decimal places | +| `PI` | `pi()` | Value of π | +| `E` | `e()` | Value of Euler's number | +| `RAND` | `rand([seed])` | Random float between 0 and 1 | +| `ACOS` | `acos(val)` | Arc cosine | +| `ASIN` | `asin(val)` | Arc sine | +| `ATAN` | `atan(val)` | Arc tangent | +| `ATAN2` | `atan2(y, x)` | Two-argument arc tangent | +| `COS` | `cos(val)` | Cosine | +| `SIN` | `sin(val)` | Sine | +| `TAN` | `tan(val)` | Tangent | +| `COT` | `cot(val)` | Cotangent | +| `DEGREES` | `degrees(radians)` | Convert radians to degrees | +| `RADIANS` | `radians(degrees)` | Convert degrees to radians | +| `CONV` | `conv(val, from_base, to_base)` | Convert between number bases | +| `CRC32` | `crc32(val)` | CRC-32 checksum | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | eval duration_ms = round(durationInNanos / 1000000.0, 2) | where duration_ms > 0 | fields traceId, serviceName, duration_ms | sort - duration_ms | head 10"}' +``` + +### Relevance Functions + +Full-text search functions for relevance-based querying. + +| Function | Syntax | Description | +|----------|--------|-------------| +| `MATCH` | `match(field, query)` | Full-text match on a single field | +| `MATCH_PHRASE` | `match_phrase(field, phrase)` | Exact phrase match | +| `MATCH_BOOL_PREFIX` | `match_bool_prefix(field, query)` | Boolean prefix match | +| `MATCH_PHRASE_PREFIX` | `match_phrase_prefix(field, prefix)` | Phrase prefix match | +| `MULTI_MATCH` | `multi_match([field1, field2], query)` | Match across multiple fields | +| `QUERY_STRING` | `query_string([field1, field2], query)` | Lucene query string syntax | +| `SIMPLE_QUERY_STRING` | `simple_query_string([field1, field2], query)` | Simplified query string | +| `HIGHLIGHT` | `highlight(field)` | Return highlighted matching fragments | +| `SCORE` | `score(relevance_func)` | Return relevance score | +| `SCOREQUERY` | `scorequery(relevance_func)` | Filter by relevance score | +| `MATCH_QUERY` | `match_query(field, query)` | Alias for MATCH | +| `WILDCARD_QUERY` | `wildcard_query(field, pattern)` | Wildcard pattern match (`*` and `?`) | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where match(body, '\''timeout error'\'') | fields traceId, severityText, body | head 10"}' +``` + +### Statistical Functions + +Functions for computing statistical correlations and covariances. + +| Function | Syntax | Description | +|----------|--------|-------------| +| `COVAR_POP` | `covar_pop(field1, field2)` | Population covariance | +| `COVAR_SAMP` | `covar_samp(field1, field2)` | Sample covariance | + +> **Note:** `corr()` is not a recognized stats aggregation function in OpenSearch 3.x PPL. To approximate Pearson correlation, use `eval` with manual calculation or compute covariance and standard deviations separately. The `covar_samp` and `covar_pop` functions are supported. + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.usage.input_tokens` > 0 | stats covar_samp(`attributes.gen_ai.usage.input_tokens`, durationInNanos) as token_duration_covar"}' +``` + +### String Functions + +Functions for string manipulation. + +| Function | Syntax | Description | +|----------|--------|-------------| +| `CONCAT` | `concat(str1, str2, ...)` | Concatenate strings | +| `LENGTH` | `length(str)` | String length in bytes | +| `LOWER` | `lower(str)` | Convert to lowercase | +| `UPPER` | `upper(str)` | Convert to uppercase | +| `TRIM` | `trim(str)` | Remove leading/trailing whitespace | +| `LTRIM` | `ltrim(str)` | Remove leading whitespace | +| `RTRIM` | `rtrim(str)` | Remove trailing whitespace | +| `SUBSTRING` | `substring(str, start [, len])` | Extract substring | +| `LEFT` | `left(str, len)` | Leftmost N characters | +| `RIGHT` | `right(str, len)` | Rightmost N characters | +| `REPLACE` | `replace(str, from, to)` | Replace occurrences | +| `REVERSE` | `reverse(str)` | Reverse a string | +| `LOCATE` | `locate(substr, str [, pos])` | Position of substring | +| `POSITION` | `position(substr IN str)` | Position of substring | +| `ASCII` | `ascii(str)` | ASCII code of first character | +| `CHAR_LENGTH` | `char_length(str)` | Character count | +| `CHARACTER_LENGTH` | `character_length(str)` | Alias for CHAR_LENGTH | +| `OCTET_LENGTH` | `octet_length(str)` | Byte count | +| `BIT_LENGTH` | `bit_length(str)` | Bit count | +| `LPAD` | `lpad(str, len, pad)` | Left-pad to length | +| `RPAD` | `rpad(str, len, pad)` | Right-pad to length | +| `SPACE` | `space(n)` | String of N spaces | +| `REPEAT` | `repeat(str, n)` | Repeat string N times | +| `STRCMP` | `strcmp(str1, str2)` | Compare strings (-1, 0, 1) | +| `SUBSTR` | `substr(str, start [, len])` | Alias for SUBSTRING | +| `MID` | `mid(str, start, len)` | Alias for SUBSTRING | +| `FIELD` | `field(str, val1, val2, ...)` | Index of str in value list | +| `FIND_IN_SET` | `find_in_set(str, strlist)` | Position in comma-separated list | +| `FORMAT` | `format(val, decimals)` | Format number with commas and decimals | +| `INSERT` | `insert(str, pos, len, newstr)` | Insert string at position | +| `INSTR` | `instr(str, substr)` | Position of first occurrence | +| `REGEXP` | `regexp(str, pattern)` | Regex match (returns 1 or 0) | +| `REGEXP_EXTRACT` | `regexp_extract(str, pattern [, group])` | Extract regex capture group | +| `REGEXP_REPLACE` | `regexp_replace(str, pattern, replacement)` | Replace regex matches | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | eval body_lower = lower(body) | where body_lower like '\''%exception%'\'' | eval short_body = left(body, 200) | fields traceId, severityText, short_body | head 10"}' +``` + +### System Functions + +| Function | Syntax | Description | +|----------|--------|-------------| +| `TYPEOF` | `typeof(field)` | Returns the data type of a field value | + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | eval type_of_duration = typeof(durationInNanos) | fields traceId, durationInNanos, type_of_duration | head 5"}' +``` + +--- + +## Grammar Source + +This PPL reference is sourced from the `opensearch-project/sql` repository's `docs/user/ppl/` directory. + +Repository: https://github.com/opensearch-project/sql + +The PPL grammar is maintained as part of the OpenSearch SQL plugin. For the latest syntax additions and changes, consult the repository documentation directly. + +## References + +- [PPL Language Reference](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.md) — Official PPL syntax documentation. Fetch this if queries fail due to OpenSearch version differences or new syntax. diff --git a/claude-code-observability-plugin/skills/slo-sli/SKILL.md b/claude-code-observability-plugin/skills/slo-sli/SKILL.md new file mode 100644 index 00000000..ce5e0678 --- /dev/null +++ b/claude-code-observability-plugin/skills/slo-sli/SKILL.md @@ -0,0 +1,625 @@ +--- +name: slo-sli +description: SLO/SLI definitions, Prometheus recording rules, error budget calculations, and burn rate alerting for service reliability management. +allowed-tools: + - Bash + - curl +--- + +# SLO/SLI Definitions and Error Budget Management + +## Overview + +This skill provides templates for implementing Service Level Objectives (SLOs) and Service Level Indicators (SLIs) using Prometheus recording rules, error budget calculations, and burn rate alerting. It follows the Google SRE book methodology for multi-window burn rate alerts. + +All Prometheus queries use the HTTP API at `http://localhost:9090/api/v1/query`. Credentials are not required for local Prometheus (HTTP, no auth). Recording rules and alerting rules are YAML blocks that can be added to the Prometheus configuration at `docker-compose/prometheus/prometheus.yml`. + +## Connection Defaults + +| Variable | Default | Description | +|---|---|---| +| `OPENSEARCH_ENDPOINT` | `https://localhost:9200` | OpenSearch base URL | +| `OPENSEARCH_USER` | `admin` | OpenSearch username | +| `OPENSEARCH_PASSWORD` | `My_password_123!@#` | OpenSearch password | +| `PROMETHEUS_ENDPOINT` | `http://localhost:9090` | Prometheus base URL | + + +## SLI Definition Templates + +### Availability SLI + +The availability SLI measures the ratio of successful requests (non-5xx) to total requests. A value of 1.0 means all requests succeeded; 0.99 means 1% failed. + +> **Note on status code labels:** The label name varies by OTel SDK version. Older semconv uses `http_status_code`; newer stable semconv uses `http_response_status_code`. Use the Metric Discovery section in the metrics skill to check which label is present, and replace `http_response_status_code` in the queries below with the variant active in your stack. + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count{http_response_status_code!~"5.."}[5m])) / sum(rate(http_server_duration_seconds_count[5m]))' +``` + +Per-service availability: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count{http_response_status_code!~"5.."}[5m])) by (service_name) / sum(rate(http_server_duration_seconds_count[5m])) by (service_name)' +``` + +### Latency SLI + +The latency SLI measures the ratio of requests completing within a threshold (e.g., 250ms) to total requests. A value of 0.95 means 95% of requests finished within the threshold. + +> **Note on latency thresholds:** The `le` bucket boundary depends on the metric's unit. For `_seconds` metrics, use `le="0.25"` for 250ms. For `_milliseconds` metrics, use `le="250"`. Use the Metric Discovery section in the metrics skill to check which metric name is active. + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_bucket{le="0.25"}[5m])) / sum(rate(http_server_duration_seconds_count[5m]))' +``` + +Per-service latency SLI with a 500ms threshold: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(http_server_duration_seconds_bucket{le="0.5"}[5m])) by (service_name) / sum(rate(http_server_duration_seconds_count[5m])) by (service_name)' +``` + +### GenAI-Specific SLI + +The GenAI SLI measures agent response time objectives using the `gen_ai_client_operation_duration_seconds` histogram. For example, the ratio of GenAI operations completing within 5 seconds: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(gen_ai_client_operation_duration_seconds_bucket{le="5.0"}[5m])) by (gen_ai_operation_name) / sum(rate(gen_ai_client_operation_duration_seconds_count[5m])) by (gen_ai_operation_name)' +``` + +Per-model GenAI availability (non-error operations): + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sum(rate(gen_ai_client_operation_duration_seconds_count{gen_ai_operation_name!="error"}[5m])) by (gen_ai_request_model) / sum(rate(gen_ai_client_operation_duration_seconds_count[5m])) by (gen_ai_request_model)' +``` + + +## Prometheus Recording Rules + +Recording rules pre-compute SLI values at multiple time windows so that SLO compliance queries are fast and efficient. Add these rule groups to `docker-compose/prometheus/prometheus.yml` under the `rule_files` section. + +### Recording Rule Naming Convention + +Recording rules follow the pattern: + +| Pattern | Example | +|---|---| +| `sli:http_availability:ratio_rate` | `sli:http_availability:ratio_rate5m` | +| `sli:http_latency:ratio_rate` | `sli:http_latency:ratio_rate5m` | + +Windows: `5m`, `30m`, `1h`, `6h`, `1d`, `3d`, `30d` + +### Availability Recording Rules + +```yaml +groups: + - name: sli_availability + rules: + - record: sli:http_availability:ratio_rate5m + expr: | + sum(rate(http_server_duration_seconds_count{http_response_status_code!~"5.."}[5m])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[5m])) by (service_name) + labels: + sli: availability + + - record: sli:http_availability:ratio_rate30m + expr: | + sum(rate(http_server_duration_seconds_count{http_response_status_code!~"5.."}[30m])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[30m])) by (service_name) + labels: + sli: availability + + - record: sli:http_availability:ratio_rate1h + expr: | + sum(rate(http_server_duration_seconds_count{http_response_status_code!~"5.."}[1h])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[1h])) by (service_name) + labels: + sli: availability + + - record: sli:http_availability:ratio_rate6h + expr: | + sum(rate(http_server_duration_seconds_count{http_response_status_code!~"5.."}[6h])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[6h])) by (service_name) + labels: + sli: availability + + - record: sli:http_availability:ratio_rate1d + expr: | + sum(rate(http_server_duration_seconds_count{http_response_status_code!~"5.."}[1d])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[1d])) by (service_name) + labels: + sli: availability + + - record: sli:http_availability:ratio_rate3d + expr: | + sum(rate(http_server_duration_seconds_count{http_response_status_code!~"5.."}[3d])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[3d])) by (service_name) + labels: + sli: availability + + - record: sli:http_availability:ratio_rate30d + expr: | + sum(rate(http_server_duration_seconds_count{http_response_status_code!~"5.."}[30d])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[30d])) by (service_name) + labels: + sli: availability +``` + +### Latency Recording Rules + +```yaml +groups: + - name: sli_latency + rules: + - record: sli:http_latency:ratio_rate5m + expr: | + sum(rate(http_server_duration_seconds_bucket{le="0.25"}[5m])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[5m])) by (service_name) + labels: + sli: latency + + - record: sli:http_latency:ratio_rate30m + expr: | + sum(rate(http_server_duration_seconds_bucket{le="0.25"}[30m])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[30m])) by (service_name) + labels: + sli: latency + + - record: sli:http_latency:ratio_rate1h + expr: | + sum(rate(http_server_duration_seconds_bucket{le="0.25"}[1h])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[1h])) by (service_name) + labels: + sli: latency + + - record: sli:http_latency:ratio_rate6h + expr: | + sum(rate(http_server_duration_seconds_bucket{le="0.25"}[6h])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[6h])) by (service_name) + labels: + sli: latency + + - record: sli:http_latency:ratio_rate1d + expr: | + sum(rate(http_server_duration_seconds_bucket{le="0.25"}[1d])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[1d])) by (service_name) + labels: + sli: latency + + - record: sli:http_latency:ratio_rate3d + expr: | + sum(rate(http_server_duration_seconds_bucket{le="0.25"}[3d])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[3d])) by (service_name) + labels: + sli: latency + + - record: sli:http_latency:ratio_rate30d + expr: | + sum(rate(http_server_duration_seconds_bucket{le="0.25"}[30d])) by (service_name) + / + sum(rate(http_server_duration_seconds_count[30d])) by (service_name) + labels: + sli: latency +``` + +Query a recording rule value: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sli:http_availability:ratio_rate30d' +``` + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sli:http_latency:ratio_rate1h' +``` + + +## Error Budget Calculation + +### Common SLO Targets and Allowed Downtime + +| SLO Target | Error Budget | Allowed Downtime (30 days) | Allowed Downtime (per day) | +|---|---|---|---| +| 99.9% | 0.1% | 43.2 minutes | 1.44 minutes | +| 99.5% | 0.5% | 3.6 hours | 7.2 minutes | +| 99.0% | 1.0% | 7.2 hours | 14.4 minutes | + +### Remaining Error Budget + +The remaining error budget tells you what fraction of your error budget is still available. A value of 1.0 means the full budget remains; 0.0 means the budget is exhausted; negative means you've exceeded it. + +Formula: `1 - (1 - SLI) / (1 - SLO_target)` + +For a 99.9% SLO target using the 30-day availability SLI: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=1 - ((1 - sli:http_availability:ratio_rate30d) / (1 - 0.999))' +``` + +For a 99.5% SLO target: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=1 - ((1 - sli:http_availability:ratio_rate30d) / (1 - 0.995))' +``` + +For a 99.0% SLO target: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=1 - ((1 - sli:http_availability:ratio_rate30d) / (1 - 0.99))' +``` + +### Error Budget Consumption Rate + +The consumption rate shows how fast the error budget is being consumed. A value of 1.0 means the budget is being consumed at exactly the expected rate; values above 1.0 mean the budget is being consumed faster than sustainable. + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=(1 - sli:http_availability:ratio_rate1h) / (1 - 0.999)' +``` + +Per-service error budget consumption over the last day: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=(1 - sli:http_availability:ratio_rate1d) / (1 - 0.999)' +``` + + +## Burn Rate Queries + +Burn rate measures how fast you are consuming your error budget relative to the SLO. A burn rate of 1.0 means you will exactly exhaust the budget by the end of the SLO window. Higher values mean faster consumption. + +### Single-Window Burn Rate + +Burn rate over a 1-hour window for a 99.9% SLO: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=(1 - sli:http_availability:ratio_rate1h) / (1 - 0.999)' +``` + +Burn rate over a 6-hour window: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=(1 - sli:http_availability:ratio_rate6h) / (1 - 0.999)' +``` + +### Multi-Window Burn Rate (Google SRE Book Pattern) + +The multi-window approach uses two conditions that must both be true before alerting. This reduces false positives by requiring both a short-term spike and a sustained trend. + +#### 14.4x Fast Burn — 1h window / 6h window + +Detects severe incidents that will exhaust the entire 30-day error budget in ~2 days. Both the 1-hour and 6-hour burn rates must exceed 14.4x: + +1-hour burn rate: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=(1 - sli:http_availability:ratio_rate1h) / (1 - 0.999) > 14.4' +``` + +6-hour burn rate (confirmation window): + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=(1 - sli:http_availability:ratio_rate6h) / (1 - 0.999) > 14.4' +``` + +#### 1x Slow Burn — 3d window / 30d window + +Detects slow, sustained degradation that will exhaust the error budget by the end of the SLO window. Both the 3-day and 30-day burn rates must exceed 1x: + +3-day burn rate: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=(1 - sli:http_availability:ratio_rate3d) / (1 - 0.999) > 1' +``` + +30-day burn rate (confirmation window): + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=(1 - sli:http_availability:ratio_rate30d) / (1 - 0.999) > 1' +``` + + +## Prometheus Alerting Rules for Burn Rate + +Add these alerting rules to the Prometheus configuration to trigger alerts when burn rates exceed thresholds. These follow the multi-window pattern from the Google SRE book. + +### Availability Burn Rate Alerts + +```yaml +groups: + - name: slo_burn_rate_alerts + rules: + - alert: SLOAvailabilityFastBurn + expr: | + ( + (1 - sli:http_availability:ratio_rate1h) / (1 - 0.999) > 14.4 + and + (1 - sli:http_availability:ratio_rate6h) / (1 - 0.999) > 14.4 + ) + for: 2m + labels: + severity: critical + slo: availability + annotations: + summary: "High availability burn rate detected for {{ $labels.service_name }}" + description: "Service {{ $labels.service_name }} is consuming error budget at 14.4x the sustainable rate. At this rate, the 30-day budget will be exhausted in ~2 days." + + - alert: SLOAvailabilitySlowBurn + expr: | + ( + (1 - sli:http_availability:ratio_rate3d) / (1 - 0.999) > 1 + and + (1 - sli:http_availability:ratio_rate30d) / (1 - 0.999) > 1 + ) + for: 1h + labels: + severity: warning + slo: availability + annotations: + summary: "Sustained availability degradation for {{ $labels.service_name }}" + description: "Service {{ $labels.service_name }} has a burn rate above 1x over 3 days, confirmed by the 30-day window. Error budget will be exhausted before the SLO window ends." +``` + +### Latency Burn Rate Alerts + +```yaml +groups: + - name: slo_latency_burn_rate_alerts + rules: + - alert: SLOLatencyFastBurn + expr: | + ( + (1 - sli:http_latency:ratio_rate1h) / (1 - 0.999) > 14.4 + and + (1 - sli:http_latency:ratio_rate6h) / (1 - 0.999) > 14.4 + ) + for: 2m + labels: + severity: critical + slo: latency + annotations: + summary: "High latency burn rate detected for {{ $labels.service_name }}" + description: "Service {{ $labels.service_name }} latency SLI is degrading at 14.4x the sustainable rate." + + - alert: SLOLatencySlowBurn + expr: | + ( + (1 - sli:http_latency:ratio_rate3d) / (1 - 0.999) > 1 + and + (1 - sli:http_latency:ratio_rate30d) / (1 - 0.999) > 1 + ) + for: 1h + labels: + severity: warning + slo: latency + annotations: + summary: "Sustained latency degradation for {{ $labels.service_name }}" + description: "Service {{ $labels.service_name }} latency SLI burn rate exceeds 1x over 3 days." +``` + +Query active alerts: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/alerts" +``` + +Query alerting rules: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/rules" +``` + + +## SLO Compliance Reporting + +### Current SLI Value + +Query the current availability SLI over the 30-day window for all services: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sli:http_availability:ratio_rate30d' +``` + +Query the current latency SLI over the 30-day window: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sli:http_latency:ratio_rate30d' +``` + +### Target Comparison + +Check which services are meeting the 99.9% availability SLO: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sli:http_availability:ratio_rate30d >= 0.999' +``` + +Check which services are violating the SLO: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sli:http_availability:ratio_rate30d < 0.999' +``` + +### Budget Remaining per Service + +Remaining error budget for each service against a 99.9% target: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=1 - ((1 - sli:http_availability:ratio_rate30d) / (1 - 0.999))' +``` + +### Burn Rate per Service + +Current burn rate for each service over the last hour: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=(1 - sli:http_availability:ratio_rate1h) / (1 - 0.999)' +``` + +Current burn rate over the last day: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=(1 - sli:http_availability:ratio_rate1d) / (1 - 0.999)' +``` + + +## SLO Setup Workflow + +Follow these steps to implement SLO monitoring for a service: + +### Step 1: Define SLIs + +Choose the SLIs that matter for your service. Most services need at least availability and latency: + +- **Availability SLI**: ratio of non-5xx responses to total responses +- **Latency SLI**: ratio of requests under a threshold (e.g., 250ms) to total requests + +Verify the raw metrics exist in Prometheus: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=http_server_duration_seconds_count' +``` + +### Step 2: Add Recording Rules + +Add the recording rule groups from the [Prometheus Recording Rules](#prometheus-recording-rules) section to your Prometheus configuration. This pre-computes SLI values at all required time windows (5m, 30m, 1h, 6h, 1d, 3d, 30d). + +Save the rules to a file (e.g., `slo-rules.yml`) and reference it in `prometheus.yml`: + +```yaml +rule_files: + - "slo-rules.yml" +``` + +Reload Prometheus to pick up the new rules: + +```bash +curl -s -X POST "$PROMETHEUS_ENDPOINT/-/reload" +``` + +Verify the recording rules are loaded: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/rules" | python3 -m json.tool +``` + +### Step 3: Set Targets + +Choose SLO targets based on your service requirements: + +| Service Tier | Availability Target | Latency Target (p99 < threshold) | +|---|---|---| +| Critical (user-facing) | 99.9% | 99.9% within 250ms | +| Standard (internal) | 99.5% | 99.5% within 500ms | +| Best-effort (batch) | 99.0% | 99.0% within 2s | + +### Step 4: Add Alerts + +Add the burn rate alerting rules from the [Prometheus Alerting Rules for Burn Rate](#prometheus-alerting-rules-for-burn-rate) section. Adjust the SLO target value in the `expr` field to match your chosen target. + +Verify alerts are configured: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/api/v1/rules" | python3 -m json.tool +``` + +### Step 5: Query Compliance + +Run the compliance report queries from the [SLO Compliance Reporting](#slo-compliance-reporting) section to verify everything is working: + +```bash +# Current SLI +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=sli:http_availability:ratio_rate30d' + +# Budget remaining +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=1 - ((1 - sli:http_availability:ratio_rate30d) / (1 - 0.999))' + +# Burn rate +curl -s "$PROMETHEUS_ENDPOINT/api/v1/query" \ + --data-urlencode 'query=(1 - sli:http_availability:ratio_rate1h) / (1 - 0.999)' + +# Active alerts +curl -s "$PROMETHEUS_ENDPOINT/api/v1/alerts" +``` + + +## References + +- [PPL Language Reference](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.md) — Official PPL syntax documentation. Fetch this if queries fail due to OpenSearch version differences or new syntax. +- [Prometheus Querying Basics](https://prometheus.io/docs/prometheus/latest/querying/basics/) — PromQL syntax reference. + +## AWS Managed Service Variants + +### Amazon Managed Service for Prometheus (AMP) (SigV4) + +Replace the local Prometheus endpoint and authentication with AWS SigV4 for all PromQL queries in this skill: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:aps" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + 'https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query' \ + --data-urlencode 'query=sli:http_availability:ratio_rate30d' +``` + +- Endpoint format: `https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query` +- Auth: `--aws-sigv4 "aws:amz:REGION:aps"` with `--user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY"` +- PromQL query syntax is identical between local Prometheus and Amazon Managed Prometheus; only the endpoint and authentication differ + +Error budget query via AMP: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:aps" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + 'https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query' \ + --data-urlencode 'query=1 - ((1 - sli:http_availability:ratio_rate30d) / (1 - 0.999))' +``` + +Burn rate query via AMP: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:aps" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + 'https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query' \ + --data-urlencode 'query=(1 - sli:http_availability:ratio_rate1h) / (1 - 0.999)' +``` + +For Amazon Managed Prometheus, recording rules and alerting rules are managed via the AMP Rules Management API rather than local configuration files. Use `awscurl` or the AWS CLI to upload rule groups. diff --git a/claude-code-observability-plugin/skills/stack-health/SKILL.md b/claude-code-observability-plugin/skills/stack-health/SKILL.md new file mode 100644 index 00000000..9e4fad58 --- /dev/null +++ b/claude-code-observability-plugin/skills/stack-health/SKILL.md @@ -0,0 +1,333 @@ +--- +name: stack-health +description: Check observability stack component health, verify data ingestion, and troubleshoot common issues. +allowed-tools: + - Bash + - curl +--- + +# Stack Health and Troubleshooting + +## Overview + +This skill provides health check commands, data verification queries, and troubleshooting guidance for the observability stack. Use it to verify that OpenSearch, Prometheus, the OTel Collector, and Data Prepper are running correctly, and to diagnose data flow problems. + +Credentials are read from the `.env` file (default: `admin` / `My_password_123!@#`). All OpenSearch curl commands use HTTPS with `-k` to skip TLS certificate verification for local development. + +## Connection Defaults + +| Variable | Default | Description | +|---|---|---| +| `OPENSEARCH_ENDPOINT` | `https://localhost:9200` | OpenSearch base URL | +| `OPENSEARCH_USER` | `admin` | OpenSearch username | +| `OPENSEARCH_PASSWORD` | `My_password_123!@#` | OpenSearch password | + +## Health Checks + +### OpenSearch Cluster Health + +Check the overall cluster status (green, yellow, or red): + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" "$OPENSEARCH_ENDPOINT/_cluster/health?pretty" +``` + +A healthy cluster returns `"status": "green"` or `"status": "yellow"` (yellow is normal for single-node development clusters). + +### Prometheus Health + +Verify Prometheus is running and healthy: + +```bash +curl -s "$PROMETHEUS_ENDPOINT/-/healthy" +``` + +Returns `Prometheus Server is Healthy.` when operational. + +### OTel Collector Metrics + +Check the OpenTelemetry Collector's internal metrics to verify it is receiving and exporting telemetry: + +```bash +curl -s http://localhost:8888/metrics +``` + +Look for `otelcol_receiver_accepted_spans_total`, `otelcol_exporter_sent_spans_total`, and `otelcol_exporter_send_failed_spans_total` in the output to confirm data flow. (OTel Collector metrics use the `_total` suffix for counters.) + +### OpenSearch Index Listing + +List all indices to verify data ingestion has created the expected trace, log, and service map indices: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" "$OPENSEARCH_ENDPOINT/_cat/indices?v" +``` + +You should see indices matching `otel-v1-apm-span-*`, `logs-otel-v1-*`, and `otel-v2-apm-service-map` if data is flowing. + +## Data Verification + +### Trace Document Count + +Verify trace data exists by counting documents in the trace index: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count()"}' +``` + +### Log Document Count + +Verify log data exists by counting documents in the log index: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | stats count()"}' +``` + +A count of 0 in either query indicates no data has been ingested for that signal. See the Troubleshooting section below. + +## Docker Compose Diagnostics + +### Check Container Status + +View the status of all stack containers: + +```bash +docker compose ps +``` + +All services should show `Up` or `Up (healthy)`. If a service is restarting or exited, check its logs. + +### View Service Logs + +View logs for a specific service: + +```bash +docker compose logs +``` + +### Data Prepper Logs + +Check Data Prepper for pipeline errors or OpenSearch connection issues: + +```bash +docker compose logs data-prepper +``` + +### OTel Collector Logs + +Check the OTel Collector for receiver, processor, or exporter errors: + +```bash +docker compose logs otel-collector +``` + +## Troubleshooting Common Failures + +### OpenSearch Unreachable + +**Symptoms**: Connection refused on port 9200, curl commands timeout or fail. + +**Diagnostic steps**: + +1. Check if the OpenSearch container is running: + ```bash + docker compose ps opensearch + ``` +2. Verify port 9200 is exposed and listening: + ```bash + docker compose ps | grep 9200 + ``` +3. Check the OpenSearch health endpoint directly: + ```bash + curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" "$OPENSEARCH_ENDPOINT/_cluster/health?pretty" + ``` +4. Check OpenSearch container logs for startup errors: + ```bash + docker compose logs opensearch + ``` +5. If the container is restarting, check for memory issues — OpenSearch requires at least 512MB heap. Verify `OPENSEARCH_JAVA_OPTS` in `docker-compose.yml`. + +### No Data in Indices + +**Symptoms**: Index listing shows no `otel-v1-apm-*` indices, or document counts are 0. + +**Diagnostic steps**: + +1. Verify the OTel Collector is receiving data — check its metrics: + ```bash + curl -s http://localhost:8888/metrics | grep otelcol_receiver_accepted_spans_total + ``` +2. Check the Data Prepper pipeline for errors: + ```bash + docker compose logs data-prepper | grep -i error + ``` +3. Verify the OTLP endpoint is reachable from your application. The OTel Collector listens on: + - gRPC: `localhost:4317` + - HTTP: `localhost:4318` +4. Send test telemetry and verify it appears: + ```bash + curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" "$OPENSEARCH_ENDPOINT/_cat/indices?v" + ``` +5. Check that Data Prepper can connect to OpenSearch — look for authentication or TLS errors in Data Prepper logs. + +### Data Prepper Pipeline Errors + +**Symptoms**: Data reaches the OTel Collector but does not appear in OpenSearch indices. + +**Diagnostic steps**: + +1. Check Data Prepper logs for pipeline processing errors: + ```bash + docker compose logs data-prepper + ``` +2. Look for OpenSearch connection failures, authentication errors, or index creation failures in the logs. +3. Verify Data Prepper is receiving data from the OTel Collector on port 21890. +4. Restart Data Prepper if configuration was changed: + ```bash + docker compose restart data-prepper + ``` + +### OTel Collector Export Failures + +**Symptoms**: Applications send telemetry but data does not reach Data Prepper or Prometheus. + +**Diagnostic steps**: + +1. Check the OTel Collector's internal metrics for export failures: + ```bash + curl -s http://localhost:8888/metrics | grep otelcol_exporter_send_failed + ``` +2. Check OTel Collector logs for exporter errors: + ```bash + docker compose logs otel-collector + ``` +3. Verify the collector can reach Data Prepper (`data-prepper:21890`) and Prometheus (`prometheus:9090`) on the Docker network. +4. Check for batch processor backpressure or memory limiter drops in the collector metrics. + +## Port Reference + +| Component | Port | Protocol | +|---|---|---| +| OpenSearch | 9200 | HTTPS | +| OTel Collector (gRPC) | 4317 | gRPC | +| OTel Collector (HTTP) | 4318 | HTTP | +| Data Prepper | 21890 | HTTP | +| Prometheus | 9090 | HTTP | +| OpenSearch Dashboards | 5601 | HTTP | + +## PPL Diagnostic Commands + +### Describe Index Mappings + +Use the PPL `describe` command to inspect the field mappings and types of an index. This is useful for verifying which fields are available for querying: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "describe otel-v1-apm-span-*"}' +``` + +### Explain Query Execution Plan + +Use the PPL `_explain` endpoint to debug query execution plans. This shows how OpenSearch will execute a PPL query without actually running it: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl/_explain" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | head 10"}' +``` + +This is useful for diagnosing slow queries, understanding how filters are applied, and verifying that field names resolve correctly. + +## Dynamic Index Discovery + +### List All Observability Indices + +Discover which observability indices exist and their sizes: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OPENSEARCH_ENDPOINT/_cat/indices/otel-*,logs-otel-*?format=json&h=index,health,docs.count,store.size&s=index" +``` + +### Get Index Field Mappings + +Discover available fields in each index dynamically instead of relying on hardcoded field names: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OPENSEARCH_ENDPOINT/otel-v1-apm-span-*/_mapping?pretty" +``` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + "$OPENSEARCH_ENDPOINT/logs-otel-v1-*/_mapping?pretty" +``` + +### PPL Describe for Field Discovery + +Use PPL `describe` to list all fields and types in an index: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "describe otel-v1-apm-span-000001"}' +``` + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "describe logs-otel-v1-000001"}' +``` + +## References + +- [PPL Language Reference](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.md) — Official PPL syntax documentation. Fetch this if queries fail due to OpenSearch version differences or new syntax. + +## AWS Managed Variants + +### Amazon OpenSearch Service Health Check + +Replace the local endpoint and authentication with AWS SigV4: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:es" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + https://DOMAIN-ID.REGION.es.amazonaws.com/_cluster/health?pretty +``` + +Index listing on AWS managed OpenSearch: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:es" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + https://DOMAIN-ID.REGION.es.amazonaws.com/_cat/indices?v +``` + +- Endpoint format: `https://DOMAIN-ID.REGION.es.amazonaws.com` +- Auth: `--aws-sigv4 "aws:amz:REGION:es"` with `--user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY"` +- No `-k` flag needed — AWS managed endpoints use valid TLS certificates + +### Amazon Managed Service for Prometheus Health + +Check Prometheus health on Amazon Managed Service for Prometheus (AMP): + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:aps" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query \ + --data-urlencode 'query=up' +``` + +- Endpoint format: `https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query` +- Auth: `--aws-sigv4 "aws:amz:REGION:aps"` with `--user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY"` +- PromQL query syntax is identical to local Prometheus; only the endpoint and authentication differ diff --git a/claude-code-observability-plugin/skills/traces/SKILL.md b/claude-code-observability-plugin/skills/traces/SKILL.md new file mode 100644 index 00000000..4ac3bfb3 --- /dev/null +++ b/claude-code-observability-plugin/skills/traces/SKILL.md @@ -0,0 +1,536 @@ +--- +name: traces +description: Query and investigate trace data from OpenSearch using PPL for agent invocations, tool executions, errors, latency, and token usage analysis. +allowed-tools: + - Bash + - curl +--- + +# Trace Querying with PPL + +## Overview + +This skill provides PPL (Piped Processing Language) query templates for investigating trace data stored in OpenSearch. Traces are stored in the `otel-v1-apm-span-*` index pattern and service dependency maps in `otel-v2-apm-service-map`. All queries use the OpenSearch PPL API at `/_plugins/_ppl` with HTTPS and basic authentication. + +Credentials are read from the `.env` file (default: `admin` / `My_password_123!@#`). All curl commands use `-k` to skip TLS certificate verification for local development. + +## Connection Defaults + +All commands below use these variables. Set them in your environment or use the defaults: + +| Variable | Default | Description | +|---|---|---| +| `OPENSEARCH_ENDPOINT` | `https://localhost:9200` | OpenSearch base URL | +| `OPENSEARCH_USER` | `admin` | OpenSearch username | +| `OPENSEARCH_PASSWORD` | `My_password_123!@#` | OpenSearch password | + +## Base Command + +All PPL queries in this skill use this curl pattern: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": ""}' +``` + +The examples below show the full command for clarity, but only the PPL query varies. + +## Agent Invocation Spans + +Query all spans where a GenAI agent was invoked: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''invoke_agent'\'' | fields traceId, spanId, `attributes.gen_ai.agent.name`, `attributes.gen_ai.request.model`, durationInNanos, startTime | sort - startTime | head 20"}' +``` + +## Tool Execution Spans + +Query all spans where a tool was executed: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''execute_tool'\'' | fields traceId, spanId, `attributes.gen_ai.tool.name`, durationInNanos, startTime | sort - startTime | head 20"}' +``` + +## Slow Spans + +Identify spans exceeding a latency threshold. The default threshold is 5 seconds (5,000,000,000 nanoseconds). Adjust the value as needed: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where durationInNanos > 5000000000 | fields traceId, spanId, serviceName, name, durationInNanos, startTime | sort - durationInNanos | head 20"}' +``` + +To find slow agent invocations specifically: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''invoke_agent'\'' AND durationInNanos > 5000000000 | fields traceId, `attributes.gen_ai.agent.name`, durationInNanos | sort - durationInNanos"}' +``` + +## Error Spans + +Query spans with error status (`status.code` = 2 means ERROR in OTel): + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `status.code` = 2 | fields traceId, spanId, serviceName, name, `status.code`, startTime | sort - startTime | head 20"}' +``` + +## Token Usage by Model + +Aggregate input and output token usage grouped by the requested model: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.usage.input_tokens` > 0 | stats sum(`attributes.gen_ai.usage.input_tokens`) as total_input, sum(`attributes.gen_ai.usage.output_tokens`) as total_output by `attributes.gen_ai.request.model`"}' +``` + +## Token Usage by Agent + +Aggregate token usage grouped by agent name: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.usage.input_tokens` > 0 | stats sum(`attributes.gen_ai.usage.input_tokens`) as total_input, sum(`attributes.gen_ai.usage.output_tokens`) as total_output by `attributes.gen_ai.agent.name`"}' +``` + +## Service Operations Listing + +List distinct services and their GenAI operation types with counts: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | stats count() by serviceName, `attributes.gen_ai.operation.name`"}' +``` + +## Service Map Queries + +> **Important:** The `sourceNode`, `targetNode`, `sourceOperation`, and `targetOperation` fields in the `otel-v2-apm-service-map-*` index are **nested struct objects**, not flat strings. Each node has the structure: +> ```json +> { +> "keyAttributes": { "name": "frontend", "environment": "generic:default" }, +> "groupByAttributes": { "telemetry": { "sdk": { "language": "python" } } }, +> "type": "service" +> } +> ``` +> PPL returns these as JSON objects. When aggregating by node (e.g., `stats ... by sourceNode`), PPL groups by the full struct which may produce null aggregations. To extract the service name, read the `sourceNode` field and parse the `keyAttributes.name` from the returned JSON. + +### Service Topology (Node Connections) + +Query the service dependency map to explore service-to-service connections. Use `dedup nodeConnectionHash` to get unique connections: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v2-apm-service-map-* | dedup nodeConnectionHash | fields sourceNode, targetNode, sourceOperation, targetOperation"}' +``` + +### Service Operations from Service Map + +List all operations for a specific service from the service map: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v2-apm-service-map-* | dedup operationConnectionHash | fields sourceNode, sourceOperation, targetNode, targetOperation"}' +``` + +### Dependency Count per Service + +Count how many downstream dependencies each service calls: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v2-apm-service-map-* | dedup nodeConnectionHash | stats distinct_count(targetNode) as dependency_count by sourceNode"}' +``` + +## Remote Service Identification with coalesce() + +Different OTel instrumentation libraries use different attributes to identify remote services. Use `coalesce()` to check multiple fields in priority order: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where serviceName = '\''frontend'\'' | where kind = '\''SPAN_KIND_CLIENT'\'' | eval _remoteService = coalesce(`attributes.net.peer.name`, `attributes.server.address`, `attributes.upstream_cluster`, `attributes.rpc.service`, `attributes.peer.service`, `attributes.db.system`, `attributes.gen_ai.system`, `attributes.http.host`, `attributes.messaging.destination.name`, '\'''\'' ) | where _remoteService != '\'''\'' | stats count() as calls by _remoteService | sort - calls"}' +``` + +**Remote service attribute priority:** + +| Field | Used By | +|---|---| +| `attributes.net.peer.name` | Node.js (frontend) | +| `attributes.server.address` | Go, Java, .NET (checkout, cart) | +| `attributes.upstream_cluster` | Envoy/Istio (frontend-proxy) | +| `attributes.rpc.service` | gRPC services (recommendation) | +| `attributes.peer.service` | Older OTel conventions | +| `attributes.db.system` | Database clients (redis, postgresql) | +| `attributes.gen_ai.system` | LLM clients (openai) | +| `attributes.http.host` | HTTP clients | +| `attributes.messaging.destination.name` | Message queues (Kafka, RabbitMQ) | + +## Cross-Signal Correlation + +### Trace-Log Joins by traceId + +Find all logs associated with a specific trace by querying the log index with the same traceId: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=logs-otel-v1-* | where traceId = '\'''\'' | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort `@timestamp`"}' +``` + +Join trace spans with correlated logs using PPL join: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' | join left=s right=l ON s.traceId = l.traceId logs-otel-v1-* | fields s.spanId, s.name, l.severityText, l.body"}' +``` + +> **Caveat:** Cross-index PPL `join` may return 0 rows on OpenSearch 3.x due to engine limitations. If you get empty results, run two separate queries (one against `otel-v1-apm-span-*` and one against `logs-otel-v1-*`) filtered by the same `traceId`, then correlate the results at the application level. + +### Trace Tree Reconstruction + +Reconstruct the full trace tree by querying all spans for a traceId, sorted by startTime with parentSpanId for hierarchy: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' | fields traceId, spanId, parentSpanId, serviceName, name, startTime, endTime, durationInNanos, `status.code` | sort startTime"}' +``` + +### Latency Gap Analysis + +Compare parent and child span timing to identify latency gaps within a trace. First retrieve all spans, then compare startTime/endTime values between parent and child: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' | fields spanId, parentSpanId, name, startTime, endTime, durationInNanos | sort startTime"}' +``` + +To find spans where the child started significantly after the parent, look for gaps between a parent's startTime and its children's startTime values. Large gaps indicate queuing, scheduling delays, or uninstrumented work. + +### Root Span Identification + +Find the root span of a trace (where parentSpanId is empty or null): + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where traceId = '\'''\'' AND parentSpanId = '\'''\'' | fields traceId, spanId, serviceName, name, durationInNanos, startTime, endTime"}' +``` + +Find all root spans (entry points) across all traces: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where parentSpanId = '\'''\'' | fields traceId, spanId, serviceName, name, durationInNanos, startTime | sort - startTime | head 20"}' +``` + +## GenAI Operation Types + +The OpenTelemetry GenAI semantic conventions define the following operation types in `attributes.gen_ai.operation.name`: + +| Operation Type | Description | +|---|---| +| `invoke_agent` | An agent invocation — the top-level span for an agent handling a request | +| `execute_tool` | A tool execution within an agent's reasoning loop | +| `chat` | An LLM chat completion call | +| `embeddings` | A text embedding generation call | +| `retrieval` | A retrieval operation (e.g., RAG vector search) | +| `create_agent` | Agent creation/initialization | +| `text_completion` | A text completion call (non-chat) | +| `generate_content` | A generic content generation call | + +### Filter by Operation Type + +#### invoke_agent + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''invoke_agent'\'' | fields traceId, spanId, `attributes.gen_ai.agent.name`, durationInNanos, startTime | sort - startTime | head 20"}' +``` + +#### execute_tool + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''execute_tool'\'' | fields traceId, spanId, `attributes.gen_ai.tool.name`, durationInNanos, startTime | sort - startTime | head 20"}' +``` + +#### chat + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''chat'\'' | fields traceId, spanId, `attributes.gen_ai.request.model`, `attributes.gen_ai.usage.input_tokens`, `attributes.gen_ai.usage.output_tokens`, durationInNanos, startTime | sort - startTime | head 20"}' +``` + +#### embeddings + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''embeddings'\'' | fields traceId, spanId, `attributes.gen_ai.request.model`, durationInNanos, startTime | sort - startTime | head 20"}' +``` + +#### retrieval + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''retrieval'\'' | fields traceId, spanId, serviceName, name, durationInNanos, startTime | sort - startTime | head 20"}' +``` + +#### create_agent + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''create_agent'\'' | fields traceId, spanId, `attributes.gen_ai.agent.name`, `attributes.gen_ai.agent.id`, startTime | sort - startTime | head 20"}' +``` + +#### text_completion + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''text_completion'\'' | fields traceId, spanId, `attributes.gen_ai.request.model`, durationInNanos, startTime | sort - startTime | head 20"}' +``` + +#### generate_content + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''generate_content'\'' | fields traceId, spanId, `attributes.gen_ai.request.model`, durationInNanos, startTime | sort - startTime | head 20"}' +``` + +## Extended GenAI Attributes + +The OTel GenAI semantic conventions provide these extended attributes on trace spans: + +| Attribute | Type | Description | +|---|---|---| +| `attributes.gen_ai.agent.id` | keyword | Unique identifier for the agent instance | +| `attributes.gen_ai.agent.name` | keyword | Human-readable agent name | +| `attributes.gen_ai.agent.description` | keyword | Description of the agent's purpose | +| `attributes.gen_ai.agent.version` | keyword | Version of the agent | +| `attributes.gen_ai.conversation.id` | keyword | Identifier for a multi-turn conversation session | +| `attributes.gen_ai.tool.call.id` | keyword | Unique identifier for a specific tool call | +| `attributes.gen_ai.tool.type` | keyword | Type of tool (e.g., function, retrieval) | +| `attributes.gen_ai.tool.call.arguments` | text | JSON-encoded arguments passed to the tool | +| `attributes.gen_ai.tool.call.result` | text | JSON-encoded result returned by the tool | +| `attributes.gen_ai.request.model` | keyword | Model requested for the operation | +| `attributes.gen_ai.usage.input_tokens` | long | Number of input tokens consumed | +| `attributes.gen_ai.usage.output_tokens` | long | Number of output tokens generated | +| `attributes.gen_ai.operation.name` | keyword | Operation type (see GenAI Operation Types above) | + +## Exception and Error Querying + +### Query Spans with Exceptions + +Find spans that contain exception events with type, message, and stacktrace: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `events.attributes.exception.type` != '\'''\'' | fields traceId, spanId, serviceName, name, `events.attributes.exception.type`, `events.attributes.exception.message`, startTime | sort - startTime | head 20"}' +``` + +### Query Exception Stacktraces + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `events.attributes.exception.stacktrace` != '\'''\'' | fields traceId, spanId, `events.attributes.exception.type`, `events.attributes.exception.message`, `events.attributes.exception.stacktrace` | head 10"}' +``` + +> **Note:** This query may fail with "insufficient resources" on large indices because the `where` filter on `events.attributes.exception.stacktrace` must scan all documents. If this happens, add `| head 1000` before the `where` filter to limit the scan scope: `source=otel-v1-apm-span-* | head 1000 | where ...` + +### Query Spans by error_type Attribute + +Find spans with a specific error type category: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.error_type` != '\'''\'' | fields traceId, spanId, serviceName, `attributes.error_type`, `status.code`, startTime | sort - startTime | head 20"}' +``` + +### Error Spans with Exception Details + +Combine error status with exception information for a complete error view: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `status.code` = 2 | fields traceId, spanId, serviceName, name, `events.attributes.exception.type`, `events.attributes.exception.message`, `attributes.error_type`, startTime | sort - startTime | head 20"}' +``` + +## Conversation Tracking + +Track multi-turn conversations by grouping spans with the same conversation ID: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.conversation.id` != '\'''\'' | stats count() as turns, sum(`attributes.gen_ai.usage.input_tokens`) as total_input_tokens, sum(`attributes.gen_ai.usage.output_tokens`) as total_output_tokens by `attributes.gen_ai.conversation.id`"}' +``` + +View all spans in a specific conversation ordered by time: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.conversation.id` = '\'''\'' | fields traceId, spanId, `attributes.gen_ai.operation.name`, `attributes.gen_ai.agent.name`, startTime, durationInNanos | sort startTime"}' +``` + +## Tool Call Inspection + +Inspect tool call arguments and results for debugging agent behavior: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''execute_tool'\'' | fields traceId, spanId, `attributes.gen_ai.tool.name`, `attributes.gen_ai.tool.call.id`, `attributes.gen_ai.tool.call.arguments`, `attributes.gen_ai.tool.call.result`, durationInNanos, startTime | sort - startTime | head 20"}' +``` + +Inspect tool calls for a specific tool: + +```bash +curl -sk -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ + -X POST "$OPENSEARCH_ENDPOINT/_plugins/_ppl" \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''execute_tool'\'' AND `attributes.gen_ai.tool.name` = '\'''\'' | fields traceId, `attributes.gen_ai.tool.call.arguments`, `attributes.gen_ai.tool.call.result`, durationInNanos, startTime | sort - startTime"}' +``` + +## PPL Commands for Trace Analysis + +The following PPL commands are particularly useful when analyzing trace data: + +| Command | Use Case | +|---|---| +| `stats` | Aggregate token usage, count spans by service, compute latency percentiles | +| `where` | Filter spans by operation type, status code, duration threshold, attribute values | +| `fields` | Select specific fields to return (traceId, spanId, attributes, etc.) | +| `sort` | Order results by startTime, durationInNanos, or other fields | +| `dedup` | Remove duplicate spans (e.g., deduplicate by traceId to get unique traces) | +| `top` | Find the most frequent values (e.g., top services, top error types) | +| `rare` | Find the least frequent values (e.g., rare operation types, rare error messages) | +| `timechart` | Visualize span counts or latency over time buckets | +| `eval` | Compute derived fields (e.g., convert nanoseconds to milliseconds) | +| `head` | Limit result count for quick exploration | +| `rename` | Rename fields for readability in output | +| `eventstats` | Add aggregation results as new fields to each row without collapsing rows | +| `trendline` | Calculate moving averages on latency or token usage over time | +| `streamstats` | Compute running statistics (e.g., cumulative token count) | +| `ad` | Anomaly detection on latency — identify spans with unusual duration patterns | + +## Trace Index Field Reference + +Key fields available in the `otel-v1-apm-span-*` index: + +| Field | Type | Description | +|---|---|---| +| `traceId` | keyword | Unique 128-bit trace identifier | +| `spanId` | keyword | Unique 64-bit span identifier | +| `parentSpanId` | keyword | Parent span ID (empty string for root spans) | +| `serviceName` | keyword | Service that produced the span | +| `name` | text | Span operation name | +| `kind` | keyword | Span kind (SERVER, CLIENT, INTERNAL, PRODUCER, CONSUMER) | +| `startTime` | date | Span start timestamp | +| `endTime` | date | Span end timestamp | +| `durationInNanos` | long | Span duration in nanoseconds | +| `status.code` | integer | Status code: 0=Unset, 1=Ok, 2=Error | +| `attributes.gen_ai.operation.name` | keyword | GenAI operation type | +| `attributes.gen_ai.agent.name` | keyword | Agent name | +| `attributes.gen_ai.agent.id` | keyword | Agent identifier | +| `attributes.gen_ai.agent.description` | keyword | Agent description | +| `attributes.gen_ai.agent.version` | keyword | Agent version | +| `attributes.gen_ai.request.model` | keyword | Requested model | +| `attributes.gen_ai.usage.input_tokens` | long | Input token count | +| `attributes.gen_ai.usage.output_tokens` | long | Output token count | +| `attributes.gen_ai.conversation.id` | keyword | Conversation identifier | +| `attributes.gen_ai.tool.name` | keyword | Tool name | +| `attributes.gen_ai.tool.call.id` | keyword | Tool call identifier | +| `attributes.gen_ai.tool.type` | keyword | Tool type | +| `attributes.gen_ai.tool.call.arguments` | text | Tool call arguments (JSON) | +| `attributes.gen_ai.tool.call.result` | text | Tool call result (JSON) | +| `attributes.error_type` | keyword | Error type category | +| `events.attributes.exception.type` | keyword | Exception class/type | +| `events.attributes.exception.message` | text | Exception message | +| `events.attributes.exception.stacktrace` | text | Exception stacktrace | + +## References + +- [PPL Language Reference](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.md) — Official PPL syntax documentation. Fetch this if queries fail due to OpenSearch version differences or new syntax. +- [OpenTelemetry GenAI Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/) — Standard attribute names for AI/LLM operations. + +## AWS Managed OpenSearch + +To query traces on Amazon OpenSearch Service, replace the local endpoint and authentication with AWS SigV4: + +```bash +curl -s --aws-sigv4 "aws:amz:REGION:es" \ + --user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY" \ + -X POST https://DOMAIN-ID.REGION.es.amazonaws.com/_plugins/_ppl \ + -H 'Content-Type: application/json' \ + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''invoke_agent'\'' | fields traceId, spanId, `attributes.gen_ai.agent.name`, durationInNanos, startTime | sort - startTime | head 20"}' +``` + +- Endpoint format: `https://DOMAIN-ID.REGION.es.amazonaws.com` +- Auth: `--aws-sigv4 "aws:amz:REGION:es"` with `--user "$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY"` +- The PPL API endpoint (`/_plugins/_ppl`) and query syntax are identical to the local stack +- No `-k` flag needed — AWS managed endpoints use valid TLS certificates diff --git a/claude-code-observability-plugin/tests/README.md b/claude-code-observability-plugin/tests/README.md new file mode 100644 index 00000000..c68dc35b --- /dev/null +++ b/claude-code-observability-plugin/tests/README.md @@ -0,0 +1,87 @@ +# Tests + +Integration and property-based tests for the Claude Code Observability Plugin. + +## Prerequisites + +- Python 3.9+ +- Install dependencies: + ```bash + pip install -r requirements.txt + ``` + +## Running Tests + +### All tests (requires running observability stack) + +```bash +cd claude-code-observability-plugin/tests +pytest +``` + +Integration tests execute real curl commands against OpenSearch and Prometheus. If the stack is not running, tests are skipped automatically with a message. + +### Property tests only (no stack needed) + +```bash +pytest test_properties.py +``` + +Property tests validate skill file content (frontmatter, curl commands, PPL syntax, etc.) without requiring a running stack. + +### Filter by tag + +```bash +pytest -m traces +pytest -m logs +pytest -m metrics +pytest -m stack_health +pytest -m ppl +pytest -m correlation +pytest -m apm_red +pytest -m slo_sli +``` + +### Verbose output + +```bash +pytest -v --tb=short +``` + +## Test Structure + +| File | Description | +|---|---| +| `test_runner.py` | YAML-driven integration tests. Loads fixtures from `fixtures/`, validates each against the Pydantic `TestFixture` model, executes commands via subprocess, and asserts expected JSON fields in responses. | +| `test_properties.py` | Property-based correctness tests (Hypothesis). Validates 10 properties: skill frontmatter validity, curl command auth/protocol, PPL/PromQL completeness, field lookup correctness, config parsing, RED queries, and SLO recording rules. | +| `conftest.py` | Session-scoped fixtures: `.env` config loading with fallback defaults, stack health check (auto-skip if stack is down), and custom pytest markers. | +| `models.py` | Pydantic `TestFixture` model with strict validation (`extra="forbid"`). | + +## Adding New Test Cases + +1. Create a YAML file in `fixtures/` (or add entries to an existing one). +2. Each entry must match the `TestFixture` schema: + +```yaml +- name: "descriptive test name" + description: "what this test validates" + command: "curl -sk -u admin:'My_password_123!@#' https://localhost:9200/..." + expected_status_code: 200 + expected_fields: + - "schema" + - "datarows" + tags: + - "traces" +``` + +3. Supported fields: + - `name` (str) — unique test identifier + - `description` (str) — what the test validates + - `command` (str) — shell command to execute + - `expected_status_code` (int) — expected exit code + - `expected_fields` (list[str]) — dot-separated JSON paths that must exist in the response + - `tags` (list[str]) — categories for marker-based filtering + - `before_test` (str, optional) — setup command run before the main command + - `after_test` (str, optional) — teardown command run after the main command + +4. Run `pytest` to verify the new fixture loads and passes. diff --git a/claude-code-observability-plugin/tests/conftest.py b/claude-code-observability-plugin/tests/conftest.py new file mode 100644 index 00000000..a5f7f1c5 --- /dev/null +++ b/claude-code-observability-plugin/tests/conftest.py @@ -0,0 +1,151 @@ +"""Pytest configuration and session-scoped fixtures for observability stack tests.""" + +import os + +import pytest +import requests +import urllib3 + +# Suppress InsecureRequestWarning for self-signed certs in dev stack +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +# --------------------------------------------------------------------------- # +# .env parsing and config loading +# --------------------------------------------------------------------------- # + +ENV_FILE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", ".env") + +DEFAULTS = { + "OPENSEARCH_HOST": "localhost", + "OPENSEARCH_PORT": "9200", + "OPENSEARCH_USER": "admin", + "OPENSEARCH_PASSWORD": "My_password_123!@#", + "PROMETHEUS_PORT": "9090", +} + + +def parse_env_file(path: str) -> "dict[str, str]": + """Read a .env file and return a dict of key-value pairs. + + Handles comments, blank lines, and optional quoting of values. + Returns an empty dict if the file does not exist. + """ + env_vars = {} + try: + with open(path, encoding="utf-8") as fh: + for line in fh: + line = line.strip() + if not line or line.startswith("#"): + continue + if "=" not in line: + continue + key, _, value = line.partition("=") + key = key.strip() + value = value.strip() + # Strip surrounding quotes (single or double) + if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'): + value = value[1:-1] + env_vars[key] = value + except FileNotFoundError: + pass + return env_vars + + +def load_config() -> "dict[str, str]": + """Build a config dict from the .env file with fallback defaults. + + Note: OPENSEARCH_HOST in .env is typically the Docker service name (e.g. + ``opensearch``) which is not reachable from the host. Tests always connect + via ``localhost`` unless overridden by the ``TEST_OPENSEARCH_HOST`` or + ``TEST_PROMETHEUS_HOST`` environment variables. + """ + env = parse_env_file(ENV_FILE_PATH) + + opensearch_host = os.environ.get( + "TEST_OPENSEARCH_HOST", DEFAULTS["OPENSEARCH_HOST"] + ) + opensearch_port = env.get("OPENSEARCH_PORT", DEFAULTS["OPENSEARCH_PORT"]) + opensearch_user = env.get("OPENSEARCH_USER", DEFAULTS["OPENSEARCH_USER"]) + opensearch_password = env.get("OPENSEARCH_PASSWORD", DEFAULTS["OPENSEARCH_PASSWORD"]) + prometheus_host = os.environ.get("TEST_PROMETHEUS_HOST", "localhost") + prometheus_port = env.get("PROMETHEUS_PORT", DEFAULTS["PROMETHEUS_PORT"]) + + return { + "opensearch_host": opensearch_host, + "opensearch_port": opensearch_port, + "opensearch_user": opensearch_user, + "opensearch_password": opensearch_password, + "prometheus_host": prometheus_host, + "prometheus_port": prometheus_port, + "opensearch_url": f"https://{opensearch_host}:{opensearch_port}", + "prometheus_url": f"http://{prometheus_host}:{prometheus_port}", + } + + +# --------------------------------------------------------------------------- # +# Pytest markers +# --------------------------------------------------------------------------- # + +def pytest_configure(config: pytest.Config) -> None: + """Register custom markers for tag-based test filtering.""" + config.addinivalue_line("markers", "traces: trace query tests") + config.addinivalue_line("markers", "logs: log query tests") + config.addinivalue_line("markers", "metrics: metrics query tests") + config.addinivalue_line("markers", "stack_health: stack health check tests") + config.addinivalue_line("markers", "ppl: PPL system command tests") + config.addinivalue_line("markers", "correlation: cross-signal correlation tests") + config.addinivalue_line("markers", "apm_red: APM RED metrics tests") + config.addinivalue_line("markers", "slo_sli: SLO/SLI query tests") + config.addinivalue_line("markers", "topology: service topology discovery tests") + config.addinivalue_line("markers", "osd_config: OSD config and index discovery tests") + config.addinivalue_line("markers", "osd_dashboards: OSD Dashboards API tests (require OSD running)") + + +# --------------------------------------------------------------------------- # +# Session-scoped fixtures +# --------------------------------------------------------------------------- # + +@pytest.fixture(scope="session") +def stack_config() -> "dict[str, str]": + """Return the resolved stack configuration dict.""" + return load_config() + + +@pytest.fixture(scope="session", autouse=True) +def check_stack_health(stack_config: dict[str, str]) -> None: + """Verify the observability stack is reachable before running tests. + + Checks OpenSearch cluster health and Prometheus health endpoints. + Skips the entire test session if either service is unavailable. + """ + opensearch_url = stack_config["opensearch_url"] + prometheus_url = stack_config["prometheus_url"] + auth = (stack_config["opensearch_user"], stack_config["opensearch_password"]) + + # Check OpenSearch + try: + resp = requests.get( + f"{opensearch_url}/_cluster/health", + auth=auth, + verify=False, + timeout=10, + ) + resp.raise_for_status() + except Exception as exc: + pytest.skip( + f"Observability stack is not running — OpenSearch unreachable at " + f"{opensearch_url}: {exc}" + ) + + # Check Prometheus + try: + resp = requests.get( + f"{prometheus_url}/-/healthy", + timeout=10, + ) + resp.raise_for_status() + except Exception as exc: + pytest.skip( + f"Observability stack is not running — Prometheus unreachable at " + f"{prometheus_url}: {exc}" + ) diff --git a/claude-code-observability-plugin/tests/fixtures/.gitkeep b/claude-code-observability-plugin/tests/fixtures/.gitkeep new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/claude-code-observability-plugin/tests/fixtures/.gitkeep @@ -0,0 +1 @@ + diff --git a/claude-code-observability-plugin/tests/fixtures/apm-red.yaml b/claude-code-observability-plugin/tests/fixtures/apm-red.yaml new file mode 100644 index 00000000..26d63305 --- /dev/null +++ b/claude-code-observability-plugin/tests/fixtures/apm-red.yaml @@ -0,0 +1,161 @@ +## APM RED metrics integration tests +## Uses http_server_duration_milliseconds (the active HTTP metric in this stack). + +- name: red_rate_promql + description: PromQL per-service HTTP request rate over a 5-minute window + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=sum(rate(http_server_duration_milliseconds_count[5m])) by (service_name)' + expected_status_code: 200 + expected_fields: + - status + - data + expected_min_results: 1 + tags: + - apm_red + +- name: red_error_promql + description: PromQL error rate ratio of 5xx responses to total requests by service + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=sum(rate(http_server_duration_milliseconds_count{http_response_status_code=~"5.."}[5m])) by (service_name) / sum(rate(http_server_duration_milliseconds_count[5m])) by (service_name)' + expected_status_code: 200 + expected_fields: + - status + - data + tags: + - apm_red + +- name: red_duration_promql + description: PromQL p95 latency by service using histogram_quantile + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=histogram_quantile(0.95, sum(rate(http_server_duration_milliseconds_bucket[5m])) by (le, service_name))' + expected_status_code: 200 + expected_fields: + - status + - data + expected_min_results: 1 + tags: + - apm_red + +- name: red_rate_ppl + description: PPL request count from trace spans grouped by serviceName + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | stats count() as request_count by serviceName"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + expected_min_results: 1 + tags: + - apm_red + +- name: red_error_ppl + description: PPL error count from trace spans where status.code is 2 (OTel ERROR) grouped by serviceName + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | where `status.code` = 2 | stats count() as error_count by serviceName"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + expected_min_results: 1 + tags: + - apm_red + +- name: red_duration_ppl + description: PPL p95 duration percentile from trace span durations grouped by serviceName + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | stats percentile(durationInNanos, 95) as p95 by serviceName"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + expected_min_results: 1 + tags: + - apm_red + +- name: red_dp_request_gauge + description: Data Prepper APM request gauge metric per service + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=sum(request{namespace="span_derived"}) by (service)' + expected_status_code: 200 + expected_fields: + - status + - data + expected_min_results: 1 + tags: + - apm_red + +- name: red_dp_error_gauge + description: Data Prepper APM error gauge metric per service + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=sum(error{namespace="span_derived"}) by (service)' + expected_status_code: 200 + expected_fields: + - status + - data + tags: + - apm_red + +- name: red_dp_fault_gauge + description: Data Prepper APM fault gauge metric per service + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=sum(fault{namespace="span_derived"}) by (service)' + expected_status_code: 200 + expected_fields: + - status + - data + tags: + - apm_red + +- name: red_dp_latency_histogram + description: Data Prepper APM p95 latency histogram per service + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=histogram_quantile(0.95, sum(latency_seconds_seconds_bucket{namespace="span_derived"}) by (le, service))' + expected_status_code: 200 + expected_fields: + - status + - data + expected_min_results: 1 + tags: + - apm_red + +- name: red_genai_rate_promql + description: PromQL GenAI operation rate by operation name and model + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=sum(rate(gen_ai_client_operation_duration_seconds_count[5m])) by (gen_ai_operation_name, gen_ai_request_model)' + expected_status_code: 200 + expected_fields: + - status + - data + expected_min_results: 1 + tags: + - apm_red + +- name: red_genai_duration_promql + description: PromQL GenAI operation p95 latency by operation name and model + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=histogram_quantile(0.95, sum(rate(gen_ai_client_operation_duration_seconds_bucket[5m])) by (le, gen_ai_operation_name, gen_ai_request_model))' + expected_status_code: 200 + expected_fields: + - status + - data + expected_min_results: 1 + tags: + - apm_red diff --git a/claude-code-observability-plugin/tests/fixtures/correlation.yaml b/claude-code-observability-plugin/tests/fixtures/correlation.yaml new file mode 100644 index 00000000..94d43559 --- /dev/null +++ b/claude-code-observability-plugin/tests/fixtures/correlation.yaml @@ -0,0 +1,85 @@ +## Cross-signal correlation integration tests + +- name: trace_to_log_correlation + description: Find logs with trace context by querying the log index for entries with non-empty traceId + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | where traceId != '\'''\'' | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - correlation + +- name: log_to_trace_correlation + description: Find spans from log entries by querying the trace index for spans with non-empty spanId + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | where spanId != '\'''\'' | fields traceId, spanId, parentSpanId, serviceName, name, startTime, endTime, durationInNanos, `status.code` | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - correlation + +- name: exemplar_query + description: Query Prometheus exemplars API for trace context attached to http_server_duration_seconds_bucket metric samples + command: >- + curl -s 'http://localhost:9090/api/v1/query_exemplars' + --data-urlencode 'query=http_server_duration_seconds_bucket' + --data-urlencode 'start=2024-01-01T00:00:00Z' + --data-urlencode 'end=2030-01-01T00:00:00Z' + expected_status_code: 200 + expected_fields: + - status + - data + tags: + - correlation + +- name: resource_correlation_by_service + description: Correlate spans for each service with per-service span counts and average duration + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | stats count() as span_count, avg(durationInNanos) as avg_duration by serviceName | sort - span_count | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - correlation + +- name: error_log_to_error_span_correlation + description: Find ERROR logs and verify spans exist for the same services + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''ERROR'\'' AND traceId != '\'''\'' | fields traceId, spanId, body, `resource.attributes.service.name`, `@timestamp` | head 10"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - correlation + +- name: log_volume_vs_span_volume_by_service + description: Compare log volume against span volume per service using separate queries + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | stats count() as log_count by `resource.attributes.service.name` | sort - log_count | head 15"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - correlation diff --git a/claude-code-observability-plugin/tests/fixtures/logs.yaml b/claude-code-observability-plugin/tests/fixtures/logs.yaml new file mode 100644 index 00000000..a9318877 --- /dev/null +++ b/claude-code-observability-plugin/tests/fixtures/logs.yaml @@ -0,0 +1,125 @@ +- name: error_logs + description: Query all error-level logs filtered by severityText + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''ERROR'\'' | fields traceId, spanId, `resource.attributes.service.name`, body, `@timestamp` | sort - `@timestamp` | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - logs + +- name: warn_logs + description: Query all warn-level logs filtered by severityText + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''WARN'\'' | fields traceId, spanId, `resource.attributes.service.name`, body, `@timestamp` | sort - `@timestamp` | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - logs + +- name: severity_number_filter + description: Filter logs at WARN level or above using severityNumber >= 13 + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | where severityNumber >= 13 | fields severityText, severityNumber, `resource.attributes.service.name`, body, `@timestamp` | sort - `@timestamp` | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - logs + +- name: trace_correlation + description: Find logs that have trace context (non-empty traceId) + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | where traceId != '\'''\'' | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - logs + +- name: error_patterns_by_severity_service + description: Aggregate log counts grouped by severity level and service name + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | stats count() by severityText, `resource.attributes.service.name`"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - logs + +- name: error_count_by_service + description: Count error logs grouped by service name sorted by error count + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | where severityText = '\''ERROR'\'' | stats count() as error_count by `resource.attributes.service.name` | sort - error_count"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - logs + +- name: log_volume_hourly + description: Analyze log volume over time using hourly time buckets + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | stats count() as log_count by span(`@timestamp`, 1h)"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - logs + +- name: body_search_like + description: Search log body content for a specific string using where with like + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | where body like '\''%error%'\'' | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort - `@timestamp` | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - logs + +- name: body_search_match + description: Full-text relevance search on log body using match function + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | where match(body, '\''failed'\'') | fields traceId, spanId, severityText, body, `resource.attributes.service.name`, `@timestamp` | sort - `@timestamp` | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - logs diff --git a/claude-code-observability-plugin/tests/fixtures/metrics.yaml b/claude-code-observability-plugin/tests/fixtures/metrics.yaml new file mode 100644 index 00000000..bf008724 --- /dev/null +++ b/claude-code-observability-plugin/tests/fixtures/metrics.yaml @@ -0,0 +1,92 @@ +## Metrics query integration tests +## Uses http_server_duration_milliseconds (the active HTTP metric in this stack). +## If your stack uses http_server_duration_seconds, replace the metric name accordingly. + +- name: http_request_rate + description: Query per-second HTTP request rate over 5m window grouped by service + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=sum(rate(http_server_duration_milliseconds_count[5m])) by (service_name)' + expected_status_code: 200 + expected_fields: + - status + - data + expected_min_results: 1 + tags: + - metrics + +- name: http_latency_p95 + description: Query 95th percentile HTTP request latency by service + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=histogram_quantile(0.95, sum(rate(http_server_duration_milliseconds_bucket[5m])) by (le, service_name))' + expected_status_code: 200 + expected_fields: + - status + - data + expected_min_results: 1 + tags: + - metrics + +- name: http_latency_p99 + description: Query 99th percentile HTTP request latency by service + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=histogram_quantile(0.99, sum(rate(http_server_duration_milliseconds_bucket[5m])) by (le, service_name))' + expected_status_code: 200 + expected_fields: + - status + - data + expected_min_results: 1 + tags: + - metrics + +- name: http_error_rate + description: Query ratio of 5xx error responses to total requests by service + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=sum(rate(http_server_duration_milliseconds_count{http_response_status_code=~"5.."}[5m])) by (service_name) / sum(rate(http_server_duration_milliseconds_count[5m])) by (service_name)' + expected_status_code: 200 + expected_fields: + - status + - data + tags: + - metrics + +- name: active_connections + description: Query current number of active HTTP connections by service + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=sum(http_server_active_requests) by (service_name)' + expected_status_code: 200 + expected_fields: + - status + - data + tags: + - metrics + +- name: genai_token_usage + description: Query GenAI token usage total by operation and model + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=gen_ai_client_token_usage_total' + expected_status_code: 200 + expected_fields: + - status + - data + expected_min_results: 1 + tags: + - metrics + +- name: genai_operation_duration + description: Query GenAI operation duration (seconds) by operation and model + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=gen_ai_client_operation_duration_seconds_count' + expected_status_code: 200 + expected_fields: + - status + - data + expected_min_results: 1 + tags: + - metrics diff --git a/claude-code-observability-plugin/tests/fixtures/osd-config.yaml b/claude-code-observability-plugin/tests/fixtures/osd-config.yaml new file mode 100644 index 00000000..d942bc3f --- /dev/null +++ b/claude-code-observability-plugin/tests/fixtures/osd-config.yaml @@ -0,0 +1,231 @@ +## OSD Config skill integration tests +## Tests both OpenSearch-direct APIs (always available) and OSD Dashboards APIs (require OSD running) + +# --- OpenSearch-direct APIs (no OSD dependency) --- + +- name: osd_index_discovery + description: Discover observability indices via _cat/indices API filtered to otel/logs patterns + command: >- + curl -sk -u admin:'My_password_123!@#' + 'https://localhost:9200/_cat/indices/otel-*,logs-otel-*?format=json' + expected_status_code: 200 + expected_fields: [] + expected_min_results: 1 + tags: + - osd_config + +- name: osd_trace_index_mapping + description: Get trace index field mappings for dynamic field discovery + command: >- + curl -sk -u admin:'My_password_123!@#' + 'https://localhost:9200/otel-v1-apm-span-000001/_mapping?pretty' + expected_status_code: 200 + expected_fields: + - otel-v1-apm-span-000001 + tags: + - osd_config + +- name: osd_log_index_mapping + description: Get log index field mappings for dynamic field discovery + command: >- + curl -sk -u admin:'My_password_123!@#' + 'https://localhost:9200/logs-otel-v1-000001/_mapping?pretty' + expected_status_code: 200 + expected_fields: + - logs-otel-v1-000001 + tags: + - osd_config + +- name: osd_service_map_index_mapping + description: Get service map index field mappings for dynamic field discovery + command: >- + curl -sk -u admin:'My_password_123!@#' + 'https://localhost:9200/otel-v2-apm-service-map-000001/_mapping?pretty' + expected_status_code: 200 + expected_fields: + - otel-v2-apm-service-map-000001 + tags: + - osd_config + +- name: osd_ppl_describe_traces + description: PPL describe command for trace index field discovery + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "describe otel-v1-apm-span-000001"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + expected_min_results: 1 + tags: + - osd_config + +- name: osd_ppl_describe_logs + description: PPL describe command for log index field discovery + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "describe logs-otel-v1-000001"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + expected_min_results: 1 + tags: + - osd_config + +# --- OSD Dashboards APIs (require OSD running on port 5601) --- + +- name: osd_workspace_list + description: List all OSD workspaces via the workspaces API + command: >- + curl -s -u admin:'My_password_123!@#' + 'http://localhost:5601/api/workspaces/_list' + -H 'osd-xsrf: true' + expected_status_code: 200 + expected_fields: + - success + tags: + - osd_config + - osd_dashboards + +- name: osd_saved_objects_count + description: Get saved object count for a specific type (type param is required by _find API) + command: >- + curl -s -u admin:'My_password_123!@#' + 'http://localhost:5601/api/saved_objects/_find?type=config&per_page=0' + -H 'osd-xsrf: true' + expected_status_code: 200 + expected_fields: + - total + tags: + - osd_config + - osd_dashboards + +- name: osd_index_patterns + description: List all index patterns registered in OSD + command: >- + curl -s -u admin:'My_password_123!@#' + 'http://localhost:5601/api/saved_objects/_find?type=index-pattern&per_page=100' + -H 'osd-xsrf: true' + expected_status_code: 200 + expected_fields: + - total + - saved_objects + tags: + - osd_config + - osd_dashboards + +- name: osd_saved_queries + description: Find saved queries registered in OSD + command: >- + curl -s -u admin:'My_password_123!@#' + 'http://localhost:5601/api/saved_objects/_find?type=query&per_page=100' + -H 'osd-xsrf: true' + expected_status_code: 200 + expected_fields: + - total + - saved_objects + tags: + - osd_config + - osd_dashboards + +- name: osd_dashboards_list + description: Find dashboards registered in OSD + command: >- + curl -s -u admin:'My_password_123!@#' + 'http://localhost:5601/api/saved_objects/_find?type=dashboard&per_page=100' + -H 'osd-xsrf: true' + expected_status_code: 200 + expected_fields: + - total + - saved_objects + tags: + - osd_config + - osd_dashboards + +- name: osd_visualizations_list + description: Find visualizations registered in OSD + command: >- + curl -s -u admin:'My_password_123!@#' + 'http://localhost:5601/api/saved_objects/_find?type=visualization&per_page=100' + -H 'osd-xsrf: true' + expected_status_code: 200 + expected_fields: + - total + - saved_objects + tags: + - osd_config + - osd_dashboards + +- name: osd_correlations_list + description: Find APM correlation saved objects (trace-to-logs and APM-Config types) + command: >- + curl -s -u admin:'My_password_123!@#' + 'http://localhost:5601/api/saved_objects/_find?type=correlations&per_page=100' + -H 'osd-xsrf: true' + expected_status_code: 200 + expected_fields: + - total + - saved_objects + tags: + - osd_config + - osd_dashboards + +- name: osd_data_sources_list + description: Find OpenSearch data source saved objects + command: >- + curl -s -u admin:'My_password_123!@#' + 'http://localhost:5601/api/saved_objects/_find?type=data-source&per_page=100' + -H 'osd-xsrf: true' + expected_status_code: 200 + expected_fields: + - total + - saved_objects + tags: + - osd_config + - osd_dashboards + +- name: osd_data_connections_list + description: Find data connection saved objects (Prometheus datasources) + command: >- + curl -s -u admin:'My_password_123!@#' + 'http://localhost:5601/api/saved_objects/_find?type=data-connection&per_page=100' + -H 'osd-xsrf: true' + expected_status_code: 200 + expected_fields: + - total + - saved_objects + tags: + - osd_config + - osd_dashboards + +- name: osd_explore_list + description: Find PromQL explore panels registered in OSD + command: >- + curl -s -u admin:'My_password_123!@#' + 'http://localhost:5601/api/saved_objects/_find?type=explore&per_page=100' + -H 'osd-xsrf: true' + expected_status_code: 200 + expected_fields: + - total + - saved_objects + tags: + - osd_config + - osd_dashboards + +- name: osd_settings + description: Get OSD settings including default workspace and index + command: >- + curl -s -u admin:'My_password_123!@#' + 'http://localhost:5601/api/opensearch-dashboards/settings' + -H 'osd-xsrf: true' + expected_status_code: 200 + expected_fields: + - settings + tags: + - osd_config + - osd_dashboards diff --git a/claude-code-observability-plugin/tests/fixtures/ppl.yaml b/claude-code-observability-plugin/tests/fixtures/ppl.yaml new file mode 100644 index 00000000..025064df --- /dev/null +++ b/claude-code-observability-plugin/tests/fixtures/ppl.yaml @@ -0,0 +1,112 @@ +## PPL system command and query integration tests + +- name: ppl_describe + description: Use PPL describe command to inspect the trace index mapping and field types + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "describe otel-v1-apm-span-000001"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - ppl + +- name: ppl_explain + description: Use PPL _explain endpoint to retrieve the query execution plan for a simple query + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl/_explain + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | head 5"}' + expected_status_code: 200 + expected_fields: + - calcite + tags: + - ppl + +- name: ppl_stats_aggregation + description: Test PPL stats command with multiple aggregation functions + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | stats count() as total, avg(durationInNanos) as avg_ns, max(durationInNanos) as max_ns, min(durationInNanos) as min_ns by serviceName | sort - total | head 10"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - ppl + +- name: ppl_distinct_count + description: Test PPL distinct_count to find unique operation names per service + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | stats distinct_count(name) as unique_ops by serviceName | sort - unique_ops | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - ppl + +- name: ppl_top_operations + description: Test PPL top command to find the most frequent span names + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | top 10 name"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - ppl + +- name: ppl_rare_operations + description: Test PPL rare command to find the least frequent span names + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | rare name"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - ppl + +- name: ppl_eval_duration_ms + description: Test PPL eval command to compute derived duration in milliseconds + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | eval duration_ms = durationInNanos / 1000000 | fields serviceName, name, duration_ms | sort - duration_ms | head 10"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - ppl + +- name: ppl_describe_log_index + description: Use PPL describe to inspect the log index field mappings + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "describe logs-otel-v1-000001"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - ppl diff --git a/claude-code-observability-plugin/tests/fixtures/slo-sli.yaml b/claude-code-observability-plugin/tests/fixtures/slo-sli.yaml new file mode 100644 index 00000000..8732cf2b --- /dev/null +++ b/claude-code-observability-plugin/tests/fixtures/slo-sli.yaml @@ -0,0 +1,47 @@ +- name: sli_availability + description: "Availability SLI: ratio of non-5xx requests to total requests over 30m window" + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=1 - (sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[30m])) / sum(rate(http_server_duration_seconds_count[30m])))' + expected_status_code: 200 + expected_fields: + - status + - data + tags: + - slo_sli + +- name: sli_latency + description: "Latency SLI: ratio of requests completing within 500ms to total requests over 30m window" + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=sum(rate(http_server_duration_seconds_bucket{le="0.5"}[30m])) / sum(rate(http_server_duration_seconds_count[30m]))' + expected_status_code: 200 + expected_fields: + - status + - data + tags: + - slo_sli + +- name: error_budget_remaining + description: "Error budget remaining for 99.9% SLO target using 30m availability SLI" + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=1 - ((1 - (sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[30m])) / sum(rate(http_server_duration_seconds_count[30m])))) / 0.999)' + expected_status_code: 200 + expected_fields: + - status + - data + tags: + - slo_sli + +- name: burn_rate_fast + description: "Fast burn rate over 1h window for 99.9% SLO target" + command: >- + curl -s 'http://localhost:9090/api/v1/query' + --data-urlencode 'query=sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[1h])) / sum(rate(http_server_duration_seconds_count[1h])) / (1 - 0.999)' + expected_status_code: 200 + expected_fields: + - status + - data + tags: + - slo_sli diff --git a/claude-code-observability-plugin/tests/fixtures/stack-health.yaml b/claude-code-observability-plugin/tests/fixtures/stack-health.yaml new file mode 100644 index 00000000..bcf92978 --- /dev/null +++ b/claude-code-observability-plugin/tests/fixtures/stack-health.yaml @@ -0,0 +1,63 @@ +## Stack health check integration tests + +- name: opensearch_cluster_health + description: Check OpenSearch cluster health status returns valid JSON with cluster_name and status fields + command: >- + curl -sk -u admin:'My_password_123!@#' + https://localhost:9200/_cluster/health?pretty + expected_status_code: 200 + expected_fields: + - cluster_name + - status + tags: + - stack-health + +- name: opensearch_cat_indices + description: List OpenSearch indices in JSON format to verify data ingestion created expected trace and log indices + command: >- + curl -sk -u admin:'My_password_123!@#' + 'https://localhost:9200/_cat/indices?format=json&v' + expected_status_code: 200 + expected_fields: [] + tags: + - stack-health + +- name: prometheus_targets + description: Query Prometheus targets API to verify scrape targets are configured + command: >- + curl -s http://localhost:9090/api/v1/targets + expected_status_code: 200 + expected_fields: + - status + - data + tags: + - stack-health + +- name: trace_data_exists + description: Verify trace data exists by counting spans + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | stats count() as total"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - stack-health + +- name: log_data_exists + description: Verify log data exists by counting log entries + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=logs-otel-v1-* | stats count() as total"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - stack-health + diff --git a/claude-code-observability-plugin/tests/fixtures/topology.yaml b/claude-code-observability-plugin/tests/fixtures/topology.yaml new file mode 100644 index 00000000..faecada4 --- /dev/null +++ b/claude-code-observability-plugin/tests/fixtures/topology.yaml @@ -0,0 +1,78 @@ +## Service topology discovery tests against the service map index + +- name: topology_list_services + description: List all unique services from the service map using dedup nodeConnectionHash + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v2-apm-service-map-* | dedup nodeConnectionHash | fields sourceNode, targetNode | head 50"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - topology + +- name: topology_service_operations + description: List operations per service using dedup operationConnectionHash + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v2-apm-service-map-* | dedup operationConnectionHash | fields sourceNode, sourceOperation, targetNode, targetOperation | head 50"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - topology + +- name: topology_dependency_count + description: Count downstream dependencies per service + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v2-apm-service-map-* | dedup nodeConnectionHash | stats count() as connections by sourceNode | sort - connections"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - topology + +- name: topology_service_attributes + description: Get service attributes (most recent) from service map + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v2-apm-service-map-* | fields sourceNode, timestamp | sort - timestamp | head 10"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - topology + +- name: topology_index_discovery + description: Discover observability indices via _cat/indices API + command: >- + curl -sk -u admin:'My_password_123!@#' + 'https://localhost:9200/_cat/indices/otel-*?format=json' + expected_status_code: 200 + expected_fields: [] + tags: + - topology + +- name: topology_trace_field_mapping + description: Get trace index field mappings for dynamic field discovery + command: >- + curl -sk -u admin:'My_password_123!@#' + 'https://localhost:9200/otel-v1-apm-span-000001/_mapping?pretty' + expected_status_code: 200 + expected_fields: + - otel-v1-apm-span-000001 + tags: + - topology diff --git a/claude-code-observability-plugin/tests/fixtures/traces.yaml b/claude-code-observability-plugin/tests/fixtures/traces.yaml new file mode 100644 index 00000000..5bb09447 --- /dev/null +++ b/claude-code-observability-plugin/tests/fixtures/traces.yaml @@ -0,0 +1,141 @@ +## Trace query integration tests using real data from the observability stack + +- name: agent_invocations + description: Query all agent invocation spans where gen_ai.operation.name is invoke_agent + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''invoke_agent'\'' | fields traceId, spanId, `attributes.gen_ai.agent.name`, `attributes.gen_ai.request.model`, durationInNanos, startTime | sort - startTime | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - traces + +- name: tool_executions + description: Query all tool execution spans where gen_ai.operation.name is execute_tool + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''execute_tool'\'' | fields traceId, spanId, `attributes.gen_ai.tool.name`, durationInNanos, startTime | sort - startTime | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - traces + +- name: slow_spans + description: Identify spans exceeding 5 second latency threshold + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | where durationInNanos > 5000000000 | fields traceId, spanId, serviceName, name, durationInNanos, startTime | sort - durationInNanos | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - traces + +- name: error_spans + description: Query spans with error status code 2 (OTel ERROR) + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | where `status.code` = 2 | fields traceId, spanId, serviceName, name, `status.code`, startTime | sort - startTime | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - traces + +- name: token_usage_by_model + description: Aggregate input and output token usage grouped by requested model + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.usage.input_tokens` > 0 | stats sum(`attributes.gen_ai.usage.input_tokens`) as total_input, sum(`attributes.gen_ai.usage.output_tokens`) as total_output by `attributes.gen_ai.request.model`"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - traces + +- name: service_operations_list + description: List distinct operations for each service to understand available endpoints + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | stats distinct_count(name) as operation_count, count() as span_count by serviceName | sort - span_count | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - traces + +- name: service_map_query + description: Query the service dependency map to understand inter-service communication + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v2-apm-service-map-* | fields sourceNode, targetNode, sourceOperation, targetOperation | head 50"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - traces + +- name: genai_agent_performance + description: Compare GenAI agent performance by average duration and invocation count + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | where `attributes.gen_ai.operation.name` = '\''invoke_agent'\'' | stats count() as invocations, avg(durationInNanos) as avg_duration_ns by `attributes.gen_ai.agent.name` | sort - invocations"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - traces + +- name: checkout_trace_tree + description: Reconstruct a trace tree for the checkout service by finding spans with parent-child relationships + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | where serviceName = '\''checkout'\'' | fields traceId, spanId, parentSpanId, name, durationInNanos, startTime | sort startTime | head 30"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - traces + +- name: frontend_http_spans + description: Query HTTP server spans from the frontend service to understand user-facing request patterns + command: >- + curl -sk -u admin:'My_password_123!@#' + -X POST https://localhost:9200/_plugins/_ppl + -H 'Content-Type: application/json' + -d '{"query": "source=otel-v1-apm-span-* | where serviceName = '\''frontend'\'' AND kind = '\''SPAN_KIND_SERVER'\'' | fields traceId, name, durationInNanos, `status.code`, startTime | sort - startTime | head 20"}' + expected_status_code: 200 + expected_fields: + - schema + - datarows + tags: + - traces diff --git a/claude-code-observability-plugin/tests/models.py b/claude-code-observability-plugin/tests/models.py new file mode 100644 index 00000000..bfc26086 --- /dev/null +++ b/claude-code-observability-plugin/tests/models.py @@ -0,0 +1,17 @@ +from typing import List, Optional + +from pydantic import BaseModel, ConfigDict + + +class TestFixture(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: str + description: str + command: str + expected_status_code: int + expected_fields: List[str] + tags: List[str] + before_test: Optional[str] = None + after_test: Optional[str] = None + expected_min_results: Optional[int] = None diff --git a/claude-code-observability-plugin/tests/requirements.txt b/claude-code-observability-plugin/tests/requirements.txt new file mode 100644 index 00000000..c8ed64e9 --- /dev/null +++ b/claude-code-observability-plugin/tests/requirements.txt @@ -0,0 +1,5 @@ +pytest>=7.0 +pyyaml>=6.0 +pydantic>=2.0 +requests>=2.28 +hypothesis>=6.0 diff --git a/claude-code-observability-plugin/tests/test_properties.py b/claude-code-observability-plugin/tests/test_properties.py new file mode 100644 index 00000000..e1e32bd9 --- /dev/null +++ b/claude-code-observability-plugin/tests/test_properties.py @@ -0,0 +1,592 @@ +"""Property-based tests for the Claude Code Observability Plugin. + +Validates correctness properties defined in the design document: + P1: Skill file frontmatter validity + P2: PPL curl command completeness + P3: OpenSearch curl command authentication + P4: Prometheus curl command protocol + P5: PPL command documentation completeness + P6: PromQL curl command completeness + P7: Recursive field lookup correctness + P8: Config loader .env parsing with fallback + P9: RED query completeness + P10: SLO recording rule validity +""" + +import os +import re +import sys +import tempfile +from pathlib import Path + +import pytest +import yaml +from hypothesis import given, settings +from hypothesis import strategies as st + +# --------------------------------------------------------------------------- +# Path setup — allow imports from sibling test modules +# --------------------------------------------------------------------------- + +sys.path.insert(0, str(Path(__file__).parent)) + +from conftest import DEFAULTS, parse_env_file +from test_runner import field_exists + + +# --------------------------------------------------------------------------- +# Override the session-scoped autouse stack health check from conftest.py. +# Property tests validate static file content and do not need a running stack. +# --------------------------------------------------------------------------- + +@pytest.fixture(scope="session", autouse=True) +def check_stack_health(): + """No-op override — property tests do not require a running stack.""" + yield + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +SKILLS_DIR = Path(__file__).parent.parent / "skills" + +SKILL_FILES = sorted( + p for p in SKILLS_DIR.rglob("SKILL.md") +) + + +def _read_skill(path: Path) -> str: + """Return the full text of a skill file.""" + return path.read_text(encoding="utf-8") + + +def _parse_frontmatter(text: str) -> dict: + """Extract and parse YAML frontmatter from a markdown file.""" + parts = text.split("---", 2) + if len(parts) < 3: + return {} + return yaml.safe_load(parts[1]) or {} + + +def _extract_code_blocks(text: str) -> list[str]: + """Return all fenced code blocks from markdown text.""" + return re.findall(r"```[^\n]*\n(.*?)```", text, re.DOTALL) + + +def _extract_bash_code_blocks(text: str) -> list[str]: + """Return all ```bash fenced code blocks from markdown text.""" + return re.findall(r"```bash\n(.*?)```", text, re.DOTALL) + + +# ========================================================================= +# Property 1: Skill file frontmatter validity +# Validates: Requirements 1.5, 7.1, 7.2, 7.3, 7.4 +# ========================================================================= + + +@pytest.mark.parametrize("skill_path", SKILL_FILES, ids=[p.parent.name for p in SKILL_FILES]) +def test_property_1_frontmatter_validity(skill_path: Path) -> None: + """Every skill file must have valid YAML frontmatter with name, description, and allowed-tools.""" + text = _read_skill(skill_path) + fm = _parse_frontmatter(text) + + assert "name" in fm, f"{skill_path.name}: missing 'name' in frontmatter" + assert isinstance(fm["name"], str) and fm["name"].strip(), ( + f"{skill_path.name}: 'name' must be a non-empty string" + ) + + assert "description" in fm, f"{skill_path.name}: missing 'description' in frontmatter" + assert isinstance(fm["description"], str) and fm["description"].strip(), ( + f"{skill_path.name}: 'description' must be a non-empty string" + ) + + assert "allowed-tools" in fm, f"{skill_path.name}: missing 'allowed-tools' in frontmatter" + assert isinstance(fm["allowed-tools"], list) and len(fm["allowed-tools"]) > 0, ( + f"{skill_path.name}: 'allowed-tools' must be a non-empty list" + ) + + +# ========================================================================= +# Property 2: PPL curl command completeness +# Validates: Requirements 2.9, 3.6 +# ========================================================================= + + +def _collect_ppl_curl_blocks() -> list[tuple[str, str]]: + """Extract code blocks containing PPL queries from traces.md and logs.md. + + Returns (file_name, code_block) pairs for blocks that contain + an observability index pattern (``otel-v1-apm-`` or ``logs-otel-v1-``). + """ + results: list[tuple[str, str]] = [] + for name in ("traces", "logs"): + path = SKILLS_DIR / name / "SKILL.md" + if not path.exists(): + continue + blocks = _extract_bash_code_blocks(_read_skill(path)) + for block in blocks: + if ("otel-v1-apm-" in block or "logs-otel-v1-" in block) and "aws-sigv4" not in block: + # Use first meaningful line as id + first_line = block.strip().split("\n")[0][:80] + results.append((f"{name}: {first_line}", block)) + return results + + +_PPL_CURL_BLOCKS = _collect_ppl_curl_blocks() + + +@pytest.mark.parametrize( + "block", + [b for _, b in _PPL_CURL_BLOCKS], + ids=[label for label, _ in _PPL_CURL_BLOCKS], +) +def test_property_2_ppl_curl_completeness(block: str) -> None: + """Every PPL code block in traces.md / logs.md must be a complete curl command.""" + assert "/_plugins/_ppl" in block, "Missing PPL API endpoint (/_plugins/_ppl)" + assert "-u admin:" in block or "-u admin'" in block or "$OPENSEARCH_USER" in block, "Missing basic auth (-u admin: or $OPENSEARCH_USER)" + assert "https" in block.lower() or "$OPENSEARCH_ENDPOINT" in block, "Missing HTTPS protocol or $OPENSEARCH_ENDPOINT" + assert '"query"' in block, 'Missing JSON body with "query" field' + + +# ========================================================================= +# Property 3: OpenSearch curl command authentication +# Validates: Requirements 7.6, 8.1, 8.2 +# ========================================================================= + + +def _collect_opensearch_curl_commands() -> list[tuple[str, str]]: + """Extract all curl commands targeting OpenSearch across all skill files. + + Identifies OpenSearch commands by: port 9200, /_plugins/, /_cluster/, /_cat/. + Excludes AWS SigV4 variant commands (those use different auth). + """ + os_patterns = re.compile(r"(localhost:9200|/_plugins/|/_cluster/|/_cat/)") + results: list[tuple[str, str]] = [] + for skill_path in SKILL_FILES: + text = _read_skill(skill_path) + blocks = _extract_bash_code_blocks(text) + for block in blocks: + if os_patterns.search(block) and "aws-sigv4" not in block: + first_line = block.strip().split("\n")[0][:60] + results.append((f"{skill_path.parent.name}: {first_line}", block)) + return results + + +_OS_CURL_COMMANDS = _collect_opensearch_curl_commands() + + +@pytest.mark.parametrize( + "block", + [b for _, b in _OS_CURL_COMMANDS], + ids=[label for label, _ in _OS_CURL_COMMANDS], +) +def test_property_3_opensearch_curl_auth(block: str) -> None: + """Every OpenSearch curl command must use HTTPS, -k flag, and basic auth.""" + assert "https" in block.lower() or "$OPENSEARCH_ENDPOINT" in block, "OpenSearch command must use HTTPS or $OPENSEARCH_ENDPOINT" + assert "-k" in block or "-sk" in block, "OpenSearch command must include -k flag" + assert "-u admin:" in block or "-u admin'" in block or "$OPENSEARCH_USER" in block, ( + "OpenSearch command must include basic auth (-u admin: or $OPENSEARCH_USER)" + ) + + +# ========================================================================= +# Property 4: Prometheus curl command protocol +# Validates: Requirements 8.3 +# ========================================================================= + + +def _collect_prometheus_curl_commands() -> list[tuple[str, str]]: + """Extract all curl commands targeting Prometheus (port 9090) across all skill files. + + Excludes AWS SigV4 variant commands. + """ + results: list[tuple[str, str]] = [] + for skill_path in SKILL_FILES: + text = _read_skill(skill_path) + blocks = _extract_bash_code_blocks(text) + for block in blocks: + if "9090" in block and "aws-sigv4" not in block: + first_line = block.strip().split("\n")[0][:60] + results.append((f"{skill_path.parent.name}: {first_line}", block)) + return results + + +_PROM_CURL_COMMANDS = _collect_prometheus_curl_commands() + + +@pytest.mark.parametrize( + "block", + [b for _, b in _PROM_CURL_COMMANDS], + ids=[label for label, _ in _PROM_CURL_COMMANDS], +) +def test_property_4_prometheus_curl_protocol(block: str) -> None: + """Every Prometheus curl command must use HTTP (not HTTPS).""" + # Find URLs targeting port 9090 + urls = re.findall(r"https?://[^\s'\"]+9090[^\s'\"]*", block) + for url in urls: + assert url.startswith("http://"), ( + f"Prometheus URL must use HTTP, not HTTPS: {url}" + ) + + +# ========================================================================= +# Property 5: PPL command documentation completeness +# Validates: Requirements 6.11 +# ========================================================================= + + +def _parse_ppl_command_sections() -> list[tuple[str, str]]: + """Parse ppl-reference.md and extract each command section (### heading). + + Returns (heading_text, section_body) pairs for command sections under + the ``## Commands`` top-level section. + """ + path = SKILLS_DIR / "ppl-reference" / "SKILL.md" + if not path.exists(): + return [] + + text = _read_skill(path) + + # Find the ## Commands section + commands_match = re.search(r"^## Commands\s*$", text, re.MULTILINE) + if not commands_match: + return [] + + # Find the next ## section (or end of file) to bound the Commands block + commands_start = commands_match.end() + next_h2 = re.search(r"^## (?!Commands)", text[commands_start:], re.MULTILINE) + commands_text = text[commands_start : commands_start + next_h2.start()] if next_h2 else text[commands_start:] + + # Split on ### or #### headings to get individual command sections + # We look for #### headings (individual commands) within ### category groups + sections: list[tuple[str, str]] = [] + # Match #### headings (the actual command entries) + pattern = re.compile(r"^####\s+(.+)$", re.MULTILINE) + matches = list(pattern.finditer(commands_text)) + + for i, m in enumerate(matches): + heading = m.group(1).strip() + start = m.end() + end = matches[i + 1].start() if i + 1 < len(matches) else len(commands_text) + body = commands_text[start:end] + sections.append((heading, body)) + + return sections + + +_PPL_COMMAND_SECTIONS = _parse_ppl_command_sections() + + +@pytest.mark.parametrize( + "section_body", + [body for _, body in _PPL_COMMAND_SECTIONS], + ids=[heading for heading, _ in _PPL_COMMAND_SECTIONS], +) +def test_property_5_ppl_command_doc_completeness(section_body: str) -> None: + """Each PPL command section must have a code block, description text, and an observability example.""" + # Must have at least one code block (syntax/usage) + code_blocks = re.findall(r"```", section_body) + assert len(code_blocks) >= 2, "Command section must contain at least one code block" + + # Must have description text (non-empty text outside code blocks) + text_outside_blocks = re.sub(r"```[^`]*```", "", section_body, flags=re.DOTALL) + stripped = text_outside_blocks.strip() + assert len(stripped) > 10, "Command section must contain description text" + + # Must have at least one example using an observability index pattern + # Most commands use otel-v1-apm-*, but some (graphlookup) use otel-v2-apm-* + # and system commands (showdatasources) use system queries without index patterns + has_otel_index = "otel-v1-apm-" in section_body or "otel-v2-apm-" in section_body or "logs-otel-v1-" in section_body + has_system_query = "show datasources" in section_body.lower() or "describe" in section_body.lower() + assert has_otel_index or has_system_query, ( + "Command section must include at least one example using an observability index pattern " + "(otel-v1-apm-* or otel-v2-apm-*) or a system query" + ) + + +# ========================================================================= +# Property 6: PromQL curl command completeness +# Validates: Requirements 4.7 +# ========================================================================= + + +def _collect_promql_blocks_from_metrics() -> list[tuple[str, str]]: + """Extract code blocks from metrics.md that contain PromQL queries. + + Identifies PromQL by patterns: rate(, histogram_quantile(, sum(. + """ + path = SKILLS_DIR / "metrics" / "SKILL.md" + if not path.exists(): + return [] + + text = _read_skill(path) + blocks = _extract_bash_code_blocks(text) + promql_pattern = re.compile(r"(rate\(|histogram_quantile\(|sum\()") + results: list[tuple[str, str]] = [] + for block in blocks: + if promql_pattern.search(block) and "aws-sigv4" not in block: + first_line = block.strip().split("\n")[0][:80] + results.append((f"metrics: {first_line}", block)) + return results + + +_PROMQL_METRICS_BLOCKS = _collect_promql_blocks_from_metrics() + + +@pytest.mark.parametrize( + "block", + [b for _, b in _PROMQL_METRICS_BLOCKS], + ids=[label for label, _ in _PROMQL_METRICS_BLOCKS], +) +def test_property_6_promql_curl_completeness(block: str) -> None: + """Every PromQL code block in metrics.md must target localhost:9090/api/v1/query.""" + assert "localhost:9090/api/v1/query" in block or "$PROMETHEUS_ENDPOINT/api/v1/query" in block, ( + "PromQL block must contain curl command targeting localhost:9090/api/v1/query or $PROMETHEUS_ENDPOINT/api/v1/query" + ) + + +# ========================================================================= +# Property 7: Recursive field lookup correctness +# Validates: Requirements 11.12 +# ========================================================================= + +# Strategy: generate nested dicts with string keys and arbitrary leaf values +_json_leaves = st.one_of( + st.none(), + st.booleans(), + st.integers(min_value=-1000, max_value=1000), + st.floats(allow_nan=False, allow_infinity=False), + st.text(min_size=0, max_size=10), +) + +_json_strategy = st.recursive( + _json_leaves, + lambda children: st.dictionaries( + st.text( + alphabet="abcdefghijklmnopqrstuvwxyz", + min_size=1, + max_size=5, + ), + children, + min_size=0, + max_size=4, + ), + max_leaves=20, +) + +# Strategy for dot-separated field paths +_path_strategy = st.lists( + st.text(alphabet="abcdefghijklmnopqrstuvwxyz", min_size=1, max_size=5), + min_size=1, + max_size=4, +).map(lambda parts: ".".join(parts)) + + +def _path_actually_exists(obj: object, path: str) -> bool: + """Ground-truth check: walk the dot-separated path through nested dicts.""" + keys = path.split(".") + current = obj + for key in keys: + if not isinstance(current, dict) or key not in current: + return False + current = current[key] + return True + + +@given(obj=_json_strategy, path=_path_strategy) +@settings(max_examples=100) +def test_property_7_recursive_field_lookup(obj: object, path: str) -> None: + """field_exists must return True iff the dot-separated path exists in the JSON object.""" + # Feature: claude-code-observability-plugin, Property 7: Recursive field lookup correctness + # **Validates: Requirements 11.12** + expected = _path_actually_exists(obj, path) + actual = field_exists(obj, path) + assert actual == expected, ( + f"field_exists({obj!r}, {path!r}) returned {actual}, expected {expected}" + ) + + +# ========================================================================= +# Property 8: Config loader .env parsing with fallback +# Validates: Requirements 11.15 +# ========================================================================= + +# Strategy: generate lines that look like .env content +_env_key = st.sampled_from([ + "OPENSEARCH_HOST", "OPENSEARCH_PORT", "OPENSEARCH_USER", + "OPENSEARCH_PASSWORD", "PROMETHEUS_PORT", +]) +_env_value = st.text( + alphabet="abcdefghijklmnopqrstuvwxyz0123456789_-.", + min_size=1, + max_size=20, +) +_env_line = st.one_of( + # Valid KEY=VALUE line + st.tuples(_env_key, _env_value).map(lambda kv: f"{kv[0]}={kv[1]}"), + # Comment line + st.just("# this is a comment"), + # Blank line + st.just(""), + # Garbage line (no =) + st.text(alphabet="abcdefghijklmnopqrstuvwxyz", min_size=1, max_size=10), +) +_env_content = st.lists(_env_line, min_size=0, max_size=10).map(lambda lines: "\n".join(lines)) + + +@given(content=_env_content) +@settings(max_examples=100) +def test_property_8_config_loader_env_parsing(content: str) -> None: + """parse_env_file must return parsed values or fall back to DEFAULTS.""" + # Feature: claude-code-observability-plugin, Property 8: Config loader .env parsing with fallback + # **Validates: Requirements 11.15** + with tempfile.NamedTemporaryFile(mode="w", suffix=".env", delete=False) as f: + f.write(content) + tmp_path = f.name + + try: + parsed = parse_env_file(tmp_path) + + # Every parsed key-value must match what's in the file + for key, value in parsed.items(): + # The value should appear in the content as KEY=VALUE + assert f"{key}=" in content, ( + f"Parsed key {key!r} not found in .env content" + ) + + # For known default keys, verify fallback works + for key, default_val in DEFAULTS.items(): + if key in parsed: + # Parsed value should be a non-empty string + assert isinstance(parsed[key], str) + # If not parsed, the caller (load_config) would use the default + # We just verify parse_env_file doesn't crash + finally: + os.unlink(tmp_path) + + +# ========================================================================= +# Property 9: RED query completeness +# Validates: Requirements 13.13 +# ========================================================================= + + +def _collect_red_query_blocks() -> list[tuple[str, str]]: + """Extract query code blocks from apm-red.md. + + Identifies PromQL blocks (rate(, histogram_quantile(, sum() and + PPL blocks (source=otel-v1-apm-). Excludes AWS SigV4 variants. + """ + path = SKILLS_DIR / "apm-red" / "SKILL.md" + if not path.exists(): + return [] + + text = _read_skill(path) + blocks = _extract_bash_code_blocks(text) + promql_pat = re.compile(r"(rate\(|histogram_quantile\()") + ppl_pat = re.compile(r"source=otel-v1-apm-") + + results: list[tuple[str, str]] = [] + for block in blocks: + if "aws-sigv4" in block: + continue + if promql_pat.search(block) or ppl_pat.search(block): + first_line = block.strip().split("\n")[0][:80] + results.append((f"apm-red: {first_line}", block)) + return results + + +_RED_QUERY_BLOCKS = _collect_red_query_blocks() + + +@pytest.mark.parametrize( + "block", + [b for _, b in _RED_QUERY_BLOCKS], + ids=[label for label, _ in _RED_QUERY_BLOCKS], +) +def test_property_9_red_query_completeness(block: str) -> None: + """RED query blocks must have correct curl commands for their query type.""" + promql_pat = re.compile(r"(rate\(|histogram_quantile\()") + ppl_pat = re.compile(r"source=otel-v1-apm-") + + is_ppl = ppl_pat.search(block) + is_promql = promql_pat.search(block) + + if is_promql and not is_ppl: + assert "localhost:9090" in block or "$PROMETHEUS_ENDPOINT" in block, ( + "PromQL RED block must target Prometheus at localhost:9090 or $PROMETHEUS_ENDPOINT" + ) + if is_ppl: + assert "-u admin:" in block or "-u admin'" in block or "$OPENSEARCH_USER" in block, ( + "PPL RED block must include OpenSearch basic auth or $OPENSEARCH_USER" + ) + + +# ========================================================================= +# Property 10: SLO recording rule validity +# Validates: Requirements 14.4, 14.5, 14.6 +# ========================================================================= + + +def _collect_slo_recording_rules() -> list[tuple[str, str]]: + """Extract YAML code blocks from slo-sli.md that contain recording rules. + + Looks for ```yaml blocks with a ``record:`` field. + """ + path = SKILLS_DIR / "slo-sli" / "SKILL.md" + if not path.exists(): + return [] + + text = _read_skill(path) + yaml_blocks = re.findall(r"```yaml\n(.*?)```", text, re.DOTALL) + + results: list[tuple[str, str]] = [] + for block in yaml_blocks: + if "record:" in block: + # Parse the YAML to extract individual rules + try: + parsed = yaml.safe_load(block) + except yaml.YAMLError: + continue + if not isinstance(parsed, dict): + continue + groups = parsed.get("groups", []) + if not isinstance(groups, list): + continue + for group in groups: + rules = group.get("rules", []) + if not isinstance(rules, list): + continue + for rule in rules: + if "record" in rule: + record_name = rule.get("record", "unknown") + results.append((f"slo-sli: {record_name}", yaml.dump(rule))) + return results + + +_SLO_RECORDING_RULES = _collect_slo_recording_rules() + + +@pytest.mark.parametrize( + "rule_yaml", + [r for _, r in _SLO_RECORDING_RULES], + ids=[label for label, _ in _SLO_RECORDING_RULES], +) +def test_property_10_slo_recording_rule_validity(rule_yaml: str) -> None: + """Each SLO recording rule must have record with sli: prefix and an expr with PromQL.""" + rule = yaml.safe_load(rule_yaml) + assert isinstance(rule, dict), "Recording rule must be a YAML dict" + + # Must have 'record' field with sli: prefix + assert "record" in rule, "Recording rule must have a 'record' field" + record_name = rule["record"] + assert isinstance(record_name, str) and record_name.startswith("sli:"), ( + f"Recording rule 'record' must start with 'sli:' prefix, got: {record_name}" + ) + + # Must have 'expr' field with a PromQL expression + assert "expr" in rule, "Recording rule must have an 'expr' field" + expr = rule["expr"] + assert isinstance(expr, str) and len(expr.strip()) > 0, ( + "Recording rule 'expr' must be a non-empty PromQL expression" + ) diff --git a/claude-code-observability-plugin/tests/test_runner.py b/claude-code-observability-plugin/tests/test_runner.py new file mode 100644 index 00000000..8bcbc6b0 --- /dev/null +++ b/claude-code-observability-plugin/tests/test_runner.py @@ -0,0 +1,227 @@ +"""YAML-driven test execution for observability plugin skill commands. + +Loads YAML fixture files from tests/fixtures/, validates each against the +Pydantic TestFixture model, and executes commands via subprocess with +configurable timeout. Uses pytest.mark.parametrize to generate one test +case per fixture and applies pytest markers based on fixture tags. +""" + +import json +import os +import subprocess +from pathlib import Path + +import pytest +import yaml + +from models import TestFixture + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +FIXTURES_DIR = Path(__file__).parent / "fixtures" +DEFAULT_TIMEOUT = 30 # seconds + +# Tag string → pytest marker mapping +TAG_MARKER_MAP = { + "traces": "traces", + "logs": "logs", + "metrics": "metrics", + "stack-health": "stack_health", + "stack_health": "stack_health", + "ppl": "ppl", + "correlation": "correlation", + "apm_red": "apm_red", + "slo_sli": "slo_sli", + "topology": "topology", + "osd_config": "osd_config", + "osd_dashboards": "osd_dashboards", +} + +# --------------------------------------------------------------------------- +# Fixture loading helpers +# --------------------------------------------------------------------------- + + +def _load_fixtures() -> list[TestFixture]: + """Load and validate all YAML fixture files from the fixtures directory.""" + fixtures: list[TestFixture] = [] + if not FIXTURES_DIR.is_dir(): + return fixtures + + for yaml_path in sorted(FIXTURES_DIR.glob("*.yaml")): + with open(yaml_path, encoding="utf-8") as fh: + raw = yaml.safe_load(fh) + if raw is None: + continue + if not isinstance(raw, list): + raw = [raw] + for entry in raw: + fixtures.append(TestFixture(**entry)) + return fixtures + + +def _fixture_ids(fixtures: list[TestFixture]) -> list[str]: + """Return human-readable test IDs from fixture names.""" + return [f.name for f in fixtures] + + +# --------------------------------------------------------------------------- +# Recursive field lookup +# --------------------------------------------------------------------------- + + +def field_exists(obj: object, path: str) -> bool: + """Check whether a dot-separated *path* exists in a nested dict. + + >>> field_exists({"data": {"result": [1]}}, "data.result") + True + >>> field_exists({"data": {"result": [1]}}, "data.missing") + False + """ + keys = path.split(".") + current = obj + for key in keys: + if not isinstance(current, dict) or key not in current: + return False + current = current[key] + return True + + +# --------------------------------------------------------------------------- +# Marker application +# --------------------------------------------------------------------------- + + +def _apply_markers(fixture: TestFixture) -> list[pytest.MarkDecorator]: + """Derive pytest markers from fixture tags.""" + markers: list[pytest.MarkDecorator] = [] + for tag in fixture.tags: + marker_name = TAG_MARKER_MAP.get(tag) + if marker_name is not None: + markers.append(getattr(pytest.mark, marker_name)) + return markers + + +# --------------------------------------------------------------------------- +# Parametrized test +# --------------------------------------------------------------------------- + +_ALL_FIXTURES = _load_fixtures() + + +@pytest.mark.parametrize( + "fixture", + _ALL_FIXTURES, + ids=_fixture_ids(_ALL_FIXTURES), +) +def test_fixture(fixture: TestFixture) -> None: + """Execute a single YAML-defined test fixture. + + Steps: + 1. Run ``before_test`` hook (if present). + 2. Execute the main command. + 3. Run ``after_test`` hook (if present). + 4. Assert exit code is 0. + 5. Parse stdout as JSON. + 6. Assert all ``expected_fields`` exist in the response. + """ + timeout = int(os.environ.get("TEST_TIMEOUT", DEFAULT_TIMEOUT)) + + # --- before_test hook ---------------------------------------------------- + if fixture.before_test: + before = subprocess.run( + fixture.before_test, + shell=True, + capture_output=True, + timeout=timeout, + ) + assert before.returncode == 0, ( + f"before_test hook failed (rc={before.returncode}): " + f"{before.stderr.decode()}" + ) + + # --- Main command -------------------------------------------------------- + result = subprocess.run( + fixture.command, + shell=True, + capture_output=True, + timeout=timeout, + ) + assert result.returncode == 0, ( + f"Command failed (rc={result.returncode}): {result.stderr.decode()}" + ) + + # --- after_test hook ----------------------------------------------------- + if fixture.after_test: + after = subprocess.run( + fixture.after_test, + shell=True, + capture_output=True, + timeout=timeout, + ) + assert after.returncode == 0, ( + f"after_test hook failed (rc={after.returncode}): " + f"{after.stderr.decode()}" + ) + + # --- Parse and validate JSON response ------------------------------------ + stdout = result.stdout.decode() + try: + response = json.loads(stdout) + except json.JSONDecodeError as exc: + pytest.fail(f"Response is not valid JSON: {exc}\nstdout: {stdout[:500]}") + + # --- Assert expected fields exist ---------------------------------------- + missing = [f for f in fixture.expected_fields if not field_exists(response, f)] + assert not missing, ( + f"Missing expected fields in response: {missing}\n" + f"Response keys: {list(response.keys()) if isinstance(response, dict) else type(response)}" + ) + + # --- Assert minimum result count (if specified) ------------------------- + if fixture.expected_min_results is not None: + # Generic list-type responses (e.g., _cat/indices JSON array) + if isinstance(response, list): + actual = len(response) + assert actual >= fixture.expected_min_results, ( + f"Expected at least {fixture.expected_min_results} items in " + f"response array, got {actual}" + ) + # PPL responses: {"schema": [...], "datarows": [[...], ...], "total": N} + elif "datarows" in response: + actual = len(response["datarows"]) + assert actual >= fixture.expected_min_results, ( + f"Expected at least {fixture.expected_min_results} datarows, " + f"got {actual}" + ) + # Prometheus responses: {"status": "success", "data": {"result": [...]}} + elif ( + isinstance(response.get("data"), dict) + and "result" in response["data"] + ): + actual = len(response["data"]["result"]) + assert actual >= fixture.expected_min_results, ( + f"Expected at least {fixture.expected_min_results} results in " + f"data.result, got {actual}" + ) + + +# --------------------------------------------------------------------------- +# Dynamic marker application via pytest_collection_modifyitems +# --------------------------------------------------------------------------- + + +def pytest_collection_modifyitems(items: list[pytest.Item]) -> None: + """Apply pytest markers to parametrized test items based on fixture tags.""" + for item in items: + # The fixture is stored in callspec params by parametrize + fixture = getattr(item, "callspec", None) + if fixture is None: + continue + fixture_obj = fixture.params.get("fixture") + if not isinstance(fixture_obj, TestFixture): + continue + for marker in _apply_markers(fixture_obj): + item.add_marker(marker) diff --git a/docs/starlight-docs/astro.config.mjs b/docs/starlight-docs/astro.config.mjs index 08b6918a..bfc06cb6 100644 --- a/docs/starlight-docs/astro.config.mjs +++ b/docs/starlight-docs/astro.config.mjs @@ -144,6 +144,11 @@ export default defineConfig({ collapsed: true, autogenerate: { directory: 'forecasting' }, }, + { + label: 'Claude Code', + collapsed: true, + autogenerate: { directory: 'claude-code' }, + }, ], }), ], diff --git a/docs/starlight-docs/src/content/docs/claude-code/index.md b/docs/starlight-docs/src/content/docs/claude-code/index.md new file mode 100644 index 00000000..d6f34d56 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/claude-code/index.md @@ -0,0 +1,200 @@ +--- +title: Claude Code +description: Give Claude observability skills for querying traces, logs, and metrics from your OpenSearch stack +--- + +The Claude Code Observability Plugin teaches Claude how to query and investigate traces, logs, and metrics from your OpenSearch-based observability stack. It provides eight skill files containing PPL query templates for OpenSearch, PromQL query templates for Prometheus, and ready-to-execute curl commands. + +The plugin follows the open [Agent Skills](https://agentskills.io/) specification, so the same skill files work across Claude Code (CLI), Claude for VS Code, and Claude Desktop. + +## What the plugin provides + +| Skill | What it does | +|---|---| +| **Traces** | PPL queries for agent invocations, tool executions, slow spans, error spans, token usage, service maps, remote service identification | +| **Logs** | PPL queries for severity filtering, trace correlation, error patterns, log volume, full-text search | +| **Metrics** | PromQL queries for HTTP rates, latency percentiles, error rates, GenAI metrics | +| **Stack Health** | Health checks, troubleshooting, port reference, diagnostic commands | +| **PPL Reference** | 50+ PPL commands with syntax, examples, and function reference | +| **Correlation** | Cross-signal workflows linking traces, logs, and metrics with batch correlation and `coalesce()` patterns | +| **APM RED** | Rate/Errors/Duration methodology queries with safe division, `topk()`, and availability patterns | +| **SLO/SLI** | SLI definitions, recording rules, error budgets, burn rate alerts | + +## Prerequisites + +- A running [Observability Stack](/docs/get-started/installation/) (or any OpenSearch + Prometheus setup) +- One of: Claude Code CLI, Claude for VS Code, or Claude Desktop + +## Installation + +### Claude Code (CLI and VS Code extension) + +First, add the repository as a plugin marketplace: + +``` +/plugin marketplace add https://github.com/opensearch-project/observability-stack.git +``` + +Then install the plugin: + +``` +/plugin install observability@observability +``` + +All eight skills are registered and Claude automatically routes to the right one based on your question. + +Verify the skills loaded: + +``` +/skills +``` + +### Claude Desktop + +Claude Desktop supports custom skills through **Settings → Capabilities → Skills**. Each skill must be uploaded as a separate ZIP file. + +Pre-built ZIP files are attached to each [GitHub release](https://github.com/opensearch-project/observability-stack/releases) — one per skill: + +| ZIP file | Skill | +|---|---| +| `traces.zip` | Trace querying and investigation | +| `logs.zip` | Log searching and correlation | +| `metrics.zip` | PromQL metrics queries | +| `stack-health.zip` | Health checks and troubleshooting | +| `ppl-reference.zip` | PPL syntax reference | +| `correlation.zip` | Cross-signal correlation | +| `apm-red.zip` | RED methodology metrics | +| `slo-sli.zip` | SLO/SLI definitions and alerts | + +To install: + +1. Download or clone the repository: + ```bash + git clone https://github.com/opensearch-project/observability-stack.git + ``` + +2. In Claude Desktop, go to **Settings → Capabilities → Skills** + +3. Click **Add** and upload each ZIP file from `claude-code-observability-plugin/dist/` + +4. Enable each skill after uploading + +:::note +Claude Desktop requires one ZIP per skill — you cannot bundle all skills into a single ZIP. Upload all eight for the full observability experience. +::: + +## Try it out + +Once installed, start asking questions: + +``` +Show me the slowest traces from the last hour +``` + +``` +What's the error rate across services? +``` + +``` +Check if the observability stack is healthy +``` + +``` +Calculate the error budget for a 99.9% availability SLO +``` + +See the [Usage Guide](/docs/claude-code/usage/) for 50+ sample questions across all eight skills. + +## Configuration + +### Default endpoints + +The plugin defaults to the local Observability Stack: + +| Service | Endpoint | Auth | +|---|---|---| +| OpenSearch | `https://localhost:9200` | Basic auth (`admin` / `My_password_123!@#`), skip TLS verify | +| Prometheus | `http://localhost:9090` | None | + +### Custom endpoints + +Set environment variables to override defaults: + +```bash +export OPENSEARCH_ENDPOINT=https://your-opensearch-host:9200 +export PROMETHEUS_ENDPOINT=http://your-prometheus-host:9090 +``` + +### AWS managed services + +The skill files include AWS SigV4 variants for Amazon OpenSearch Service and Amazon Managed Service for Prometheus. When using managed services, the query syntax stays the same — only the endpoint URL and authentication method change. + +## Index patterns + +The plugin queries these OpenSearch indices: + +| Signal | Index Pattern | Key Fields | +|---|---|---| +| Traces | `otel-v1-apm-span-*` | `traceId`, `spanId`, `serviceName`, `name`, `durationInNanos`, `status.code`, `attributes.gen_ai.*` | +| Logs | `logs-otel-v1-*` | `traceId`, `spanId`, `severityText`, `body`, `resource.attributes.service.name`, `@timestamp` | +| Service Maps | `otel-v2-apm-service-map-*` | `sourceNode`, `targetNode`, `sourceOperation`, `targetOperation` | + +:::note +The log index uses `resource.attributes.service.name` (backtick-quoted in PPL as `` `resource.attributes.service.name` ``) instead of the top-level `serviceName` field found in the trace span index. +::: + +## Running the tests + +The plugin includes a test suite to validate skill file correctness: + +```bash +cd claude-code-observability-plugin/tests +pip install -r requirements.txt + +# Property tests (no running stack needed) +pytest test_properties.py -v + +# Integration tests (requires running stack) +pytest -v +``` + +## Troubleshooting + +### "Observability stack is not running" + +Tests and skills require OpenSearch and Prometheus to be running locally: + +```bash +docker compose up -d opensearch prometheus +``` + +### OpenSearch returns "Unauthorized" + +Check the password in `.env` matches the default: `My_password_123!@#` + +### No trace or log data + +The stack includes example services that generate telemetry automatically. Verify they're running: + +```bash +docker compose ps | grep -E "canary|weather|travel" +``` + +### Prometheus crash-looping (exit code 137) + +Clear the corrupted WAL data: + +```bash +docker compose stop prometheus +docker compose rm -f prometheus +docker volume rm observability-stack_prometheus-data +docker compose up -d prometheus +``` + +## Related links + +- [Usage Guide](/docs/claude-code/usage/) — 50+ sample questions with real examples +- [MCP Server](/docs/mcp/) — query OpenSearch via Model Context Protocol +- [Investigate Traces](/docs/investigate/discover-traces/) — explore traces in OpenSearch Dashboards +- [Investigate Logs](/docs/investigate/discover-logs/) — explore logs in OpenSearch Dashboards +- [Send Data](/docs/send-data/) — instrument your applications with OpenTelemetry diff --git a/docs/starlight-docs/src/content/docs/claude-code/showcase.md b/docs/starlight-docs/src/content/docs/claude-code/showcase.md new file mode 100644 index 00000000..801cd6c0 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/claude-code/showcase.md @@ -0,0 +1,376 @@ +--- +title: Showcase +description: Real-world examples demonstrating the power of Claude Code with observability data +--- + +These examples demonstrate real investigative workflows using Claude Code with the observability plugin. Each scenario shows natural language questions and the actual queries Claude executes behind the scenes. + +## AI Agent Cost Analysis + +Track token consumption across LLM models to optimize costs. + +``` +> Which LLM model is consuming the most tokens? +``` + +Claude queries trace spans for GenAI token usage: + +```sql +source=otel-v1-apm-span-* +| where `attributes.gen_ai.usage.input_tokens` > 0 +| stats sum(`attributes.gen_ai.usage.input_tokens`) as input_tokens, + sum(`attributes.gen_ai.usage.output_tokens`) as output_tokens + by `attributes.gen_ai.request.model` +| eval total_tokens = input_tokens + output_tokens +| sort - total_tokens +``` + +**Follow-up:** + +``` +> Compare the cost efficiency — which model has the best output-to-input ratio? +``` + +```sql +source=otel-v1-apm-span-* +| where `attributes.gen_ai.usage.input_tokens` > 0 +| stats sum(`attributes.gen_ai.usage.input_tokens`) as input, + sum(`attributes.gen_ai.usage.output_tokens`) as output, + count() as calls, avg(durationInNanos) as avg_latency + by `attributes.gen_ai.request.model` +| eval efficiency = output * 1.0 / input +| sort - efficiency +``` + +--- + +## Incident Investigation: E-Commerce Checkout Failures + +A real scenario: checkout errors are spiking. Use Claude to go from alert to root cause. + +**Step 1 — Detect the problem:** + +``` +> What is the current error rate for the checkout service? +``` + +```promql +sum(rate(http_server_duration_seconds_count{ + http_response_status_code=~"5..", service_name="checkout" +}[5m])) +/ +clamp_min(sum(rate(http_server_duration_seconds_count{ + service_name="checkout" +}[5m])), 1) +* 100 +``` + +**Step 2 — Find the failing operations:** + +``` +> Show me the error spans from checkout sorted by time +``` + +```sql +source=otel-v1-apm-span-* +| where serviceName = 'checkout' AND `status.code` = 2 +| fields traceId, spanId, name, `events.attributes.exception.type`, + `events.attributes.exception.message`, startTime +| sort - startTime | head 20 +``` + +**Step 3 — Trace a specific failure:** + +``` +> Show me the full trace tree for that traceId +``` + +```sql +source=otel-v1-apm-span-* +| where traceId = '' +| fields spanId, parentSpanId, serviceName, name, durationInNanos, + `status.code`, startTime +| sort startTime +``` + +**Step 4 — Find correlated logs:** + +``` +> Show me the logs for that trace +``` + +```sql +source=logs-otel-v1-* +| where traceId = '' +| fields spanId, severityText, body, `resource.attributes.service.name`, + `@timestamp` +| sort `@timestamp` +``` + +**Step 5 — Check if this is a new problem:** + +``` +> What was the error rate trend for checkout over the last 6 hours? +``` + +```promql +sum(rate(http_server_duration_seconds_count{ + http_response_status_code=~"5..", service_name="checkout" +}[5m])) +``` + +--- + +## Multi-Agent Orchestration Debugging + +When a travel planner agent fans out to weather and events sub-agents, trace the entire orchestration. + +``` +> Show me all Travel Planner agent invocations and their sub-agent calls +``` + +```sql +source=otel-v1-apm-span-* +| where `attributes.gen_ai.operation.name` = 'invoke_agent' +| stats count() as invocations, avg(durationInNanos) as avg_duration_ns + by `attributes.gen_ai.agent.name` +| eval avg_duration_ms = avg_duration_ns / 1000000 +| sort - invocations +``` + +**Follow-up — Find slow orchestrations:** + +``` +> Which Travel Planner invocations took longer than 30 seconds? +``` + +```sql +source=otel-v1-apm-span-* +| where `attributes.gen_ai.operation.name` = 'invoke_agent' + AND `attributes.gen_ai.agent.name` = 'Travel Planner' + AND durationInNanos > 30000000000 +| fields traceId, spanId, durationInNanos, startTime +| sort - durationInNanos +``` + +**Drill into a slow trace:** + +``` +> Show me all spans in that trace — I want to see which sub-agent was slow +``` + +```sql +source=otel-v1-apm-span-* +| where traceId = '' +| fields spanId, parentSpanId, serviceName, name, + `attributes.gen_ai.agent.name`, `attributes.gen_ai.operation.name`, + durationInNanos +| eval duration_ms = durationInNanos / 1000000 +| sort startTime +``` + +--- + +## Service Dependency Discovery + +Automatically discover what each service depends on, even when different instrumentation libraries use different attributes. + +``` +> What remote services does the checkout service call? +``` + +```sql +source=otel-v1-apm-span-* +| where serviceName = 'checkout' AND kind = 'SPAN_KIND_CLIENT' +| eval _remoteService = coalesce( + `attributes.net.peer.name`, + `attributes.server.address`, + `attributes.rpc.service`, + `attributes.db.system`, + `attributes.gen_ai.system`, + 'unknown') +| stats count() as calls, avg(durationInNanos) as avg_latency_ns + by _remoteService +| eval avg_latency_ms = avg_latency_ns / 1000000 +| sort - calls +``` + +**Follow-up — Check database performance:** + +``` +> Show me the slowest database queries from checkout +``` + +```sql +source=otel-v1-apm-span-* +| where serviceName = 'checkout' + AND `attributes.db.system` != '' +| fields name, `attributes.db.system`, `attributes.db.statement`, + durationInNanos +| eval duration_ms = durationInNanos / 1000000 +| sort - duration_ms | head 20 +``` + +--- + +## Error Budget Monitoring + +Track SLO compliance and error budget consumption for SRE workflows. + +``` +> How much error budget do we have left for a 99.9% SLO? +``` + +```promql +1 - ( + (1 - sum(rate(http_server_duration_seconds_count{ + http_response_status_code=~"5.."}[30m])) + / sum(rate(http_server_duration_seconds_count[30m]))) + / 0.999 +) +``` + +**Follow-up — Check burn rate:** + +``` +> Are we burning error budget too fast? Show me the burn rate. +``` + +```promql +sum(rate(http_server_duration_seconds_count{ + http_response_status_code=~"5.."}[1h])) +/ sum(rate(http_server_duration_seconds_count[1h])) +/ (1 - 0.999) +``` + +A burn rate above 14.4x over 1 hour indicates a fast burn that would exhaust the monthly error budget in less than 2 days. + +--- + +## Log Pattern Discovery + +Find recurring error patterns across services to prioritize fixes. + +``` +> What are the most common error messages across all services? +``` + +```sql +source=logs-otel-v1-* +| where severityText = 'ERROR' +| stats count() as occurrences + by body, `resource.attributes.service.name` +| sort - occurrences | head 20 +``` + +**Follow-up — Trend analysis:** + +``` +> How has the error volume changed hour by hour today? +``` + +```sql +source=logs-otel-v1-* +| where severityText = 'ERROR' +| stats count() as error_count + by span(`@timestamp`, 1h), `resource.attributes.service.name` +| sort `span(`@timestamp`, 1h)` +``` + +--- + +## Cross-Service Latency Investigation + +Find where time is being spent across service boundaries. + +``` +> Show me the top 5 services with the worst p99 latency +``` + +```promql +topk(5, + histogram_quantile(0.99, + sum(rate(http_server_duration_seconds_bucket[5m])) + by (le, service_name))) +``` + +**Follow-up — Drill into the slowest service:** + +``` +> What operations on the frontend service are the slowest? +``` + +```promql +topk(10, + histogram_quantile(0.95, + sum(rate(http_server_duration_seconds_bucket{ + service_name="frontend" + }[5m])) by (le, http_route))) +``` + +**Then correlate with traces:** + +``` +> Show me the actual slow traces from the frontend for that route +``` + +```sql +source=otel-v1-apm-span-* +| where serviceName = 'frontend' + AND kind = 'SPAN_KIND_SERVER' + AND durationInNanos > 1000000000 +| fields traceId, name, durationInNanos, startTime +| eval duration_ms = durationInNanos / 1000000 +| sort - duration_ms | head 20 +``` + +--- + +## Tool Execution Analysis + +Debug AI agent tool calls — what's failing, what's slow, and why. + +``` +> Which tools are failing the most? +``` + +```sql +source=otel-v1-apm-span-* +| where `attributes.gen_ai.operation.name` = 'execute_tool' + AND `status.code` = 2 +| stats count() as failures by `attributes.gen_ai.tool.name` +| sort - failures +``` + +**Follow-up — Inspect a failing tool:** + +``` +> Show me the arguments and results for the last 10 get_current_weather calls +``` + +```sql +source=otel-v1-apm-span-* +| where `attributes.gen_ai.tool.name` = 'get_current_weather' +| fields traceId, `attributes.gen_ai.tool.call.arguments`, + `attributes.gen_ai.tool.call.result`, + `status.code`, durationInNanos, startTime +| sort - startTime | head 10 +``` + +--- + +## Comparing All Services at a Glance + +Get a complete RED dashboard for every service in one investigation. + +``` +> Give me a health dashboard — show rate, error rate, and p95 latency for all services +``` + +Claude runs three PromQL queries simultaneously and presents a unified view: + +| Query | Signal | +|---|---| +| `sum(rate(http_server_duration_seconds_count[5m])) by (service_name)` | Rate | +| `sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[5m])) by (service_name) / clamp_min(sum(rate(http_server_duration_seconds_count[5m])) by (service_name), 1) * 100` | Error % | +| `histogram_quantile(0.95, sum(rate(http_server_duration_seconds_bucket[5m])) by (le, service_name))` | p95 Latency | diff --git a/docs/starlight-docs/src/content/docs/claude-code/usage.md b/docs/starlight-docs/src/content/docs/claude-code/usage.md new file mode 100644 index 00000000..612fee9e --- /dev/null +++ b/docs/starlight-docs/src/content/docs/claude-code/usage.md @@ -0,0 +1,249 @@ +--- +title: Usage Guide +description: Sample questions and real-world workflows for each observability skill +--- + +This guide shows how to use the Claude Code Observability Plugin through natural language. Each section demonstrates a skill with sample questions and what Claude does behind the scenes. + +## Traces + +Query distributed trace data to understand how requests flow through services and AI agents. + +### Sample questions + +**Service overview:** +- "Which services have the most trace spans?" +- "How many distinct operations does each service have?" + +**GenAI agent analysis:** +- "How many times was each AI agent invoked?" +- "What is the average response time for the Travel Planner agent?" +- "Show me token usage by model — which model consumes the most tokens?" +- "Find the slowest agent invocations in the last hour" + +**Error investigation:** +- "Show me all error spans from the checkout service" +- "Which services have the most errors?" +- "Find failed tool executions — what tools are failing?" + +**Latency analysis:** +- "Find all spans taking longer than 5 seconds" +- "What is the p95 duration for each service?" + +**Service dependencies:** +- "What remote services does the frontend call?" +- "Show me the service dependency map" +- "How many downstream dependencies does each service have?" + +### What Claude does + +When you ask *"Show me token usage by model"*, Claude runs this PPL query: + +```sql +source=otel-v1-apm-span-* +| where `attributes.gen_ai.usage.input_tokens` > 0 +| stats sum(`attributes.gen_ai.usage.input_tokens`) as total_input, + sum(`attributes.gen_ai.usage.output_tokens`) as total_output + by `attributes.gen_ai.request.model` +``` + +When you ask *"What remote services does the frontend call?"*, Claude uses `coalesce()` across multiple OTel attributes: + +```sql +source=otel-v1-apm-span-* +| where serviceName = 'frontend' | where kind = 'SPAN_KIND_CLIENT' +| eval _remoteService = coalesce( + `attributes.net.peer.name`, `attributes.server.address`, + `attributes.rpc.service`, `attributes.db.system`, + `attributes.gen_ai.system`, '') +| where _remoteService != '' +| stats count() as calls by _remoteService | sort - calls +``` + +--- + +## Logs + +Search, filter, and analyze log entries across all services. + +### Sample questions + +**Severity filtering:** +- "Show me all ERROR logs" +- "How many errors does each service have?" +- "Show me WARN and ERROR logs from the last hour" + +**Full-text search:** +- "Find all logs mentioning 'timeout'" +- "Search for logs containing 'connection refused'" + +**Error analysis:** +- "Which service has the most error logs?" +- "Show me the error log breakdown by service and severity" + +**Log volume:** +- "Show me log volume over time in hourly buckets" +- "Show the error rate trend over the last 24 hours" + +### What Claude does + +When you ask *"Which service has the most error logs?"*, Claude runs: + +```sql +source=logs-otel-v1-* +| where severityText = 'ERROR' +| stats count() as errors by `resource.attributes.service.name` +| sort - errors +``` + +:::note +The log index uses `resource.attributes.service.name` instead of the top-level `serviceName` field found in trace spans. +::: + +--- + +## Metrics + +Query Prometheus for HTTP rates, latency percentiles, and GenAI-specific metrics. + +### Sample questions + +- "What is the current request rate for each service?" +- "Show me p95 and p99 latency for all services" +- "What is the 5xx error rate by service?" +- "Show me GenAI token usage rate by model" +- "Which services have the highest error rates?" + +### What Claude does + +When you ask *"What is the p95 latency?"*, Claude runs: + +```promql +histogram_quantile(0.95, + sum(rate(http_server_duration_seconds_bucket[5m])) by (le, service_name)) +``` + +--- + +## Stack Health + +Check component health, verify data ingestion, and troubleshoot issues. + +### Sample questions + +- "Is the observability stack healthy?" +- "How many trace spans and logs are in the system?" +- "List all OpenSearch indices" +- "Check the Prometheus scrape targets" + +--- + +## Correlation + +Connect traces, logs, and metrics for end-to-end incident investigation. + +### Sample questions + +- "Find all logs for trace ID abc123def456" +- "Show me error logs that have trace context" +- "Compare span counts vs log counts for each service" +- "Which traces are associated with 'connection refused' errors?" + +### Real-world workflow + +**"I see high error rates — what's happening?"** + +1. Claude checks Prometheus error rate by service +2. Identifies the service with elevated errors (e.g., `weather-agent`) +3. Queries error logs for that service +4. Extracts traceId from error logs +5. Reconstructs the full trace tree +6. Shows the complete timeline from metric spike to root cause + +--- + +## APM RED + +Rate, Errors, and Duration metrics for service-level monitoring. + +### Sample questions + +- "Show me RED metrics for all services" +- "What is the request rate, error rate, and p95 latency for checkout?" +- "Which service has the highest error rate?" +- "Show me the top 5 services by fault rate" +- "What is the availability for each service?" + +### What Claude does + +Claude runs three PromQL queries with safe division patterns: + +```promql +-- Rate +sum(rate(http_server_duration_seconds_count[5m])) by (service_name) + +-- Errors (with clamp_min to avoid division by zero) +sum(rate(http_server_duration_seconds_count{http_response_status_code=~"5.."}[5m])) by (service_name) +/ clamp_min(sum(rate(http_server_duration_seconds_count[5m])) by (service_name), 1) * 100 + +-- Duration +histogram_quantile(0.95, + sum(rate(http_server_duration_seconds_bucket[5m])) by (le, service_name)) +``` + +--- + +## SLO/SLI + +Define, measure, and alert on service level objectives. + +### Sample questions + +- "What is the current availability SLI for all services?" +- "What percentage of requests complete within 500ms?" +- "How much error budget do we have remaining for a 99.9% SLO?" +- "What is the current burn rate?" +- "Help me set up SLO recording rules for Prometheus" + +--- + +## PPL Reference + +Claude's built-in guide for constructing novel PPL queries beyond the standard templates. + +### Sample questions + +- "How do I write a PPL query to join traces with logs?" +- "Show me the PPL syntax for regex field extraction" +- "How do I use timechart to visualize error trends?" +- "Help me write a query to find the top 10 slowest operations per service" + +--- + +## Power user tips + +### Combining skills + +Ask questions that span multiple skills — Claude automatically routes to the right ones: + +- "The checkout service is slow. Show me its p95 latency, recent error logs, and the slowest traces." +- "Compare the error rate in Prometheus with actual error spans in OpenSearch" +- "An agent is failing — show me the traces, associated logs, and token usage" + +### Iterative investigation + +Claude remembers context within a conversation, so you can drill down: + +``` +> Show me services with error spans + → weather-agent has 150 errors + +> Show me the error spans from weather-agent + → most errors are "External API returned 503" + +> Find the traces for those errors and show me the associated logs + → correlates traces → logs + +> What was the error rate trend for weather-agent over the last 6 hours? + → queries Prometheus for the time series +```