diff --git a/README.md b/README.md index bccb2c1d..fa42d9a3 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Observability Stack is an open-source stack designed for modern distributed syst - **OpenSearch**: Stores and indexes logs and traces for search and analysis - **Prometheus**: Stores time-series metrics data - **OpenSearch Dashboards**: Provides web-based visualization and exploration +- **PPL (Piped Processing Language)**: Native query language for logs and traces — pipe-based, human-readable, 50+ commands ## See it in action @@ -441,8 +442,22 @@ The current configuration includes a custom OpenSearch Dockerfile (`docker-compo Track progress: [OpenSearch 3.5.0 Release](https://github.com/opensearch-project/OpenSearch/releases) +## Query Language: PPL + +The Observability Stack uses **Piped Processing Language (PPL)** as its native query language for logs and traces. PPL is a pipe-based language designed for the way operators actually investigate data: + +``` +source = logs-otel-v1* +| where severityNumber >= 17 +| stats count() as errors by `resource.attributes.service.name` +| sort - errors +``` + +PPL provides 50+ commands and 200+ functions covering search, aggregation, pattern discovery, machine learning, joins, and more. See the [PPL documentation](https://observability.opensearch.org/docs/ppl/) for the full reference with live playground examples. + ## Documentation +- [PPL Language Reference](https://observability.opensearch.org/docs/ppl/) - Query language documentation with live examples - [AGENTS.md](AGENTS.md) - AI-optimized repository documentation - [CONTRIBUTING.md](CONTRIBUTING.md) - Development workflow and contribution guidelines - [examples/](examples/) - Language-specific instrumentation examples diff --git a/docs/starlight-docs/astro.config.mjs b/docs/starlight-docs/astro.config.mjs index f3a92c1f..96d451ae 100644 --- a/docs/starlight-docs/astro.config.mjs +++ b/docs/starlight-docs/astro.config.mjs @@ -63,24 +63,6 @@ export default defineConfig({ }, ], }, - { - label: 'Agent Observability', - collapsed: true, - items: [ - { label: 'Overview', link: '/ai-observability/' }, - { label: 'Getting Started', link: '/ai-observability/getting-started/' }, - { label: 'Framework Integrations', link: '/send-data/ai-agents/integrations/' }, - { label: 'Agent Tracing', link: '/ai-observability/agent-tracing/' }, - { label: 'Agent Graph & Path', link: '/ai-observability/agent-tracing/graph/' }, - { label: 'Evaluation & Scoring', link: '/ai-observability/evaluation/' }, - { label: 'Evaluation Integrations', link: '/ai-observability/evaluation-integrations/' }, - ], - }, - { - label: 'Agent Health', - collapsed: true, - autogenerate: { directory: 'agent-health' }, - }, { label: 'Send Data', collapsed: true, @@ -104,11 +86,109 @@ export default defineConfig({ }, ], }, + { + label: 'PPL - Query Language', + collapsed: true, + items: [ + { label: 'Overview', link: '/ppl/' }, + { label: 'Command Reference', link: '/ppl/commands/' }, + { + label: 'Search & Filter', + collapsed: true, + items: [ + { label: 'search', link: '/ppl/commands/search/' }, + { label: 'where', link: '/ppl/commands/where/' }, + ], + }, + { + label: 'Fields & Transformation', + collapsed: true, + items: [ + { label: 'fields', link: '/ppl/commands/fields/' }, + { label: 'eval', link: '/ppl/commands/eval/' }, + { label: 'rename', link: '/ppl/commands/rename/' }, + { label: 'fillnull', link: '/ppl/commands/fillnull/' }, + { label: 'expand', link: '/ppl/commands/expand/' }, + { label: 'flatten', link: '/ppl/commands/flatten/' }, + ], + }, + { + label: 'Aggregation & Statistics', + collapsed: true, + items: [ + { label: 'stats', link: '/ppl/commands/stats/' }, + { label: 'eventstats', link: '/ppl/commands/eventstats/' }, + { label: 'streamstats', link: '/ppl/commands/streamstats/' }, + { label: 'timechart', link: '/ppl/commands/timechart/' }, + { label: 'trendline', link: '/ppl/commands/trendline/' }, + ], + }, + { + label: 'Sorting & Limiting', + collapsed: true, + items: [ + { label: 'sort', link: '/ppl/commands/sort/' }, + { label: 'head', link: '/ppl/commands/head/' }, + { label: 'dedup', link: '/ppl/commands/dedup/' }, + { label: 'top', link: '/ppl/commands/top/' }, + { label: 'rare', link: '/ppl/commands/rare/' }, + ], + }, + { + label: 'Text Extraction', + collapsed: true, + items: [ + { label: 'parse', link: '/ppl/commands/parse/' }, + { label: 'grok', link: '/ppl/commands/grok/' }, + { label: 'rex', link: '/ppl/commands/rex/' }, + { label: 'patterns', link: '/ppl/commands/patterns/' }, + { label: 'spath', link: '/ppl/commands/spath/' }, + ], + }, + { + label: 'Data Combination', + collapsed: true, + items: [ + { label: 'join', link: '/ppl/commands/join/' }, + { label: 'lookup', link: '/ppl/commands/lookup/' }, + ], + }, + { + label: 'Machine Learning', + collapsed: true, + items: [ + { label: 'ml', link: '/ppl/commands/ml/' }, + ], + }, + { + label: 'Metadata', + collapsed: true, + items: [ + { label: 'describe', link: '/ppl/commands/describe/' }, + ], + }, + { label: 'Function Reference', link: '/ppl/functions/' }, + { label: 'Observability Examples', link: '/ppl/examples/' }, + ], + }, { label: 'Discover', collapsed: true, autogenerate: { directory: 'investigate' }, }, + { + label: 'Agent Observability', + collapsed: true, + items: [ + { label: 'Overview', link: '/ai-observability/' }, + { label: 'Getting Started', link: '/ai-observability/getting-started/' }, + { label: 'Framework Integrations', link: '/send-data/ai-agents/integrations/' }, + { label: 'Agent Tracing', link: '/ai-observability/agent-tracing/' }, + { label: 'Agent Graph & Path', link: '/ai-observability/agent-tracing/graph/' }, + { label: 'Evaluation & Scoring', link: '/ai-observability/evaluation/' }, + { label: 'Evaluation Integrations', link: '/ai-observability/evaluation-integrations/' }, + ], + }, { label: 'Application Monitoring', collapsed: true, @@ -120,7 +200,7 @@ export default defineConfig({ autogenerate: { directory: 'dashboards' }, }, { - label: 'Alerting & Detection', + label: 'Alerting', collapsed: true, items: [ { label: 'Alerting', link: '/alerting/' }, @@ -129,7 +209,12 @@ export default defineConfig({ ], }, { - label: 'Reference', + label: 'Agent Health', + collapsed: true, + autogenerate: { directory: 'agent-health' }, + }, + { + label: 'SDKs, MCP & Clients', collapsed: true, items: [ { label: 'Python SDK', link: '/send-data/ai-agents/python/' }, diff --git a/docs/starlight-docs/src/content/docs/agent-health/cli.md b/docs/starlight-docs/src/content/docs/agent-health/cli.md index 4ebaeceb..52e9b1d4 100644 --- a/docs/starlight-docs/src/content/docs/agent-health/cli.md +++ b/docs/starlight-docs/src/content/docs/agent-health/cli.md @@ -28,7 +28,7 @@ agent-health [serve] [options] |--------|-------------|---------| | `-p, --port ` | Server port | `4001` | | `-e, --env-file ` | Load env file | `.env` | -| `--no-browser` | Skip auto-open browser | — | +| `--no-browser` | Skip auto-open browser | - | ```bash agent-health --port 8080 --env-file prod.env @@ -94,14 +94,14 @@ agent-health benchmark [options] | Option | Description | Default | |--------|-------------|---------| -| `-n, --name ` | Benchmark name or ID | — | -| `-f, --file ` | JSON file of test cases to import and benchmark | — | +| `-n, --name ` | Benchmark name or ID | - | +| `-f, --file ` | JSON file of test cases to import and benchmark | - | | `-a, --agent ` | Agent key (repeatable) | First enabled agent | | `-m, --model ` | Model override | Agent default | | `-o, --output ` | Output: `table`, `json` | `table` | -| `--export ` | Export results to file | — | +| `--export ` | Export results to file | - | | `--format ` | Report format for `--export`: `json`, `html`, `pdf` | `json` | -| `-v, --verbose` | Show per-test-case results and errors | — | +| `-v, --verbose` | Show per-test-case results and errors | - | | `--stop-server` | Stop the server after benchmark completes | Keep running | **Modes:** @@ -149,11 +149,11 @@ agent-health report -b [options] | Option | Description | Default | |--------|-------------|---------| -| `-b, --benchmark ` | Benchmark name or ID **(required)** | — | +| `-b, --benchmark ` | Benchmark name or ID **(required)** | - | | `-r, --runs ` | Comma-separated run IDs | All runs | | `-f, --format ` | Report format: `json`, `html`, `pdf` | `html` | | `-o, --output ` | Output file path | Auto-generated | -| `--stdout` | Write to stdout (JSON format only) | — | +| `--stdout` | Write to stdout (JSON format only) | - | ```bash agent-health report -b "Baseline" # HTML report (all runs) diff --git a/docs/starlight-docs/src/content/docs/agent-health/configuration/index.md b/docs/starlight-docs/src/content/docs/agent-health/configuration/index.md index aab7de47..9787eb9b 100644 --- a/docs/starlight-docs/src/content/docs/agent-health/configuration/index.md +++ b/docs/starlight-docs/src/content/docs/agent-health/configuration/index.md @@ -29,9 +29,9 @@ Settings are loaded in this order (later overrides earlier): | 2. Environment variables (.env file) | -3. JSON config file (agent-health.config.json) — auto-created +3. JSON config file (agent-health.config.json) - auto-created | -4. TypeScript config file (agent-health.config.ts) — optional, for custom agents/connectors +4. TypeScript config file (agent-health.config.ts) - optional, for custom agents/connectors ``` ## JSON config file @@ -138,9 +138,9 @@ Required for the Bedrock LLM judge and Claude Code agent. |----------|-------------|---------| | `AWS_PROFILE` | AWS profile to use | `default` | | `AWS_REGION` | AWS region | `us-west-2` | -| `AWS_ACCESS_KEY_ID` | Explicit access key (alternative to profile) | — | -| `AWS_SECRET_ACCESS_KEY` | Explicit secret key | — | -| `AWS_SESSION_TOKEN` | Session token (for temporary credentials) | — | +| `AWS_ACCESS_KEY_ID` | Explicit access key (alternative to profile) | - | +| `AWS_SECRET_ACCESS_KEY` | Explicit secret key | - | +| `AWS_SESSION_TOKEN` | Session token (for temporary credentials) | - | ### OpenSearch Storage (optional) @@ -148,9 +148,9 @@ Override the default file-based storage with an OpenSearch cluster. | Variable | Description | Default | |----------|-------------|---------| -| `OPENSEARCH_STORAGE_ENDPOINT` | Storage cluster URL | — | -| `OPENSEARCH_STORAGE_USERNAME` | Username | — | -| `OPENSEARCH_STORAGE_PASSWORD` | Password | — | +| `OPENSEARCH_STORAGE_ENDPOINT` | Storage cluster URL | - | +| `OPENSEARCH_STORAGE_USERNAME` | Username | - | +| `OPENSEARCH_STORAGE_PASSWORD` | Password | - | | `OPENSEARCH_STORAGE_TLS_SKIP_VERIFY` | Skip TLS verification | `false` | ### OpenSearch Observability (optional) @@ -159,9 +159,9 @@ For viewing agent traces and logs. | Variable | Description | Default | |----------|-------------|---------| -| `OPENSEARCH_LOGS_ENDPOINT` | Logs cluster URL | — | -| `OPENSEARCH_LOGS_USERNAME` | Username | — | -| `OPENSEARCH_LOGS_PASSWORD` | Password | — | +| `OPENSEARCH_LOGS_ENDPOINT` | Logs cluster URL | - | +| `OPENSEARCH_LOGS_USERNAME` | Username | - | +| `OPENSEARCH_LOGS_PASSWORD` | Password | - | | `OPENSEARCH_LOGS_TRACES_INDEX` | Traces index pattern | `otel-v1-apm-span-*` | | `OPENSEARCH_LOGS_INDEX` | Logs index pattern | `ml-commons-logs-*` | @@ -191,5 +191,5 @@ $ agent-health doctor ## Next steps -- [Connectors](/docs/agent-health/configuration/connectors/) — create custom connectors for your agent type -- [CLI Reference](/docs/agent-health/cli/) — all commands and options +- [Connectors](/docs/agent-health/configuration/connectors/) - create custom connectors for your agent type +- [CLI Reference](/docs/agent-health/cli/) - all commands and options diff --git a/docs/starlight-docs/src/content/docs/agent-health/evaluations/experiments.md b/docs/starlight-docs/src/content/docs/agent-health/evaluations/experiments.md index 5c37ceb5..e6e4afbb 100644 --- a/docs/starlight-docs/src/content/docs/agent-health/evaluations/experiments.md +++ b/docs/starlight-docs/src/content/docs/agent-health/evaluations/experiments.md @@ -31,16 +31,16 @@ To compare agents, run the same experiment multiple times with different agent/m ## Running experiments from the CLI ```bash -# Quick mode — auto-creates a benchmark from all stored test cases +# Quick mode - auto-creates a benchmark from all stored test cases npx @opensearch-project/agent-health benchmark -# Named mode — runs a specific existing benchmark +# Named mode - runs a specific existing benchmark npx @opensearch-project/agent-health benchmark -n "Baseline" -a my-agent -# File mode — imports test cases from JSON and runs them +# File mode - imports test cases from JSON and runs them npx @opensearch-project/agent-health benchmark -f ./test-cases.json -a my-agent -# With export — save results to file +# With export - save results to file npx @opensearch-project/agent-health benchmark -f ./test-cases.json -n "My Run" -a my-agent --export results.json ``` diff --git a/docs/starlight-docs/src/content/docs/agent-health/evaluations/index.md b/docs/starlight-docs/src/content/docs/agent-health/evaluations/index.md index 67fad638..b64036a1 100644 --- a/docs/starlight-docs/src/content/docs/agent-health/evaluations/index.md +++ b/docs/starlight-docs/src/content/docs/agent-health/evaluations/index.md @@ -32,7 +32,7 @@ A "Golden Path" is the expected trajectory an agent should follow to successfull - What reasoning steps are expected - What the final response should contain -The LLM judge doesn't require an exact match — it evaluates whether the agent's actual trajectory achieves the expected outcomes through reasonable steps, even if the specific path differs. +The LLM judge doesn't require an exact match - it evaluates whether the agent's actual trajectory achieves the expected outcomes through reasonable steps, even if the specific path differs. ## LLM Judge output @@ -64,5 +64,5 @@ AWS_SECRET_ACCESS_KEY=your_secret ## Next steps -- [Test Cases](/docs/agent-health/evaluations/test-cases/) — create and manage evaluation scenarios -- [Experiments](/docs/agent-health/evaluations/experiments/) — run batch evaluations and compare results +- [Test Cases](/docs/agent-health/evaluations/test-cases/) - create and manage evaluation scenarios +- [Experiments](/docs/agent-health/evaluations/experiments/) - run batch evaluations and compare results diff --git a/docs/starlight-docs/src/content/docs/agent-health/evaluations/test-cases.md b/docs/starlight-docs/src/content/docs/agent-health/evaluations/test-cases.md index 6c7204d6..868fcff1 100644 --- a/docs/starlight-docs/src/content/docs/agent-health/evaluations/test-cases.md +++ b/docs/starlight-docs/src/content/docs/agent-health/evaluations/test-cases.md @@ -76,8 +76,8 @@ npx @opensearch-project/agent-health benchmark -f test-cases.json -a another-age ## Tips for good test cases -- **Make prompts specific and unambiguous** — avoid vague instructions -- **Include all necessary context data** — the agent shouldn't need to guess -- **Define clear, measurable expected outcomes** — the judge needs concrete criteria -- **Start with simple cases, add complexity gradually** — build confidence before testing edge cases -- **Use labels for organization** — filter and group test cases by category, difficulty, or domain +- **Make prompts specific and unambiguous** - avoid vague instructions +- **Include all necessary context data** - the agent shouldn't need to guess +- **Define clear, measurable expected outcomes** - the judge needs concrete criteria +- **Start with simple cases, add complexity gradually** - build confidence before testing edge cases +- **Use labels for organization** - filter and group test cases by category, difficulty, or domain diff --git a/docs/starlight-docs/src/content/docs/agent-health/getting-started.md b/docs/starlight-docs/src/content/docs/agent-health/getting-started.md index 05999ad7..31920ac0 100644 --- a/docs/starlight-docs/src/content/docs/agent-health/getting-started.md +++ b/docs/starlight-docs/src/content/docs/agent-health/getting-started.md @@ -10,7 +10,7 @@ This guide walks you through using Agent Health to evaluate AI agents. The appli ## Prerequisites **Required:** -- **Node.js 18+** — [download here](https://nodejs.org/) +- **Node.js 18+** - [download here](https://nodejs.org/) - **npm** (comes with Node.js) **Optional (for production use):** @@ -51,7 +51,7 @@ Agent Health includes a built-in Travel Planner multi-agent demo, along with a D - Simulates a multi-agent Travel Planner system with realistic trajectories - Agent types: Travel Coordinator, Weather Agent, Events Agent, Booking Agent, Budget Agent -- No external endpoint required — select "Demo Agent" in the agent dropdown +- No external endpoint required - select "Demo Agent" in the agent dropdown ### Demo Judge @@ -125,7 +125,7 @@ Each step shows timestamp, duration, tool arguments (for actions), full tool out ## Next steps -- [Connect your own agent](/docs/agent-health/configuration/) — configure Agent Health for your agent -- [Create custom test cases](/docs/agent-health/evaluations/test-cases/) — build test cases for your domain -- [Run experiments](/docs/agent-health/evaluations/experiments/) — batch evaluate across agents and models -- [View traces](/docs/agent-health/traces/) — visualize OpenTelemetry traces from your agent +- [Connect your own agent](/docs/agent-health/configuration/) - configure Agent Health for your agent +- [Create custom test cases](/docs/agent-health/evaluations/test-cases/) - build test cases for your domain +- [Run experiments](/docs/agent-health/evaluations/experiments/) - batch evaluate across agents and models +- [View traces](/docs/agent-health/traces/) - visualize OpenTelemetry traces from your agent diff --git a/docs/starlight-docs/src/content/docs/agent-health/index.md b/docs/starlight-docs/src/content/docs/agent-health/index.md index 9fb2bbf4..ee4c6375 100644 --- a/docs/starlight-docs/src/content/docs/agent-health/index.md +++ b/docs/starlight-docs/src/content/docs/agent-health/index.md @@ -5,7 +5,7 @@ sidebar: hidden: true --- -Agent Health is an evaluation and observability framework for AI agents. It helps you measure agent performance through "Golden Path" trajectory comparison — where an LLM judge evaluates agent actions against expected outcomes. Check out the [GitHub repository](https://github.com/opensearch-project/agent-health) for source code and contributions. +Agent Health is an evaluation and observability framework for AI agents. It helps you measure agent performance through "Golden Path" trajectory comparison - where an LLM judge evaluates agent actions against expected outcomes. Check out the [GitHub repository](https://github.com/opensearch-project/agent-health) for source code and contributions. ## Quick start @@ -50,8 +50,8 @@ For creating custom connectors, see [Connectors](/docs/agent-health/configuratio ## Next steps -- [Getting Started](/docs/agent-health/getting-started/) — step-by-step walkthrough from install to first evaluation -- [Evaluations](/docs/agent-health/evaluations/) — how evaluations, test cases, and experiments work -- [Trace Visualization](/docs/agent-health/traces/) — real-time trace monitoring and comparison -- [Configuration](/docs/agent-health/configuration/) — connect your own agent and configure the environment -- [CLI Reference](/docs/agent-health/cli/) — all CLI commands and options +- [Getting Started](/docs/agent-health/getting-started/) - step-by-step walkthrough from install to first evaluation +- [Evaluations](/docs/agent-health/evaluations/) - how evaluations, test cases, and experiments work +- [Trace Visualization](/docs/agent-health/traces/) - real-time trace monitoring and comparison +- [Configuration](/docs/agent-health/configuration/) - connect your own agent and configure the environment +- [CLI Reference](/docs/agent-health/cli/) - all CLI commands and options diff --git a/docs/starlight-docs/src/content/docs/agent-health/traces/index.md b/docs/starlight-docs/src/content/docs/agent-health/traces/index.md index 289853a2..4604de71 100644 --- a/docs/starlight-docs/src/content/docs/agent-health/traces/index.md +++ b/docs/starlight-docs/src/content/docs/agent-health/traces/index.md @@ -15,9 +15,9 @@ Agent Health's trace visualization is separate from the [Agent Traces](/docs/ai- Navigate to **Traces** in the sidebar for real-time trace monitoring: -- **Live tailing** — auto-refresh traces every 10 seconds with pause/resume controls -- **Agent filter** — filter traces by specific agent -- **Text search** — search span names and attributes +- **Live tailing** - auto-refresh traces every 10 seconds with pause/resume controls +- **Agent filter** - filter traces by specific agent +- **Text search** - search span names and attributes ## View modes @@ -34,9 +34,9 @@ Click the **Maximize** button on any trace visualization to open full-screen mod The comparison view supports side-by-side trace analysis: -- **Aligned view** — spans from different runs aligned by similarity -- **Merged view** — combined flow visualization showing all traces -- **Horizontal/Vertical orientation** — toggle layout for your preference +- **Aligned view** - spans from different runs aligned by similarity +- **Merged view** - combined flow visualization showing all traces +- **Horizontal/Vertical orientation** - toggle layout for your preference ## Enabling trace collection diff --git a/docs/starlight-docs/src/content/docs/ai-observability/evaluation-integrations.mdx b/docs/starlight-docs/src/content/docs/ai-observability/evaluation-integrations.mdx index c6845582..9d78ffff 100644 --- a/docs/starlight-docs/src/content/docs/ai-observability/evaluation-integrations.mdx +++ b/docs/starlight-docs/src/content/docs/ai-observability/evaluation-integrations.mdx @@ -205,12 +205,12 @@ def test_agent_quality(): assert avg_accuracy >= 0.8, f"Accuracy dropped to {avg_accuracy}" ``` -Run with: `pytest test_agent.py` — results are recorded as OTel experiment spans and available in OpenSearch Dashboards. +Run with: `pytest test_agent.py` - results are recorded as OTel experiment spans and available in OpenSearch Dashboards. --- ## Related links -- [Evaluation & Scoring](/docs/ai-observability/evaluation/) — core `score()`, `evaluate()`, `Experiment` API -- [Python SDK reference](/docs/send-data/ai-agents/python/) — full SDK documentation -- [Agent Health — Experiments](/docs/agent-health/evaluations/experiments/) — UI and CLI-based experiment workflows +- [Evaluation & Scoring](/docs/ai-observability/evaluation/) - core `score()`, `evaluate()`, `Experiment` API +- [Python SDK reference](/docs/send-data/ai-agents/python/) - full SDK documentation +- [Agent Health - Experiments](/docs/agent-health/evaluations/experiments/) - UI and CLI-based experiment workflows diff --git a/docs/starlight-docs/src/content/docs/ai-observability/evaluation.mdx b/docs/starlight-docs/src/content/docs/ai-observability/evaluation.mdx index ec656c56..15909adf 100644 --- a/docs/starlight-docs/src/content/docs/ai-observability/evaluation.mdx +++ b/docs/starlight-docs/src/content/docs/ai-observability/evaluation.mdx @@ -9,15 +9,15 @@ import { Aside } from '@astrojs/starlight/components'; The Python SDK provides three evaluation capabilities that all emit data through the standard OTLP pipeline: -- **`score()`** — attach quality scores to individual traces or spans -- **`evaluate()`** — run an agent against a dataset with automated scorer functions -- **`Experiment`** — upload pre-computed results from any evaluation framework +- **`score()`** - attach quality scores to individual traces or spans +- **`evaluate()`** - run an agent against a dataset with automated scorer functions +- **`Experiment`** - upload pre-computed results from any evaluation framework All evaluation data lands in the same OpenSearch index as your traces, so you can query scores alongside agent spans. --- -## `score()` — attach scores to traces +## `score()` - attach scores to traces Submits an evaluation score as an OTEL span linked to the trace being scored. @@ -74,7 +74,7 @@ def run(query: str) -> str: --- -## `evaluate()` — run experiments +## `evaluate()` - run experiments Executes a task function against each item in a dataset, runs scorer functions, and records everything as OTel experiment spans. @@ -116,7 +116,7 @@ print(result.summary) | `name` | `str` | Experiment name. | | `task` | `Callable` | Function that takes input and returns output. Use `@observe` for full tracing. | | `data` | `list[dict]` | Test cases: `"input"` (required), `"expected"`, `"case_id"`, `"case_name"` (optional). | -| `scores` | `list[Callable]` | Scorer functions — each receives `(input, output, expected)`. | +| `scores` | `list[Callable]` | Scorer functions - each receives `(input, output, expected)`. | | `metadata` | `dict` | Attached to the root experiment span. | | `record_io` | `bool` | Record input/output/expected as span attributes. Default `False`. | @@ -154,27 +154,27 @@ class EvalScore: ```mermaid flowchart TD - A["test_suite_run — experiment root"] --> B["test_case — case 1"] - A --> C["test_case — case 2"] + A["test_suite_run - experiment root"] --> B["test_case - case 1"] + A --> C["test_case - case 2"] B --> D["invoke_agent my_agent"] B --> E["evaluation result events"] D --> F["execute_tool ..."] ``` -Agent traces from the task become children of `test_case` spans — full waterfall from experiment to individual LLM calls. +Agent traces from the task become children of `test_case` spans - full waterfall from experiment to individual LLM calls. ### Result types ```python result = evaluate(...) result.summary # ExperimentSummary -result.summary.scores # dict[str, ScoreSummary] — avg, min, max, count per metric -result.cases # list[CaseResult] — per-case input, output, scores, status +result.summary.scores # dict[str, ScoreSummary] - avg, min, max, count per metric +result.cases # list[CaseResult] - per-case input, output, scores, status ``` --- -## `Experiment` — upload pre-computed results +## `Experiment` - upload pre-computed results Use `Experiment` when you already have evaluation results from another framework (RAGAS, DeepEval, pytest, custom) and want to upload them as OTel spans. @@ -233,7 +233,7 @@ print(f"V2: {result_b.summary.scores['accuracy'].avg:.2f}") --- -## `OpenSearchTraceRetriever` — query stored traces +## `OpenSearchTraceRetriever` - query stored traces Retrieves traces from OpenSearch for building evaluation pipelines. Requires the `[opensearch]` extra: @@ -262,7 +262,7 @@ retriever = OpenSearchTraceRetriever( ### Methods -**`list_root_spans()`** — find recent agent traces: +**`list_root_spans()`** - find recent agent traces: ```python roots = retriever.list_root_spans(services=["my-agent"], max_results=20) @@ -274,9 +274,9 @@ roots = retriever.list_root_spans(services=["my-agent"], max_results=20) | `since` | `datetime` | 15 min ago | Only traces started after this time. | | `max_results` | `int` | `50` | Maximum root spans to return. | -Returns `list[SpanRecord]` — one per trace (the root span with no parent). +Returns `list[SpanRecord]` - one per trace (the root span with no parent). -**`get_traces()`** — fetch full trace with all spans: +**`get_traces()`** - fetch full trace with all spans: ```python session = retriever.get_traces(roots[0].trace_id) @@ -292,18 +292,18 @@ for trace in session.traces: Returns a `SessionRecord`. -**`find_evaluated_trace_ids()`** — filter out already-scored traces: +**`find_evaluated_trace_ids()`** - filter out already-scored traces: ```python evaluated = retriever.find_evaluated_trace_ids([s.trace_id for s in roots]) to_score = [s for s in roots if s.trace_id not in evaluated] ``` -Returns `set[str]` — trace IDs that already have an evaluation span. +Returns `set[str]` - trace IDs that already have an evaluation span. ### Return types -**`SpanRecord`** — normalised view of one span: +**`SpanRecord`** - normalised view of one span: ```python @dataclass @@ -327,7 +327,7 @@ class SpanRecord: raw: dict = {} # Original OpenSearch document ``` -**`Message`** — a single user or assistant message: +**`Message`** - a single user or assistant message: ```python @dataclass @@ -336,7 +336,7 @@ class Message: content: str ``` -**`TraceRecord`** — all spans sharing a single trace ID: +**`TraceRecord`** - all spans sharing a single trace ID: ```python @dataclass @@ -345,7 +345,7 @@ class TraceRecord: spans: list[SpanRecord] = [] ``` -**`SessionRecord`** — all traces for a session/conversation: +**`SessionRecord`** - all traces for a session/conversation: ```python @dataclass @@ -391,7 +391,7 @@ retriever = OpenSearchTraceRetriever( ## Related links -- [Evaluation Integrations](/docs/ai-observability/evaluation-integrations/) — use DeepEval, RAGAS, MLflow, pytest with the observability stack -- [Python SDK reference](/docs/send-data/ai-agents/python/) — `register`, `observe`, `enrich` documentation -- [Agent Tracing UI](/docs/ai-observability/agent-tracing/) — explore traces in OpenSearch Dashboards -- [Agent Health — Experiments](/docs/agent-health/evaluations/experiments/) — UI and CLI-based experiment workflows +- [Evaluation Integrations](/docs/ai-observability/evaluation-integrations/) - use DeepEval, RAGAS, MLflow, pytest with the observability stack +- [Python SDK reference](/docs/send-data/ai-agents/python/) - `register`, `observe`, `enrich` documentation +- [Agent Tracing UI](/docs/ai-observability/agent-tracing/) - explore traces in OpenSearch Dashboards +- [Agent Health - Experiments](/docs/agent-health/evaluations/experiments/) - UI and CLI-based experiment workflows diff --git a/docs/starlight-docs/src/content/docs/ai-observability/getting-started.mdx b/docs/starlight-docs/src/content/docs/ai-observability/getting-started.mdx index 86ed4bd3..57ca9ef2 100644 --- a/docs/starlight-docs/src/content/docs/ai-observability/getting-started.mdx +++ b/docs/starlight-docs/src/content/docs/ai-observability/getting-started.mdx @@ -1,6 +1,6 @@ --- title: "Getting Started" -description: "Instrument an AI agent, view traces, and score quality — end to end in 5 minutes" +description: "Instrument an AI agent, view traces, and score quality - end to end in 5 minutes" sidebar: order: 0 --- @@ -11,7 +11,7 @@ This guide walks you through instrumenting an AI agent with the Python SDK, view ## End-to-end platform -From code to insight — the platform covers the full AI observability lifecycle: +From code to insight - the platform covers the full AI observability lifecycle: ```mermaid flowchart LR @@ -111,7 +111,7 @@ flowchart LR ```python from opensearch_genai_observability_sdk_py import register, observe, Op, enrich - # One-line setup — connects to the local OTel Collector + # One-line setup - connects to the local OTel Collector register( endpoint="http://localhost:4318/v1/traces", service_name="my-agent", @@ -195,7 +195,7 @@ flowchart LR ## What's next -- [Python SDK reference](/docs/send-data/ai-agents/python/) — full API documentation for `register`, `observe`, `enrich`, and AWS auth -- [Evaluation & Scoring](/docs/ai-observability/evaluation/) — `score()`, `evaluate()`, `Experiment`, and `OpenSearchTraceRetriever` in depth -- [Agent Tracing UI](/docs/ai-observability/agent-tracing/) — explore traces, graphs, and timelines in OpenSearch Dashboards -- [Agent Health](/docs/agent-health/) — evaluate agents with Golden Path comparison, LLM judges, and batch experiments +- [Python SDK reference](/docs/send-data/ai-agents/python/) - full API documentation for `register`, `observe`, `enrich`, and AWS auth +- [Evaluation & Scoring](/docs/ai-observability/evaluation/) - `score()`, `evaluate()`, `Experiment`, and `OpenSearchTraceRetriever` in depth +- [Agent Tracing UI](/docs/ai-observability/agent-tracing/) - explore traces, graphs, and timelines in OpenSearch Dashboards +- [Agent Health](/docs/agent-health/) - evaluate agents with Golden Path comparison, LLM judges, and batch experiments diff --git a/docs/starlight-docs/src/content/docs/ai-observability/index.md b/docs/starlight-docs/src/content/docs/ai-observability/index.md index c97d6a99..84aef60e 100644 --- a/docs/starlight-docs/src/content/docs/ai-observability/index.md +++ b/docs/starlight-docs/src/content/docs/ai-observability/index.md @@ -3,7 +3,7 @@ title: AI Observability description: Observe, debug, and evaluate AI agent workflows with OpenTelemetry GenAI conventions --- -The Observability Stack provides end-to-end tooling for AI agent observability — from instrumenting your code to viewing traces, scoring quality, and running evaluations. +The Observability Stack provides end-to-end tooling for AI agent observability - from instrumenting your code to viewing traces, scoring quality, and running evaluations. ## End-to-end platform @@ -36,41 +36,41 @@ flowchart LR ## Capabilities -- **Agent tracing** — visualize LLM agent execution as trace trees, DAG graphs, and timelines -- **GenAI semantic conventions** — standard `gen_ai.*` attributes for model, tokens, tools, and sessions -- **Evaluation & scoring** — attach quality scores to traces, run experiments against datasets -- **Trace retrieval** — query stored traces from OpenSearch for evaluation pipelines -- **Auto-instrumentation** — OpenAI, Anthropic, Bedrock, LangChain, and 20+ libraries traced automatically -- **MCP server** — query OpenSearch from AI agents via the built-in Model Context Protocol server +- **Agent tracing** - visualize LLM agent execution as trace trees, DAG graphs, and timelines +- **GenAI semantic conventions** - standard `gen_ai.*` attributes for model, tokens, tools, and sessions +- **Evaluation & scoring** - attach quality scores to traces, run experiments against datasets +- **Trace retrieval** - query stored traces from OpenSearch for evaluation pipelines +- **Auto-instrumentation** - OpenAI, Anthropic, Bedrock, LangChain, and 20+ libraries traced automatically +- **MCP server** - query OpenSearch from AI agents via the built-in Model Context Protocol server ## Getting started Start here for a hands-on walkthrough from `pip install` to seeing traces and scoring quality: -- **[Getting Started](/docs/ai-observability/getting-started/)** — instrument an agent, view traces, score quality in 5 minutes +- **[Getting Started](/docs/ai-observability/getting-started/)** - instrument an agent, view traces, score quality in 5 minutes ## Instrument Send agent trace data to the observability stack: -- [Python SDK](/docs/send-data/ai-agents/python/) — `@observe`, `enrich()`, auto-instrumentation, AWS SigV4 -- [TypeScript SDK](/docs/send-data/ai-agents/typescript/) — coming soon -- [AI Agents overview](/docs/send-data/ai-agents/) — why use the SDK vs manual OTel +- [Python SDK](/docs/send-data/ai-agents/python/) - `@observe`, `enrich()`, auto-instrumentation, AWS SigV4 +- [TypeScript SDK](/docs/send-data/ai-agents/typescript/) - coming soon +- [AI Agents overview](/docs/send-data/ai-agents/) - why use the SDK vs manual OTel ## Analyze Explore traces in OpenSearch Dashboards: -- [Agent Tracing](/docs/ai-observability/agent-tracing/) — the Agent Traces UI, span tables, detail flyouts -- [Agent Graph & Path](/docs/ai-observability/agent-tracing/graph/) — DAG visualization, trace tree, and timeline views +- [Agent Tracing](/docs/ai-observability/agent-tracing/) - the Agent Traces UI, span tables, detail flyouts +- [Agent Graph & Path](/docs/ai-observability/agent-tracing/graph/) - DAG visualization, trace tree, and timeline views ## Evaluate Score agent quality and run experiments: -- [Evaluation & Scoring](/docs/ai-observability/evaluation/) — `score()`, `evaluate()`, `Experiment`, trace retrieval -- **[Agent Health](/docs/agent-health/)** — Golden Path trajectory comparison, LLM judge scoring, batch experiments via UI and CLI +- [Evaluation & Scoring](/docs/ai-observability/evaluation/) - `score()`, `evaluate()`, `Experiment`, trace retrieval +- **[Agent Health](/docs/agent-health/)** - Golden Path trajectory comparison, LLM judge scoring, batch experiments via UI and CLI ## Connect -- [MCP Server](/docs/mcp/) — query OpenSearch from AI agents via MCP +- [MCP Server](/docs/mcp/) - query OpenSearch from AI agents via MCP diff --git a/docs/starlight-docs/src/content/docs/alerting/index.md b/docs/starlight-docs/src/content/docs/alerting/index.md index af875cf9..bd986885 100644 --- a/docs/starlight-docs/src/content/docs/alerting/index.md +++ b/docs/starlight-docs/src/content/docs/alerting/index.md @@ -7,9 +7,9 @@ OpenSearch Alerting lets you define monitors that watch your observability data ## Key concepts -- **Monitors**: Scheduled queries that check your data at regular intervals. Monitors can query any OpenSearch index — logs, traces, metrics, or custom indices. +- **Monitors**: Scheduled queries that check your data at regular intervals. Monitors can query any OpenSearch index - logs, traces, metrics, or custom indices. - **Triggers**: Conditions attached to monitors that define when an alert should fire. For example, "trigger when error count exceeds 100 in the last 5 minutes." -- **Actions**: What happens when a trigger fires — send a message to Slack, PagerDuty, email, a custom webhook, or any channel supported by the OpenSearch Notifications plugin. +- **Actions**: What happens when a trigger fires - send a message to Slack, PagerDuty, email, a custom webhook, or any channel supported by the OpenSearch Notifications plugin. - **Alerts**: Active instances of triggered conditions. Alerts have states (active, acknowledged, completed) and can be managed from the Alerting dashboard. ## Monitor types @@ -24,7 +24,7 @@ OpenSearch Alerting lets you define monitors that watch your observability data ## Getting started 1. Open OpenSearch Dashboards and navigate to **Alerting** (under the main menu). -2. Create a **destination** (notification channel) — Slack, email, webhook, etc. +2. Create a **destination** (notification channel) - Slack, email, webhook, etc. 3. Create a **monitor** with a query against your observability data. 4. Add a **trigger** with a condition and an **action** that sends to your destination. 5. The monitor runs on its schedule and fires alerts when conditions are met. @@ -50,4 +50,4 @@ Set the trigger to fire when the document count exceeds your threshold, and conf ## Learn more -For the full alerting reference — including API operations, composite monitors, alert acknowledgment, and notification channel configuration — see the [Alerting documentation](https://docs.opensearch.org/latest/observing-your-data/alerting/index/) in the official OpenSearch docs. +For the full alerting reference - including API operations, composite monitors, alert acknowledgment, and notification channel configuration - see the [Alerting documentation](https://docs.opensearch.org/latest/observing-your-data/alerting/index/) in the official OpenSearch docs. diff --git a/docs/starlight-docs/src/content/docs/anomaly-detection/index.md b/docs/starlight-docs/src/content/docs/anomaly-detection/index.md index dac02519..32da3377 100644 --- a/docs/starlight-docs/src/content/docs/anomaly-detection/index.md +++ b/docs/starlight-docs/src/content/docs/anomaly-detection/index.md @@ -3,12 +3,12 @@ title: Anomaly Detection description: Detect anomalies in your observability data using machine learning --- -OpenSearch Anomaly Detection uses machine learning to automatically identify unusual patterns in your time-series data. It learns the normal behavior of your metrics and alerts you when something deviates from the expected pattern — without requiring manual threshold configuration. +OpenSearch Anomaly Detection uses machine learning to automatically identify unusual patterns in your time-series data. It learns the normal behavior of your metrics and alerts you when something deviates from the expected pattern - without requiring manual threshold configuration. ## Key concepts - **Detector**: A configuration that defines what data to monitor, which features (aggregations) to track, and how often to check. Each detector uses the Random Cut Forest (RCF) algorithm to model normal behavior. -- **Features**: The aggregations a detector monitors — for example, average CPU usage, request count, or error rate over a time window. +- **Features**: The aggregations a detector monitors - for example, average CPU usage, request count, or error rate over a time window. - **Real-time detection**: The detector runs continuously and flags anomalies as new data arrives. Results are available within the detection interval (typically 1-10 minutes). - **Historical detection**: Run a detector against past data to identify anomalies retroactively. Useful for validating detector configuration or investigating past incidents. - **Anomaly grade**: A score from 0 to 1 indicating how severe the anomaly is. Higher grades mean the data point is further from expected behavior. @@ -30,11 +30,11 @@ Anomaly detection works on any time-series data indexed in OpenSearch. Common ob 1. Open OpenSearch Dashboards and navigate to **Anomaly Detection** (under the main menu). 2. Click **Create detector**. 3. Select the index to monitor (e.g., `logs-otel-v1*` or a metrics index). -4. Define one or more **features** — aggregations over fields in your data. +4. Define one or more **features** - aggregations over fields in your data. 5. Set the **detection interval** (how often the detector evaluates). 6. Optionally configure a **category field** to run detection per group (e.g., per service name). 7. Start the detector in real-time mode. -8. View results on the detector's dashboard — anomaly grade, confidence, and feature values over time. +8. View results on the detector's dashboard - anomaly grade, confidence, and feature values over time. ## Pairing with alerting @@ -42,4 +42,4 @@ Combine anomaly detection with [Alerting](/docs/alerting/) to get notified when ## Learn more -For the full reference — including detector APIs, tuning parameters, and multi-category detection — see the [Anomaly Detection documentation](https://docs.opensearch.org/latest/observing-your-data/ad/index/) in the official OpenSearch docs. +For the full reference - including detector APIs, tuning parameters, and multi-category detection - see the [Anomaly Detection documentation](https://docs.opensearch.org/latest/observing-your-data/ad/index/) in the official OpenSearch docs. diff --git a/docs/starlight-docs/src/content/docs/apm/index.md b/docs/starlight-docs/src/content/docs/apm/index.md index 5ca0b72a..40839c78 100644 --- a/docs/starlight-docs/src/content/docs/apm/index.md +++ b/docs/starlight-docs/src/content/docs/apm/index.md @@ -11,8 +11,8 @@ Application Monitoring gives you a real-time view of how your services are perfo In OpenSearch Dashboards, navigate to **Observability** > **Application Monitoring**. The sidebar shows: -- **Services** — catalog of all instrumented services with filtering, detail views, and correlation links -- **Application Map** — interactive topology graph of service dependencies +- **Services** - catalog of all instrumented services with filtering, detail views, and correlation links +- **Application Map** - interactive topology graph of service dependencies ## Key capabilities @@ -28,9 +28,9 @@ A filterable table of all instrumented services showing latency (P99), throughpu Drill into any service to see three tabs: -- **Overview** — KPI cards (throughput, fault rate, error rate, availability, latency P99) with sparklines and trend arrows, latency by dependencies, requests by operations, and availability by operations charts. -- **Operations** — table of all operations with expandable rows showing per-operation request, fault, error, and latency charts. -- **Dependencies** — table of downstream dependencies with expandable rows showing per-dependency charts. +- **Overview** - KPI cards (throughput, fault rate, error rate, availability, latency P99) with sparklines and trend arrows, latency by dependencies, requests by operations, and availability by operations charts. +- **Operations** - table of all operations with expandable rows showing per-operation request, fault, error, and latency charts. +- **Dependencies** - table of downstream dependencies with expandable rows showing per-dependency charts. ### Correlations diff --git a/docs/starlight-docs/src/content/docs/apm/service-map.md b/docs/starlight-docs/src/content/docs/apm/service-map.md index 6aa940fc..1bb68a31 100644 --- a/docs/starlight-docs/src/content/docs/apm/service-map.md +++ b/docs/starlight-docs/src/content/docs/apm/service-map.md @@ -18,19 +18,19 @@ The breadcrumb at the top reads **Application > Services**, reflecting the curre The Application Map toolbar provides: -- **Search bar** — filter by service name or environment -- **Time range selector** — choose a time window, with a Refresh button -- **Zoom in / Zoom out** — adjust the map zoom level -- **Fit to screen** — reset the view to fit all nodes +- **Search bar** - filter by service name or environment +- **Time range selector** - choose a time window, with a Refresh button +- **Zoom in / Zoom out** - adjust the map zoom level +- **Fit to screen** - reset the view to fit all nodes ## Filters panel A filters panel on the left side lets you narrow what the map displays: -- **Group by** — select an attribute to group services (see [Group by view](#group-by-view) below) -- **Fault Rate (5xx)** — filter by fault rate buckets: < 1%, 1–5%, > 5% -- **Error Rate (4xx)** — filter by error rate buckets: < 1%, 1–5%, > 5% -- **Environment** — filter to a specific deployment environment +- **Group by** - select an attribute to group services (see [Group by view](#group-by-view) below) +- **Fault Rate (5xx)** - filter by fault rate buckets: < 1%, 1–5%, > 5% +- **Error Rate (4xx)** - filter by error rate buckets: < 1%, 1–5%, > 5% +- **Environment** - filter to a specific deployment environment Applied filters appear as chips above the map (e.g., "Fault Rate (5xx): > 5% x"). Click the **x** on a chip to remove it, or click **Clear all** to reset all filters. @@ -65,10 +65,10 @@ Click any service node to open a detail panel on the right side of the map. The **Metrics** Four time-series charts for the selected time range: -- **Requests** — request volume over time -- **Latency** — P50, P90, and P99 latency lines -- **Faults (5xx)** — 5xx fault count over time -- **Errors (4xx)** — 4xx error count over time +- **Requests** - request volume over time +- **Latency** - P50, P90, and P99 latency lines +- **Faults (5xx)** - 5xx fault count over time +- **Errors (4xx)** - 4xx error count over time ## Group by view diff --git a/docs/starlight-docs/src/content/docs/apm/services.md b/docs/starlight-docs/src/content/docs/apm/services.md index 9f3a52f8..1ded0396 100644 --- a/docs/starlight-docs/src/content/docs/apm/services.md +++ b/docs/starlight-docs/src/content/docs/apm/services.md @@ -39,11 +39,11 @@ The main table lists all instrumented services: Use the left-hand filters to narrow down the catalog: -- **Environment** — filter by deployment environment -- **Latency** — range slider to set minimum and maximum latency thresholds -- **Throughput** — range slider to set minimum and maximum throughput thresholds -- **Failure ratio** — bucket filters: < 1%, 1–5%, > 5% -- **Attributes** — filter by resource attributes such as `telemetry.sdk.language` +- **Environment** - filter by deployment environment +- **Latency** - range slider to set minimum and maximum latency thresholds +- **Throughput** - range slider to set minimum and maximum throughput thresholds +- **Failure ratio** - bucket filters: < 1%, 1–5%, > 5% +- **Attributes** - filter by resource attributes such as `telemetry.sdk.language` ## Service detail view @@ -60,9 +60,9 @@ A panel showing which downstream dependencies have the highest fault rates for t **Correlated data** Quick links to related telemetry: -- **View service attributes** — resource attributes attached to the service -- **View correlated spans** — spans associated with this service -- **View correlated logs** — logs correlated to this service's traces +- **View service attributes** - resource attributes attached to the service +- **View correlated spans** - spans associated with this service +- **View correlated logs** - logs correlated to this service's traces **KPI cards** Five cards summarizing key performance indicators, each with a sparkline chart, the current value, the previous-period value, and a trend arrow: @@ -104,24 +104,24 @@ The Operations tab lists every operation for the service. **Expandable rows** Click a row to expand it and see inline charts: -- **Requests / Faults** — request volume and 5xx fault count over time -- **Errors / Latency** — error count and latency (P50, P90, P99) over time +- **Requests / Faults** - request volume and 5xx fault count over time +- **Errors / Latency** - error count and latency (P50, P90, P99) over time **Filters sidebar** -- **Availability** — bucket filters for availability ranges -- **Operations** — search box to filter by operation name -- **Error rate** — filter by error rate range -- **Latency** — filter by latency range -- **Requests** — filter by request count range -- **Service operations** — filter to specific service-side operations -- **Remote operations** — filter to specific remote (downstream) operations +- **Availability** - bucket filters for availability ranges +- **Operations** - search box to filter by operation name +- **Error rate** - filter by error rate range +- **Latency** - filter by latency range +- **Requests** - filter by request count range +- **Service operations** - filter to specific service-side operations +- **Remote operations** - filter to specific remote (downstream) operations **Correlation flyout** Click a correlation icon on any operation row to open a flyout panel. The flyout header shows the service name, environment badge, and operation filter badge. It contains two tabs: -- **Correlated spans** — a table of spans with columns for Time, Status (OK/ERROR badge), HTTP Status, Kind, Operation, and Span ID. The Span ID is a clickable link that navigates to the span in Explore Traces. -- **Correlated logs** — a table of log entries correlated to the selected operation. +- **Correlated spans** - a table of spans with columns for Time, Status (OK/ERROR badge), HTTP Status, Kind, Operation, and Span ID. The Span ID is a clickable link that navigates to the span in Explore Traces. +- **Correlated logs** - a table of log entries correlated to the selected operation. ![Correlation flyout showing correlated spans for the frontend service GET operation](/docs/images/apm/service-span-correlations.png) @@ -148,9 +148,9 @@ Click a row to expand it and see inline charts for request volume, errors, and l **Filters sidebar** -- **Availability** — bucket filters for availability ranges -- **Dependency service** — search box to filter by dependency name -- **Error rate** — filter by error rate range +- **Availability** - bucket filters for availability ranges +- **Dependency service** - search box to filter by dependency name +- **Error rate** - filter by error rate range ## How service data is generated diff --git a/docs/starlight-docs/src/content/docs/claude-code/index.md b/docs/starlight-docs/src/content/docs/claude-code/index.md index d6f34d56..2fe098b9 100644 --- a/docs/starlight-docs/src/content/docs/claude-code/index.md +++ b/docs/starlight-docs/src/content/docs/claude-code/index.md @@ -53,7 +53,7 @@ Verify the skills loaded: Claude Desktop supports custom skills through **Settings → Capabilities → Skills**. Each skill must be uploaded as a separate ZIP file. -Pre-built ZIP files are attached to each [GitHub release](https://github.com/opensearch-project/observability-stack/releases) — one per skill: +Pre-built ZIP files are attached to each [GitHub release](https://github.com/opensearch-project/observability-stack/releases) - one per skill: | ZIP file | Skill | |---|---| @@ -80,7 +80,7 @@ To install: 4. Enable each skill after uploading :::note -Claude Desktop requires one ZIP per skill — you cannot bundle all skills into a single ZIP. Upload all eight for the full observability experience. +Claude Desktop requires one ZIP per skill - you cannot bundle all skills into a single ZIP. Upload all eight for the full observability experience. ::: ## Try it out @@ -127,7 +127,7 @@ export PROMETHEUS_ENDPOINT=http://your-prometheus-host:9090 ### AWS managed services -The skill files include AWS SigV4 variants for Amazon OpenSearch Service and Amazon Managed Service for Prometheus. When using managed services, the query syntax stays the same — only the endpoint URL and authentication method change. +The skill files include AWS SigV4 variants for Amazon OpenSearch Service and Amazon Managed Service for Prometheus. When using managed services, the query syntax stays the same - only the endpoint URL and authentication method change. ## Index patterns @@ -193,8 +193,8 @@ docker compose up -d prometheus ## Related links -- [Usage Guide](/docs/claude-code/usage/) — 50+ sample questions with real examples -- [MCP Server](/docs/mcp/) — query OpenSearch via Model Context Protocol -- [Investigate Traces](/docs/investigate/discover-traces/) — explore traces in OpenSearch Dashboards -- [Investigate Logs](/docs/investigate/discover-logs/) — explore logs in OpenSearch Dashboards -- [Send Data](/docs/send-data/) — instrument your applications with OpenTelemetry +- [Usage Guide](/docs/claude-code/usage/) - 50+ sample questions with real examples +- [MCP Server](/docs/mcp/) - query OpenSearch via Model Context Protocol +- [Investigate Traces](/docs/investigate/discover-traces/) - explore traces in OpenSearch Dashboards +- [Investigate Logs](/docs/investigate/discover-logs/) - explore logs in OpenSearch Dashboards +- [Send Data](/docs/send-data/) - instrument your applications with OpenTelemetry diff --git a/docs/starlight-docs/src/content/docs/claude-code/showcase.md b/docs/starlight-docs/src/content/docs/claude-code/showcase.md index 801cd6c0..9cdc4dcc 100644 --- a/docs/starlight-docs/src/content/docs/claude-code/showcase.md +++ b/docs/starlight-docs/src/content/docs/claude-code/showcase.md @@ -28,7 +28,7 @@ source=otel-v1-apm-span-* **Follow-up:** ``` -> Compare the cost efficiency — which model has the best output-to-input ratio? +> Compare the cost efficiency - which model has the best output-to-input ratio? ``` ```sql @@ -48,7 +48,7 @@ source=otel-v1-apm-span-* A real scenario: checkout errors are spiking. Use Claude to go from alert to root cause. -**Step 1 — Detect the problem:** +**Step 1 - Detect the problem:** ``` > What is the current error rate for the checkout service? @@ -65,7 +65,7 @@ clamp_min(sum(rate(http_server_duration_seconds_count{ * 100 ``` -**Step 2 — Find the failing operations:** +**Step 2 - Find the failing operations:** ``` > Show me the error spans from checkout sorted by time @@ -79,7 +79,7 @@ source=otel-v1-apm-span-* | sort - startTime | head 20 ``` -**Step 3 — Trace a specific failure:** +**Step 3 - Trace a specific failure:** ``` > Show me the full trace tree for that traceId @@ -93,7 +93,7 @@ source=otel-v1-apm-span-* | sort startTime ``` -**Step 4 — Find correlated logs:** +**Step 4 - Find correlated logs:** ``` > Show me the logs for that trace @@ -107,7 +107,7 @@ source=logs-otel-v1-* | sort `@timestamp` ``` -**Step 5 — Check if this is a new problem:** +**Step 5 - Check if this is a new problem:** ``` > What was the error rate trend for checkout over the last 6 hours? @@ -138,7 +138,7 @@ source=otel-v1-apm-span-* | sort - invocations ``` -**Follow-up — Find slow orchestrations:** +**Follow-up - Find slow orchestrations:** ``` > Which Travel Planner invocations took longer than 30 seconds? @@ -156,7 +156,7 @@ source=otel-v1-apm-span-* **Drill into a slow trace:** ``` -> Show me all spans in that trace — I want to see which sub-agent was slow +> Show me all spans in that trace - I want to see which sub-agent was slow ``` ```sql @@ -195,7 +195,7 @@ source=otel-v1-apm-span-* | sort - calls ``` -**Follow-up — Check database performance:** +**Follow-up - Check database performance:** ``` > Show me the slowest database queries from checkout @@ -230,7 +230,7 @@ Track SLO compliance and error budget consumption for SRE workflows. ) ``` -**Follow-up — Check burn rate:** +**Follow-up - Check burn rate:** ``` > Are we burning error budget too fast? Show me the burn rate. @@ -263,7 +263,7 @@ source=logs-otel-v1-* | sort - occurrences | head 20 ``` -**Follow-up — Trend analysis:** +**Follow-up - Trend analysis:** ``` > How has the error volume changed hour by hour today? @@ -294,7 +294,7 @@ topk(5, by (le, service_name))) ``` -**Follow-up — Drill into the slowest service:** +**Follow-up - Drill into the slowest service:** ``` > What operations on the frontend service are the slowest? @@ -328,7 +328,7 @@ source=otel-v1-apm-span-* ## Tool Execution Analysis -Debug AI agent tool calls — what's failing, what's slow, and why. +Debug AI agent tool calls - what's failing, what's slow, and why. ``` > Which tools are failing the most? @@ -342,7 +342,7 @@ source=otel-v1-apm-span-* | sort - failures ``` -**Follow-up — Inspect a failing tool:** +**Follow-up - Inspect a failing tool:** ``` > Show me the arguments and results for the last 10 get_current_weather calls @@ -364,7 +364,7 @@ source=otel-v1-apm-span-* Get a complete RED dashboard for every service in one investigation. ``` -> Give me a health dashboard — show rate, error rate, and p95 latency for all services +> Give me a health dashboard - show rate, error rate, and p95 latency for all services ``` Claude runs three PromQL queries simultaneously and presents a unified view: diff --git a/docs/starlight-docs/src/content/docs/claude-code/usage.md b/docs/starlight-docs/src/content/docs/claude-code/usage.md index 612fee9e..0338d3e1 100644 --- a/docs/starlight-docs/src/content/docs/claude-code/usage.md +++ b/docs/starlight-docs/src/content/docs/claude-code/usage.md @@ -18,13 +18,13 @@ Query distributed trace data to understand how requests flow through services an **GenAI agent analysis:** - "How many times was each AI agent invoked?" - "What is the average response time for the Travel Planner agent?" -- "Show me token usage by model — which model consumes the most tokens?" +- "Show me token usage by model - which model consumes the most tokens?" - "Find the slowest agent invocations in the last hour" **Error investigation:** - "Show me all error spans from the checkout service" - "Which services have the most errors?" -- "Find failed tool executions — what tools are failing?" +- "Find failed tool executions - what tools are failing?" **Latency analysis:** - "Find all spans taking longer than 5 seconds" @@ -151,7 +151,7 @@ Connect traces, logs, and metrics for end-to-end incident investigation. ### Real-world workflow -**"I see high error rates — what's happening?"** +**"I see high error rates - what's happening?"** 1. Claude checks Prometheus error rate by service 2. Identifies the service with elevated errors (e.g., `weather-agent`) @@ -224,11 +224,11 @@ Claude's built-in guide for constructing novel PPL queries beyond the standard t ### Combining skills -Ask questions that span multiple skills — Claude automatically routes to the right ones: +Ask questions that span multiple skills - Claude automatically routes to the right ones: - "The checkout service is slow. Show me its p95 latency, recent error logs, and the slowest traces." - "Compare the error rate in Prometheus with actual error spans in OpenSearch" -- "An agent is failing — show me the traces, associated logs, and token usage" +- "An agent is failing - show me the traces, associated logs, and token usage" ### Iterative investigation diff --git a/docs/starlight-docs/src/content/docs/dashboards/build.md b/docs/starlight-docs/src/content/docs/dashboards/build.md index d3d9cc8d..92f71a88 100644 --- a/docs/starlight-docs/src/content/docs/dashboards/build.md +++ b/docs/starlight-docs/src/content/docs/dashboards/build.md @@ -3,13 +3,13 @@ title: "Build a Dashboard" description: "Create dashboards, add visualization panels, and arrange layouts for observability monitoring" --- -This guide walks through creating a dashboard from scratch — adding panels, choosing visualization types, configuring queries, and arranging the layout. +This guide walks through creating a dashboard from scratch - adding panels, choosing visualization types, configuring queries, and arranging the layout. ## Creating a new dashboard 1. Navigate to **Dashboards** in the left navigation 2. Select **Create** → **Dashboard** -3. You start with an empty canvas — add panels to populate it +3. You start with an empty canvas - add panels to populate it Alternatively, when you're in Discover and have a visualization you like, select **Save** → **Save to dashboard** → **New dashboard** to create a dashboard seeded with that visualization. @@ -59,7 +59,7 @@ Some rules of thumb: - Comparing categories → bar chart - Single number that matters → metric or gauge - Distribution shape → heat map or histogram (bar chart with `bin`) -- Proportions of a whole → pie chart (use sparingly — bar charts are usually clearer) +- Proportions of a whole → pie chart (use sparingly - bar charts are usually clearer) ## Configuring panels @@ -74,7 +74,7 @@ search earliest=-6h source = logs-otel-v1* | timechart span=5m count() by `resource.attributes.service.name` ``` -For metrics (PromQL — adjust metric names to match your environment): +For metrics (PromQL - adjust metric names to match your environment): ```promql sum by (service_name) (rate(http_server_request_duration_seconds_count[5m])) ``` @@ -82,7 +82,7 @@ sum by (service_name) (rate(http_server_request_duration_seconds_count[5m])) ### Axes and formatting - Set axis labels and units (requests/sec, milliseconds, bytes, percentage) -- Configure Y-axis scale (linear or logarithmic) — log scale is useful when values span orders of magnitude +- Configure Y-axis scale (linear or logarithmic) - log scale is useful when values span orders of magnitude - Set min/max bounds to keep charts consistent across panels - Choose color schemes that distinguish series clearly @@ -90,7 +90,7 @@ sum by (service_name) (rate(http_server_request_duration_seconds_count[5m])) - Position legends at the bottom, right, or hide them for single-series panels - Use legend values (min, max, avg, current) to add context without hovering -- For dashboards with many panels, hiding legends saves space — use panel titles instead +- For dashboards with many panels, hiding legends saves space - use panel titles instead ### Thresholds @@ -100,7 +100,7 @@ Add horizontal threshold lines to panels to mark important boundaries: - Latency target (e.g., yellow line at 500ms, red at 1s) - Capacity limits (e.g., red line at 80% CPU) -Thresholds make it immediately obvious when a metric crosses a boundary — no mental math required. +Thresholds make it immediately obvious when a metric crosses a boundary - no mental math required. ## Layout and arrangement @@ -109,10 +109,10 @@ Thresholds make it immediately obvious when a metric crosses a boundary — no m Panels snap to a grid. Resize by dragging the bottom-right corner of a panel. Rearrange by dragging the panel header. Layout tips: -- Put the most important panels at the top — that's what people see first +- Put the most important panels at the top - that's what people see first - Group related panels together (e.g., all latency panels in one row, all error panels in another) - Use full-width panels for time-series charts that benefit from horizontal space -- Use narrow panels for metric values and gauges — they don't need much room +- Use narrow panels for metric values and gauges - they don't need much room ### Recommended dashboard layouts @@ -140,7 +140,7 @@ Layout tips: ## Time range controls -The dashboard time picker in the top bar sets the time range for all panels simultaneously. This keeps everything aligned — when you're investigating a 2am incident, every panel shows the same window. +The dashboard time picker in the top bar sets the time range for all panels simultaneously. This keeps everything aligned - when you're investigating a 2am incident, every panel shows the same window. Individual panels can override the dashboard time range if needed, but use this sparingly. Mismatched time ranges across panels create confusion. @@ -166,10 +166,10 @@ The fastest way to build a dashboard is to start in Discover: 3. Save it to a dashboard 4. Repeat for each question you want the dashboard to answer -This approach ensures every panel has a clear purpose — it answers a question you actually asked during investigation. Dashboards built this way tend to be more useful than ones designed abstractly. +This approach ensures every panel has a clear purpose - it answers a question you actually asked during investigation. Dashboards built this way tend to be more useful than ones designed abstractly. ## Next steps -- [Sharing Dashboards](/docs/dashboards/sharing/) — share, export, and best practices -- [Discover Logs](/docs/investigate/discover-logs/) — build log queries to power dashboard panels -- [Discover Metrics](/docs/investigate/discover-metrics/) — build PromQL queries for metrics panels +- [Sharing Dashboards](/docs/dashboards/sharing/) - share, export, and best practices +- [Discover Logs](/docs/investigate/discover-logs/) - build log queries to power dashboard panels +- [Discover Metrics](/docs/investigate/discover-metrics/) - build PromQL queries for metrics panels diff --git a/docs/starlight-docs/src/content/docs/dashboards/index.md b/docs/starlight-docs/src/content/docs/dashboards/index.md index 02ea5fed..ac732b45 100644 --- a/docs/starlight-docs/src/content/docs/dashboards/index.md +++ b/docs/starlight-docs/src/content/docs/dashboards/index.md @@ -3,7 +3,7 @@ title: "Dashboards & Visualize" description: "Build, customize, and share observability dashboards in OpenSearch" --- -Dashboards are where investigation results become operational views. In OpenSearch, dashboards combine visualizations from logs, traces, and metrics into a single pane — giving teams a shared, real-time picture of system health. +Dashboards are where investigation results become operational views. In OpenSearch, dashboards combine visualizations from logs, traces, and metrics into a single pane - giving teams a shared, real-time picture of system health. ## What dashboards are for @@ -21,7 +21,7 @@ Dashboards serve different audiences and purposes: Dashboards and Discover work together. The typical workflow: -1. Investigate in Discover — build a query, explore the data, find the right visualization +1. Investigate in Discover - build a query, explore the data, find the right visualization 2. Save the visualization to a dashboard (new or existing) 3. The dashboard panel stays live, updating as new data arrives 4. When something looks wrong on a dashboard, click through to Discover to dig deeper @@ -47,11 +47,11 @@ Filters let you narrow the data across all panels at once without editing indivi 1. Select **Add filter** in the filter bar 2. Choose a field (e.g., `resource.attributes.service.name`, `severity.text`, `service_name`) -3. Pick an operator — `is`, `is not`, `is one of`, `exists`, etc. +3. Pick an operator - `is`, `is not`, `is one of`, `exists`, etc. 4. Set the value (e.g., `checkout-service`) 5. The filter applies to every panel on the dashboard immediately -You can stack multiple filters. They combine with AND logic — all conditions must match. +You can stack multiple filters. They combine with AND logic - all conditions must match. ### Filter use cases @@ -71,10 +71,10 @@ Pin filters when you're investigating a specific service or environment and want ### Filters and template variables (coming soon) -Currently, dashboards support global filters for narrowing data across all panels. Template variable support — dropdowns that let you switch between services, environments, or time intervals — is planned for a future release. Once available, variables will complement filters by providing reusable, designed-in drill-down controls. +Currently, dashboards support global filters for narrowing data across all panels. Template variable support - dropdowns that let you switch between services, environments, or time intervals - is planned for a future release. Once available, variables will complement filters by providing reusable, designed-in drill-down controls. ## Getting started -- [Build a Dashboard](/docs/dashboards/build/) — create dashboards, add panels, choose visualization types, and arrange layouts -- [Sharing Dashboards](/docs/dashboards/sharing/) — share, export, and best practices -- [Troubleshooting](/docs/dashboards/troubleshooting/) — diagnose panel issues, inspect queries, and fix common problems +- [Build a Dashboard](/docs/dashboards/build/) - create dashboards, add panels, choose visualization types, and arrange layouts +- [Sharing Dashboards](/docs/dashboards/sharing/) - share, export, and best practices +- [Troubleshooting](/docs/dashboards/troubleshooting/) - diagnose panel issues, inspect queries, and fix common problems diff --git a/docs/starlight-docs/src/content/docs/dashboards/sharing.md b/docs/starlight-docs/src/content/docs/dashboards/sharing.md index efd8cedd..d0501170 100644 --- a/docs/starlight-docs/src/content/docs/dashboards/sharing.md +++ b/docs/starlight-docs/src/content/docs/dashboards/sharing.md @@ -3,12 +3,12 @@ title: "Sharing Dashboards" description: "Share dashboards, export reports, and manage dashboard best practices" --- -Once you've built a dashboard, share it with the people who need it — team members, on-call engineers, or stakeholders. +Once you've built a dashboard, share it with the people who need it - team members, on-call engineers, or stakeholders. ## Share with team members Dashboards are accessible to anyone with the appropriate OpenSearch permissions. Share by: -- Sending the dashboard URL directly — the URL includes the dashboard ID and current time range +- Sending the dashboard URL directly - the URL includes the dashboard ID and current time range - Adding the dashboard to a team's bookmarked dashboards list - Referencing it in runbooks and incident response documentation @@ -17,7 +17,7 @@ Dashboards are accessible to anyone with the appropriate OpenSearch permissions. For sharing a point-in-time view (e.g., during a post-incident review): 1. Set the dashboard to the time range you want to capture 2. Select **Share** → **Snapshot** -3. The snapshot preserves the exact state — data, time range, and filters +3. The snapshot preserves the exact state - data, time range, and filters Snapshots are read-only and don't update with new data. They're a record of what the dashboard looked like at a specific moment. @@ -43,9 +43,9 @@ Import by navigating to **Dashboards** → **Import** and uploading the JSON fil ### Design for the audience -- **On-call engineers:** prioritize real-time data, error rates, and latency. Keep it scannable — if something is wrong, it should be obvious in 5 seconds. +- **On-call engineers:** prioritize real-time data, error rates, and latency. Keep it scannable - if something is wrong, it should be obvious in 5 seconds. - **Team leads:** include trend data, error rate trends, and week-over-week comparisons. These dashboards are checked daily, not during incidents. -- **Stakeholders:** high-level summaries — availability percentage, request volume, key business metrics. Minimize technical detail. +- **Stakeholders:** high-level summaries - availability percentage, request volume, key business metrics. Minimize technical detail. ### Keep dashboards focused @@ -73,7 +73,7 @@ Export dashboard JSON and commit it to your repository. This gives you: ## Next steps -- [Build a Dashboard](/docs/dashboards/build/) — create dashboards, add panels, and arrange layouts -- [Discover Logs](/docs/investigate/discover-logs/) — build log queries for dashboard panels -- [Discover Traces](/docs/investigate/discover-traces/) — build trace queries for dashboard panels -- [Discover Metrics](/docs/investigate/discover-metrics/) — build PromQL queries for metrics panels +- [Build a Dashboard](/docs/dashboards/build/) - create dashboards, add panels, and arrange layouts +- [Discover Logs](/docs/investigate/discover-logs/) - build log queries for dashboard panels +- [Discover Traces](/docs/investigate/discover-traces/) - build trace queries for dashboard panels +- [Discover Metrics](/docs/investigate/discover-metrics/) - build PromQL queries for metrics panels diff --git a/docs/starlight-docs/src/content/docs/dashboards/troubleshooting.md b/docs/starlight-docs/src/content/docs/dashboards/troubleshooting.md index 8d8a2347..3a886b3d 100644 --- a/docs/starlight-docs/src/content/docs/dashboards/troubleshooting.md +++ b/docs/starlight-docs/src/content/docs/dashboards/troubleshooting.md @@ -24,19 +24,19 @@ When a dashboard panel shows unexpected data, errors, or loads slowly, these tec ### Panel shows stale data -- Check auto-refresh — if it's disabled, the dashboard only updates when you manually refresh or change the time range -- Check the time range — "Last 1 hour" is relative and updates on refresh, but a custom absolute range (e.g., "March 4, 2pm–3pm") is fixed -- Check data ingestion — if the pipeline is delayed, the data may not have arrived yet +- Check auto-refresh - if it's disabled, the dashboard only updates when you manually refresh or change the time range +- Check the time range - "Last 1 hour" is relative and updates on refresh, but a custom absolute range (e.g., "March 4, 2pm–3pm") is fixed +- Check data ingestion - if the pipeline is delayed, the data may not have arrived yet ## Performance issues ### Dashboard loads slowly 1. **Identify the slow panel:** Open browser developer tools → Network tab → refresh the dashboard. Sort requests by duration to find the slowest query. -2. **Use panel inspect:** Open the slow panel's inspect view → Request tab. Look at the query — is it scanning too much data? +2. **Use panel inspect:** Open the slow panel's inspect view → Request tab. Look at the query - is it scanning too much data? 3. **Common fixes:** - Narrow the dashboard time range - - Reduce the number of panels — each panel fires a separate query + - Reduce the number of panels - each panel fires a separate query - Simplify expensive queries (remove joins, reduce distinct count operations) - Use `| head N` in PPL queries to limit result sets - For PromQL, use shorter range vectors (`[5m]` instead of `[1h]`) @@ -54,9 +54,9 @@ The slow panel likely has an expensive query. Use inspect to see the query and r This is usually a rendering problem, not a query problem: -- Too many data points in a single panel (e.g., a line chart with 100,000 points) — increase the `timechart span` to reduce granularity -- Too many panels on one dashboard — split into multiple focused dashboards -- Large data tables without pagination — add `| head 100` to limit rows +- Too many data points in a single panel (e.g., a line chart with 100,000 points) - increase the `timechart span` to reduce granularity +- Too many panels on one dashboard - split into multiple focused dashboards +- Large data tables without pagination - add `| head 100` to limit rows ## Filter issues @@ -80,9 +80,9 @@ Every dashboard panel has a built-in inspect tool that shows you exactly what's 2. Click the panel menu (three dots or gear icon) 3. Select **Inspect** 4. You'll see tabs for: - - **Data** — the raw data returned by the query, as a table. Check whether the values match what you expect. - - **Request** — the exact query sent to OpenSearch, including any filters and time ranges the dashboard applied - - **Response** — the raw JSON response from OpenSearch, including timing and metadata + - **Data** - the raw data returned by the query, as a table. Check whether the values match what you expect. + - **Request** - the exact query sent to OpenSearch, including any filters and time ranges the dashboard applied + - **Response** - the raw JSON response from OpenSearch, including timing and metadata ### What to look for @@ -99,11 +99,11 @@ For deeper debugging, use your browser's developer tools alongside panel inspect 1. Open developer tools (`Cmd+Option+I` on macOS) 2. Switch to the **Network** tab 3. Refresh the dashboard or change the time range -4. Watch for requests to OpenSearch endpoints — each panel fires its own query +4. Watch for requests to OpenSearch endpoints - each panel fires its own query 5. Look for: - - **Failed requests** (red) — server errors, timeouts, or permission issues - - **Slow requests** — sort by duration to find the panel that's dragging down the whole dashboard - - **Response size** — very large responses can slow down rendering even if the query is fast + - **Failed requests** (red) - server errors, timeouts, or permission issues + - **Slow requests** - sort by duration to find the panel that's dragging down the whole dashboard + - **Response size** - very large responses can slow down rendering even if the query is fast ### Console tab @@ -116,7 +116,7 @@ When reporting a dashboard issue: 1. Use panel inspect to capture the request and response 2. Note the dashboard time range and any active filters 3. Check the browser console for errors -4. Try the same query directly in Discover — if it works there but not on the dashboard, the issue is in the panel configuration +4. Try the same query directly in Discover - if it works there but not on the dashboard, the issue is in the panel configuration 5. Export the dashboard JSON (Share → Export) to preserve the exact configuration for debugging -6. Ask in the [#observability channel](https://opensearch.org/slack) on the OpenSearch Slack workspace — the community is active and responsive +6. Ask in the [#observability channel](https://opensearch.org/slack) on the OpenSearch Slack workspace - the community is active and responsive 7. Search or file an issue on the [OpenSearch GitHub repo](https://github.com/opensearch-project/OpenSearch-Dashboards) for bugs or feature requests diff --git a/docs/starlight-docs/src/content/docs/forecasting/index.md b/docs/starlight-docs/src/content/docs/forecasting/index.md index 22ebc40e..6f955516 100644 --- a/docs/starlight-docs/src/content/docs/forecasting/index.md +++ b/docs/starlight-docs/src/content/docs/forecasting/index.md @@ -9,7 +9,7 @@ OpenSearch Forecasting extends the anomaly detection framework to predict future - **Forecaster**: A configuration similar to an anomaly detector that defines what data to forecast, which features to predict, and the forecast horizon. - **Forecast horizon**: How far into the future to predict. The forecaster generates predicted values for each interval up to the horizon. -- **Features**: The aggregations to forecast — the same feature types used in anomaly detection (averages, counts, sums, etc.). +- **Features**: The aggregations to forecast - the same feature types used in anomaly detection (averages, counts, sums, etc.). - **Confidence intervals**: Each predicted value includes upper and lower bounds indicating the range of expected values. ## How it fits the Observability Stack @@ -27,7 +27,7 @@ Forecasting is useful for capacity planning and proactive operations: 1. Open OpenSearch Dashboards and navigate to **Forecasting** (available alongside Anomaly Detection). 2. Create a **forecaster** by selecting an index and defining features to predict. -3. Set the **forecast horizon** — how many intervals ahead to predict. +3. Set the **forecast horizon** - how many intervals ahead to predict. 4. Run the forecaster to generate predictions. 5. View predicted values alongside actual data to validate accuracy. @@ -37,4 +37,4 @@ Combine forecasting with [Alerting](/docs/alerting/) to get notified when foreca ## Learn more -For the full reference — including forecaster APIs, tuning parameters, and supported aggregation types — see the [Forecasting documentation](https://docs.opensearch.org/latest/observing-your-data/forecast/index/) in the official OpenSearch docs. +For the full reference - including forecaster APIs, tuning parameters, and supported aggregation types - see the [Forecasting documentation](https://docs.opensearch.org/latest/observing-your-data/forecast/index/) in the official OpenSearch docs. diff --git a/docs/starlight-docs/src/content/docs/get-started/core-concepts.md b/docs/starlight-docs/src/content/docs/get-started/core-concepts.md index 76cc6926..cdb7cbcc 100644 --- a/docs/starlight-docs/src/content/docs/get-started/core-concepts.md +++ b/docs/starlight-docs/src/content/docs/get-started/core-concepts.md @@ -5,7 +5,7 @@ description: Key terms and ideas for the OpenSearch Observability Stack ## Traces and spans -A **trace** represents a single end-to-end operation (for example, an API request or an agent workflow). Each trace is composed of **spans** — individual units of work with a start time, duration, and metadata. Spans form a tree structure with parent-child relationships. +A **trace** represents a single end-to-end operation (for example, an API request or an agent workflow). Each trace is composed of **spans** - individual units of work with a start time, duration, and metadata. Spans form a tree structure with parent-child relationships. ## OpenTelemetry (OTel) @@ -21,7 +21,7 @@ Service maps are auto-generated by Data Prepper from trace data. They visualize ## RED metrics -Rate, Errors, and Duration — the three golden signals computed automatically by Data Prepper from ingested trace spans. These power the APM views in OpenSearch Dashboards. +Rate, Errors, and Duration - the three golden signals computed automatically by Data Prepper from ingested trace spans. These power the APM views in OpenSearch Dashboards. ## PPL (Piped Processing Language) @@ -38,6 +38,6 @@ PromQL is Prometheus's query language for time-series metrics. The Observability ## Index patterns OpenSearch stores observability data in indices following naming conventions: -- `otel-v1-apm-span-*` — trace spans -- `otel-v2-apm-service-map` — service map data -- `logs-otel-v1*` — log data +- `otel-v1-apm-span-*` - trace spans +- `otel-v2-apm-service-map` - service map data +- `logs-otel-v1*` - log data diff --git a/docs/starlight-docs/src/content/docs/get-started/installation.mdx b/docs/starlight-docs/src/content/docs/get-started/installation.mdx index 909aa8cb..f0cbed66 100644 --- a/docs/starlight-docs/src/content/docs/get-started/installation.mdx +++ b/docs/starlight-docs/src/content/docs/get-started/installation.mdx @@ -57,7 +57,7 @@ grep -E '^OPENSEARCH_(USER|PASSWORD)=' .env ### Example services -The stack ships with example AI agent services that generate agent traces automatically — a multi-agent travel planner, weather agent, and events agent. These are enabled by default. +The stack ships with example AI agent services that generate agent traces automatically - a multi-agent travel planner, weather agent, and events agent. These are enabled by default. To disable them, comment out this line in `.env`: @@ -101,6 +101,6 @@ To fully remove, delete the repository directory. ## Next steps -- [Platform Overview](/docs/get-started/overview/) — architecture and data flow -- [Core Concepts](/docs/get-started/core-concepts/) — key terms and ideas -- [Quickstart](/docs/get-started/quickstart/first-traces/) — send your first traces +- [Platform Overview](/docs/get-started/overview/) - architecture and data flow +- [Core Concepts](/docs/get-started/core-concepts/) - key terms and ideas +- [Quickstart](/docs/get-started/quickstart/first-traces/) - send your first traces diff --git a/docs/starlight-docs/src/content/docs/get-started/quickstart/first-dashboard.md b/docs/starlight-docs/src/content/docs/get-started/quickstart/first-dashboard.md index 9cbd091b..30de7ed1 100644 --- a/docs/starlight-docs/src/content/docs/get-started/quickstart/first-dashboard.md +++ b/docs/starlight-docs/src/content/docs/get-started/quickstart/first-dashboard.md @@ -38,5 +38,5 @@ Select **Save**, give your dashboard a name, and optionally add it to an Observa ## Next steps -- [Dashboards](/docs/dashboards/) — advanced dashboard features -- [Discover Metrics](/docs/investigate/discover-metrics/) — PromQL-based metric exploration +- [Dashboards](/docs/dashboards/) - advanced dashboard features +- [Discover Metrics](/docs/investigate/discover-metrics/) - PromQL-based metric exploration diff --git a/docs/starlight-docs/src/content/docs/get-started/quickstart/first-traces.md b/docs/starlight-docs/src/content/docs/get-started/quickstart/first-traces.md index 6b8197cb..4bd9c868 100644 --- a/docs/starlight-docs/src/content/docs/get-started/quickstart/first-traces.md +++ b/docs/starlight-docs/src/content/docs/get-started/quickstart/first-traces.md @@ -44,6 +44,6 @@ For other languages, see [Send Data](/docs/send-data/). ## Next steps -- [Create Your First Dashboard](/docs/get-started/quickstart/first-dashboard/) — build custom visualizations -- [Agent Tracing](/docs/ai-observability/agent-tracing/) — trace AI agent workflows with GenAI semantic conventions -- [Send Data](/docs/send-data/) — more instrumentation options +- [Create Your First Dashboard](/docs/get-started/quickstart/first-dashboard/) - build custom visualizations +- [Agent Tracing](/docs/ai-observability/agent-tracing/) - trace AI agent workflows with GenAI semantic conventions +- [Send Data](/docs/send-data/) - more instrumentation options diff --git a/docs/starlight-docs/src/content/docs/index.mdx b/docs/starlight-docs/src/content/docs/index.mdx index df892d06..87e45332 100644 --- a/docs/starlight-docs/src/content/docs/index.mdx +++ b/docs/starlight-docs/src/content/docs/index.mdx @@ -7,7 +7,7 @@ import { LinkCard, CardGrid, Tabs, TabItem, Aside } from '@astrojs/starlight/com import IconCard from '../../components/IconCard.astro'; import IconCardGrid from '../../components/IconCardGrid.astro'; -OpenSearch Observability Stack is an **open-source, OpenTelemetry-native observability platform** ([GitHub](https://github.com/opensearch-project/observability-stack)) for monitoring services, infrastructure, and AI agents. Install locally via Docker Compose — traces, logs, Prometheus metrics, service maps, and agent tracing out of the box. +OpenSearch Observability Stack is an **open-source, OpenTelemetry-native observability platform** ([GitHub](https://github.com/opensearch-project/observability-stack)) for monitoring services, infrastructure, and AI agents. Install locally via Docker Compose - traces, logs, Prometheus metrics, service maps, and agent tracing out of the box. @@ -20,7 +20,7 @@ OpenSearch Observability Stack is an **open-source, OpenTelemetry-native observa - + ## Quickstarts @@ -29,18 +29,34 @@ OpenSearch Observability Stack is an **open-source, OpenTelemetry-native observa +## Query language: PPL + +The Observability Stack is powered by **Piped Processing Language (PPL)** - a pipe-based, human-readable query language purpose-built for observability. PPL is the native language for querying logs and traces, giving you a single, consistent syntax across signal types. + +```sql +source = logs-otel-v1* +| where severityNumber >= 17 +| stats count() as errors by `resource.attributes.service.name` +| sort - errors +``` + +With **50+ commands** and **200+ built-in functions**, PPL covers everything from simple filtering to machine learning anomaly detection - all in a pipeline you can read top to bottom. + + + ## Why Observability Stack? - **Open source**: Fully open source, no vendor lock-in, self-host everything - **OpenTelemetry-native**: All data ingestion uses OTel protocols and [semantic conventions](https://opentelemetry.io/docs/specs/semconv/) +- **PPL-native**: Pipe-based query language across logs and traces - no query DSL, no steep learning curve - **GenAI-first**: Purpose-built views for AI agent tracing using standard `gen_ai.*` attributes -- **Local-first**: Runs entirely on your machine via Docker Compose — no cloud account required +- **Local-first**: Runs entirely on your machine via Docker Compose - no cloud account required - **Production path**: Same components (OpenSearch, Prometheus, OTel Collector) scale to production ## Community -- [GitHub](https://github.com/opensearch-project/observability-stack) — issues, PRs, and discussions -- [Contributing guide](https://github.com/opensearch-project/observability-stack/blob/main/CONTRIBUTING.md) — how to contribute +- [GitHub](https://github.com/opensearch-project/observability-stack) - issues, PRs, and discussions +- [Contributing guide](https://github.com/opensearch-project/observability-stack/blob/main/CONTRIBUTING.md) - how to contribute diff --git a/docs/starlight-docs/src/content/docs/investigate/index.md b/docs/starlight-docs/src/content/docs/investigate/index.md index a5ce8e15..ae1ef71c 100644 --- a/docs/starlight-docs/src/content/docs/investigate/index.md +++ b/docs/starlight-docs/src/content/docs/investigate/index.md @@ -3,7 +3,7 @@ title: "Investigate" description: "Explore, query, and analyze your observability data across logs, traces, and metrics using Discover" --- -Observability investigation in OpenSearch centers on the Discover experience — a consistent querying interface available across logs, traces, and metrics, each on its own dedicated page. Analysts use Discover to understand system behavior, diagnose issues, and uncover patterns. +Observability investigation in OpenSearch centers on the Discover experience - a consistent querying interface available across logs, traces, and metrics, each on its own dedicated page. Analysts use Discover to understand system behavior, diagnose issues, and uncover patterns. ## Discover: your investigation starting point @@ -55,14 +55,20 @@ source = otel-v1-apm-span-* | head 10 ``` -For the full PPL command reference, see the [PPL documentation](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.md). For hands-on examples using OTEL data, see [Discover Logs](/docs/investigate/discover-logs/) and [Discover Traces](/docs/investigate/discover-traces/). +For the full PPL reference, see: +- **[PPL Language Overview](/docs/ppl/)** - Why PPL and how it fits into the stack +- **[Command Reference](/docs/ppl/commands/)** - Syntax and examples for all 50+ commands +- **[Function Reference](/docs/ppl/functions/)** - 200+ built-in functions +- **[Observability Examples](/docs/ppl/examples/)** - Real-world queries with live playground links + +For hands-on exploration in the Discover UI, see [Discover Logs](/docs/investigate/discover-logs/) and [Discover Traces](/docs/investigate/discover-traces/). ### PromQL PromQL is a functional query language for selecting and aggregating time-series metrics. It supports instant queries, range queries, and built-in functions for rates, aggregations, and mathematical operations. :::caution[Placeholder queries] -The PromQL examples below use standard OpenTelemetry metric names. Your environment may use different metric names and labels — adjust accordingly. +The PromQL examples below use standard OpenTelemetry metric names. Your environment may use different metric names and labels - adjust accordingly. ::: **Sample queries:** @@ -116,7 +122,7 @@ After running a query against logs or traces, analysts can build visualizations | Gauge chart | Progress toward thresholds or goals | | Tag cloud | Frequency analysis of terms or categories | -Discover automatically recommends the most appropriate visualization based on your query results — for example, a single metric with a date column defaults to a line chart, while categorical columns with high cardinality trigger a heat map. +Discover automatically recommends the most appropriate visualization based on your query results - for example, a single metric with a date column defaults to a line chart, while categorical columns with high cardinality trigger a heat map. ## Save to dashboard @@ -138,7 +144,7 @@ When your investigation is complete, save your query for future use and share it - **Save:** Preserve the query text, filters, time range, and selected fields. Access saved queries from the **Open** menu in Discover. - **Share:** Share your saved query with team members so they can rerun the same investigation, build on your work, or use it as a starting point for their own analysis. -Saved queries become reusable building blocks — use them to standardize investigation runbooks, onboard new team members, or create a library of common diagnostic queries for your organization. +Saved queries become reusable building blocks - use them to standardize investigation runbooks, onboard new team members, or create a library of common diagnostic queries for your organization. ## Datasets @@ -146,7 +152,7 @@ Datasets provide a unified way to discover and select data sources for your quer ## Correlations -Correlations let you jump between related signals — linking a log entry to the trace that produced it, or navigating from a slow trace to the associated logs. See [Correlations](/docs/investigate/correlations/) for details. +Correlations let you jump between related signals - linking a log entry to the trace that produced it, or navigating from a slow trace to the associated logs. See [Correlations](/docs/investigate/correlations/) for details. ## Troubleshooting diff --git a/docs/starlight-docs/src/content/docs/investigate/troubleshooting.md b/docs/starlight-docs/src/content/docs/investigate/troubleshooting.md index 21ef37fd..3b0babc0 100644 --- a/docs/starlight-docs/src/content/docs/investigate/troubleshooting.md +++ b/docs/starlight-docs/src/content/docs/investigate/troubleshooting.md @@ -11,7 +11,7 @@ When a query returns unexpected results, no results, or errors, these techniques | Possible cause | How to check | Fix | |---|---|---| -| Time range too narrow | Check the date picker — does it cover when the data was ingested? | Widen the time range | +| Time range too narrow | Check the date picker - does it cover when the data was ingested? | Widen the time range | | Wrong index pattern | Check `source =` in PPL or the index selector | Switch to the correct index (e.g., `logs-otel-v1*` not `logs-otel-v1`) | | Filter too restrictive | Check the filter bar for active filters | Remove or loosen filters | | Field name mismatch | Check the network payload for the exact field names | Use autocomplete to verify field names, or check the index mapping | @@ -23,13 +23,13 @@ PPL syntax errors usually point to the exact position of the problem. Common mis - Missing pipe (`|`) between commands - Using `=` instead of `==` in `where` clauses (PPL uses `=` for equality, not `==`) -- Quoting issues — field names with dots (like `severity.text`) don't need quotes, but string values do +- Quoting issues - field names with dots (like `severity.text`) don't need quotes, but string values do - Using `AND`/`OR` without proper parentheses for precedence -- REX capture group names containing underscores (not allowed — use letters and digits only) +- REX capture group names containing underscores (not allowed - use letters and digits only) ### Query takes too long or times out -- Narrow the time range — this is the single biggest performance lever +- Narrow the time range - this is the single biggest performance lever - Add `where` clauses early in the pipeline to filter before aggregating - Avoid `join` and `subquery` on large indices without tight filters on both sides - Add `| head N` to limit result sets @@ -42,34 +42,34 @@ This means the query exceeded the cluster's memory or compute limits. To work ar - Reduce the time range significantly (try `earliest=-15m` instead of hours) - Remove expensive operations like `join`, `subquery`, or `eventstats` and simplify the query -- Break complex queries into smaller steps — run the inner query first, note the results, then build the outer query -- If on a local development cluster (like observability-stack), the resource limits are tight by design — simplify queries or increase Docker resource allocation +- Break complex queries into smaller steps - run the inner query first, note the results, then build the outer query +- If on a local development cluster (like observability-stack), the resource limits are tight by design - simplify queries or increase Docker resource allocation ### Results look wrong -- Check for null values in fields you're aggregating — nulls can skew `avg()`, `count()`, and other stats. Use `fillnull` to handle them -- Verify field types — aggregating on a string field that looks numeric won't work as expected -- Check for duplicate events — if data is being ingested from multiple pipelines, you may be double-counting +- Check for null values in fields you're aggregating - nulls can skew `avg()`, `count()`, and other stats. Use `fillnull` to handle them +- Verify field types - aggregating on a string field that looks numeric won't work as expected +- Check for duplicate events - if data is being ingested from multiple pipelines, you may be double-counting - Look at the raw data (`| fields *`) before aggregating to confirm the data looks right at the source ## PromQL-specific issues ### No data points -- Verify the metric name exists — use autocomplete or the metrics explorer to browse available metrics -- Check label names and values — PromQL label matchers are case-sensitive -- Ensure the scrape interval covers your range vector — `rate(metric[1m])` needs at least 2 data points within 1 minute +- Verify the metric name exists - use autocomplete or the metrics explorer to browse available metrics +- Check label names and values - PromQL label matchers are case-sensitive +- Ensure the scrape interval covers your range vector - `rate(metric[1m])` needs at least 2 data points within 1 minute ### Unexpected NaN or Inf values -- `NaN` usually means division by zero — add a `> 0` filter to the denominator -- `Inf` can appear in `histogram_quantile()` when there aren't enough buckets — check that the `le` label exists and has sufficient values -- Missing `rate()` around counters — counters only go up, so raw values aren't useful. Always wrap counters in `rate()` or `increase()` +- `NaN` usually means division by zero - add a `> 0` filter to the denominator +- `Inf` can appear in `histogram_quantile()` when there aren't enough buckets - check that the `le` label exists and has sufficient values +- Missing `rate()` around counters - counters only go up, so raw values aren't useful. Always wrap counters in `rate()` or `increase()` ### Rate returns 0 when data exists -- The range vector window might be too small — if your scrape interval is 30s, use at least `[1m]` for `rate()` -- Counter resets (service restarts) can cause `rate()` to return 0 for one interval — this is normal +- The range vector window might be too small - if your scrape interval is 30s, use at least `[1m]` for `rate()` +- Counter resets (service restarts) can cause `rate()` to return 0 for one interval - this is normal ## Browser developer tools @@ -82,16 +82,16 @@ When the above techniques don't resolve the issue, your browser's developer tool 3. Run your query in Discover 4. Look for the request to the OpenSearch query endpoint (typically a POST to `/_plugins/_ppl` or `/_plugins/_query`) 5. Click the request to see: - - **Request payload** — the exact query sent to OpenSearch, including any filters or time ranges the UI added - - **Response body** — the raw data returned, before the UI formats it - - **Status code** — 200 (success), 400 (bad query syntax), 500 (server error), etc. - - **Timing** — how long the query took to execute + - **Request payload** - the exact query sent to OpenSearch, including any filters or time ranges the UI added + - **Response body** - the raw data returned, before the UI formats it + - **Status code** - 200 (success), 400 (bad query syntax), 500 (server error), etc. + - **Timing** - how long the query took to execute This is especially useful when the UI shows an unhelpful error message. The response body often contains a more detailed error from the query engine. ### What to look for in the payload -- Check that the time range in the request matches what you expect — the UI may be adding time filters you didn't write +- Check that the time range in the request matches what you expect - the UI may be adding time filters you didn't write - Look for additional `where` clauses injected by dashboard filters or Discover's filter bar - Verify the index pattern in the `source` matches your intended target - Check for query size limits (`size`, `head`) that might be truncating results @@ -105,5 +105,5 @@ If you've exhausted these techniques: 3. Try running a simplified version of the query (just `source = index | head 10`) to confirm basic connectivity 4. Check OpenSearch cluster health and resource usage 5. Consult the [PPL documentation](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.md) or [PromQL documentation](https://prometheus.io/docs/prometheus/latest/querying/basics/) for syntax reference -6. Ask in the [#observability channel](https://opensearch.org/slack) on the OpenSearch Slack workspace — the community is active and responsive +6. Ask in the [#observability channel](https://opensearch.org/slack) on the OpenSearch Slack workspace - the community is active and responsive 7. Search or file an issue on the [OpenSearch GitHub repo](https://github.com/opensearch-project/OpenSearch) for bugs or feature requests diff --git a/docs/starlight-docs/src/content/docs/mcp/index.md b/docs/starlight-docs/src/content/docs/mcp/index.md index 4c938a0f..696ea64a 100644 --- a/docs/starlight-docs/src/content/docs/mcp/index.md +++ b/docs/starlight-docs/src/content/docs/mcp/index.md @@ -54,5 +54,5 @@ The MCP server is particularly useful for AI-assisted investigation workflows: ## Learn more -- [Introducing MCP in OpenSearch](https://opensearch.org/blog/introducing-mcp-in-opensearch/) — announcement blog post with architecture details -- [Model Context Protocol specification](https://modelcontextprotocol.io/) — the MCP standard +- [Introducing MCP in OpenSearch](https://opensearch.org/blog/introducing-mcp-in-opensearch/) - announcement blog post with architecture details +- [Model Context Protocol specification](https://modelcontextprotocol.io/) - the MCP standard diff --git a/docs/starlight-docs/src/content/docs/ppl/commands.md b/docs/starlight-docs/src/content/docs/ppl/commands.md new file mode 100644 index 00000000..67720678 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands.md @@ -0,0 +1,1121 @@ +--- +title: "PPL Command Reference" +description: "Complete reference for all PPL commands - syntax, parameters, and examples with live playground links for OpenTelemetry observability data." +--- + +import { Tabs, TabItem, Aside } from '@astrojs/starlight/components'; + +This reference covers every PPL command available in OpenSearch. Each command includes syntax, parameters, and examples you can run against live OpenTelemetry data in the [playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t)). + + + +## Query structure + +Every PPL query starts with a `search` command (or just `source=`), followed by a pipeline of commands separated by the pipe character (`|`): + +```sql +source = +| command1 +| command2 +| command3 +``` + +In the Discover UI, the `source` is set automatically by the selected dataset, so queries typically begin with `|`: + +```sql +| where severityText = 'ERROR' +| stats count() as errors by `resource.attributes.service.name` +``` + +--- + +## Search and filter + +### search + +Retrieve documents from an index. This is always the first command in a PPL query. The `search` keyword can be omitted. + +**Syntax:** +``` +search source= [] +source= [] +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | Index name or pattern to query | +| `` | No | Initial filter condition | + +**Example - Get all logs:** +```sql +source = logs-otel-v1* +``` + +**Example - Search with inline filter:** +```sql +source = logs-otel-v1* severityText = 'ERROR' +``` + +Try in playground → + +--- + +### where + +Filter results using boolean expressions. Only rows where the expression evaluates to `true` are returned. + +**Syntax:** +``` +where +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | Condition that evaluates to true/false | + +Supports operators: `=`, `!=`, `>`, `<`, `>=`, `<=`, `AND`, `OR`, `NOT`, `LIKE`, `IN`, `BETWEEN`, `IS NULL`, `IS NOT NULL`. + +**Example - Filter error logs:** +```sql +| where severityText = 'ERROR' or severityText = 'FATAL' +| head 20 +``` + +Try in playground → + +**Example - Compound conditions:** +```sql +| where severityNumber >= 17 AND `resource.attributes.service.name` = 'checkout' +| head 20 +``` + +Try in playground → + +--- + +### regex + +*(experimental, since 3.3)* + +Filter results by matching field values against a regular expression pattern. + +**Syntax:** +``` +regex = +regex != +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | Field to match against | +| `` | Yes | Java regex pattern | + +**Example - Filter services matching a pattern:** +```sql +| regex `resource.attributes.service.name` = ".*agent.*" +| head 20 +``` + +Try in playground → + +--- + +### subquery + +*(experimental, since 3.0)* + +Embed one PPL query inside another for complex filtering. + +**Syntax:** +``` +where [not] in [ source= | ... ] +where [not] exists [ source= | ... ] +``` + +**Example - Find logs from services that have errors in traces:** +```sql +source = logs-otel-v1* +| where `resource.attributes.service.name` in [ + source = otel-v1-apm-span-* + | where status.code = 2 + ] +``` + +--- + +## Field selection and transformation + +### fields + +Keep or remove fields from search results. + +**Syntax:** +``` +fields [+|-] +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | Comma-delimited list of fields | +| `+` or `-` | No | `+` includes (default), `-` excludes | + +**Example - Select specific fields:** +```sql +| fields time, body, severityText, `resource.attributes.service.name` +| head 20 +``` + +**Example - Exclude fields:** +```sql +| fields - traceId, spanId +| head 20 +``` + +Try in playground → + +--- + +### table + +*(experimental, since 3.3)* + +Alias for `fields` with enhanced syntax. Supports space-delimited field lists. + +**Syntax:** +``` +table [+|-] +``` + +**Example:** +```sql +| table time body severityText +| head 20 +``` + +Try in playground → + +--- + +### rename + +Rename one or more fields. Supports wildcard patterns. + +**Syntax:** +``` +rename AS [, AS ]... +``` + +**Example:** +```sql +| rename `resource.attributes.service.name` as service +| head 20 +``` + +Try in playground → + +--- + +### eval + +Evaluate an expression and append (or overwrite) the result as a new field. + +**Syntax:** +``` +eval = [, = ]... +``` + +**Example - Calculate duration in milliseconds:** +```sql +source = otel-v1-apm-span-* +| eval duration_ms = durationInNanos / 1000000 +| sort - duration_ms +| head 10 +``` + +**Example - Concatenate fields:** +```sql +| eval service_operation = concat(`resource.attributes.service.name`, '/', body) +| head 20 +``` + +Try in playground → + +--- + +### convert + +*(experimental, since 3.5)* + +Transform field values to numeric values using specialized conversion functions. + +**Syntax:** +``` +convert (auto | ctime | dur2sec | memk | mktime | mstime | num | rmcomma | rmunit) () [as ] [, ...] +``` + +--- + +### replace + +*(experimental, since 3.4)* + +Replace text in one or more fields. + +**Syntax:** +``` +replace (, ) in [, ]... +``` + +**Example:** +```sql +| replace ("error", "ERROR") in body +| head 20 +``` + +Try in playground → + +--- + +### fillnull + +*(experimental, since 3.0)* + +Fill null values with a specified value. + +**Syntax:** +``` +fillnull value= +fillnull using = [, = ] +``` + +**Example:** +```sql +| fillnull value='unknown' `resource.attributes.service.name` +| head 20 +``` + +Try in playground → + +--- + +### expand + +*(experimental, since 3.1)* + +Expand a nested array field into multiple documents (one per array element). + +**Syntax:** +``` +expand [as ] +``` + +--- + +### flatten + +*(experimental, since 3.1)* + +Flatten a struct/object field into separate top-level fields. + +**Syntax:** +``` +flatten [as ()] +``` + +--- + +## Aggregation and statistics + +### stats + +Calculate aggregations from search results. The workhorse of PPL analytics. + +**Syntax:** +``` +stats ... [by ] +stats ... [by span(, ) [as ], ] +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | Aggregation function (count, sum, avg, max, min, etc.) | +| `by ` | No | Group results by one or more fields | +| `span(, )` | No | Create time or numeric buckets | + +**Example - Count logs by service:** +```sql +| stats count() as log_count by `resource.attributes.service.name` +``` + +Try in playground → + +**Example - Error rate by service:** +```sql +| stats count() as total, + sum(case(severityText = 'ERROR', 1 else 0)) as errors + by `resource.attributes.service.name` +| eval error_rate = errors * 100.0 / total +| sort - error_rate +``` + +Try in playground → + +**Example - Time-bucketed log volume:** +```sql +| stats count() as volume by span(time, 5m) as time_bucket +``` + +Try in playground → + +--- + +### eventstats + +*(experimental, since 3.1)* + +Like `stats`, but appends the aggregation result as a new field to **every event** instead of collapsing rows. + +**Syntax:** +``` +eventstats ... [by ] +``` + +**Example - Add service-level average alongside each log:** +```sql +source = otel-v1-apm-span-* +| eventstats avg(durationInNanos) as avg_duration by serviceName +| eval deviation = durationInNanos - avg_duration +| where deviation > avg_duration * 2 +``` + +--- + +### streamstats + +*(experimental, since 3.4)* + +Calculate cumulative or rolling statistics as events are processed in order. + +**Syntax:** +``` +streamstats [current=] [window=] ... [by ] +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `current` | No | Include current event in calculation (default: true) | +| `window` | No | Number of events for rolling window (default: 0 = all) | + +**Example - Rolling average latency over last 10 spans:** +```sql +source = otel-v1-apm-span-* +| sort startTime +| streamstats window=10 avg(durationInNanos) as rolling_avg by serviceName +``` + +--- + +### bin + +*(experimental, since 3.3)* + +Group numeric or time values into buckets of equal intervals. + +**Syntax:** +``` +bin [span=] [bins=] +``` + +**Example:** +```sql +source = otel-v1-apm-span-* +| eval duration_ms = durationInNanos / 1000000 +| bin duration_ms span=100 +| stats count() as spans by duration_ms +``` + +--- + +### timechart + +*(experimental, since 3.3)* + +Create time-based aggregations - perfect for dashboards and trend analysis. + +**Syntax:** +``` +timechart [timefield=] [span=] [by ] +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `timefield` | No | Time field (default: `@timestamp`) | +| `span` | No | Time interval (default: `1m`) | +| `limit` | No | Max distinct values for `by` field (default: 10) | + +**Example - Log volume over time by service:** +```sql +| timechart timefield=time span=5m count() by `resource.attributes.service.name` +``` + +Try in playground → + +--- + +### chart + +*(experimental, since 3.4)* + +Apply statistical aggregations with row and column splits for visualization. + +**Syntax:** +``` +chart [by ] +chart [over ] [by ] +``` + +**Example:** +```sql +| chart count() by `resource.attributes.service.name`, severityText +``` + +Try in playground → + +--- + +### trendline + +*(experimental, since 3.0)* + +Calculate moving averages of fields - simple moving average (SMA) or weighted moving average (WMA). + +**Syntax:** +``` +trendline [sort ] (sma|wma)(, ) [as ] +``` + +**Example:** +```sql +source = otel-v1-apm-span-* +| sort startTime +| trendline sma(5, durationInNanos) as latency_trend +``` + +--- + +### addtotals + +*(stable, since 3.5)* + +Add row and column totals to aggregation results. + +**Syntax:** +``` +addtotals [col=] [row=] [fieldname=] [labelfield=] [label=] [] +``` + +--- + +### addcoltotals + +*(stable, since 3.5)* + +Add a totals row at the bottom of results. + +**Syntax:** +``` +addcoltotals [labelfield=] [label=] [] +``` + +--- + +### transpose + +*(stable, since 3.5)* + +Transpose rows to columns - useful for pivoting aggregation results. + +**Syntax:** +``` +transpose [] [header_field=] [include_empty=] [column_name=] +``` + +--- + +## Sorting and limiting + +### sort + +Sort results by one or more fields. + +**Syntax:** +``` +sort [] [+|-] [, [+|-] ]... +sort [] [asc|desc] [, [asc|desc]]... +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | Field to sort by | +| `+` or `asc` | No | Ascending (default) | +| `-` or `desc` | No | Descending | +| `` | No | Number of results to return | + +**Example - Most recent logs first:** +```sql +| sort - time +| head 20 +``` + +Try in playground → + +**Example - Slowest traces:** +```sql +source = otel-v1-apm-span-* +| sort - durationInNanos +| head 10 +``` + +--- + +### head + +Return the first N results. Default is 10. + +**Syntax:** +``` +head [] [from ] +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | No | Number of results (default: 10) | +| `` | No | Number of results to skip | + +**Example:** +```sql +| sort - time +| head 50 +``` + +Try in playground → + +--- + +### reverse + +*(experimental, since 3.2)* + +Reverse the display order of results. + +**Syntax:** +``` +reverse +``` + +--- + +## Deduplication and ranking + +### dedup + +Remove duplicate documents based on field values. + +**Syntax:** +``` +dedup [] [keepempty=] [consecutive=] +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | Fields that define uniqueness | +| `` | No | Number of duplicates to keep per group (default: 1) | +| `keepempty` | No | Keep documents with null values (default: false) | +| `consecutive` | No | Only remove consecutive duplicates (default: false) | + +**Example - One log per unique service:** +```sql +| dedup `resource.attributes.service.name` +``` + +Try in playground → + +--- + +### top + +Find the most common values of a field. + +**Syntax:** +``` +top [] [by ] +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | No | Number of top values (default: 10) | +| `` | Yes | Fields to find top values for | + +**Example - Top services by log volume:** +```sql +| top 5 `resource.attributes.service.name` +``` + +Try in playground → + +--- + +### rare + +Find the least common values of a field - useful for spotting anomalies. + +**Syntax:** +``` +rare [by ] +``` + +**Example - Rarest severity levels:** +```sql +| rare severityText +``` + +Try in playground → + +--- + +## Text extraction and pattern matching + +### parse + +Extract fields from text using regular expressions with named capture groups. + +**Syntax:** +``` +parse +``` + +**Example - Extract HTTP status codes from log bodies:** +```sql +| parse body 'HTTP/\d\.\d"\s+(?\d{3})' +| stats count() as requests by statusCode +``` + +Try in playground → + +--- + +### grok + +*(stable, since 2.4)* + +Extract fields using grok patterns - a higher-level abstraction over regex using predefined patterns like `%{IP}`, `%{NUMBER}`, `%{GREEDYDATA}`. + +**Syntax:** +``` +grok +``` + +**Example - Parse structured log lines:** +```sql +| grok body '%{IP:client_ip} - %{DATA:user} \[%{HTTPDATE:timestamp}\] "%{WORD:method} %{DATA:url}"' +| head 20 +``` + +Try in playground → + +--- + +### rex + +*(experimental, since 3.3)* + +Extract fields from text using regex named capture groups, with additional options for sed-mode text substitution. + +**Syntax:** +``` +rex [mode=] field= [max_match=] +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `field` | Yes | Source field | +| `` | Yes | Regex with named groups `(?...)` | +| `mode` | No | `extract` (default) or `sed` for substitution | +| `max_match` | No | Max matches to extract (default: 1) | + +**Example - Extract key-value pairs from logs:** +```sql +| rex field=body "status=(?\w+)\s+latency=(?\d+)" +| head 20 +``` + +Try in playground → + +--- + +### spath + +*(experimental, since 3.3)* + +Extract fields from structured JSON data within a text field. + +**Syntax:** +``` +spath input= [output=] [path=] +``` + +**Example:** +```sql +| spath input=body path=error.message output=error_msg +| where isnotnull(error_msg) +| stats count() by error_msg +``` + +Try in playground → + +--- + +### patterns + +*(stable, since 2.4)* + +Automatically discover log patterns by extracting and clustering similar log lines. This is one of PPL's most powerful observability features - it replaces hours of manual regex work with a single command. + +**Syntax:** +``` +patterns [method=simple_pattern|brain] [mode=label|aggregation] [max_sample_count=] +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | Text field to analyze | +| `method` | No | `simple_pattern` (default) or `brain` for smarter clustering | +| `mode` | No | `label` adds pattern field, `aggregation` groups by pattern | +| `max_sample_count` | No | Sample logs per pattern (default: 10) | + +**Example - Discover log patterns:** +```sql +| patterns body method=simple_pattern mode=aggregation +``` + +Try in playground → + +--- + +## Data combination and enrichment + +### join + +*(stable, since 3.0)* + +Combine two datasets together. Supports inner, left, right, full, semi, anti, and cross joins. + +**Syntax:** +``` +[joinType] join [left=] [right=] on +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `joinType` | No | `inner` (default), `left`, `right`, `full`, `semi`, `anti`, `cross` | +| `on ` | Yes | Join condition | +| `` | Yes | Index name or subsearch | + +**Example - Correlate logs with trace data:** +```sql +source = logs-otel-v1* +| left join on traceId = traceId [ + source = otel-v1-apm-span-* + ] +``` + +--- + +### lookup + +*(experimental, since 3.0)* + +Enrich data by looking up values from a reference index. + +**Syntax:** +``` +lookup [as ] [replace|append ] +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | Reference index to look up from | +| `` | Yes | Key field in the lookup index | +| `replace` or `append` | No | `replace` overwrites, `append` fills nulls only | + +--- + +### append + +*(experimental, since 3.3)* + +Append results of a subsearch to the bottom of the main search results. + +**Syntax:** +``` +append [] +``` + +**Example - Combine log stats with trace stats:** +```sql +source = logs-otel-v1* +| stats count() as log_count by `resource.attributes.service.name` +| append [ + source = otel-v1-apm-span-* + | stats count() as log_count by serviceName as `resource.attributes.service.name` + ] +``` + +--- + +### appendcol + +*(experimental, since 3.1)* + +Append subsearch results as additional columns alongside the main results. + +**Syntax:** +``` +appendcol [override=] [] +``` + +--- + +### multisearch + +*(experimental, since 3.4)* + +Execute multiple search queries and combine results. + +**Syntax:** +``` +multisearch [] [, ]... +``` + +--- + +## Multivalue fields + +### mvcombine + +*(stable, since 3.4)* + +Combine values of a field across rows into a multivalue array. + +**Syntax:** +``` +mvcombine +``` + +--- + +### nomv + +*(stable, since 3.6)* + +Convert a multivalue field to a single string by joining elements with newlines. + +**Syntax:** +``` +nomv +``` + +--- + +### mvexpand + +*(stable, since 3.6)* + +Expand a multi-valued field into separate documents (one per value). + +**Syntax:** +``` +mvexpand [limit=] +``` + +--- + +## Machine learning + +### ml + +*(stable, since 2.5)* + +Apply machine learning algorithms directly in your query pipeline. + +**Syntax (Anomaly Detection - RCF):** +``` +ml action='train' algorithm='rcf' [time_field=] [anomaly_rate=] +``` + +**Syntax (Clustering - K-Means):** +``` +ml action='train' algorithm='kmeans' [centroids=] [iterations=] [distance_type=] +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `algorithm` | Yes | `rcf` (Random Cut Forest) or `kmeans` | +| `time_field` | Yes (RCF time-series) | Timestamp field for time-series anomaly detection | +| `centroids` | No | Number of clusters for kmeans (default: 2) | +| `anomaly_rate` | No | Expected anomaly rate for RCF (default: 0.005) | + +**Example - Anomaly detection on time-series data:** +```sql +source = otel-v1-apm-span-* +| stats avg(durationInNanos) as avg_latency by span(startTime, 1m) as minute +| ml action='train' algorithm='rcf' time_field='minute' +| where is_anomaly = 1 +``` + +--- + +### kmeans + +*(stable, since 1.3)* + +Apply k-means clustering directly on query results. + +**Syntax:** +``` +kmeans [centroids=] [iterations=] [distance_type=COSINE|L1|EUCLIDEAN] +``` + +--- + +## Metadata and debugging + +### describe + +*(stable, since 2.1)* + +Query the metadata (field names, types) of an index. + +**Syntax:** +``` +describe +``` + +**Example:** +```sql +describe logs-otel-v1* +``` + +--- + +### explain + +*(stable, since 3.1)* + +Show the execution plan of a query - useful for debugging and optimization. + +**Syntax:** +``` +explain [simple|standard|cost|extended] +``` + +--- + +### showdatasources + +*(stable, since 2.4)* + +List all configured data sources in the PPL engine. + +**Syntax:** +``` +show datasources +``` + +--- + +## Graph traversal + +### graphlookup + +*(experimental, since 3.6)* + +Perform recursive graph traversal on a collection using BFS - useful for tracing service dependency chains. + +**Syntax:** +``` +graphlookup source= connectFromField= connectToField= as [maxDepth=] [depthField=] +``` + +--- + +## All commands at a glance + +| Command | Since | Status | Description | +|---------|-------|--------|-------------| +| [search](#search) | 1.0 | stable | Retrieve documents from an index | +| [where](#where) | 1.0 | stable | Filter with boolean expressions | +| [fields](#fields) | 1.0 | stable | Keep or remove fields | +| [table](#table) | 3.3 | experimental | Alias for fields with enhanced syntax | +| [rename](#rename) | 1.0 | stable | Rename fields | +| [eval](#eval) | 1.0 | stable | Evaluate expressions, create fields | +| [convert](#convert) | 3.5 | experimental | Convert field values to numeric | +| [replace](#replace) | 3.4 | experimental | Replace text in fields | +| [fillnull](#fillnull) | 3.0 | experimental | Fill null values | +| [expand](#expand) | 3.1 | experimental | Expand nested arrays | +| [flatten](#flatten) | 3.1 | experimental | Flatten struct fields | +| [stats](#stats) | 1.0 | stable | Aggregation and grouping | +| [eventstats](#eventstats) | 3.1 | experimental | Aggregation appended to each event | +| [streamstats](#streamstats) | 3.4 | experimental | Cumulative/rolling statistics | +| [bin](#bin) | 3.3 | experimental | Group into numeric/time buckets | +| [timechart](#timechart) | 3.3 | experimental | Time-based charts | +| [chart](#chart) | 3.4 | experimental | Aggregation with row/column splits | +| [trendline](#trendline) | 3.0 | experimental | Moving averages | +| [addtotals](#addtotals) | 3.5 | stable | Row and column totals | +| [addcoltotals](#addcoltotals) | 3.5 | stable | Column totals | +| [transpose](#transpose) | 3.5 | stable | Transpose rows to columns | +| [sort](#sort) | 1.0 | stable | Sort results | +| [reverse](#reverse) | 3.2 | experimental | Reverse result order | +| [head](#head) | 1.0 | stable | Return first N results | +| [dedup](#dedup) | 1.0 | stable | Remove duplicates | +| [top](#top) | 1.0 | stable | Most common values | +| [rare](#rare) | 1.0 | stable | Least common values | +| [parse](#parse) | 1.3 | stable | Regex field extraction | +| [grok](#grok) | 2.4 | stable | Grok pattern extraction | +| [rex](#rex) | 3.3 | experimental | Regex extraction with options | +| [regex](#regex) | 3.3 | experimental | Regex-based filtering | +| [spath](#spath) | 3.3 | experimental | JSON field extraction | +| [patterns](#patterns) | 2.4 | stable | Log pattern discovery | +| [join](#join) | 3.0 | stable | Combine datasets | +| [append](#append) | 3.3 | experimental | Append subsearch results | +| [appendcol](#appendcol) | 3.1 | experimental | Append as columns | +| [lookup](#lookup) | 3.0 | experimental | Enrich from lookup index | +| [multisearch](#multisearch) | 3.4 | experimental | Multi-query combination | +| [subquery](#subquery) | 3.0 | experimental | Nested query filtering | +| [ml](#ml) | 2.5 | stable | Machine learning algorithms | +| [kmeans](#kmeans) | 1.3 | stable | K-means clustering | +| [mvcombine](#mvcombine) | 3.4 | stable | Combine multivalue fields | +| [nomv](#nomv) | 3.6 | stable | Multivalue to string | +| [mvexpand](#mvexpand) | 3.6 | stable | Expand multivalue fields | +| [graphlookup](#graphlookup) | 3.6 | experimental | Recursive graph traversal | +| [describe](#describe) | 2.1 | stable | Index metadata | +| [explain](#explain) | 3.1 | stable | Query execution plan | +| [showdatasources](#showdatasources) | 2.4 | stable | List data sources | + +## Syntax conventions + +| Notation | Meaning | +|----------|---------| +| `` | Replace with actual value | +| `[optional]` | Can be omitted | +| `(a \| b)` | Required choice between options | +| `[a \| b]` | Optional choice between options | +| `...` | Preceding element can repeat | + +## Further reading + +- **[Function Reference](/docs/ppl/functions/)** - 200+ built-in functions +- **[Observability Examples](/docs/ppl/examples/)** - Real-world OTel queries +- **[PPL source documentation](https://github.com/opensearch-project/sql/tree/main/docs/user/ppl)** - Upstream PPL docs in the OpenSearch SQL plugin diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/dedup.md b/docs/starlight-docs/src/content/docs/ppl/commands/dedup.md new file mode 100644 index 00000000..57bad9a3 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/dedup.md @@ -0,0 +1,121 @@ +--- +title: "dedup" +description: "Remove duplicate documents based on field values - deduplicate results for unique combinations." +--- + +import { Aside } from '@astrojs/starlight/components'; + +The `dedup` command removes duplicate documents from search results based on the values of one or more specified fields. By default, it keeps the first occurrence of each unique combination of field values and discards subsequent duplicates. + +You can retain more than one duplicate per combination by specifying a count, preserve rows that have null values with `keepempty=true`, and limit deduplication to consecutive rows only with `consecutive=true`. + +## Syntax + +```sql +dedup [] [keepempty=] [consecutive=] +``` + +## Arguments + +| Argument | Required | Type | Default | Description | +|----------|----------|------|---------|-------------| +| `` | Yes | Comma-delimited field names | -- | The fields used to determine uniqueness. At least one field is required. When multiple fields are specified, uniqueness is based on the combination of all field values. | +| `` | No | Integer (> 0) | `1` | The number of duplicate documents to retain for each unique combination of field values. | +| `keepempty` | No | Boolean | `false` | When `true`, keeps documents where any field in the field list has a `NULL` value or is missing. When `false`, those documents are discarded. | +| `consecutive` | No | Boolean | `false` | When `true`, removes only consecutive duplicate documents rather than all duplicates. | + + + +## Usage notes + +- **Operates on field combinations.** When you specify multiple fields, `dedup` considers the combination of values across all those fields. For example, `dedup service, severity` keeps one row for each unique (service, severity) pair. +- **`keepempty=true` preserves rows with null values.** By default, rows where any of the specified fields is null are removed. Set `keepempty=true` to retain them. +- **`consecutive=true` only removes adjacent duplicates.** This is useful when your data is sorted and you want to collapse runs of identical values while preserving non-adjacent duplicates. +- **Common pattern: one representative per group.** Use `dedup` to get one sample document per unique value of a field. This is faster than `stats` when you need the actual document, not just a count. + +## Examples + +### Deduplicate on a single field + +Keep one log entry per unique severity level: + +```sql +| dedup severityText +``` + +Try in playground → + +### Keep multiple duplicates per group + +Keep up to 2 log entries per severity level: + +```sql +| dedup 2 severityText +``` + +Try in playground → + +### Deduplicate on multiple fields + +Keep one log entry per unique combination of service and severity: + +```sql +| dedup `resource.attributes.service.name`, severityText +``` + +Try in playground → + +### Preserve rows with null values + +Keep one log per unique traceId, including logs that have no traceId: + +```sql +| dedup traceId keepempty=true +``` + +Try in playground → + +### One representative error log per OTel service + +Get one sample error log from each service to quickly see what kinds of errors each service produces: + +```sql +| where severityText = 'ERROR' +| dedup `resource.attributes.service.name` +``` + +Try in playground → + +## Extended examples + +### Unique service-severity combinations with OTel context + +Find every distinct combination of service and severity level, showing one sample log body for each. This is useful for building a quick inventory of what each service is logging: + +```sql +| dedup `resource.attributes.service.name`, severityText +| sort `resource.attributes.service.name`, severityText +``` + +Try in playground → + +### Deduplicate traces to find one slow span per service + +Get one representative slow span (over 1 second) from each service in your OTel trace data: + +```sql +source = otel-v1-apm-span-* +| where durationInNanos > 1000000000 +| dedup serviceName +| sort - durationInNanos +``` + +## See also + +- [top](/docs/ppl/commands/top/) - Find the most common values of a field +- [rare](/docs/ppl/commands/rare/) - Find the least common values of a field +- [stats](/docs/ppl/commands/stats/) - Aggregate results when you need counts rather than sample documents +- [head](/docs/ppl/commands/head/) - Limit the number of results returned +- [PPL Command Reference](/docs/ppl/commands/) - All PPL commands diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/describe.md b/docs/starlight-docs/src/content/docs/ppl/commands/describe.md new file mode 100644 index 00000000..b7b8d4a7 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/describe.md @@ -0,0 +1,144 @@ +--- +title: "describe" +description: "Query index metadata - discover available fields, types, and schema information." +--- + +import { Aside } from '@astrojs/starlight/components'; + + + +The `describe` command queries index metadata, returning field names, data types, and schema information. It must be used as the **first command** in a PPL query -- it cannot appear after a pipe. This is your starting point when exploring an unfamiliar index. + +## Syntax + +```sql +describe [.][.] +``` + +## Arguments + +### Required + +| Argument | Description | +|----------|-------------| +| `` | The index or index pattern to describe. Supports wildcards (e.g., `logs-otel-v1*`). | + +### Optional + +| Argument | Default | Description | +|----------|---------|-------------| +| `` | OpenSearch default | The data source to query. | +| `` | Default schema | The schema containing the table. | + +## Output columns + +The `describe` command returns metadata rows with the following key columns: + +| Column | Description | +|--------|-------------| +| `TABLE_NAME` | Name of the index. | +| `COLUMN_NAME` | Name of the field. | +| `TYPE_NAME` | Data type of the field (e.g., `string`, `bigint`, `timestamp`, `object`, `nested`). | + +Additional columns include `TABLE_CAT`, `TABLE_SCHEM`, `DATA_TYPE`, `COLUMN_SIZE`, `NULLABLE`, `ORDINAL_POSITION`, and others following JDBC metadata conventions. + +## Usage notes + +- `describe` must be the first command in the query. You cannot pipe data into `describe`. +- Combine `describe` with `where` and `fields` to filter and focus on specific columns or types. +- Use wildcard index patterns to describe fields across multiple indices at once. +- The output helps you discover the correct field names and types before writing more complex queries -- especially useful for OTel indices where field names follow dotted semantic conventions. + +## Examples + +### Describe an index + +List all fields and their types in the OTel logs index: + +```sql +describe logs-otel-v1* +``` + +Try in playground → + +### Find fields of a specific type + +Filter for all `bigint` (long) fields: + +```sql +describe logs-otel-v1* +| where TYPE_NAME = 'bigint' +``` + +Try in playground → + +### Find fields by name pattern + +Search for fields containing `service` in their name: + +```sql +describe logs-otel-v1* +| where like(COLUMN_NAME, '%service%') +``` + +Try in playground → + +### List all gen_ai fields + +Discover GenAI semantic convention fields in your OTel index: + +```sql +describe logs-otel-v1* +| where like(COLUMN_NAME, '%gen_ai%') +``` + +Try in playground → + +### Describe the trace index + +Explore the span index schema to understand available trace fields: + +```sql +describe otel-v1-apm-span-* +| sort COLUMN_NAME +``` + +## Extended examples + +### Compare schemas across OTel signal indices + +Describe both the log and trace indices to find common fields for cross-signal correlation: + +```sql +describe logs-otel-v1* +| where COLUMN_NAME IN ('traceId', 'spanId', 'time', 'severityText', 'body') +``` + +Try in playground → + +Then compare with the trace index: + +```sql +describe otel-v1-apm-span-* +| where COLUMN_NAME IN ('traceId', 'spanId', 'startTime', 'endTime', 'serviceName') +``` + +### Discover all nested object fields + +Find all object and nested field types to understand the document structure: + +```sql +describe logs-otel-v1* +| where TYPE_NAME = 'object' OR TYPE_NAME = 'nested' +| sort COLUMN_NAME +``` + +Try in playground → + +## See also + +- [fields](/docs/ppl/commands/fields/) -- select or exclude fields from query results +- [showdatasources](/docs/ppl/commands/) -- list all configured data sources +- [search](/docs/ppl/commands/search/) -- retrieve documents from an index diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/eval.md b/docs/starlight-docs/src/content/docs/ppl/commands/eval.md new file mode 100644 index 00000000..ddc0821a --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/eval.md @@ -0,0 +1,140 @@ +--- +title: "eval" +description: "Create computed fields by evaluating expressions - arithmetic, string operations, conditionals, and more." +--- + +## Description + +The `eval` command evaluates an expression and appends the result as a new field to each event in the search results. If a field with the same name already exists, its value is overwritten. Use `eval` whenever you need to derive new values: unit conversions, string manipulation, conditional categorization, date arithmetic, or type casting. + +`eval` supports arithmetic, string, date/time, conditional, and type conversion expressions. It is commonly paired with `stats` to prepare fields before aggregation or to compute derived metrics after aggregation. + +> **Note:** The `eval` command is executed on the coordinating node and is not pushed down to the OpenSearch query DSL. + +--- + +## Syntax + +```sql +eval = [, = ]... +``` + +--- + +## Arguments + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | The name of the field to create or update. If the field does not exist, a new field is added. If it already exists, its value is overwritten. | +| `` | Yes | The expression to evaluate. Supports arithmetic operators (`+`, `-`, `*`, `/`, `%`), string functions, date functions, conditional functions (`if()`, `case()`), type casts (`CAST`), and more. | + +--- + +## Usage notes + +- **Multiple assignments in a single `eval`**: Separate them with commas. This is more efficient than chaining multiple `eval` commands. + ```sql + | eval duration_ms = durationInNanos / 1000000, status_label = if(`status.code` = 0, 'OK', 'Error') + ``` + +- **Later assignments can reference earlier ones**: Within the same `eval`, a field defined on the left can be used by an expression on the right. + ```sql + | eval doubled = value * 2, quadrupled = doubled * 2 + ``` + +- **Overwriting existing fields**: If you assign to an existing field name, the original value is replaced for all downstream commands. The original data in the index is not modified. + +- **String concatenation**: Use the `+` operator to concatenate strings. When mixing types, cast numeric values to strings first with `CAST(field AS STRING)`. + +- **Conditional expressions**: Use `if(condition, true_value, false_value)` for simple two-way branching, or `case(condition1, value1, condition2, value2, ... else default)` for multi-way branching. + +- **Works with all PPL functions**: Any function available in PPL (string, math, date, type conversion) can be used in an `eval` expression. + +- **No aggregation functions in `eval`**: Aggregation functions like `count()` or `avg()` belong in `stats`, not `eval`. Use `eval` after `stats` to compute derived metrics from aggregated values. + +--- + +## Basic examples + +### Arithmetic -- convert nanoseconds to milliseconds + +```sql +source = otel-v1-apm-span-* +| eval duration_ms = durationInNanos / 1000000 +``` + +### String concatenation + +```sql +source = logs-otel-v1* +| eval service_severity = `resource.attributes.service.name` + ' - ' + severityText +``` + +Try in playground → + +### Conditional with `if()` + +```sql +source = logs-otel-v1* +| eval is_error = if(severityText = 'ERROR', 'yes', 'no') +``` + +Try in playground → + +### Multi-way conditional with `case()` + +```sql +source = otel-v1-apm-span-* +| eval latency_tier = case( + durationInNanos < 100000000, 'fast', + durationInNanos < 500000000, 'moderate', + durationInNanos < 1000000000, 'slow' + else 'critical') +``` + +### Type casting with string concatenation + +```sql +source = otel-v1-apm-span-* +| eval span_info = 'Service: ' + serviceName + ', Duration (ns): ' + CAST(durationInNanos AS STRING) +``` + +--- + +## Extended examples + +### OTel: Categorize log severity into alert levels + +Derive an `alert_level` field from the numeric severity of log events, useful for routing alerts or filtering dashboards. + +```sql +| eval alert_level = case( + severityNumber >= 21, 'CRITICAL', + severityNumber >= 17, 'ERROR', + severityNumber >= 13, 'WARN', + severityNumber >= 9, 'INFO' + else 'DEBUG') +| stats count() as cnt by alert_level, `resource.attributes.service.name` +``` + +Try in playground → + +### OTel: Build a composite service identifier + +Combine the service name and severity into a single field for downstream grouping or display. + +```sql +| eval service_status = `resource.attributes.service.name` + ' [' + severityText + ']' +| head 20 +``` + +Try in playground → + +--- + +## See also + +- [stats](/docs/ppl/commands/stats/) -- aggregate results (often used after `eval`) +- [fields](/docs/ppl/commands/fields/) -- select which fields to display +- [where](/docs/ppl/commands/where/) -- filter results using expressions +- [sort](/docs/ppl/commands/sort/) -- order results by computed fields diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/eventstats.md b/docs/starlight-docs/src/content/docs/ppl/commands/eventstats.md new file mode 100644 index 00000000..44bccebb --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/eventstats.md @@ -0,0 +1,129 @@ +--- +title: "eventstats" +description: "Add aggregation statistics as new fields to every event - enrich each row with group-level context." +--- + +import { Tabs, TabItem, Aside } from '@astrojs/starlight/components'; + + + +The `eventstats` command calculates summary statistics and appends them as new fields to **every** event in your results. Unlike `stats`, which collapses rows into an aggregation table, `eventstats` preserves every original event and adds the computed values alongside. + +This makes `eventstats` ideal for comparing individual events against group-level context -- flagging outliers, calculating deviation from the norm, or adding percentile baselines to each row. + +## Syntax + +```sql +eventstats [bucket_nullable=] ... [by ] +``` + +## Arguments + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | One or more aggregation functions (e.g., `avg(field)`, `count()`, `max(field)`). Each produces a new field in every row. | +| `bucket_nullable` | No | Whether `null` values form their own group in `by` aggregations. Default is controlled by `plugins.ppl.syntax.legacy.preferred`. | +| `` | No | Group results by one or more fields or expressions. Syntax: `by [span-expression,] [field,]...`. Without a `by` clause, statistics are computed across all events. | +| `span(, )` | No | Split a numeric or time field into buckets. Example: `span(durationInNanos, 1000000000)` creates 1-second buckets; `span(time, 1h)` creates hourly buckets. | + +### Time units for span expressions + +`ms` (milliseconds), `s` (seconds), `m` (minutes), `h` (hours), `d` (days), `w` (weeks), `M` (months), `q` (quarters), `y` (years). + +## Supported aggregation functions + +`COUNT`, `SUM`, `AVG`, `MAX`, `MIN`, `VAR_SAMP`, `VAR_POP`, `STDDEV_SAMP`, `STDDEV_POP`, `DISTINCT_COUNT` / `DC`, `EARLIEST`, `LATEST`. + +## Usage notes + +- **Use `eventstats` when you need both the raw event and the aggregate.** If you only need the aggregation table, use `stats` instead -- it is faster. +- **Combine with `eval` and `where`** to calculate deviations or filter outliers. For example, compute `avg(latency)` per service with `eventstats`, then `eval deviation = latency - avg_latency` and `where deviation > threshold`. +- **Span expressions** let you bucket time or numeric fields, which is useful for comparing events within time windows. +- **`bucket_nullable=false`** excludes rows with `null` group-by values from aggregation (their aggregated field is also `null`). Use `bucket_nullable=true` to treat `null` as a valid group. + +## Examples + +### Average, sum, and count by group + +Calculate aggregate latency statistics per service and add them to every span: + +```sql +source = otel-v1-apm-span-* +| eventstats avg(durationInNanos), sum(durationInNanos), count() by serviceName +| head 50 +``` + +Every row retains its original fields, plus new aggregate columns with the group-level values. + +### Count by span and group + +Count trace spans within 1-hour time buckets, grouped by service: + +```sql +source = otel-v1-apm-span-* +| eventstats count() as cnt by span(startTime, 1h) as time_bucket, serviceName +| head 50 +``` + +### Filter after enrichment + +Add the service-level average latency, then keep only spans that deviate significantly: + +```sql +source = otel-v1-apm-span-* +| eventstats avg(durationInNanos) as avg_duration by serviceName +| eval deviation = durationInNanos - avg_duration +| where abs(deviation) > avg_duration * 2 +| sort - deviation +``` + +### Null bucket handling + +Exclude `null` group-by values from aggregation: + +```sql +source = otel-v1-apm-span-* +| eventstats bucket_nullable=false count() as cnt by `status.code` +| head 50 +``` + +Rows where `status.code` is `null` receive `null` for `cnt`. + +## Extended examples + +### Add service average latency to each span + +Compute per-service average latency and attach it to every span, then identify outliers: + +```sql +source = otel-v1-apm-span-* +| eventstats avg(durationInNanos) as avg_duration by serviceName +| eval deviation = durationInNanos - avg_duration +| where deviation > avg_duration * 2 +| sort - deviation +| head 20 +``` + +### Flag high-severity log spikes per service + +Count logs per service and severity, then flag services with unusually high error counts: + +```sql +source = logs-otel-v1* +| eventstats count() as svc_error_count by `resource.attributes.service.name`, severityText +| where severityText = 'ERROR' +| eventstats avg(svc_error_count) as avg_errors +| where svc_error_count > avg_errors * 3 +| dedup `resource.attributes.service.name` +``` + +Try in playground → + +## See also + +- [stats](/docs/ppl/commands/stats/) - aggregate and collapse rows +- [streamstats](/docs/ppl/commands/streamstats/) - cumulative and rolling window statistics +- [trendline](/docs/ppl/commands/trendline/) - moving averages +- [Command Reference](/docs/ppl/commands/) - all PPL commands diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/expand.md b/docs/starlight-docs/src/content/docs/ppl/commands/expand.md new file mode 100644 index 00000000..5a97b905 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/expand.md @@ -0,0 +1,109 @@ +--- +title: "expand" +description: "Expand nested array fields into multiple documents - one row per array element." +--- + +import { Aside } from '@astrojs/starlight/components'; + + + +The `expand` command transforms a single document containing a nested array field into multiple documents, one per array element. All other fields in the original document are duplicated across the resulting rows. This is useful for working with OTel attributes stored as arrays or nested structures. + +## Syntax + +```sql +expand [as ] +``` + +## Arguments + +### Required + +| Argument | Description | +|----------|-------------| +| `` | The array field to expand. Must be a nested array type. | + +### Optional + +| Argument | Default | Description | +|----------|---------|-------------| +| `as ` | Original field name | An alias for the expanded field in the output. | + +## Usage notes + +- Only **nested array** fields are supported. Primitive fields that store array-like strings cannot be expanded. For string fields containing JSON arrays, use [spath](/docs/ppl/commands/spath/) to parse them first. +- If the array field is empty (`[]`), the row is retained with the expanded field set to `null`. +- Expanding a field with N elements produces N rows. Be mindful of result set size when expanding large arrays. +- After expansion, each row contains the individual array element (or its alias), along with all other fields from the original document duplicated. +- Combine `expand` with [flatten](/docs/ppl/commands/flatten/) to first expand an array of objects, then flatten each object's fields into top-level columns. + +## Examples + + + +### Expand an array field + +Expand the `resource.attributes` array from OTel log records into individual rows, one per attribute: + +```sql +source = logs-otel-v1* +| expand resource.attributes +``` + +### Expand with an alias + +Expand and rename the expanded field for clarity: + +```sql +source = logs-otel-v1* +| expand resource.attributes as attr +``` + +### Filter after expansion + +Expand resource attributes into rows, then filter for a specific attribute key: + +```sql +source = logs-otel-v1* +| expand resource.attributes as attr +| flatten attr +| where key = 'service.name' +``` + +## Extended examples + +### Expand and flatten OTel resource attributes + +OTel data often stores attributes as arrays of key-value objects. Expand the array first, then flatten each object to access individual attributes: + +```sql +source = logs-otel-v1* +| expand resource.attributes as attr +| flatten attr +``` + +Try in playground → + +### Expand nested scope attributes for instrumentation analysis + +Examine individual scope attributes from OTel log records to understand which instrumentation libraries are producing logs: + +```sql +source = logs-otel-v1* +| expand instrumentationScope.attributes as scope_attr +| flatten scope_attr +| stats count() as log_count by key, value +| sort - log_count +``` + +Try in playground → + +## See also + +- [flatten](/docs/ppl/commands/flatten/) -- flatten struct/object fields into top-level columns +- [spath](/docs/ppl/commands/spath/) -- parse JSON strings before expanding +- [eval](/docs/ppl/commands/eval/) -- transform expanded values diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/fields.md b/docs/starlight-docs/src/content/docs/ppl/commands/fields.md new file mode 100644 index 00000000..6694dd95 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/fields.md @@ -0,0 +1,142 @@ +--- +title: "fields" +description: "Keep or remove fields from search results - control which columns appear in output." +--- + +## Description + +The `fields` command specifies which fields (columns) to include in or exclude from the search results. It operates in two modes: + +- **Include mode** (`+`, default) - keeps only the listed fields and drops everything else. +- **Exclude mode** (`-`) - removes the listed fields and keeps everything else. + +Use `fields` to reduce clutter, focus on relevant data, and improve query performance by limiting the amount of data transferred. + +## Syntax + +```sql +fields [+|-] +``` + +## Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `` | Yes | A comma-delimited or space-delimited list of field names. Supports wildcard patterns (`*`). | +| `+` or `-` | No | `+` (include mode, default) keeps only the listed fields. `-` (exclude mode) removes the listed fields from the output. | + +## Usage notes + +- **Reduces data transfer**: Selecting only the fields you need reduces the amount of data returned from OpenSearch, which can significantly improve query performance for wide indices with many fields. +- **Wildcard patterns**: Use `*` to match field names by prefix (`severity*`), suffix (`*Id`), or substring (`*attr*`). Wildcards are expanded against the index schema. +- **Field order**: The order of fields in the output matches the order you specify in the `fields` command. +- **Automatic deduplication**: If a field is both explicitly listed and matched by a wildcard pattern, it appears only once in the output. +- **Backtick-quoted field names**: OTel fields with dots in their names (e.g., `resource.attributes.service.name`) must be enclosed in backticks (`` ` ``) to prevent them from being interpreted as nested field access. For example: `` `resource.attributes.service.name` ``. +- **Space or comma delimiters**: Fields can be separated by commas, spaces, or a mix of both. All three forms are equivalent: `fields a, b, c`, `fields a b c`, `fields a, b c`. +- **Multiple fields commands**: You can chain `fields` commands. For example, first include a broad set, then exclude specific fields from that set. +- **Full wildcard**: Use `fields *` or `` fields `*` `` to select all fields in the index schema, including fields with null values. Use the backtick form if the plain `*` does not return all expected fields. + +## Basic examples + +### Select specific fields + +Return only the timestamp, log body, and severity from log results: + +```sql +source=logs-otel-v1* +| fields time, body, severityText +``` + +Try in playground → + +### Exclude a field + +Start with a set of fields, then remove one: + +```sql +source=logs-otel-v1* +| fields time, body, severityText, traceId +| fields - traceId +``` + +Try in playground → + +### Space-delimited syntax + +Fields can be separated by spaces instead of commas: + +```sql +source=logs-otel-v1* +| fields time body severityText +``` + +Try in playground → + +### Prefix wildcard + +Select all fields whose names start with `severity`: + +```sql +source=logs-otel-v1* +| fields severity* +``` + +Try in playground → + +### Suffix wildcard + +Select all fields whose names end with `Id`: + +```sql +source=logs-otel-v1* +| fields *Id +``` + +Try in playground → + +## Extended examples + +### Select OTel log fields with backticks + +When working with OpenTelemetry data, field names contain dots. Use backticks to reference them correctly. + +```sql +source=logs-otel-v1* +| where severityText = 'ERROR' +| fields time, body, severityText, `resource.attributes.service.name`, `attributes.gen_ai.operation.name` +| head 20 +``` + +Try in playground → + +### Exclude verbose fields for a clean log view + +Remove high-cardinality or noisy fields to focus on the essentials during an investigation. This is especially useful when browsing raw log data in Discover. + +```sql +source=logs-otel-v1* +| where severityNumber >= 17 +| fields - `attributes.event.domain`, `attributes.event.name`, instrumentationScope +| head 50 +``` + +Try in playground → + +### Wildcard to select attribute groups + +Use a wildcard pattern to grab all GenAI-related attributes at once: + +```sql +source=logs-otel-v1* +| where ISNOTNULL(`attributes.gen_ai.operation.name`) +| fields time, body, `attributes.gen_ai*` +| head 20 +``` + +Try in playground → + +## See also + +- [`search`](/docs/ppl/commands/search/) - The starting point of every PPL query +- [`where`](/docs/ppl/commands/where/) - Filter results using boolean expressions +- [PPL Commands](/docs/ppl/commands/) - Full command reference diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/fillnull.md b/docs/starlight-docs/src/content/docs/ppl/commands/fillnull.md new file mode 100644 index 00000000..682369b7 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/fillnull.md @@ -0,0 +1,151 @@ +--- +title: "fillnull" +description: "Replace null values with specified defaults - clean up missing data for analysis and visualization." +--- + +import { Aside } from '@astrojs/starlight/components'; + + + +The `fillnull` command replaces null values in one or more fields with a specified value. This is essential for cleaning up data before aggregation, visualization, or export -- null values can break charts and skew statistics. + + + +## Syntax + +Three equivalent syntax forms are available: + +```sql +fillnull with [in ] +``` + +```sql +fillnull using = [, = ]... +``` + +```sql +fillnull value= [] +``` + +## Arguments + +### Required + +| Argument | Description | +|----------|-------------| +| `` | The replacement value for null fields. | + +### Optional + +| Argument | Description | +|----------|-------------| +| `` | Fields in which to replace nulls. Comma-delimited with `with`/`using` syntax, space-delimited with `value=` syntax. When omitted, all fields are processed. | +| ` = ` | Per-field replacement values (only with `using` syntax). | + +## Usage notes + +- When applying the same value to all fields without specifying field names, **all fields must be of the same type**. For mixed types, use separate `fillnull` commands or specify fields explicitly. +- The replacement value type must match the field type. You cannot fill a string field with a numeric value or vice versa. +- The `using` syntax is the most flexible form because it lets you assign different default values to different fields in a single command. +- Use `fillnull` before `stats` or `timechart` to ensure null values do not create unwanted `NULL` categories in grouped results. + +## Examples + +### Fill missing service names with a default + +Replace null service name values with `unknown`: + +```sql +source = logs-otel-v1* +| fillnull with 'unknown' in `resource.attributes.service.name` +| stats count() as log_count by `resource.attributes.service.name` +``` + +Try in playground → + +### Fill multiple fields with the same value + +Replace nulls in both `severityText` and `resource.attributes.service.name`: + +```sql +source = logs-otel-v1* +| fillnull with 'N/A' in severityText, `resource.attributes.service.name` +``` + +Try in playground → + +### Per-field defaults with the using syntax + +Assign different default values to different fields: + +```sql +source = logs-otel-v1* +| fillnull using severityText = 'INFO', `resource.attributes.service.name` = 'unknown-service' +``` + +Try in playground → + +### Fill all fields using the value= syntax + +Replace nulls across all string fields with a placeholder: + +```sql +source = logs-otel-v1* +| fillnull value='' +``` + +Try in playground → + +### Clean data before visualization + +Fill nulls before a timechart to prevent `NULL` categories from appearing in charts: + +```sql +source = logs-otel-v1* +| fillnull with 'unknown' in `resource.attributes.service.name` +| timechart timefield=time span=5m count() by `resource.attributes.service.name` +``` + +Try in playground → + +## Extended examples + +### Clean OTel log data for a service health dashboard + +Fill multiple fields with appropriate defaults before aggregating for a dashboard panel: + +```sql +source = logs-otel-v1* +| fillnull using severityText = 'UNSET', `resource.attributes.service.name` = 'unknown' +| stats count() as total, + sum(case(severityText = 'ERROR' OR severityText = 'FATAL', 1 else 0)) as errors + by `resource.attributes.service.name` +| eval error_rate = round(errors * 100.0 / total, 2) +| sort - error_rate +``` + +Try in playground → + +### Fill missing trace context for log-trace correlation + +When correlating logs with traces, fill missing trace IDs to identify uncorrelated logs: + +```sql +source = logs-otel-v1* +| fillnull using traceId = 'no-trace', spanId = 'no-span' +| stats count() as log_count by traceId +| where traceId = 'no-trace' +``` + +Try in playground → + +## See also + +- [eval](/docs/ppl/commands/eval/) -- create computed fields or conditional replacements with `case()` +- [where](/docs/ppl/commands/where/) -- filter out null values with `IS NOT NULL` +- [stats](/docs/ppl/commands/stats/) -- aggregation (benefits from clean non-null data) +- [timechart](/docs/ppl/commands/timechart/) -- time-based charts (null `by` fields create `NULL` categories) diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/flatten.md b/docs/starlight-docs/src/content/docs/ppl/commands/flatten.md new file mode 100644 index 00000000..436627e1 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/flatten.md @@ -0,0 +1,103 @@ +--- +title: "flatten" +description: "Flatten struct or object fields into separate top-level fields - simplify nested data structures." +--- + +import { Aside } from '@astrojs/starlight/components'; + + + +The `flatten` command converts a struct or object field into individual top-level fields within a document. Each key in the struct becomes its own column. The resulting fields are ordered **lexicographically** by their original key names. + +## Syntax + +```sql +flatten [as ()] +``` + +## Arguments + +### Required + +| Argument | Description | +|----------|-------------| +| `` | The struct or object field to flatten. Only object and nested field types are supported. | + +### Optional + +| Argument | Default | Description | +|----------|---------|-------------| +| `as ()` | Original key names | Comma-separated aliases for the flattened fields. Must be enclosed in parentheses if more than one alias. The number of aliases must match the number of keys, and they map in **lexicographic order** of the original keys. | + +## Usage notes + +- Do **not** apply `flatten` to array fields. Use [expand](/docs/ppl/commands/expand/) to split arrays into rows first, then `flatten` each resulting object. +- When a field contains a nested array, only the first element of the array is flattened. +- The `flatten` command may not work as expected if flattened fields are hidden. For example, `source=logs-otel-v1* | fields instrumentationScope | flatten instrumentationScope` fails because sub-fields like `instrumentationScope.name` are hidden after `fields instrumentationScope`. Instead, use `source=logs-otel-v1* | flatten instrumentationScope`. +- Aliases must follow the lexicographic order of original keys. For a struct with keys `b`, `c`, `Z`, provide aliases in the order `Z`, `b`, `c` (uppercase sorts before lowercase). + +## Examples + + + +### Flatten an object field + +Flatten the `instrumentationScope` object from OTel log records into its component fields (`name`, `version`, `attributes`): + +```sql +source = logs-otel-v1* +| flatten instrumentationScope +``` + +### Flatten with aliases + +Rename flattened fields using aliases (in lexicographic order of original keys: `attributes`, `name`, `version`): + +```sql +source = logs-otel-v1* +| flatten instrumentationScope as (scope_attrs, scope_name, scope_version) +``` + +### Flatten after filtering + +Filter for error logs first, then flatten to reduce data volume before restructuring: + +```sql +source = logs-otel-v1* +| where severityText = 'ERROR' +| flatten instrumentationScope +``` + +## Extended examples + +### Flatten OTel span attributes for analysis + +OTel span documents store HTTP metadata in nested objects. Flatten them for easier querying: + +```sql +source = otel-v1-apm-span-* +| flatten attributes +| where `http.status_code` >= 400 +| stats count() as error_count by serviceName, `http.status_code` +``` + +### Expand and flatten a nested array of objects + +Combine `expand` and `flatten` to work with arrays of structured objects. First expand the array into rows, then flatten each object: + +```sql +source = logs-otel-v1* +| expand resource.attributes as attr +| flatten attr +| sort - key +``` + +## See also + +- [expand](/docs/ppl/commands/expand/) -- expand array fields into multiple rows (use before `flatten` for arrays of objects) +- [spath](/docs/ppl/commands/spath/) -- extract fields from JSON strings +- [fields](/docs/ppl/commands/fields/) -- select or exclude fields from results diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/grok.md b/docs/starlight-docs/src/content/docs/ppl/commands/grok.md new file mode 100644 index 00000000..21c85577 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/grok.md @@ -0,0 +1,173 @@ +--- +title: "grok" +description: "Extract fields using grok patterns - a higher-level alternative to regex with 200+ predefined patterns." +--- + +import { Aside } from '@astrojs/starlight/components'; + +The `grok` command parses a text field using grok pattern syntax and appends the extracted fields to the search results. Grok provides over 200 predefined patterns (`%{IP}`, `%{NUMBER}`, `%{HOSTNAME}`, etc.) that wrap common regular expressions, making extraction more readable and less error-prone than writing raw regex. + + + +## Syntax + +```sql +grok +``` + +## Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `` | Yes | The text field to parse. | +| `` | Yes | A grok pattern using `%{PATTERN:fieldname}` syntax. Each `%{PATTERN:fieldname}` creates a new string field. If a field with the same name already exists, it is overwritten. Raw regex can be mixed with grok patterns. | + +## Usage notes + +- Grok patterns are built on top of regular expressions but provide a more readable, reusable syntax. +- Use the `%{PATTERN:fieldname}` syntax to extract a named field. If you omit `:fieldname`, the match is consumed but no field is created. +- The grok pattern must match the **entire** string from start to end for extraction to succeed. Use `%{GREEDYDATA}` or `%{GREEDYDATA:name}` at the end of your pattern to consume any remaining text (including trailing newlines via `[\s\S]`). +- When parsing a null field, the result is an empty string. +- Each unnamed `%{PATTERN}` must be unique within a single grok expression, or you will get a "Duplicate key" error. Give each pattern a unique field name to avoid this. +- Grok shares the same [limitations](/docs/ppl/commands/parse/#limitations) as the `parse` command. + +### Commonly used grok patterns + +| Pattern | Matches | Example | +|---------|---------|---------| +| `%{IP:ip}` | IPv4 or IPv6 address | `192.168.1.1` | +| `%{NUMBER:num}` | Integer or floating-point number | `42`, `3.14` | +| `%{WORD:word}` | Single word (no whitespace) | `ERROR` | +| `%{HOSTNAME:host}` | Hostname or FQDN | `api.example.com` | +| `%{GREEDYDATA:msg}` | Everything (greedy match) | any remaining text | +| `%{IPORHOST:server}` | IP address or hostname | `10.0.0.1` or `web01` | +| `%{URI:url}` | Full URI | `https://example.com/path?q=1` | +| `%{URIPATH:path}` | URI path component | `/api/v1/agents` | +| `%{POSINT:code}` | Positive integer | `200`, `404` | +| `%{DATA:val}` | Non-greedy match (minimal) | short text segments | + +## Basic examples + +### Extract HTTP method, path, and status from Envoy access logs + +The frontend-proxy service emits Envoy access logs in the body field. Use grok patterns to parse the timestamp, HTTP method, request path, and response status: + +```sql +source=logs-otel-v1* +| where like(body, '%HTTP/1.1"%') +| grok body '\[%{DATA:ts}\] "%{WORD:method} %{DATA:path} HTTP/%{DATA:ver}" %{POSINT:status} %{GREEDYDATA:rest}' +| head 20 +``` + +| body | method | path | status | +|------|--------|------|--------| +| [2026-02-26T18:04:21.634Z] "GET /api/data HTTP/1.1" 200 - via_upstream ... | GET | /api/data | 200 | +| [2026-02-26T18:04:23.059Z] "POST /api/product-ask-ai-assistant/0PUK6V6EV0 HTTP/1.1" 200 ... | POST | /api/product-ask-ai-assistant/0PUK6V6EV0 | 200 | +| [2026-02-26T18:04:21.629Z] "GET /api/data/ HTTP/1.1" 308 - via_upstream ... | GET | /api/data/ | 308 | + +Try in playground → + +### Override an existing field + +Strip the Kafka broker prefix from log bodies, keeping only the message content: + +```sql +source=logs-otel-v1* +| where `resource.attributes.service.name` = 'kafka' +| where like(body, '%Broker%Creating%') +| grok body '\[%{DATA}\] %{GREEDYDATA:body}' +| head 20 +``` + +| body | +|------| +| Creating new partition __consumer_offsets-33 with topic id _xZjVwc_TO2HCCnHkcNIDg. | +| Creating new partition __consumer_offsets-15 with topic id _xZjVwc_TO2HCCnHkcNIDg. | +| Creating new partition __consumer_offsets-48 with topic id _xZjVwc_TO2HCCnHkcNIDg. | + +Try in playground → + +### Extract component name and broker ID from Kafka logs + +Use grok to parse the `[Component id=N]` prefix from Kafka broker log bodies: + +```sql +source=logs-otel-v1* +| where `resource.attributes.service.name` = 'kafka' +| grok body '\[%{DATA:component} id=%{NUMBER:brokerId}\] %{GREEDYDATA:message}' +| where length(component) > 0 +| head 20 +``` + +| body | component | brokerId | message | +|------|-----------|----------|---------| +| [Broker id=1] Creating new partition __consumer_offsets-33 ... | Broker | 1 | Creating new partition __consumer_offsets-33 ... | +| [RaftManager id=1] Completed transition to Leader ... | RaftManager | 1 | Completed transition to Leader ... | +| [QuorumController id=1] The request from broker 1 ... | QuorumController | 1 | The request from broker 1 ... | + +Try in playground → + +### Aggregate HTTP requests by method and status + +Parse Envoy access logs and count requests grouped by HTTP method and status code: + +```sql +source=logs-otel-v1* +| where `resource.attributes.service.name` = 'frontend-proxy' +| head 1000 +| grok body '\[%{DATA:ts}\] "%{WORD:method} %{DATA:path} HTTP/%{DATA:ver}" %{POSINT:status} %{GREEDYDATA:rest}' +| where length(method) > 0 +| stats count() as requests by method, status +| sort - requests +``` + +Try in playground → + +## Extended examples + +### Extract the first word from OTel log bodies + +OpenTelemetry log bodies often start with a keyword that indicates the log type. Use grok to extract the first word and aggregate: + +```sql +source=logs-otel-v1* +| head 1000 +| grok body '%{WORD:first} %{GREEDYDATA:rest}' +| where length(first) > 0 +| stats count() as occurrences by first +| sort - occurrences +| head 20 +``` + +This extracts the first word from each log body, then counts occurrences to identify the most common log message prefixes across all services. + +Try in playground → + +### Identify top endpoints from Envoy access logs + +Parse Envoy access log bodies and aggregate by HTTP method and request path to find the busiest endpoints: + +```sql +source=logs-otel-v1* +| where `resource.attributes.service.name` = 'frontend-proxy' +| head 1000 +| grok body '\[%{DATA:ts}\] "%{WORD:method} %{DATA:path} HTTP/%{DATA:ver}" %{POSINT:status} %{GREEDYDATA:rest}' +| where length(method) > 0 +| stats count() as requests by method, path +| sort - requests +| head 20 +``` + +Try in playground → + + + +## See also + +- [parse](/docs/ppl/commands/parse/) -- extract fields using raw Java regex (more control, less readability) +- [rex](/docs/ppl/commands/rex/) -- regex extraction with sed-mode text replacement and multiple matches +- [patterns](/docs/ppl/commands/patterns/) -- automatically discover log patterns without writing any patterns diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/head.md b/docs/starlight-docs/src/content/docs/ppl/commands/head.md new file mode 100644 index 00000000..b7d7b249 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/head.md @@ -0,0 +1,126 @@ +--- +title: "head" +description: "Return the first N results from the search - limit output for exploration and top-N queries." +--- + +import { Aside } from '@astrojs/starlight/components'; + +The `head` command returns the first N results from a search result. The default number of results is 10. An optional offset skips a specified number of results before returning, enabling simple pagination. + +`head` is commonly placed at the end of a pipeline after `sort` to implement top-N queries (for example, "show the 10 slowest traces"). During exploration, always use `head` to limit the volume of data scanned and returned. + + + +## Syntax + +```sql +head [] [from ] +``` + +## Arguments + +| Argument | Required | Type | Default | Description | +|----------|----------|------|---------|-------------| +| `` | No | Integer | `10` | The number of results to return. Must be a positive integer. | +| `` | No | Integer | `0` | The number of results to skip before returning. Used with the `from` keyword. Must be a non-negative integer. | + +## Usage notes + +- **Always use during exploration.** Adding `head` at the end of a query prevents scanning the entire result set when you only need a sample. +- **Combine with `sort` for top-N patterns.** The idiomatic way to get "top N by some metric" in PPL is `sort - | head N`. +- **Offset enables simple pagination.** Use `head from ` to page through results. For example, `head 10 from 20` returns results 21 through 30. +- **Order matters.** `head` operates on whatever the pipeline has produced up to that point. Placing it before `sort` limits the rows that get sorted; placing it after `sort` limits the sorted output. + +## Examples + +### Return the default number of results + +Return the first 10 log entries (the default): + +```sql +| head +``` + +Try in playground → + +### Return a specific number of results + +Return the first 50 results: + +```sql +| head 50 +``` + +Try in playground → + +### Skip results with an offset + +Return 10 results starting from the 21st result (skip the first 20): + +```sql +| head 10 from 20 +``` + +Try in playground → + +### Top-N pattern: slowest traces + +Combine `sort` and `head` to find the 10 slowest spans: + +```sql +source = otel-v1-apm-span-* +| sort - durationInNanos +| head 10 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20sort%20-%20durationInNanos%0A%7C%20head%2010')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Top-N pattern: services with the most errors (OTel logs) + +Count error logs per service, sort descending, and return the top 5: + +```sql +| where severityText = 'ERROR' +| stats count() as error_count by `resource.attributes.service.name` +| sort - error_count +| head 5 +``` + +Try in playground → + +## Extended examples + +### Paginate through recent error logs + +Page through error logs 20 at a time. This query returns the second page (results 21-40): + +```sql +| where severityText = 'ERROR' +| sort - time +| head 20 from 20 +``` + +Try in playground → + +### Sample logs from each OTel service + +Get a quick sample of 5 logs per service by combining `dedup` and `head`: + +```sql +source = logs-otel-v1* +| dedup 5 `resource.attributes.service.name` +| sort - time +``` + +Try in playground → + +This is useful for initial exploration of what data each service is producing, without scanning the entire index. + +## See also + +- [sort](/docs/ppl/commands/sort/) - Sort results before applying `head` for top-N queries +- [dedup](/docs/ppl/commands/dedup/) - Deduplicate results for unique combinations +- [PPL Command Reference](/docs/ppl/commands/) - All PPL commands +- [Observability Examples](/docs/ppl/examples/) - Real-world OTel queries diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/join.md b/docs/starlight-docs/src/content/docs/ppl/commands/join.md new file mode 100644 index 00000000..4103620a --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/join.md @@ -0,0 +1,167 @@ +--- +title: "join" +description: "Combine two datasets together - correlate logs with traces, enrich data from reference indices." +--- + +import { Tabs, TabItem, Aside } from '@astrojs/starlight/components'; + + + +The `join` command combines two datasets by matching rows on a condition. The left side is your current pipeline (an index or piped commands); the right side is another index or a subsearch enclosed in square brackets. + +Use `join` to correlate logs with traces, enrich spans with service metadata, or combine any two datasets that share a common field. + +## Syntax + +The `join` command supports two syntax forms: basic and extended. + +### Basic syntax + +```sql +[joinType] join [left = ] [right = ] (on | where) +``` + +### Extended syntax + +```sql +join [type=] [overwrite=] [max=] ( | [left = ] [right = ] (on | where) ) +``` + +## Arguments + +### Basic syntax parameters + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `joinType` | No | The type of join. Valid values: `inner` (default), `left`, `right`, `full`, `semi`, `anti`, `cross`. | +| `left = ` | No | Alias for the left dataset to disambiguate shared field names. Must appear before `right`. | +| `right = ` | No | Alias for the right dataset to disambiguate shared field names. | +| `` | Yes | A comparison expression placed after `on` or `where` that specifies how to match rows. | +| `` | Yes | The right-side dataset. Can be an index name or a subsearch in `[ ... ]`. | + +### Extended syntax parameters + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `type` | No | Join type when using extended syntax. Valid values: `inner` (default), `left`, `outer` (alias for `left`), `right`, `full`, `semi`, `anti`, `cross`. | +| `overwrite` | No | When using a field list, whether right-side fields with duplicate names overwrite left-side fields. Default: `true`. | +| `max` | No | Maximum number of right-side matches per left row. Default: `0` (unlimited) when legacy mode is enabled; `1` otherwise. | +| `` | No | Common field names used to build the join condition automatically. Fields must exist in both datasets. | +| `` | Yes | A comparison expression placed after `on` or `where` that specifies how to match rows. | +| `` | Yes | The right-side dataset. Can be an index name or a subsearch in `[ ... ]`. | + +## Join types + +| Type | Keyword | Description | +|------|---------|-------------| +| Inner | `inner join` | Returns only rows with a match on both sides. This is the default. | +| Left outer | `left join` | Returns all left rows. Unmatched right fields are `null`. | +| Right outer | `right join` | Returns all right rows. Unmatched left fields are `null`. Requires `plugins.calcite.all_join_types.allowed = true`. | +| Full outer | `full join` | Returns all rows from both sides. Unmatched fields are `null` on the missing side. Requires `plugins.calcite.all_join_types.allowed = true`. | +| Left semi | `left semi join` | Returns left rows that have at least one match on the right. No right-side fields are included. | +| Left anti | `left anti join` | Returns left rows that have **no** match on the right. Useful for finding orphaned records. | +| Cross | `cross join` | Returns the Cartesian product of both sides. Requires `plugins.calcite.all_join_types.allowed = true`. | + +## Usage notes + +- **Assign aliases** when both sides share field names. Without aliases, ambiguous field names are automatically prefixed with the table name or alias (e.g., `table1.id`, `table2.id`). +- **Keep the right side small.** The right dataset is loaded into memory. Filter or limit the right-side subsearch to keep queries efficient. +- **Right, full, and cross joins** are disabled by default for performance reasons. Enable them by setting `plugins.calcite.all_join_types.allowed` to `true`. +- **Subsearch row limit.** The maximum number of rows from a subsearch is controlled by `plugins.ppl.join.subsearch_maxout` (default: `50000`). +- When using the extended syntax with a **field list**, duplicate field names are deduplicated based on the `overwrite` setting. + +## Examples + +### Join two indexes + +Correlate log entries with trace spans using `traceId` to see which spans produced each log line: + +```sql +source = logs-otel-v1* +| inner join left=l right=r ON l.traceId = r.traceId otel-v1-apm-span-* +``` + +### Join with a subsearch + +Join logs with a filtered subset of trace spans - only slow spans above a latency threshold: + +```sql +source = logs-otel-v1* as l +| where severityText = 'ERROR' +| left join ON l.traceId = r.traceId [ + source = otel-v1-apm-span-* + | where durationInNanos > 5000000000 + | sort - durationInNanos + | head 100 + ] as r +``` + +### Join using a field list (extended syntax) + +Join logs with trace spans on the shared `traceId` field using the field-list shorthand: + +```sql +source = logs-otel-v1* +| where severityText = 'ERROR' +| join type=left overwrite=true traceId [ + source = otel-v1-apm-span-* + | where durationInNanos > 1000000000 + | sort - durationInNanos + | head 100 + ] +``` + +### Semi join - find logs with matching spans + +Return only log events that have at least one matching trace span: + +```sql +source = logs-otel-v1* +| left semi join left=l right=r on l.traceId = r.traceId otel-v1-apm-span-* +``` + +### Anti join - find orphaned logs without spans + +Return log events that have no matching trace span - useful for finding gaps in instrumentation: + +```sql +source = logs-otel-v1* +| left anti join left=l right=r on l.traceId = r.traceId otel-v1-apm-span-* +``` + +## Extended examples + +### Correlate logs with trace spans + +Join log events with trace span data using `traceId` to see which spans produced each log line: + +```sql +source = logs-otel-v1* +| left join left=l right=r on l.traceId = r.traceId [ + source = otel-v1-apm-span-* + ] +| head 50 +``` + +### Enrich spans with service map data + +Join raw spans with the service map index to add dependency context: + +```sql +source = otel-v1-apm-span-* +| inner join left=span right=svc on span.serviceName = svc.serviceName [ + source = otel-v2-apm-service-map* + | dedup serviceName + ] +| sort - span.durationInNanos +| head 20 +``` + +## See also + +- [lookup](/docs/ppl/commands/lookup/) - simpler enrichment from a reference index +- [subquery](/docs/ppl/commands/) - filter using nested queries +- [append](/docs/ppl/commands/) - stack results vertically instead of joining +- [Command Reference](/docs/ppl/commands/) - all PPL commands diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/lookup.md b/docs/starlight-docs/src/content/docs/ppl/commands/lookup.md new file mode 100644 index 00000000..0489494e --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/lookup.md @@ -0,0 +1,134 @@ +--- +title: "lookup" +description: "Enrich events with data from a lookup index - add context like team names, environment labels, or cost data." +--- + +import { Tabs, TabItem, Aside } from '@astrojs/starlight/components'; + + + +The `lookup` command enriches your search results by matching rows against a reference index (dimension table) and pulling in additional fields. It is the simplest way to add context -- team ownership, environment labels, cost centers, or any static metadata -- to streaming event data. + +Compared with `join`, `lookup` is more efficient for one-to-one enrichment against a relatively small, static dataset. + +## Syntax + +```sql +lookup ( [AS ])... + [(replace | append | output) ( [AS ])...] +``` + +## Arguments + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | The name of the lookup index (dimension table) to match against. | +| `` | Yes | A key field in the lookup index used for matching, similar to a join key. Specify multiple fields as a comma-separated list. | +| `` | No | A key field in the source data to match against `lookupMappingField`. Defaults to the same name as `lookupMappingField`. Use `AS` to map differently named fields. | +| `replace \| append \| output` | No | Controls how matched values are applied. Default: `replace`. | +| `` | No | A field from the lookup index whose matched value is added to results. If omitted, all non-key fields from the lookup index are applied. | +| `` | No | The name of the result field where matched values are placed. Defaults to `inputField`. | + +## Output modes + +| Mode | Behavior | +|------|----------| +| `replace` | Overwrites existing field values with matched values from the lookup index. If no match is found, the field is set to `null`. This is the default. | +| `append` | Fills only missing (`null`) values in the source data. Existing non-null values are preserved. | +| `output` | Synonym for `replace`. Provided for compatibility. | + +## Usage notes + +- **Use `lookup` instead of `join`** when enriching events from a small, static reference table. It avoids the overhead of a full join. +- **`replace` overwrites existing values.** If the source data already has a `team` field and the lookup also provides `team`, the lookup value wins. Use `append` if you only want to fill gaps. +- **`append` only fills nulls.** Non-null values in the source data are never overwritten. If the `outputField` does not already exist in the source and you use `append`, the operation fails. Use `replace` to create new fields. +- **Multiple mapping fields** are supported. Separate them with commas to match on a composite key. +- When `` is omitted, **all fields** from the lookup index (except the mapping keys) are applied to the output. + + + +## Examples + +### Basic lookup - replace values + +Enrich log events with team ownership from a `service_owners` reference index: + +```sql +source = logs-otel-v1* +| eval service = `resource.attributes.service.name` +| LOOKUP service_owners service_name AS service REPLACE team +``` + +### Append missing values only + +Fill in `team` where it is currently `null`, without overwriting existing values: + +```sql +source = logs-otel-v1* +| eval service = `resource.attributes.service.name` +| LOOKUP service_owners service_name AS service APPEND team +``` + +### Lookup without specifying input fields + +When no `inputField` is specified, all non-key fields from the lookup index are applied: + +```sql +source = logs-otel-v1* +| eval service = `resource.attributes.service.name` +| LOOKUP service_owners service_name AS service +``` + +### Map to a new output field + +Place matched values into a new field using `AS`: + +```sql +source = otel-v1-apm-span-* +| LOOKUP environments service_name AS serviceName REPLACE env AS deploy_env +``` + +### Using the OUTPUT keyword + +`OUTPUT` is a synonym for `REPLACE` and produces identical results: + +```sql +source = logs-otel-v1* +| eval service = `resource.attributes.service.name` +| LOOKUP service_owners service_name AS service OUTPUT team +``` + +## Extended examples + +### Enrich logs with service ownership + +Assume you have a `service_owners` index mapping `service.name` to `team`, `oncall`, and `tier`. Enrich log events with ownership context: + +```sql +source = logs-otel-v1* +| eval service = `resource.attributes.service.name` +| LOOKUP service_owners service.name AS service REPLACE team, oncall, tier +| head 50 +``` + +### Add environment labels to spans + +Enrich trace spans with deployment metadata from an `environments` reference index: + +```sql +source = otel-v1-apm-span-* +| LOOKUP environments service_name AS serviceName REPLACE env, region, cost_center +| where env = 'production' +| sort - durationInNanos +| head 20 +``` + +## See also + +- [join](/docs/ppl/commands/join/) - full join for complex multi-field correlation +- [eval](/docs/ppl/commands/eval/) - compute new fields from expressions +- [Command Reference](/docs/ppl/commands/) - all PPL commands diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/ml.md b/docs/starlight-docs/src/content/docs/ppl/commands/ml.md new file mode 100644 index 00000000..76eb2287 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/ml.md @@ -0,0 +1,203 @@ +--- +title: "ml" +description: "Apply machine learning algorithms in your query pipeline - anomaly detection and clustering without external tools." +--- + +import { Aside } from '@astrojs/starlight/components'; + + + +The `ml` command applies machine learning algorithms from the ML Commons plugin directly in your PPL query pipeline. It supports anomaly detection using Random Cut Forest (RCF) and clustering using k-means, running train-and-predict operations in a single step. + + + +## Syntax + +**Anomaly detection (time-series):** + +```sql +ml action='train' algorithm='rcf' time_field= [] +``` + +**Anomaly detection (batch/non-time-series):** + +```sql +ml action='train' algorithm='rcf' [] +``` + +**K-means clustering:** + +```sql +ml action='train' algorithm='kmeans' [] +``` + +## Arguments + +### RCF time-series parameters + +| Argument | Required | Default | Description | +|----------|----------|---------|-------------| +| `time_field=` | Yes | -- | The timestamp field for time-series analysis. | +| `number_of_trees=` | No | `30` | Number of trees in the forest. | +| `shingle_size=` | No | `8` | Consecutive records in a shingle (sliding window). | +| `sample_size=` | No | `256` | Sample size for stream samplers. | +| `output_after=` | No | `32` | Minimum data points before results are produced. | +| `time_decay=` | No | `0.0001` | Decay factor for stream samplers. | +| `anomaly_rate=` | No | `0.005` | Expected anomaly rate (0.0 to 1.0). | +| `date_format=` | No | `yyyy-MM-dd HH:mm:ss` | Format of the time field. | +| `time_zone=` | No | `UTC` | Time zone of the time field. | +| `category_field=` | No | -- | Group input by category; prediction runs independently per group. | + +### RCF batch (non-time-series) parameters + +| Argument | Required | Default | Description | +|----------|----------|---------|-------------| +| `number_of_trees=` | No | `30` | Number of trees in the forest. | +| `sample_size=` | No | `256` | Random samples per tree from training data. | +| `output_after=` | No | `32` | Minimum data points before results are produced. | +| `training_data_size=` | No | Full dataset | Size of the training dataset. | +| `anomaly_score_threshold=` | No | `1.0` | Score threshold above which a point is anomalous. | +| `category_field=` | No | -- | Group input by category; prediction runs independently per group. | + +### K-means parameters + +| Argument | Required | Default | Description | +|----------|----------|---------|-------------| +| `centroids=` | No | `2` | Number of clusters. | +| `iterations=` | No | `10` | Maximum iterations for convergence. | +| `distance_type=` | No | `EUCLIDEAN` | Distance metric: `COSINE`, `L1`, or `EUCLIDEAN`. | + +## Output fields + +### RCF time-series output + +The command appends these fields to each row: + +| Field | Description | +|-------|-------------| +| `score` | Anomaly score (higher = more anomalous). | +| `anomaly_grade` | Anomaly grade (0.0 = normal, higher = more anomalous). | + +### RCF batch output + +| Field | Description | +|-------|-------------| +| `score` | Anomaly score. | +| `anomalous` | Boolean indicating whether the point is anomalous (`True`/`False`). | + +### K-means output + +| Field | Description | +|-------|-------------| +| `ClusterID` | The cluster assignment (integer starting from 0). | + +## Usage notes + +- For time-series RCF, ensure data is ordered by the time field before passing to `ml`. The algorithm expects sequential data. +- The `output_after` parameter controls the warm-up period. The first N data points will have a score of 0 while the model learns normal patterns. +- Batch RCF treats each data point independently, making it suitable for detecting outliers in non-sequential data. +- K-means works best when numeric fields are on similar scales. Consider normalizing with `eval` before clustering. +- Use `category_field` to run independent models per category (e.g., per service), avoiding cross-contamination between different baseline behaviors. + +## Examples + +### Detect anomalous latency in time-series data + +Aggregate span duration into 1-minute buckets and detect anomalies: + +```sql +source = otel-v1-apm-span-* +| stats avg(durationInNanos) as avg_latency by span(startTime, 1m) as minute +| ml action='train' algorithm='rcf' time_field='minute' +| where anomaly_grade > 0 +| sort - anomaly_grade +``` + +### Time-series anomaly detection by service + +Run independent anomaly detection per service: + +```sql +source = otel-v1-apm-span-* +| stats avg(durationInNanos) as avg_latency by span(startTime, 1m) as minute, serviceName +| ml action='train' algorithm='rcf' time_field='minute' category_field='serviceName' +| where anomaly_grade > 0 +``` + +### Batch outlier detection on request durations + +Detect unusually slow spans without considering time ordering: + +```sql +source = otel-v1-apm-span-* +| ml action='train' algorithm='rcf' +| where anomalous = 'True' +``` + +### Cluster services by error and latency behavior + +Use k-means to group services into behavioral clusters based on error rate and average latency: + +```sql +source = otel-v1-apm-span-* +| stats avg(durationInNanos) as avg_duration, count() as total, sum(case(status.code = 2, 1 else 0)) as errors by serviceName +| eval error_rate = errors * 100.0 / total +| ml action='train' algorithm='kmeans' centroids=3 +``` + +### Tune anomaly detection sensitivity + +Lower the `anomaly_rate` and increase `shingle_size` for stricter detection with more context: + +```sql +source = otel-v1-apm-span-* +| stats avg(durationInNanos) as avg_latency by span(startTime, 1m) as minute +| ml action='train' algorithm='rcf' time_field='minute' anomaly_rate=0.001 shingle_size=16 +| where anomaly_grade > 0 +``` + +## Extended examples + +### End-to-end latency anomaly investigation (OTel) + +Detect anomalous latency spikes and then find the specific traces responsible: + +```sql +source = otel-v1-apm-span-* +| stats avg(durationInNanos) as avg_latency, max(durationInNanos) as max_latency by span(startTime, 5m) as window, serviceName +| ml action='train' algorithm='rcf' time_field='window' category_field='serviceName' +| where anomaly_grade > 0 +| sort - anomaly_grade +| head 10 +``` + +After identifying the anomalous time windows, investigate individual traces: + +```sql +source = otel-v1-apm-span-* +| where serviceName = 'checkout' +| sort - durationInNanos +| head 20 +``` + +### Cluster OTel services by operational profile + +Group services by their token usage, latency, and throughput characteristics to identify operational tiers: + +```sql +source = otel-v1-apm-span-* +| stats avg(durationInNanos) as avg_duration, count() as throughput by serviceName +| eval avg_duration_ms = avg_duration / 1000000 +| ml action='train' algorithm='kmeans' centroids=3 distance_type=EUCLIDEAN +``` + +## See also + +- [stats](/docs/ppl/commands/stats/) -- aggregate data before feeding to ML algorithms +- [eventstats](/docs/ppl/commands/eventstats/) -- append aggregation results alongside original events +- [trendline](/docs/ppl/commands/trendline/) -- simple and weighted moving averages +- [eval](/docs/ppl/commands/eval/) -- normalize fields before clustering diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/parse.md b/docs/starlight-docs/src/content/docs/ppl/commands/parse.md new file mode 100644 index 00000000..7e910bbb --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/parse.md @@ -0,0 +1,174 @@ +--- +title: "parse" +description: "Extract fields from text using regular expressions - turn unstructured log data into structured fields." +--- + +import { Aside } from '@astrojs/starlight/components'; + +The `parse` command extracts new fields from a text field using a Java regular expression with named capture groups. Each named group in the pattern creates a new string field appended to the search results. The original field is preserved. + + + +## Syntax + +```sql +parse +``` + +## Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `` | Yes | The text field to parse. | +| `` | Yes | A Java regular expression containing one or more named capture groups using `(?pattern)` syntax. Each named group creates a new string field. If a field with the same name already exists, its values are overwritten. | + +## Usage notes + +- Named capture groups in the regex pattern become new fields. For example, `(?.+)` creates a field called `host`. +- The pattern must match the **entire** string from start to end. Use `[\s\S]+` at the end of the pattern to consume any remaining content including trailing newlines. +- If a named group matches a field that already exists, the existing field is overwritten with the extracted value. +- Parsed fields are available for use in all subsequent pipe commands (`where`, `stats`, `sort`, `eval`, etc.). +- The pattern uses [Java regular expression syntax](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html). +- When parsing a null field, the result is an empty string. +- Fields created by `parse` cannot be re-parsed by another `parse` command. +- The source field used by `parse` cannot be overridden by `eval` and still produce correct results. + +**Common regex patterns:** + +| Pattern | Matches | +|---------|---------| +| `(?\d+\.\d+\.\d+\.\d+)` | IPv4 addresses | +| `(?\d{3})` | HTTP status codes | +| `(?\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})` | ISO timestamps | +| `(?GET\|POST\|PUT\|DELETE)` | HTTP methods | +| `(?/[^\s]+)` | URL paths | +| `[\s\S]+` | Match remaining text (including newlines) | + +## Basic examples + +### Extract HTTP method, path, and status from Envoy access logs + +Parse the Envoy access log format emitted by the frontend-proxy service. The pattern must match the full body string: + +```sql +source=logs-otel-v1* +| where like(body, '%HTTP/1.1"%') +| parse body '\[(?[^\]]+)\] "(?\w+) (?\S+) HTTP/(?[^"]+)" (?\d+)[\s\S]+' +| head 20 +``` + +| body | ts | method | path | status | +|------|----|--------|------|--------| +| [2026-02-26T18:04:21.634Z] "GET /api/data HTTP/1.1" 200 ... | 2026-02-26T18:04:21.634Z | GET | /api/data | 200 | +| [2026-02-26T18:04:23.059Z] "POST /api/product-ask-ai-assistant/0PUK6V6EV0 HTTP/1.1" 200 ... | 2026-02-26T18:04:23.059Z | POST | /api/product-ask-ai-assistant/0PUK6V6EV0 | 200 | +| [2026-02-26T18:04:21.629Z] "GET /api/data/ HTTP/1.1" 308 ... | 2026-02-26T18:04:21.629Z | GET | /api/data/ | 308 | + +Try in playground → + +### Filter Envoy logs by status code + +Parse the status code from Envoy access logs and filter for non-2xx responses: + +```sql +source=logs-otel-v1* +| where `resource.attributes.service.name` = 'frontend-proxy' +| parse body '\[(?[^\]]+)\] "(?\w+) (?\S+) HTTP/(?[^"]+)" (?\d+)[\s\S]+' +| where cast(status as int) >= 300 +| sort status +| head 20 +``` + +| method | path | status | +|--------|------|--------| +| GET | /api/data/ | 308 | +| GET | /api/data/ | 308 | +| GET | /api/data/ | 308 | + +Try in playground → + +### Override an existing field + +Replace the `body` field with just the user action by using the same field name in the capture group. This works on load-generator log bodies that start with "User": + +```sql +source=logs-otel-v1* +| where like(body, 'User %') +| parse body 'User (?.+)' +| head 20 +``` + +| body | +|------| +| viewing cart | +| getting recommendations for product: 0PUK6V6EV0 | +| getting ads for category: None | +| accessing index page | + +Try in playground → + +### Aggregate request counts by endpoint + +Parse the Envoy access log format and count requests per method and path: + +```sql +source=logs-otel-v1* +| where like(body, '%HTTP/1.1"%') +| parse body '\[(?[^\]]+)\] "(?\w+) (?\S+) HTTP/(?[^"]+)" (?\d+)[\s\S]+' +| stats count() as cnt by method, path +| sort - cnt +``` + +Try in playground → + +## Extended examples + +### Extract partition names from Kafka broker logs + +Parse the Kafka broker log body format to extract the broker ID and partition name: + +```sql +source=logs-otel-v1* +| where `resource.attributes.service.name` = 'kafka' +| where like(body, '%Broker%Creating%') +| parse body '\[Broker id=(?\d+)\] Creating new partition (?\S+) [\s\S]+' +| head 20 +``` + +This extracts the broker ID and partition name from Kafka log bodies that follow the `[Broker id=N] Creating new partition ...` pattern. + +Try in playground → + +### Extract product IDs from recommendation logs + +Parse recommendation log bodies to extract product IDs and count how often each product is recommended: + +```sql +source=logs-otel-v1* +| where like(body, '%product:%') +| parse body '(?.+)product: (?.+)' +| stats count() as cnt by productId +| sort - cnt +``` + +Try in playground → + + + +## Limitations + +- Fields created by `parse` cannot be parsed again by a subsequent `parse` command. +- Fields created by `parse` cannot be overridden by `eval`. +- The source text field used by `parse` cannot be overridden and still produce correct results. +- The pattern must match the entire string. Use `[\s\S]+` at the end to consume remaining content including trailing newlines. +- Parsed fields cannot be filtered or sorted after they are used in a `stats` command. + +## See also + +- [grok](/docs/ppl/commands/grok/) -- extract fields using predefined grok patterns instead of raw regex +- [rex](/docs/ppl/commands/rex/) -- more powerful regex extraction with sed mode and multiple matches +- [patterns](/docs/ppl/commands/patterns/) -- automatically discover log patterns without writing regex +- [PPL Functions Reference](/docs/ppl/functions/) -- `regexp_match` and other string functions for regex filtering diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/patterns.md b/docs/starlight-docs/src/content/docs/ppl/commands/patterns.md new file mode 100644 index 00000000..5851d6fc --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/patterns.md @@ -0,0 +1,221 @@ +--- +title: "patterns" +description: "Automatically discover log patterns - cluster similar log messages without writing regex." +--- + +import { Aside } from '@astrojs/starlight/components'; + +The `patterns` command is one of PPL's most powerful features for log analysis. It automatically extracts patterns from unstructured text by replacing variable parts (numbers, IPs, timestamps, identifiers) with `<*>` placeholders, grouping similar log lines together. This replaces hours of manual regex writing with a single command. + + + +Two pattern extraction methods are available: + +- **`simple_pattern`** (default): Fast, regex-based extraction that replaces alphanumeric tokens with `<*>` placeholders. +- **`brain`**: A smarter ML-based clustering algorithm that preserves semantic meaning and produces more accurate groupings. + +Two output modes control how results are returned: + +- **`label`** (default): Adds a `patterns_field` column to each event showing its pattern. +- **`aggregation`**: Groups events by pattern and returns `pattern_count` and `sample_logs`. + +## Syntax + +```sql +patterns [by ] + [method=simple_pattern|brain] + [mode=label|aggregation] + [max_sample_count=] + [show_numbered_token=] + [new_field=] + [pattern=] + [buffer_limit=] + [variable_count_threshold=] + [frequency_threshold_percentage=] +``` + +## Arguments + +### Common arguments + +| Argument | Required | Default | Description | +|----------|----------|---------|-------------| +| `` | Yes | -- | The text field to analyze for log patterns. | +| `by ` | No | -- | Fields or scalar functions to group logs before pattern extraction. | +| `method` | No | `simple_pattern` | Pattern extraction method: `simple_pattern` (fast, regex-based) or `brain` (ML-based clustering). | +| `mode` | No | `label` | Output mode: `label` adds a pattern column to each event; `aggregation` groups events by pattern. | +| `max_sample_count` | No | `10` | Maximum number of sample log entries returned per pattern in `aggregation` mode. | +| `show_numbered_token` | No | `false` | When `true`, variables use numbered placeholders (``, ``) instead of `<*>`, and aggregation mode includes a `tokens` mapping. | +| `new_field` | No | `patterns_field` | Name for the output field that contains the extracted pattern. | + +### simple_pattern arguments + +| Argument | Required | Default | Description | +|----------|----------|---------|-------------| +| `pattern` | No | Auto-detect | A custom Java regular expression that identifies characters to replace with `<*>` placeholders. For example, `[0-9]` replaces only digits. | + +### brain arguments + +| Argument | Required | Default | Description | +|----------|----------|---------|-------------| +| `buffer_limit` | No | `100000` | Maximum internal buffer size (minimum `50000`). | +| `variable_count_threshold` | No | `5` | Controls sensitivity to detecting constant vs. variable words. Lower values produce more general patterns. | +| `frequency_threshold_percentage` | No | `0.3` | Minimum word frequency percentage. Words below this threshold are ignored. | + +## Usage notes + +- The `patterns` command runs on the coordinator node, not on data nodes. It groups patterns from log messages that have already been returned. +- In **label mode**, each event gets an additional `patterns_field` column showing its pattern. Use this to visually identify similar log lines. +- In **aggregation mode**, the output contains one row per unique pattern with `pattern_count` and `sample_logs` columns. This is ideal for understanding log composition at a glance. +- The **brain method** is better at identifying which parts of a log message are variable vs. static. It produces more meaningful groupings than `simple_pattern`, especially for complex log formats. +- The `by` clause lets you discover patterns per group (e.g., per service, per severity level). +- Default cluster settings for `patterns` can be overridden with cluster settings prefixed `plugins.ppl.pattern.*`. + +### When to use patterns + +| Use case | Why patterns helps | +|----------|-------------------| +| Incident investigation | Quickly answer: "What log patterns appeared during the outage?" | +| Log volume reduction | Identify the noisiest patterns consuming storage and bandwidth | +| Anomaly detection | Spot new or rare patterns that were not seen before | +| Log categorization | Group thousands of unique messages into a manageable set of templates | +| Regex bootstrapping | Use discovered patterns as a starting point for `parse` or `grok` rules | + +## Basic examples + +### Simple pattern discovery (label mode) + +Add a pattern label to each log body: + +```sql +source = logs-otel-v1* +| patterns body method=simple_pattern +| head 20 +``` + +| body | patterns_field | +|------|----------------| +| 10.0.1.55 - GET /api/v1/agents 200 1234ms | \<*\>.\<*\>.\<*\>.\<*\> - \<*\> /\<*\>/\<*\>/\<*\> \<*\> \<*\>\<*\> | +| 192.168.1.10 - POST /api/v1/invoke 201 567ms | \<*\>.\<*\>.\<*\>.\<*\> - \<*\> /\<*\>/\<*\>/\<*\> \<*\> \<*\>\<*\> | +| 172.16.0.42 - GET /health 200 12ms | \<*\>.\<*\>.\<*\>.\<*\> - \<*\> /\<*\> \<*\> \<*\>\<*\> | + +Try in playground → + +### Aggregation mode -- group by pattern + +Count how many log entries match each pattern: + +```sql +source = logs-otel-v1* +| patterns body method=simple_pattern mode=aggregation +| head 20 +``` + +This returns one row per unique pattern with the count and up to 10 sample log lines. + +Try in playground → + +### Brain method -- smarter clustering + +The brain method preserves more semantic meaning than simple_pattern: + +```sql +source = logs-otel-v1* +| patterns body method=brain +| head 20 +``` + +The brain algorithm identifies that HTTP methods (GET, POST, etc.), URL paths, and status codes are variable while structural elements (brackets, dashes, quotes) are constant, producing cleaner patterns like: + +``` +<*IP*> - <*> [<*>/<*>/<*>:<*>:<*>:<*> <*>] "<*> <*> HTTP/<*>" <*> <*> +``` + +Try in playground → + +### Custom regex pattern + +Replace only digits, preserving all other characters: + +```sql +source = logs-otel-v1* +| patterns body method=simple_pattern new_field='no_numbers' pattern='[0-9]' +| head 20 +``` + +Try in playground → + +### Aggregation with numbered tokens + +Enable numbered tokens to see exactly which parts of the pattern are variable: + +```sql +source = logs-otel-v1* +| patterns body method=simple_pattern mode=aggregation show_numbered_token=true +| head 1 +``` + +The output includes a `tokens` map showing what each `` placeholder matched, e.g. `{'': ['200'], '': ['404'], ...}`. + +Try in playground → + +## Extended examples + +### Discover patterns in OTel log bodies + +Find the dominant log patterns across all services in your OpenTelemetry data: + +```sql +source = logs-otel-v1* +| patterns body method=brain mode=aggregation +| sort - pattern_count +| head 20 +``` + +This reveals the most common log message shapes across your entire system. The `pattern_count` column shows which patterns dominate your log volume -- often a small number of patterns account for the vast majority of log entries. + +Try in playground → + +### Find dominant patterns per service + +Use the `by` clause to discover patterns grouped by the originating service: + +```sql +source = logs-otel-v1* +| patterns body method=brain mode=aggregation by `resource.attributes.service.name` +| sort - pattern_count +| head 30 +``` + +This helps answer questions like: "Which service produces the most repetitive log patterns?" and "Are there services emitting unique patterns that might indicate errors?" + +Try in playground → + + + +## Configuring defaults + +Override the default `patterns` settings at the cluster level: + +```json +PUT _cluster/settings +{ + "persistent": { + "plugins.ppl.pattern.method": "brain", + "plugins.ppl.pattern.mode": "aggregation", + "plugins.ppl.pattern.max.sample.count": 5, + "plugins.ppl.pattern.buffer.limit": 50000, + "plugins.ppl.pattern.show.numbered.token": true + } +} +``` + +## See also + +- [parse](/docs/ppl/commands/parse/) -- extract specific fields using Java regex when you know the pattern +- [grok](/docs/ppl/commands/grok/) -- extract fields using predefined grok patterns for known log formats +- [rex](/docs/ppl/commands/rex/) -- regex extraction with sed mode and multiple match support diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/rare.md b/docs/starlight-docs/src/content/docs/ppl/commands/rare.md new file mode 100644 index 00000000..2c23c932 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/rare.md @@ -0,0 +1,121 @@ +--- +title: "rare" +description: "Find the least common values of a field - surface anomalies and unusual patterns." +--- + +import { Aside } from '@astrojs/starlight/components'; + +The `rare` command finds the least common values (or combinations of values) for the specified fields. It is the inverse of `top` -- instead of returning the most frequent values, it returns the least frequent. Results are sorted from least to most common. + +`rare` is a powerful tool for anomaly surfacing. In observability data, uncommon values often signal problems: a rare error type, a service name that only appeared recently, or an unusual status code can all indicate issues that deserve investigation. + + + +## Syntax + +```sql +rare [rare-options] [by ] +``` + +## Arguments + +| Argument | Required | Type | Default | Description | +|----------|----------|------|---------|-------------| +| `` | Yes | Comma-delimited field names | -- | The fields to find rare values for. When multiple fields are specified, `rare` finds the least common combinations. | +| `by ` | No | Field name(s) | -- | One or more fields to group the results by. Rare values are computed separately within each group. | +| `showcount` | No | Boolean | `true` | When `true`, includes a count column showing the frequency of each value. Set to `false` for cleaner output. | +| `countfield` | No | String | `count` | The name of the count column in the output. Only applies when `showcount=true`. | + +## Usage notes + +- **Anomaly surfacing.** Rare values in observability data are often signals: a rare error type, a service that barely produces logs, or an unusual severity level can all indicate issues. +- **Rare error types.** Use `rare` on error message fields to find unusual errors that might be masked by high-volume common errors. +- **Rare service names.** A service that appears rarely in logs might be failing to start, experiencing intermittent connectivity, or newly deployed. +- **Rare status codes.** Uncommon HTTP status codes or gRPC error codes can reveal edge cases in your application logic. +- **Use `by` clause for per-group rare values.** Find what is unusual within each group -- for example, the rarest severity level per service. +- **Returns up to 10 results.** The `rare` command returns at most 10 results per group-by combination. Unlike `top`, there is no parameter to increase this limit. + +## Examples + +### Rarest severity levels + +Find the least common log severity levels across all services: + +```sql +| rare severityText +``` + +Try in playground → + +### Rarest services by log volume + +Find the services that produce the fewest logs -- these may be failing or newly deployed: + +```sql +| rare `resource.attributes.service.name` +``` + +Try in playground → + +### Rare severity levels by service + +Find the rarest severity levels within each service. A service that rarely produces ERROR logs suddenly showing them is noteworthy: + +```sql +| rare showcount=false severityText by `resource.attributes.service.name` +``` + +Try in playground → + +### Hide the count column + +Return just the rare values without frequency counts: + +```sql +| rare showcount=false severityText +``` + +Try in playground → + +### Rename the count column + +Use a custom name for the count field: + +```sql +| rare countfield='occurrences' `resource.attributes.service.name` +``` + +Try in playground → + +## Extended examples + +### Rare service-severity combinations in OTel logs + +Find unusual combinations of service and severity level. Combinations that appear rarely may indicate new failure modes: + +```sql +| rare `resource.attributes.service.name`, severityText +``` + +Try in playground → + +### Rare span operations per OTel service + +Find the least frequently executed operations in each service from trace data. Rare operations can indicate code paths that are only hit under unusual conditions -- potential sources of untested behavior: + +```sql +source = otel-v1-apm-span-* +| rare name by serviceName +``` + +This is especially useful after a deployment: if a new operation name appears in `rare` output that was not there before, it may indicate new functionality or an unexpected code path being triggered. + +## See also + +- [top](/docs/ppl/commands/top/) - The inverse of `rare`: find the most common values +- [dedup](/docs/ppl/commands/dedup/) - Deduplicate to get unique values with sample documents +- [stats](/docs/ppl/commands/stats/) - For more detailed frequency analysis with custom aggregations +- [patterns](/docs/ppl/commands/patterns/) - Automatically discover and cluster log patterns +- [PPL Command Reference](/docs/ppl/commands/) - All PPL commands diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/rename.md b/docs/starlight-docs/src/content/docs/ppl/commands/rename.md new file mode 100644 index 00000000..76c8ca43 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/rename.md @@ -0,0 +1,121 @@ +--- +title: "rename" +description: "Rename fields in search results - simplify long OTel attribute names for readability." +--- + +import { Tabs, TabItem, Aside } from '@astrojs/starlight/components'; + + + +The `rename` command renames one or more fields in your search results. It is especially useful for simplifying the long, dot-delimited attribute names common in OpenTelemetry data (e.g., `resource.attributes.service.name`) into shorter, readable aliases. + +## Syntax + +```sql +rename AS [, AS ]... +``` + +## Arguments + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | The current field name. Supports wildcard patterns using `*`. | +| `` | Yes | The new name for the field. Must contain the same number of wildcards as the source. | + +## Usage notes + +- **Multiple renames** can be specified in a single command, separated by commas. +- **Wildcard patterns** (`*`) match any sequence of characters. Both the source and target must have the same number of wildcards. For example, `*Name` matches `serviceName` and `traceGroupName`, and renaming to `*_name` produces `service_name` and `traceGroup_name`. +- **Renaming to an existing field** removes the original target field and replaces it with the source field's values. +- **Renaming a non-existent field to an existing field** removes the target field from results. +- **Renaming a non-existent field to a non-existent field** has no effect. +- The `rename` command executes on the coordinating node and is **not pushed down** to the query DSL. +- Literal `*` characters in field names cannot be escaped -- the asterisk is always treated as a wildcard. + +## Examples + +### Rename a single field + +```sql +source = otel-v1-apm-span-* +| rename serviceName as service +| head 20 +``` + +### Rename multiple fields + +```sql +source = otel-v1-apm-span-* +| rename serviceName as service, durationInNanos as duration_ns +| head 20 +``` + +### Rename with wildcards + +Match all fields ending in `Name` and replace with `_name`: + +```sql +source = otel-v1-apm-service-map-* +| rename *Name as *_name +| head 20 +``` + +### Multiple wildcard patterns + +Combine several wildcard renames in one command: + +```sql +source = otel-v1-apm-span-* +| rename *Name as *_name, *Id as *_id +| head 20 +``` + +### Rename an existing field to another existing field + +The target field is replaced by the source field's values: + +```sql +source = otel-v1-apm-span-* +| rename serviceName as name +| head 20 +``` + +The `name` column now contains the original `serviceName` values. + +## Extended examples + +### Simplify OTel attribute names for log analysis + +OpenTelemetry log fields have long, dot-delimited names. Rename them for readability before analysis: + +```sql +source = logs-otel-v1* +| rename `resource.attributes.service.name` as service, + `resource.attributes.telemetry.sdk.language` as language, + `resource.attributes.host.name` as host +| where severityText = 'ERROR' +| stats count() as errors by service, language, host +| sort - errors +``` + +Try in playground → + +### Rename span fields for dashboard readability + +Shorten trace span attribute names for cleaner output in dashboards: + +```sql +source = otel-v1-apm-span-* +| rename serviceName as service, durationInNanos as duration_ns +| eval duration_ms = duration_ns / 1000000 +| sort - duration_ms +| head 20 +``` + +## See also + +- [fields](/docs/ppl/commands/fields/) - select or exclude fields +- [eval](/docs/ppl/commands/eval/) - create computed fields +- [Command Reference](/docs/ppl/commands/) - all PPL commands diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/rex.md b/docs/starlight-docs/src/content/docs/ppl/commands/rex.md new file mode 100644 index 00000000..42eb3886 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/rex.md @@ -0,0 +1,188 @@ +--- +title: "rex" +description: "Extract or substitute fields using regex - with support for sed-mode text replacement and multiple matches." +--- + +import { Aside } from '@astrojs/starlight/components'; + +The `rex` command is a more powerful alternative to `parse` for extracting fields from text using Java regular expressions. In addition to standard extraction, `rex` supports **sed mode** for text substitution, **multiple match extraction**, and **offset tracking** to record match positions. + + + +## Syntax + +```sql +rex [mode=] field= [max_match=] [offset_field=] +``` + +## Arguments + +| Argument | Required | Default | Description | +|----------|----------|---------|-------------| +| `field` | Yes | -- | The text field to extract data from. Must be a string field. | +| `` | Yes | -- | In **extract** mode: a Java regex with named capture groups `(?pattern)`. Group names must start with a letter and contain only letters and digits (no underscores). In **sed** mode: a sed-style pattern (see [Sed mode syntax](#sed-mode-syntax)). | +| `mode` | No | `extract` | `extract` creates new fields from named capture groups. `sed` performs text substitution on the field in place. | +| `max_match` | No | `1` | Maximum number of matches to extract. When greater than 1, extracted fields are returned as arrays. Set to `0` for unlimited matches (capped by the configured system limit, default `10`). | +| `offset_field` | No | -- | Valid in `extract` mode only. Name of an output field that records the character offset positions of each match. | + +### Sed mode syntax + +In sed mode, the pattern uses one of the following forms: + +| Syntax | Description | +|--------|-------------| +| `s///` | Substitute the first match of `` with ``. | +| `s///g` | Substitute all matches (global flag). | +| `y///` | Transliterate characters (like `tr`). | + +Backreferences (`\1`, `\2`, etc.) are supported in the replacement string. + +### rex vs. parse + +| Feature | `rex` | `parse` | +|---------|-------|---------| +| Named capture groups | Yes | Yes | +| Multiple named groups per pattern | Yes | No | +| Multiple matches (`max_match`) | Yes | No | +| Text substitution (sed mode) | Yes | No | +| Offset tracking | Yes | No | +| Requires full-string match | No | Yes | + +## Usage notes + +- In extract mode, each named capture group creates a new string field. When `max_match > 1`, fields become arrays. +- Unlike `parse`, `rex` performs partial matching -- the pattern does not need to match the entire string. +- Group names cannot contain underscores or special characters due to Java regex limitations. Use `(?...)` not `(?...)`. +- Non-matching patterns return an empty string for the extracted fields. Use `where length(field) > 0` to filter non-matches. +- Multiple `rex` commands can be chained to extract from different fields in the same query. +- The `max_match` system limit defaults to `10` and can be configured via the `plugins.ppl.rex.max_match.limit` cluster setting. Requesting more than the limit results in an error. + +## Basic examples + +### Extract HTTP method and path from Envoy access logs + +Use two named capture groups to extract the HTTP method and request path from frontend-proxy (Envoy) log bodies: + +```sql +source=logs-otel-v1* +| rex field=body "(?GET|POST|PUT|DELETE|PATCH)\s+(?/[^\s]+)" +| where length(method) > 0 +| head 20 +``` + +| body | method | path | +|------|--------|------| +| [2026-02-26T18:04:21.634Z] "GET /api/data HTTP/1.1" 200 ... | GET | /api/data | +| [2026-02-26T18:04:23.059Z] "POST /api/product-ask-ai-assistant/0PUK6V6EV0 HTTP/1.1" 200 ... | POST | /api/product-ask-ai-assistant/0PUK6V6EV0 | +| [2026-02-26T18:04:27.084Z] "GET /api/products/6E92ZMYYFZ HTTP/1.1" 200 ... | GET | /api/products/6E92ZMYYFZ | + +Try in playground → + +### Replace text using sed mode + +Mask IP addresses in Envoy access log bodies by substituting them with a placeholder: + +```sql +source=logs-otel-v1* +| where like(body, '%HTTP/1.1"%') +| rex field=body mode=sed "s/\d+\.\d+\.\d+\.\d+/[REDACTED]/g" +| head 2 +``` + +| body | +|------| +| [[REDACTED]] "GET /api/data/ HTTP/1.1" 308 - via_upstream - "-" 0 9 3 2 "-" "python-requests/2.32.5" ... "[REDACTED]" frontend [REDACTED] ... | +| [[REDACTED]] "GET /api/data HTTP/1.1" 200 - via_upstream - "-" 0 211 140 140 "-" "python-requests/2.32.5" ... "[REDACTED]" frontend [REDACTED] ... | + +Try in playground → + +### Extract Kafka broker component and ID + +Pull out the component name and broker ID from Kafka log bodies with bracketed prefixes: + +```sql +source=logs-otel-v1* +| where `resource.attributes.service.name` = 'kafka' +| rex field=body "\[(?\w+) id=(?\d+)\]" +| where length(component) > 0 +| head 5 +``` + +| body | component | brokerId | +|------|-----------|----------| +| [Broker id=1] Creating new partition __consumer_offsets-33 ... | Broker | 1 | +| [RaftManager id=1] Completed transition to Leader ... | RaftManager | 1 | +| [QuorumController id=1] The request from broker 1 ... | QuorumController | 1 | + +Try in playground → + +### Track match positions with offset_field + +Record where each capture group matched within the Envoy access log body: + +```sql +source=logs-otel-v1* +| rex field=body "(?GET|POST|PUT|DELETE).*(?\d{3})" offset_field=matchpos +| where length(method) > 0 +| head 2 +``` + +| body | method | statusCode | matchpos | +|------|--------|------------|----------| +| [2026-02-26T18:04:21.634Z] "GET /api/data HTTP/1.1" 200 ... | GET | 200 | method=29-31&statusCode=50-52 | +| [2026-02-26T18:04:23.059Z] "POST /api/product-ask-ai-assistant/... | POST | 200 | method=29-32&statusCode=81-83 | + +Try in playground → + +## Extended examples + +### Chain rex commands to extract from multiple fields + +Extract the first character of the severity text and the HTTP method/path from the body in a single query: + +```sql +source=logs-otel-v1* +| rex field=severityText "(?^.)" +| rex field=body "(?GET|POST|PUT|DELETE|PATCH)\s+(?/\S+)" +| where length(method) > 0 +| head 3 +``` + +| severityText | body | severityChar | method | path | +|-------------|------|-------------|--------|------| +| INFO | [2026-02-26T18:04:21.634Z] "GET /api/data HTTP/1.1" 200 ... | I | GET | /api/data | +| INFO | [2026-02-26T18:04:23.059Z] "POST /api/product-ask-ai-assistant/... | I | POST | /api/product-ask-ai-assistant/0PUK6V6EV0 | +| INFO | [2026-02-26T18:04:24.766Z] "GET / HTTP/1.1" 200 ... | I | GET | / | + +Try in playground → + +### Aggregate endpoint traffic from Envoy access logs + +Use `rex` to extract method and path from frontend-proxy log bodies, then aggregate to find the busiest endpoints: + +```sql +source=logs-otel-v1* +| where `resource.attributes.service.name` = 'frontend-proxy' +| rex field=body "(?GET|POST|PUT|DELETE|PATCH)\s+(?/\S+)" +| where length(method) > 0 +| stats count() as requests by method, path +| sort - requests +| head 20 +``` + +This extracts HTTP method and path from Envoy access log bodies, then counts requests per endpoint. + +Try in playground → + + + +## See also + +- [parse](/docs/ppl/commands/parse/) -- simpler regex extraction when you need a single capture group +- [grok](/docs/ppl/commands/grok/) -- extract fields using predefined grok patterns for common formats +- [patterns](/docs/ppl/commands/patterns/) -- automatically discover log patterns without writing regex +- [PPL Functions Reference](/docs/ppl/functions/) -- `regexp_match` and other string functions for regex filtering diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/search.md b/docs/starlight-docs/src/content/docs/ppl/commands/search.md new file mode 100644 index 00000000..fd651ca6 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/search.md @@ -0,0 +1,139 @@ +--- +title: "search" +description: "Retrieve documents from an index - the starting point of every PPL query." +--- + +## Description + +The `search` command retrieves documents from an index. It is the **starting point of every PPL query** and must always be the first command in the pipeline. Every PPL query begins with `search` (or its shorthand `source=`) to specify which index to query. + +The `search` keyword itself can be omitted - `source=` is equivalent to `search source=`. An optional boolean expression filters results at the search level before any pipeline processing occurs. + +**In the Discover UI**, the dataset selector automatically sets the source index. Queries in the query bar start with a pipe character (`|`) and do not need a `source=` clause. + +## Syntax + +```sql +search source=[:] [] +``` + +Shorthand (omitting the `search` keyword): + +```sql +source= [] +``` + +## Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `` | Yes | The name of the index to query. Supports wildcard patterns (e.g., `logs-otel-v1*`). | +| `` | No | A filter expression applied at search time. Supports field comparisons (`=`, `!=`, `>`, `<`, `>=`, `<=`), Boolean operators (`AND`, `OR`, `NOT`), `IN`, wildcards (`*`, `?`), full-text search, and time modifiers (`earliest`, `latest`). | +| `` | No | The name of a remote cluster for cross-cluster search. Prefixed to the index name with a colon (e.g., `remote:logs-otel-v1*`). | + +## Usage notes + +- **Always first**: `search` must be the first command in any PPL query. Exactly one `search` (or `source=`) is allowed per query. +- **Omitting the keyword**: The `search` keyword is optional. Writing `source=logs-otel-v1*` is the most common form. +- **Discover UI queries**: When using PPL in Discover, the source index is set by the dataset selector. Your query starts with `|` followed by pipeline commands (e.g., `| where severityText = 'ERROR' | fields body`). +- **Search expression vs. where**: The boolean expression in `search` is converted to an OpenSearch query string query and executes at the search layer. For more complex filtering with functions and eval expressions, use the [`where`](/docs/ppl/commands/where/) command after the pipe. +- **Cross-cluster search**: To query an index on a remote cluster, prefix the index name with the cluster name and a colon. Cross-cluster search must be configured at the OpenSearch level. +- **Full-text search**: Unquoted terms search across all fields (or the configured default field). Multiple terms are combined with `AND` by default. Use quotes for phrase matching. +- **Wildcard patterns in index names**: Index names support `*` wildcards (e.g., `source=logs-*`), which is common for querying across time-based index patterns. +- **Operator precedence**: Boolean operators in the search expression follow this precedence: `Parentheses > NOT > OR > AND`. Note that this is PPL-specific and differs from SQL and Splunk SPL, where `AND` binds tighter than `OR`. In PPL, `a OR b AND c` is evaluated as `(a OR b) AND c`, not `a OR (b AND c)`. Use explicit parentheses to avoid ambiguity. +- **`NOT` vs. `!=`**: The `!=` operator excludes documents with null or missing fields, while `NOT` includes them. See the extended examples for details. + +## Basic examples + +### Retrieve all documents + +Fetch every document from an index with no filter. Useful for exploring data or verifying ingestion. + +```sql +source=logs-otel-v1* +``` + +Try in playground → + +### Filter with a boolean expression + +Return only documents where `severityText` is `ERROR`: + +```sql +source=logs-otel-v1* severityText="ERROR" +``` + +Try in playground → + +### Full-text search + +Search across all fields for documents containing the term `timeout`: + +```sql +search timeout source=logs-otel-v1* +``` + +Try in playground → + +### Multi-value match with IN + +Match documents where `severityText` is one of several values: + +```sql +source=logs-otel-v1* severityText IN ("ERROR", "WARN", "FATAL") +``` + +Try in playground → + +### Search across trace data + +Query OTel trace spans with a filter to find error spans: + +```sql +source=otel-v1-apm-span-* status.code=2 +| head 20 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%3Dotel-v1-apm-span-%2A%20status.code%3D2%0A%7C%20head%2020')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +## Extended examples + +### Filter OTel logs by service and severity + +Find error logs from a specific service using OTel semantic convention fields. Backticks are required for dotted field names. + +```sql +source=logs-otel-v1* + severityText="ERROR" + AND `resource.attributes.service.name`="cart" +| head 20 +``` + +Try in playground → + +### Discover-style query (no source clause) + +In the Discover UI, the dataset selector sets the index. Your query starts with `|`: + +```sql +| where severityText = 'ERROR' +| head 50 +``` + +Try in playground → + +### Cross-cluster search + +Query an index on a remote cluster named `us-west`: + +```sql +source=us-west:logs-otel-v1* severityText="ERROR" +| stats count() as error_count by `resource.attributes.service.name` +| sort - error_count +``` + +## See also + +- [`where`](/docs/ppl/commands/where/) - Filter results using boolean expressions after the pipe +- [`fields`](/docs/ppl/commands/fields/) - Select or exclude specific fields from the output +- [PPL Commands](/docs/ppl/commands/) - Full command reference diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/sort.md b/docs/starlight-docs/src/content/docs/ppl/commands/sort.md new file mode 100644 index 00000000..9099e71c --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/sort.md @@ -0,0 +1,159 @@ +--- +title: "sort" +description: "Sort search results by one or more fields in ascending or descending order." +--- + +## Description + +The `sort` command orders search results by one or more fields. It supports ascending and descending order, multiple sort keys, null value ordering, and type-specific sorting functions. Use it to find top-N results, order events chronologically, or rank aggregated data. + +PPL supports two notation styles for specifying sort direction -- prefix notation (`+ field` / `- field`) and suffix notation (`field asc` / `field desc`). Both produce identical results; choose whichever reads more clearly for your query. You must use one notation style consistently within a single `sort` command. + +--- + +## Syntax + +### Prefix notation + +```sql +sort [] [+|-] [, [+|-] ]... +``` + +### Suffix notation + +```sql +sort [] [asc|desc|a|d] [, [asc|desc|a|d]]... +``` + +--- + +## Arguments + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | The field to sort by. Multiple fields are comma-separated; earlier fields take priority. Use `auto(field)`, `str(field)`, `ip(field)`, or `num(field)` to control how values are interpreted. | +| `+` / `-` | No | **Prefix notation only.** `+` for ascending (default), `-` for descending. | +| `asc` / `desc` | No | **Suffix notation only.** `asc` (or `a`) for ascending (default), `desc` (or `d`) for descending. | +| `` | No | Maximum number of results to return. `0` or omitted returns all results. Equivalent to piping through `head`. | + +--- + +## Usage notes + +- **Default order is ascending**: If you omit the direction indicator, results are sorted in ascending order (smallest/earliest first). + +- **Null and missing values**: Null values sort first in ascending order and last in descending order. This is important when sorting fields that may not exist on every document. + +- **Type-specific sort functions**: Control how field values are compared: + - `auto(field)` -- automatic type detection (default behavior). + - `str(field)` -- sort as strings (lexicographic). Useful for sorting numeric fields as text (e.g. `str(severityNumber)` makes `"17"` come before `"9"`). + - `num(field)` -- sort as numbers. + - `ip(field)` -- sort as IP addresses. + +- **Count parameter for top-N queries**: `sort 10 - durationInNanos` returns only the 10 spans with the highest duration. This is more efficient than `sort - durationInNanos | head 10` because it can optimize internally. + +- **Multi-field sorting**: Fields are evaluated left to right. If two records tie on the first field, the second field breaks the tie, and so on. + +- **Performance**: Sorting large result sets is memory-intensive because all matching documents must be held and compared. For large datasets, combine `sort` with `stats` aggregation or use `head` to limit results. Sorting after `stats` (which typically produces fewer rows) is much cheaper than sorting raw events. + +- **Do not mix notations**: Use either prefix or suffix notation within a single `sort` command -- mixing `- severityNumber, serviceName desc` in one command is not supported. + +--- + +## Basic examples + +### Sort ascending (default) + +```sql +source = logs-otel-v1* +| sort severityNumber +``` + +Try in playground → + +### Sort descending with prefix notation + +```sql +source = logs-otel-v1* +| sort - severityNumber +``` + +Try in playground → + +### Multi-field sort + +Sort by service name ascending, then by severity descending: + +```sql +source = logs-otel-v1* +| sort + `resource.attributes.service.name`, - severityNumber +``` + +Try in playground → + +This is equivalent in suffix notation: + +```sql +source = logs-otel-v1* +| sort `resource.attributes.service.name` asc, severityNumber desc +``` + +Try in playground → + +### Limit results with count + +Return only the 2 most recent log entries: + +```sql +source = logs-otel-v1* +| sort 2 - time +``` + +Try in playground → + +### Lexicographic sort with `str()` + +Sort numeric severity as strings (lexicographic order): + +```sql +source = logs-otel-v1* +| sort str(severityNumber) +``` + +Try in playground → + +--- + +## Extended examples + +### OTel: Most recent error logs + +Retrieve the 20 most recent error logs across all services, sorted by timestamp descending. + +```sql +| where severityText = 'ERROR' +| sort - time +| head 20 +``` + +Try in playground → + +### OTel: Services with the most log volume + +Aggregate log counts by service, then sort to find the noisiest services. + +```sql +| stats count() as log_count by `resource.attributes.service.name` +| sort - log_count +``` + +Try in playground → + +--- + +## See also + +- [head](/docs/ppl/commands/head/) -- limit the number of returned results +- [stats](/docs/ppl/commands/stats/) -- aggregate before sorting for better performance +- [eval](/docs/ppl/commands/eval/) -- compute fields to sort by +- [where](/docs/ppl/commands/where/) -- filter before sorting diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/spath.md b/docs/starlight-docs/src/content/docs/ppl/commands/spath.md new file mode 100644 index 00000000..3e738643 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/spath.md @@ -0,0 +1,177 @@ +--- +title: "spath" +description: "Extract fields from structured JSON data - parse nested JSON within log bodies without re-indexing." +--- + +import { Aside } from '@astrojs/starlight/components'; + + + +The `spath` command extracts fields from structured JSON data stored in a text field. It operates in two modes: + +- **Path-based mode** -- When `path` is specified, extracts a single value at the given JSON path. +- **Auto-extract mode** -- When `path` is omitted, extracts all fields from the JSON into a map. + +This is ideal for semi-structured log bodies that contain JSON payloads -- you can extract and query nested fields without re-indexing. + + + +## Syntax + +```sql +spath input= [output=] [[path=]] +``` + +## Arguments + +### Required + +| Argument | Description | +|----------|-------------| +| `input=` | The field containing JSON data to parse. Must be a string field. | + +### Optional + +| Argument | Default | Description | +|----------|---------|-------------| +| `output=` | Value of `path` (path mode) or `input` (auto-extract) | Destination field for the extracted data. | +| `path=` | -- | The JSON path identifying data to extract. When omitted, runs in auto-extract mode. The `path=` keyword is optional; you can specify the path as a positional argument. | + +## JSON path syntax + +| Syntax | Description | Example | +|--------|-------------|---------| +| `field` | Top-level field | `status` | +| `parent.child` | Dot notation for nested fields | `error.message` | +| `list{0}` | Array element by index | `tags{0}` | +| `list{}` | All array elements | `items{}` | +| `"['special.name']"` | Escaped field names with dots or spaces | `"['a.b.c']"` | + +## Usage notes + +- The `spath` command always returns extracted values as **strings**. Use `eval` with `cast()` to convert to numeric types for aggregation. +- The input field must contain a valid **JSON string**. Struct or map fields from the index schema cannot be used directly -- you must first convert them to a string representation. +- In auto-extract mode, nested objects produce dotted keys (`user.name`), arrays produce `{}` suffix keys (`tags{}`), and all values are stringified. +- Empty JSON objects (`{}`) return an empty map. Malformed JSON returns partial results from any fields parsed before the error. +- In auto-extract mode, access individual values via dotted path navigation on the output field (e.g., `doc.user.name`). For keys containing `{}`, use backtick quoting. + +## Examples + +### Extract a field from a JSON string + +Extract the `status` field from a JSON string. This example uses `eval` to create a JSON string for demonstration, but in practice you would use this on a body field that already contains JSON: + +```sql +source=logs-otel-v1* +| head 1 +| eval jsonStr = '{"status": 200, "service": "frontend", "latency": 45}' +| spath input=jsonStr path=status output=httpStatus +``` + +| httpStatus | +|------------| +| 200 | + +Try in playground → + +### Extract nested object fields + +Traverse multiple levels of nesting using dot notation to extract deeply nested values: + +```sql +source=logs-otel-v1* +| head 1 +| eval jsonStr = '{"error": {"type": "timeout", "message": "upstream timed out"}}' +| spath input=jsonStr path=error.message output=errorMsg +``` + +| errorMsg | +|----------| +| upstream timed out | + +Try in playground → + +### Extract array elements + +Extract the first element and all elements from an array within JSON data: + +```sql +source=logs-otel-v1* +| head 1 +| eval jsonStr = '{"tags": ["frontend", "v2", "canary"]}' +| spath input=jsonStr path=tags{0} output=firstTag +| spath input=jsonStr path=tags{} output=allTags +``` + +| firstTag | allTags | +|----------|---------| +| frontend | ["frontend","v2","canary"] | + +Try in playground → + +### Cast extracted values for aggregation + +Extracted values are strings. Cast them before performing numeric operations: + +```sql +source=logs-otel-v1* +| head 1 +| eval jsonStr = '{"status": 200, "service": "frontend", "latency": 45}' +| spath input=jsonStr path=latency output=latency +| eval latency = cast(latency as double) +``` + +Try in playground → + +### Auto-extract all fields from JSON + +Extract all fields from a JSON string into a map, then access individual values: + +```sql +source=logs-otel-v1* +| head 1 +| eval jsonStr = '{"status": 200, "service": "frontend"}' +| spath input=jsonStr output=parsed +``` + +| parsed | +|--------| +| {service: frontend, status: 200} | + +Try in playground → + +## Extended examples + +### Extract multiple error fields from a JSON payload + +Chain multiple `spath` commands to extract several fields from a nested error payload: + +```sql +source=logs-otel-v1* +| head 1 +| eval jsonStr = '{"error": {"type": "timeout", "message": "upstream timed out", "code": 504}}' +| spath input=jsonStr path=error.type output=errorType +| spath input=jsonStr path=error.message output=errorMsg +| spath input=jsonStr path=error.code output=errorCode +``` + +| errorType | errorMsg | errorCode | +|-----------|----------|-----------| +| timeout | upstream timed out | 504 | + +Try in playground → + + + +## See also + +- [parse](/docs/ppl/commands/parse/) -- extract fields using regex named capture groups +- [grok](/docs/ppl/commands/grok/) -- extract fields using grok patterns +- [rex](/docs/ppl/commands/rex/) -- regex extraction with sed-mode substitution +- [eval](/docs/ppl/commands/eval/) -- create computed fields and type conversions diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/stats.md b/docs/starlight-docs/src/content/docs/ppl/commands/stats.md new file mode 100644 index 00000000..996c7d50 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/stats.md @@ -0,0 +1,181 @@ +--- +title: "stats" +description: "Calculate aggregate statistics over search results - counts, averages, percentiles, and more with grouping." +--- + +## Description + +The `stats` command is the primary aggregation command in PPL. It calculates statistics such as `count`, `sum`, `avg`, `min`, `max`, `percentile`, and more across your search results. Use it whenever you need to summarize data -- whether that means counting error logs per service, computing average response latency, tracking token usage over time, or building the numbers behind a dashboard panel. + +Results can be grouped using the `by` clause with one or more fields, and time-series bucketing is supported through the `span()` expression. When no `by` clause is provided, `stats` returns a single row representing the aggregation over the entire result set. + +`stats` is the workhorse command for dashboards, alerting thresholds, and investigative queries. + +--- + +## Syntax + +```sql +stats [bucket_nullable=] [, ]... [by , ] +``` + +--- + +## Arguments + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | One or more aggregation functions (see table below). Multiple aggregations are comma-separated. Use `as` to alias the output field name. | +| `by ` | No | Groups results by one or more fields. Without a `by` clause, stats returns one row aggregating all results. | +| `by ` | No | Splits a field into time or numeric buckets. Syntax: `span(field, interval)`. At most one span expression per query. The span is always treated as the first grouping key regardless of position. | +| `bucket_nullable` | No | When `false`, excludes records where the group-by field is null, improving performance. Default depends on `plugins.ppl.syntax.legacy.preferred`. | + +### Supported aggregation functions + +| Function | Description | +|----------|-------------| +| `count()` | Count of all events (including nulls). Alias: `c()`. | +| `count()` | Count of events where field is not null. | +| `sum()` | Sum of numeric values. | +| `avg()` | Average of numeric values. | +| `max()` | Maximum value. | +| `min()` | Minimum value. | +| `var_samp()` | Sample variance. | +| `var_pop()` | Population variance. | +| `stddev_samp()` | Sample standard deviation. | +| `stddev_pop()` | Population standard deviation. | +| `distinct_count()` | Approximate count of distinct values. Alias: `dc()`. | +| `percentile(, )` | Percentile value (e.g. `percentile(duration, 95)`). Alias: `percentile_approx()`. | +| `median()` | 50th percentile (shorthand for `percentile(field, 50)`). | +| `first()` | First non-null value encountered. | +| `last()` | Last non-null value encountered. | +| `list()` | Collects all values into an array, preserving duplicates and order. | +| `values()` | Collects unique values into a sorted array (duplicates removed). | +| `take(, )` | Returns a list of up to `n` original values. | +| `earliest()` | Earliest value by timestamp. | +| `latest()` | Latest value by timestamp. | + +--- + +## Usage notes + +- **Multiple aggregations in a single `stats`**: Separate them with commas. Each produces its own output column. + ```sql + | stats count() as total, avg(duration) as avg_dur, max(duration) as max_dur by service + ``` + +- **Naming output fields with `as`**: Without an alias, the column name is the function call itself (e.g. `avg(severityNumber)`). Always alias for readability. + +- **`count()` vs `count(field)`**: `count()` counts all events including those where a field is null. `count(field)` only counts events where `field` is non-null. + +- **Span intervals -- numeric**: `span(severityNumber, 4)` creates buckets of width 4 (1, 5, 9, 13, ...). + +- **Span intervals -- time**: `span(time, 1h)` creates hourly buckets. Valid time units: `ms` (millisecond), `s` (second), `m` (minute), `h` (hour), `d` (day), `w` (week), `M` (month), `q` (quarter), `y` (year). + +- **Span is always the first grouping key**: Even if you write `by severityText, span(time, 5m)`, the span is promoted to the first position in the output. + +- **Null handling in group-by**: By default, null values in group-by fields produce a null bucket. Set `bucket_nullable=false` to exclude null groups for cleaner output and faster performance. + +- **Eval expressions inside aggregations**: You can embed expressions directly, e.g. `sum(durationInNanos / 1000000)`. + +- **High-cardinality fields**: Aggregations over fields with many distinct values (like URLs or trace IDs) use approximate bucket counts. Results may be approximate for the long tail. + +- **Ascending doc_count sort caveat**: When sorting by count in ascending order on high-cardinality fields, globally rare terms may be missed due to shard-level approximation. + +--- + +## Basic examples + +### Count all log events + +```sql +source = logs-otel-v1* +| stats count() +``` + +Try in playground → + +### Average severity by service + +```sql +source = logs-otel-v1* +| stats avg(severityNumber) by `resource.attributes.service.name` +``` + +Try in playground → + +### Multiple aggregations + +```sql +source = logs-otel-v1* +| stats avg(severityNumber) as avg_severity, max(severityNumber) as max_severity, count() as cnt by `resource.attributes.service.name` +``` + +Try in playground → + +### Time bucketing with span + +```sql +source = logs-otel-v1* +| stats count() as log_count by span(time, 10m) as time_bucket +``` + +Try in playground → + +### Percentile calculation + +```sql +source = otel-v1-apm-span-* +| stats percentile(durationInNanos, 90) as p90 by serviceName +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20stats%20percentile%28durationInNanos%2C%2090%29%20as%20p90%20by%20serviceName')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### First and last occurrence + +Find the first and last error timestamp per service, along with the total error count: + +```sql +source = logs-otel-v1* +| where severityText = 'ERROR' +| stats earliest(time) as first_error, latest(time) as last_error, count() as total by `resource.attributes.service.name` +| sort first_error +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20logs-otel-v1%2A%0A%7C%20where%20severityText%20%3D%20!%27ERROR!%27%0A%7C%20stats%20earliest%28time%29%20as%20first_error%2C%20latest%28time%29%20as%20last_error%2C%20count%28%29%20as%20total%20by%20%60resource.attributes.service.name%60%0A%7C%20sort%20first_error')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +--- + +## Extended examples + +### OTel: Error rate by service + +Calculate the error rate per service by comparing error-severity log counts to total log counts. + +```sql +| stats count() as total, + sum(case(severityText = 'ERROR' or severityText = 'FATAL', 1 else 0)) as errors + by `resource.attributes.service.name` +| eval error_rate = errors * 100.0 / total +``` + +Try in playground → + +### OTel: Log volume over time + +Track how many log events arrive per 5-minute window, broken down by severity. + +```sql +| stats count() as log_count by span(time, 5m) as time_bucket, severityText +``` + +Try in playground → + +--- + +## See also + +- [eval](/docs/ppl/commands/eval/) -- create computed fields from aggregation results +- [sort](/docs/ppl/commands/sort/) -- order aggregated results +- [where](/docs/ppl/commands/where/) -- filter before aggregating +- [head](/docs/ppl/commands/head/) -- limit output rows diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/streamstats.md b/docs/starlight-docs/src/content/docs/ppl/commands/streamstats.md new file mode 100644 index 00000000..f9354a54 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/streamstats.md @@ -0,0 +1,151 @@ +--- +title: "streamstats" +description: "Calculate cumulative and rolling window statistics - running totals, moving averages, and trend detection." +--- + +import { Tabs, TabItem, Aside } from '@astrojs/starlight/components'; + + + +The `streamstats` command calculates cumulative or rolling statistics as events are processed in order. Unlike `stats` (which collapses to an aggregation table) or `eventstats` (which computes over the entire dataset at once), `streamstats` processes events incrementally -- each row's statistics reflect only the events seen so far. + +This makes `streamstats` ideal for running totals, moving averages, trend detection, and any analysis that depends on the sequence of events. + +## Syntax + +```sql +streamstats [bucket_nullable=] [current=] [window=] [global=] + [reset_before="()"] [reset_after="()"] + ... [by ] +``` + +## Arguments + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `` | Yes | One or more aggregation functions (e.g., `avg(field)`, `sum(field)`, `count()`). | +| `current` | No | Include the current event in the calculation. Default: `true`. Set to `false` to use only previous events. | +| `window` | No | Number of events in the sliding window. Default: `0` (all previous and current events). | +| `global` | No | When `window` is set, determines whether a single window is used across all rows (`true`) or separate windows per `by` group (`false`). Default: `true`. | +| `reset_before` | No | Reset all accumulated statistics **before** processing an event when the expression evaluates to `true`. Syntax: `reset_before="()"`. | +| `reset_after` | No | Reset all accumulated statistics **after** processing an event when the expression evaluates to `true`. The expression can reference fields produced by `streamstats`. Syntax: `reset_after="()"`. | +| `bucket_nullable` | No | Whether `null` values form their own group in `by` aggregations. Default is controlled by `plugins.ppl.syntax.legacy.preferred`. | +| `` | No | Group results by one or more fields. Each group gets its own running calculation. Syntax: `by [span-expression,] [field,]...`. | + +## Comparing stats, eventstats, and streamstats + +| Aspect | `stats` | `eventstats` | `streamstats` | +|--------|---------|--------------|---------------| +| Output | Aggregation table only | Original events + aggregate fields | Original events + running aggregate fields | +| Scope | All events (or per group) | All events (or per group), result added to every row | Incremental -- each row reflects events seen so far | +| Use case | Summary reports | Compare individual events to group totals | Running totals, moving averages, trend detection | + +## Supported aggregation functions + +`COUNT`, `SUM`, `AVG`, `MAX`, `MIN`, `VAR_SAMP`, `VAR_POP`, `STDDEV_SAMP`, `STDDEV_POP`, `DISTINCT_COUNT` / `DC`, `EARLIEST`, `LATEST`. + +## Usage notes + +- **Sort your data first.** `streamstats` processes events in the order they arrive. For time-series analysis, pipe through `sort` before `streamstats`. +- **`window` controls the sliding window size.** Use `window=10` to compute statistics over the last 10 events. Without `window`, statistics accumulate over all events seen so far. +- **`current=false`** excludes the current event from the calculation, so the first row always has `null` statistics. This is useful for comparing each event to the state *before* it arrived. +- **`global=true` vs `global=false`** matters when combining `window` with `by`. With `global=true`, the window slides across all rows but aggregation is per-group. With `global=false`, each group gets its own independent window. +- **Reset conditions** let you restart accumulation based on data patterns -- useful for session boundaries or partition changes. + +## Examples + +### Running average, sum, and count by group + +```sql +source = otel-v1-apm-span-* +| sort startTime +| streamstats avg(durationInNanos) as running_avg, sum(durationInNanos) as running_sum, count() as running_count by serviceName +| head 50 +``` + +Each row shows the running statistics computed from spans seen so far within its service group. + +### Rolling maximum over a 2-row window + +Compute the maximum latency from the previous 2 spans (excluding the current span): + +```sql +source = otel-v1-apm-span-* +| sort startTime +| streamstats current=false window=2 max(durationInNanos) as prev_max_latency +| head 50 +``` + +The first row has `null` because no previous events exist. + +### Global vs group-specific windows + +With `global=true`, the window slides across all rows but aggregation respects the `by` group: + +```sql +source = otel-v1-apm-span-* +| sort startTime +| streamstats window=5 global=true avg(durationInNanos) as running_avg by serviceName +| head 50 +``` + +With `global=false`, each `by` group gets its own independent window: + +```sql +source = otel-v1-apm-span-* +| sort startTime +| streamstats window=5 global=false avg(durationInNanos) as running_avg by serviceName +| head 50 +``` + +The difference is visible when services are interleaved. With `global=false`, each service's window only counts spans from that service. + +### Conditional reset + +Reset running statistics when latency crosses a threshold: + +```sql +source = otel-v1-apm-span-* +| sort startTime +| streamstats current=false reset_before="(durationInNanos > 10000000000)" reset_after="(durationInNanos < 1000000)" avg(durationInNanos) as avg_latency by serviceName +| head 50 +``` + +Statistics reset **before** processing any span with latency above 10 seconds, and **after** processing any span with latency below 1 millisecond. + +## Extended examples + +### Rolling average latency per service + +Sort spans by time, then compute a 10-span rolling average latency for each service: + +```sql +source = otel-v1-apm-span-* +| sort startTime +| streamstats window=10 avg(durationInNanos) as rolling_avg_latency by serviceName +| eval rolling_avg_ms = rolling_avg_latency / 1000000 +| head 50 +``` + +### Cumulative error count per service + +Track the running count of error logs per service over time: + +```sql +source = logs-otel-v1* +| where severityText = 'ERROR' +| sort time +| streamstats count() as cumulative_errors by `resource.attributes.service.name` +| head 100 +``` + +Try in playground → + +## See also + +- [eventstats](/docs/ppl/commands/eventstats/) - add group-level statistics to every event +- [trendline](/docs/ppl/commands/trendline/) - simple and weighted moving averages +- [stats](/docs/ppl/commands/stats/) - aggregate and collapse rows +- [Command Reference](/docs/ppl/commands/) - all PPL commands diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/timechart.md b/docs/starlight-docs/src/content/docs/ppl/commands/timechart.md new file mode 100644 index 00000000..0187aad8 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/timechart.md @@ -0,0 +1,137 @@ +--- +title: "timechart" +description: "Create time-based aggregations and charts - the go-to command for time-series visualization." +--- + +import { Aside } from '@astrojs/starlight/components'; + + + +The `timechart` command creates time-based aggregations by grouping data into time intervals, optionally splitting by a field, and applying an aggregation function to each bucket. Results are returned in an unpivoted format with separate rows for each time-field combination, making them ideal for dashboard panels and trend analysis. + +## Syntax + +```sql +timechart [timefield=] [span=] [limit=] [useother=] [usenull=] [nullstr=] [by ] +``` + +## Arguments + +### Required + +| Argument | Description | +|----------|-------------| +| `` | The aggregation function to apply to each time bucket. Only a single aggregation function is supported per `timechart` command. Supports all [stats](/docs/ppl/commands/stats/) aggregation functions plus the timechart-specific rate functions (`per_second`, `per_minute`, `per_hour`, `per_day`). | + +### Optional + +| Argument | Default | Description | +|----------|---------|-------------| +| `timefield=` | `@timestamp` | The timestamp field to use for time-based grouping. For OTel log indices, use `timefield=time`. | +| `span=` | `1m` | Time interval for grouping. Supported units: `ms`, `s`, `m` (minute), `h`, `d`, `w`, `M` (month), `q`, `y`. Note: `m` and `M` are case-sensitive. | +| `limit=` | `10` | Maximum number of distinct values shown when using `by`. Values beyond the limit are grouped into `OTHER`. Set to `0` for unlimited. | +| `useother=` | `true` | Whether to create an `OTHER` category for values beyond the `limit`. Only applies with `by`. | +| `usenull=` | `true` | Whether to group documents with null `by` field values into a `NULL` category. When `false`, null-valued documents are excluded. | +| `nullstr=` | `"NULL"` | The category name for documents with null `by` field values. Only applies when `usenull=true`. | +| `by ` | -- | Groups results by the specified field in addition to time intervals. | + +## Usage notes + +- Results only include time-field combinations that have data. Empty buckets are omitted rather than showing null or zero. +- The top N values for `limit` are selected based on the **sum** of aggregation values across all time intervals. +- Only a single aggregation function is supported per `timechart`. Use multiple `timechart` commands joined with `appendcol` if you need multiple aggregations. +- The timechart-specific rate functions calculate normalized rates: `per_second(field) = sum(field) / span_in_seconds`, `per_minute(field) = sum(field) * 60 / span_in_seconds`, and so on. +- In the Discover UI, the source index is set by the selected dataset, so start your query with `| timechart ...`. + +## Examples + +### Log volume per 5 minutes + +Count all log events in 5-minute windows: + +```sql +source = logs-otel-v1* +| timechart timefield=time span=5m count() +``` + +Try in playground → + +### Log volume by service over time + +Break down log volume by service name in 5-minute buckets: + +```sql +source = logs-otel-v1* +| timechart timefield=time span=5m count() by `resource.attributes.service.name` +``` + +Try in playground → + +### Error rate over time by service + +Count only error logs per service in 5-minute windows: + +```sql +source = logs-otel-v1* +| where severityText = 'ERROR' +| timechart timefield=time span=5m count() by `resource.attributes.service.name` +``` + +Try in playground → + +### Top 3 services with the rest grouped as OTHER + +Limit the breakdown to the top 3 services by volume, grouping remaining services into `OTHER`: + +```sql +source = logs-otel-v1* +| timechart timefield=time span=5m limit=3 count() by `resource.attributes.service.name` +``` + +Try in playground → + +### Exclude the OTHER category + +Show only the top 5 services without an `OTHER` bucket: + +```sql +source = logs-otel-v1* +| timechart timefield=time span=5m limit=5 useother=false count() by `resource.attributes.service.name` +``` + +Try in playground → + +## Extended examples + +### Request latency percentiles over time (OTel traces) + +Calculate average span duration per minute, broken down by service, to visualize latency trends: + +```sql +source = otel-v1-apm-span-* +| timechart timefield=startTime span=1m avg(durationInNanos) by serviceName +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20timechart%20timefield%3DstartTime%20span%3D1m%20avg%28durationInNanos%29%20by%20serviceName')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +This produces a time-series suitable for a dashboard line chart where each line represents a service's average latency over time. + +### Per-second event rate by severity (OTel logs) + +Use the `per_second` rate function to normalize event counts across different time windows, grouped by severity level: + +```sql +source = logs-otel-v1* +| timechart timefield=time span=1m per_second(severityNumber) by severityText +``` + +Try in playground → + +## See also + +- [stats](/docs/ppl/commands/stats/) -- general aggregation and grouping +- [chart](/docs/ppl/commands/) -- row/column split aggregation for non-time-based charts +- [trendline](/docs/ppl/commands/trendline/) -- moving averages over ordered data +- [bin](/docs/ppl/commands/) -- bucket numeric or time values into intervals diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/top.md b/docs/starlight-docs/src/content/docs/ppl/commands/top.md new file mode 100644 index 00000000..b482b20a --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/top.md @@ -0,0 +1,121 @@ +--- +title: "top" +description: "Find the most common values of a field - quickly identify dominant patterns in your data." +--- + +import { Aside } from '@astrojs/starlight/components'; + +The `top` command finds the most common values (or combinations of values) for the specified fields. It automatically counts occurrences and returns results sorted from most to least frequent. An optional `by` clause groups the results so you can find the top values within each group. + +`top` is a fast way to profile your data and answer questions like "which services produce the most logs?" or "what are the most common error messages?" + + + +## Syntax + +```sql +top [] [top-options] [by ] +``` + +## Arguments + +| Argument | Required | Type | Default | Description | +|----------|----------|------|---------|-------------| +| `` | No | Integer | `10` | The number of most-frequent values to return. | +| `` | Yes | Comma-delimited field names | -- | The fields to find top values for. When multiple fields are specified, `top` finds the most common combinations. | +| `by ` | No | Field name(s) | -- | One or more fields to group the results by. Top values are computed separately within each group. | +| `showcount` | No | Boolean | `true` | When `true`, includes a count column showing the frequency of each value. Set to `false` for cleaner output when counts are not needed. | +| `countfield` | No | String | `count` | The name of the count column in the output. Only applies when `showcount=true`. | + +## Usage notes + +- **Fast data profiling.** `top` is the quickest way to understand the distribution of values in a field. Use it early in an investigation to orient yourself. +- **`showcount=false` for clean output.** When you only need the values and not the frequencies, use `showcount=false` to remove the count column. +- **Multiple fields find top combinations.** Specifying more than one field returns the most common value tuples. For example, `top service, severity` returns the most frequent (service, severity) pairs. +- **Use `by` clause for per-group analysis.** The `by` clause is powerful for comparative profiling, such as finding the top error message for each service. +- **`countfield` renames the count column.** Use `countfield='frequency'` or similar to give the count column a descriptive name for downstream processing. + +## Examples + +### Top services by log volume + +Find the services producing the most logs: + +```sql +| top `resource.attributes.service.name` +``` + +Try in playground → + +### Top 5 severity levels + +Return only the 5 most common severity levels: + +```sql +| top 5 severityText +``` + +Try in playground → + +### Top severity by service + +Find the most common severity level for each service: + +```sql +| top 1 showcount=false severityText by `resource.attributes.service.name` +``` + +Try in playground → + +### Hide the count column + +Return just the values without frequency counts: + +```sql +| top showcount=false severityText +``` + +Try in playground → + +### Rename the count column + +Use a custom name for the count field: + +```sql +| top countfield='frequency' `resource.attributes.service.name` +``` + +Try in playground → + +## Extended examples + +### Top service-severity combinations in OTel logs + +Find the most common combinations of service and severity. This reveals which services are noisiest and at what severity level: + +```sql +| top 10 `resource.attributes.service.name`, severityText +``` + +Try in playground → + +### Top span operations per OTel service + +Find the most frequently executed operations in each service from trace data: + +```sql +source = otel-v1-apm-span-* +| top 3 name by serviceName +``` + +This helps identify hot paths in your microservices architecture -- the operations that execute most frequently are often the best candidates for optimization. + +## See also + +- [rare](/docs/ppl/commands/rare/) - The inverse of `top`: find the least common values +- [stats](/docs/ppl/commands/stats/) - For more complex aggregations beyond simple frequency counts +- [dedup](/docs/ppl/commands/dedup/) - Deduplicate to get unique values with sample documents +- [head](/docs/ppl/commands/head/) - Limit the number of results returned +- [PPL Command Reference](/docs/ppl/commands/) - All PPL commands diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/trendline.md b/docs/starlight-docs/src/content/docs/ppl/commands/trendline.md new file mode 100644 index 00000000..93429dff --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/trendline.md @@ -0,0 +1,125 @@ +--- +title: "trendline" +description: "Calculate moving averages - simple (SMA) and weighted (WMA) for trend analysis and smoothing." +--- + +import { Tabs, TabItem, Aside } from '@astrojs/starlight/components'; + + + +The `trendline` command calculates moving averages over a sorted sequence of events. It supports two types: + +- **SMA (Simple Moving Average)** - all data points in the window are weighted equally. +- **WMA (Weighted Moving Average)** - more recent data points receive higher weight, making the average more responsive to recent changes. + +Use `trendline` to smooth noisy time-series data, reveal underlying trends in latency or throughput, and detect gradual shifts in system behavior. + +## Syntax + +```sql +trendline [sort [+|-] ] + (sma | wma)(, ) [as ] + [(sma | wma)(, ) [as ]]... +``` + +## Arguments + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `sort [+\|-] ` | No | The field used to order data before calculating the moving average. `+` for ascending (default, nulls first), `-` for descending (nulls last). If omitted, data is processed in its current order. | +| `sma \| wma` | Yes | The type of moving average. `sma` = simple moving average (equal weight); `wma` = weighted moving average (recent values weighted more). | +| `` | Yes | The window size -- number of data points used to calculate each average. Must be greater than zero. | +| `` | Yes | The numeric field to compute the moving average over. | +| `` | No | Name for the output column. Default: `_trendline`. | + +## SMA vs WMA + +**Simple Moving Average (SMA):** The arithmetic mean of the last *n* values. Given values `[v1, v2, ..., vn]`, the SMA is `(v1 + v2 + ... + vn) / n`. + +**Weighted Moving Average (WMA):** More recent values receive proportionally higher weight. Given values `[v1, v2, ..., vn]` where `vn` is the most recent, the WMA is `(1*v1 + 2*v2 + ... + n*vn) / (1 + 2 + ... + n)`. + +WMA reacts faster to changes, making it better for detecting recent shifts. SMA is more stable and resistant to short-term noise. + +## Usage notes + +- **The first `n-1` rows will have `null`** for the moving average because there are not yet enough data points to fill the window. +- **Null field values cause the row to be excluded** from the trendline output. +- **Multiple trendlines** can be calculated in a single command -- for example, a short-window and long-window SMA on the same field to detect crossovers. +- **Sort your data** before applying `trendline`, or use the built-in `sort` parameter. For time-series data, sort by timestamp. + +## Examples + +### Simple moving average over 5 data points + +Smooth span latency with a 5-point SMA: + +```sql +source = otel-v1-apm-span-* +| trendline sort startTime sma(5, durationInNanos) as latency_trend +| head 50 +``` + +### Multiple trendlines in one command + +Compute both an SMA and WMA on latency simultaneously to compare smoothing behavior: + +```sql +source = otel-v1-apm-span-* +| trendline sort startTime sma(5, durationInNanos) as sma_latency wma(5, durationInNanos) as wma_latency +| head 50 +``` + +### Default alias + +When no alias is specified, the output column is named `_trendline`: + +```sql +source = otel-v1-apm-span-* +| trendline sort startTime sma(5, durationInNanos) +| head 50 +``` + +### Weighted moving average + +WMA gives more weight to recent values, producing a trend that reacts faster: + +```sql +source = otel-v1-apm-span-* +| trendline sort startTime wma(5, durationInNanos) +| head 50 +``` + +## Extended examples + +### Latency trend for trace spans + +Sort spans by time and calculate a 5-span SMA of duration to smooth out latency noise: + +```sql +source = otel-v1-apm-span-* +| trendline sort startTime sma(5, durationInNanos) as latency_sma wma(5, durationInNanos) as latency_wma +| eval sma_ms = latency_sma / 1000000, wma_ms = latency_wma / 1000000 +| head 50 +``` + +Both SMA and WMA are computed side by side, making it easy to compare the smoother SMA with the more reactive WMA. + +### Token usage trend for AI agents + +Track the moving average of token consumption over time for a generative AI service: + +```sql +source = otel-v1-apm-span-* +| where `attributes.gen_ai.usage.output_tokens` > 0 +| trendline sort startTime sma(10, `attributes.gen_ai.usage.output_tokens`) as token_trend +| head 100 +``` + +## See also + +- [streamstats](/docs/ppl/commands/streamstats/) - cumulative and rolling window statistics with more control +- [eventstats](/docs/ppl/commands/eventstats/) - add group-level aggregates to every event +- [stats](/docs/ppl/commands/stats/) - aggregate and collapse rows +- [Command Reference](/docs/ppl/commands/) - all PPL commands diff --git a/docs/starlight-docs/src/content/docs/ppl/commands/where.md b/docs/starlight-docs/src/content/docs/ppl/commands/where.md new file mode 100644 index 00000000..a6ecbe2d --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/commands/where.md @@ -0,0 +1,207 @@ +--- +title: "where" +description: "Filter search results using boolean expressions - the primary filtering command in PPL." +--- + +## Description + +The `where` command filters search results to only those rows where the specified boolean expression evaluates to `true`. It is the **primary filtering command** in PPL and can appear anywhere in the pipeline after the `search` (or `source=`) command. + +`where` supports all comparison operators, logical operators, pattern matching with `LIKE`, set membership with `IN`, range checks with `BETWEEN`, null testing with `IS NULL` / `IS NOT NULL`, and nested conditions with parentheses. You can also use built-in functions and `eval` expressions inline within the boolean expression. + +## Syntax + +```sql +where +``` + +## Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `` | Yes | The condition used to filter results. Only rows where this evaluates to `true` are returned. | + +### Supported operators + +| Category | Operators | +|----------|-----------| +| **Comparison** | `=`, `!=`, `<>`, `>`, `<`, `>=`, `<=` | +| **Logical** | `AND`, `OR`, `NOT` | +| **Pattern matching** | `LIKE(field, pattern)` - `%` matches zero or more characters, `_` matches exactly one character | +| **Set membership** | `IN (value1, value2, ...)` | +| **Range** | `BETWEEN value1 AND value2` | +| **Null testing** | `IS NULL`, `IS NOT NULL`, `ISNULL(field)`, `ISNOTNULL(field)` | +| **Grouping** | Parentheses `( )` for controlling evaluation order | + +## Usage notes + +- **Multiple where commands**: You can chain multiple `where` commands in a single pipeline. Each successive `where` further narrows the result set, equivalent to combining them with `AND`. +- **Eval expressions inline**: You can use functions and expressions directly in the boolean condition (e.g., `where length(body) > 100` or `where LIKE(body, '%timeout%')`). +- **Null handling**: Comparisons with `null` values follow SQL semantics - a comparison involving `null` evaluates to `null` (not `true` or `false`), so the row is excluded. Use `IS NULL` or `ISNULL()` to explicitly test for null values. +- **String values**: Enclose string literals in single quotes (`'value'`). Double quotes are used for field names that contain special characters. +- **Backtick field names**: OTel fields with dots in their names (e.g., `resource.attributes.service.name`) must be enclosed in backticks to prevent them from being interpreted as nested field access. +- **Performance**: Filters applied earlier in the pipeline reduce the amount of data processed by subsequent commands. Place your most selective `where` conditions as early as possible. +- **vs. search expression**: The `search` command also supports inline boolean expressions, but `where` is more flexible - it supports functions, `LIKE`, `BETWEEN`, and computed expressions that `search` does not. + +## Basic examples + +### Simple comparison + +Return log entries with a severity number greater than 9 (above DEBUG level): + +```sql +source=logs-otel-v1* +| where severityNumber > 9 +| head 20 +``` + +Try in playground → + +### Combine conditions with AND / OR + +Return error logs from the checkout service: + +```sql +source=logs-otel-v1* +| where severityText = 'ERROR' AND `resource.attributes.service.name` = 'checkout' +| head 20 +``` + +Try in playground → + +Return logs that are either errors or from the payment service: + +```sql +source=logs-otel-v1* +| where severityText = 'ERROR' OR `resource.attributes.service.name` = 'payment' +| head 20 +``` + +Try in playground → + +### Pattern matching with LIKE + +Find logs whose body contains the word `connection`: + +```sql +source=logs-otel-v1* +| where LIKE(body, '%connection%') +| head 20 +``` + +Try in playground → + +Find service names starting with `product-`: + +```sql +source=logs-otel-v1* +| where LIKE(`resource.attributes.service.name`, 'product-%') +| head 20 +``` + +Try in playground → + +### Set membership with IN + +Return logs matching specific severity levels: + +```sql +source=logs-otel-v1* +| where severityText IN ('ERROR', 'WARN', 'FATAL') +| head 20 +``` + +Try in playground → + +### Filter by numeric range + +Return logs with severity numbers in the error range (17 through 21) using `BETWEEN`: + +```sql +source = logs-otel-v1* +| where severityNumber BETWEEN 17 AND 21 +| stats count() as logs by severityText +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20logs-otel-v1%2A%0A%7C%20where%20severityNumber%20BETWEEN%2017%20AND%2021%0A%7C%20stats%20count%28%29%20as%20logs%20by%20severityText')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Null testing + +Find log entries where the trace ID is empty (logs not correlated to a trace): + +```sql +source=logs-otel-v1* +| where traceId = '' +| head 20 +``` + +Try in playground → + +Find log entries that have a span ID (logs correlated to a specific span): + +```sql +source=logs-otel-v1* +| where ISNOTNULL(spanId) +| head 20 +``` + +Try in playground → + +### Grouped conditions + +Combine multiple conditions with parentheses to control evaluation order: + +```sql +source=logs-otel-v1* +| where (severityText = 'ERROR' OR severityText = 'FATAL') AND `resource.attributes.service.name` = 'weather-agent' +| head 20 +``` + +Try in playground → + +## Extended examples + +### Filter error logs by service + +Find ERROR and FATAL logs from the weather agent service. This is a common starting point for incident triage. + +```sql +source=logs-otel-v1* +| where severityText = 'ERROR' OR severityText = 'FATAL' +| where `resource.attributes.service.name` = 'weather-agent' +| head 50 +``` + +Try in playground → + +### Compound GenAI attribute filter + +Filter logs for a specific GenAI agent operation, useful for investigating AI agent invocation failures or high-latency completions. + +```sql +source=logs-otel-v1* +| where `attributes.gen_ai.operation.name` = 'invoke_agent' +| where `resource.attributes.service.name` = 'weather-agent' +| where severityNumber >= 17 +| head 20 +``` + +Try in playground → + +### Filter logs containing a keyword pattern + +Find logs whose body contains the word "timeout" using `LIKE` with wildcard characters: + +```sql +source=logs-otel-v1* +| where LIKE(body, '%timeout%') +| head 20 +``` + +Try in playground → + +## See also + +- [`search`](/docs/ppl/commands/search/) - The starting point of every PPL query, also supports inline boolean expressions +- [`fields`](/docs/ppl/commands/fields/) - Select or exclude specific fields from the output +- [PPL Commands](/docs/ppl/commands/) - Full command reference diff --git a/docs/starlight-docs/src/content/docs/ppl/examples.md b/docs/starlight-docs/src/content/docs/ppl/examples.md new file mode 100644 index 00000000..5638e5a7 --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/examples.md @@ -0,0 +1,690 @@ +--- +title: "PPL Observability Examples" +description: "Real-world PPL queries for OpenTelemetry logs, traces, and AI agent observability - with live playground links to try each query instantly." +--- + +import { Tabs, TabItem, Aside } from '@astrojs/starlight/components'; + +These examples use real OpenTelemetry data from the Observability Stack. Each query runs against the live [playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t)) - click "Try in playground" to run any query instantly. + + + +## Index patterns + +The Observability Stack uses these OpenTelemetry index patterns: + +| Signal | Index Pattern | Key Fields | +|--------|--------------|------------| +| **Logs** | `logs-otel-v1*` | `time`, `body`, `severityText`, `severityNumber`, `traceId`, `spanId`, `resource.attributes.service.name` | +| **Traces** | `otel-v1-apm-span-*` | `traceId`, `spanId`, `parentSpanId`, `serviceName`, `name`, `durationInNanos`, `startTime`, `endTime`, `status.code` | +| **Service Map** | `otel-v2-apm-service-map-*` | `serviceName`, `destination.domain`, `destination.resource`, `traceGroupName` | + + + +--- + +## Log investigation + +### Count logs by service + +See which services are generating the most logs. + +```sql +| stats count() as log_count by `resource.attributes.service.name` +| sort - log_count +``` + +Try in playground → + +### Find error and fatal logs + +Filter for high-severity logs across all services. + +```sql +| where severityText = 'ERROR' or severityText = 'FATAL' +| sort - time +``` + +Try in playground → + +### Error rate by service + +Calculate the error percentage for each service. + +```sql +| stats count() as total, + sum(case(severityText = 'ERROR' or severityText = 'FATAL', 1 else 0)) as errors + by `resource.attributes.service.name` +| eval error_rate = round(errors * 100.0 / total, 2) +| sort - error_rate +``` + +Try in playground → + +### Log volume over time + +Time-bucketed log volume - great for spotting traffic spikes. + +```sql +| stats count() as volume by span(time, 5m) as time_bucket +``` + +Try in playground → + +### Severity breakdown by service + +Distribution of log levels per service. + +```sql +| stats count() as cnt by `resource.attributes.service.name`, severityText +| sort `resource.attributes.service.name`, - cnt +``` + +Try in playground → + +### Top log-producing services + +Quick view of the noisiest services. + +```sql +| top 10 `resource.attributes.service.name` +``` + +Try in playground → + +### Discover log patterns + +Automatically cluster similar log messages - no regex required. + +```sql +| patterns body +``` + +Try in playground → + +### Deduplicate logs by service + +Get one representative log per service. + +```sql +| dedup `resource.attributes.service.name` +``` + +Try in playground → + +--- + +## Trace analysis + +### Slowest traces + +Find the operations with the highest latency. + +```sql +source = otel-v1-apm-span-* +| eval duration_ms = durationInNanos / 1000000 +| sort - duration_ms +| head 20 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20eval%20duration_ms%20%3D%20durationInNanos%20%2F%201000000%0A%7C%20sort%20-%20duration_ms%0A%7C%20head%2020')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Error spans + +Find all spans with error status. + +```sql +source = otel-v1-apm-span-* +| where status.code = 2 +| sort - startTime +| head 20 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20where%20status.code%20%3D%202%0A%7C%20sort%20-%20startTime%0A%7C%20head%2020')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Latency percentiles by service + +P50, P95, P99 latency for each service. + +```sql +source = otel-v1-apm-span-* +| stats avg(durationInNanos) as avg_ns, + percentile(durationInNanos, 50) as p50_ns, + percentile(durationInNanos, 95) as p95_ns, + percentile(durationInNanos, 99) as p99_ns, + count() as span_count + by serviceName +| eval p50_ms = round(p50_ns / 1000000, 1), + p95_ms = round(p95_ns / 1000000, 1), + p99_ms = round(p99_ns / 1000000, 1) +| sort - p99_ms +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20stats%20avg%28durationInNanos%29%20as%20avg_ns%2C%0A%20%20%20%20%20%20%20%20percentile%28durationInNanos%2C%2050%29%20as%20p50_ns%2C%0A%20%20%20%20%20%20%20%20percentile%28durationInNanos%2C%2095%29%20as%20p95_ns%2C%0A%20%20%20%20%20%20%20%20percentile%28durationInNanos%2C%2099%29%20as%20p99_ns%2C%0A%20%20%20%20%20%20%20%20count%28%29%20as%20span_count%0A%20%20by%20serviceName%0A%7C%20eval%20p50_ms%20%3D%20round%28p50_ns%20%2F%201000000%2C%201%29%2C%0A%20%20%20%20%20%20%20p95_ms%20%3D%20round%28p95_ns%20%2F%201000000%2C%201%29%2C%0A%20%20%20%20%20%20%20p99_ms%20%3D%20round%28p99_ns%20%2F%201000000%2C%201%29%0A%7C%20sort%20-%20p99_ms')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Service error rates + +Error rate calculated from span status codes. + +```sql +source = otel-v1-apm-span-* +| stats count() as total, + sum(case(status.code = 2, 1 else 0)) as errors + by serviceName +| eval error_rate = round(errors * 100.0 / total, 2) +| sort - error_rate +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20stats%20count%28%29%20as%20total%2C%0A%20%20%20%20%20%20%20%20sum%28case%28status.code%20%3D%202%2C%201%20else%200%29%29%20as%20errors%0A%20%20by%20serviceName%0A%7C%20eval%20error_rate%20%3D%20round%28errors%20%2A%20100.0%20%2F%20total%2C%202%29%0A%7C%20sort%20-%20error_rate')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Trace fan-out analysis + +How many spans does each trace produce? High fan-out can indicate N+1 queries or excessive tool calls. + +```sql +source = otel-v1-apm-span-* +| stats count() as span_count by traceId +| sort - span_count +| head 20 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20stats%20count%28%29%20as%20span_count%20by%20traceId%0A%7C%20sort%20-%20span_count%0A%7C%20head%2020')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Operations by service + +What operations does each service perform? + +```sql +source = otel-v1-apm-span-* +| stats count() as invocations, avg(durationInNanos) as avg_latency by serviceName, name +| sort serviceName, - invocations +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20stats%20count%28%29%20as%20invocations%2C%20avg%28durationInNanos%29%20as%20avg_latency%20by%20serviceName%2C%20name%0A%7C%20sort%20serviceName%2C%20-%20invocations')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +--- + +## AI agent observability + +These queries leverage the [OpenTelemetry GenAI Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/) attributes that the Observability Stack captures for AI agent telemetry. + +### GenAI operations breakdown + +See what types of AI operations are occurring. + +```sql +| stats count() as operations by `resource.attributes.service.name`, `attributes.gen_ai.operation.name` +``` + +Try in playground → + +### Token usage by agent + +Track LLM token consumption across agents. + +```sql +source = otel-v1-apm-span-* +| where isnotnull(`attributes.gen_ai.usage.input_tokens`) +| stats sum(`attributes.gen_ai.usage.input_tokens`) as input_tokens, + sum(`attributes.gen_ai.usage.output_tokens`) as output_tokens, + count() as calls + by serviceName +| eval total_tokens = input_tokens + output_tokens +| sort - total_tokens +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20where%20isnotnull%28%60attributes.gen_ai.usage.input_tokens%60%29%0A%7C%20stats%20sum%28%60attributes.gen_ai.usage.input_tokens%60%29%20as%20input_tokens%2C%0A%20%20%20%20%20%20%20%20sum%28%60attributes.gen_ai.usage.output_tokens%60%29%20as%20output_tokens%2C%0A%20%20%20%20%20%20%20%20count%28%29%20as%20calls%0A%20%20by%20serviceName%0A%7C%20eval%20total_tokens%20%3D%20input_tokens%20%2B%20output_tokens%0A%7C%20sort%20-%20total_tokens')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Token usage over time + +Monitor token consumption trends. + +```sql +source = otel-v1-apm-span-* +| where isnotnull(`attributes.gen_ai.usage.input_tokens`) +| stats sum(`attributes.gen_ai.usage.input_tokens`) as input_tokens, + sum(`attributes.gen_ai.usage.output_tokens`) as output_tokens + by span(startTime, 5m) as time_bucket +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20where%20isnotnull%28%60attributes.gen_ai.usage.input_tokens%60%29%0A%7C%20stats%20sum%28%60attributes.gen_ai.usage.input_tokens%60%29%20as%20input_tokens%2C%0A%20%20%20%20%20%20%20%20sum%28%60attributes.gen_ai.usage.output_tokens%60%29%20as%20output_tokens%0A%20%20by%20span%28startTime%2C%205m%29%20as%20time_bucket')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### AI system usage breakdown + +Which AI systems are being used and how often? + +```sql +source = otel-v1-apm-span-* +| where isnotnull(`attributes.gen_ai.system`) +| stats count() as requests, + sum(`attributes.gen_ai.usage.input_tokens`) as input_tokens, + sum(`attributes.gen_ai.usage.output_tokens`) as output_tokens + by `attributes.gen_ai.system` +| sort - requests +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20where%20isnotnull%28%60attributes.gen_ai.system%60%29%0A%7C%20stats%20count%28%29%20as%20requests%2C%0A%20%20%20%20%20%20%20%20sum%28%60attributes.gen_ai.usage.input_tokens%60%29%20as%20input_tokens%2C%0A%20%20%20%20%20%20%20%20sum%28%60attributes.gen_ai.usage.output_tokens%60%29%20as%20output_tokens%0A%20%20by%20%60attributes.gen_ai.system%60%0A%7C%20sort%20-%20requests')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Tool execution analysis + +See which tools agents are calling and their performance. + +```sql +source = otel-v1-apm-span-* +| where `attributes.gen_ai.operation.name` = 'execute_tool' +| stats count() as executions, + avg(durationInNanos) as avg_latency, + max(durationInNanos) as max_latency + by `attributes.gen_ai.tool.name`, serviceName +| eval avg_ms = round(avg_latency / 1000000, 1) +| sort - executions +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20where%20%60attributes.gen_ai.operation.name%60%20%3D%20!%27execute_tool!%27%0A%7C%20stats%20count%28%29%20as%20executions%2C%0A%20%20%20%20%20%20%20%20avg%28durationInNanos%29%20as%20avg_latency%2C%0A%20%20%20%20%20%20%20%20max%28durationInNanos%29%20as%20max_latency%0A%20%20by%20%60attributes.gen_ai.tool.name%60%2C%20serviceName%0A%7C%20eval%20avg_ms%20%3D%20round%28avg_latency%20%2F%201000000%2C%201%29%0A%7C%20sort%20-%20executions')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Agent invocation latency + +End-to-end latency for agent invocations. + +```sql +source = otel-v1-apm-span-* +| where `attributes.gen_ai.operation.name` = 'invoke_agent' +| eval duration_ms = durationInNanos / 1000000 +| stats avg(duration_ms) as avg_ms, + percentile(duration_ms, 95) as p95_ms, + count() as invocations + by serviceName, `attributes.gen_ai.agent.name` +| sort - p95_ms +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20where%20%60attributes.gen_ai.operation.name%60%20%3D%20!%27invoke_agent!%27%0A%7C%20eval%20duration_ms%20%3D%20durationInNanos%20%2F%201000000%0A%7C%20stats%20avg%28duration_ms%29%20as%20avg_ms%2C%0A%20%20%20%20%20%20%20%20percentile%28duration_ms%2C%2095%29%20as%20p95_ms%2C%0A%20%20%20%20%20%20%20%20count%28%29%20as%20invocations%0A%20%20by%20serviceName%2C%20%60attributes.gen_ai.agent.name%60%0A%7C%20sort%20-%20p95_ms')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Failed agent operations + +Find agent operations that resulted in errors. + +```sql +source = otel-v1-apm-span-* +| where isnotnull(`attributes.gen_ai.operation.name`) and status.code = 2 +| sort - startTime +| head 20 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20where%20isnotnull%28%60attributes.gen_ai.operation.name%60%29%20and%20status.code%20%3D%202%0A%7C%20sort%20-%20startTime%0A%7C%20head%2020')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +--- + +## SRE incident response + +### Error rate percentage over time + +Track the overall error rate trend - spot the moment things started going wrong. + +```sql +| stats count() as total, + sum(case(severityText = 'ERROR' or severityText = 'FATAL', 1 else 0)) as errors + by span(time, 5m) as time_bucket +| eval error_pct = round(errors * 100.0 / total, 2) +| sort time_bucket +``` + +Try in playground → + +### First error occurrence per service + +Find when each service first started erroring - pinpoint the origin of an incident. + +```sql +| where severityText = 'ERROR' +| stats earliest(time) as first_seen, count() as total_errors by `resource.attributes.service.name` +| sort first_seen +``` + +Try in playground → + +### Error spike by service (timechart) + +Visualize error spikes per service over time - the Splunk-style `timechart` equivalent. + +```sql +| where severityText = 'ERROR' +| timechart span=5m count() by `resource.attributes.service.name` +``` + +Try in playground → + +### P95 latency timeseries by service + +Track latency degradation over time - the core SRE golden signal. + +```sql +source = otel-v1-apm-span-* +| stats percentile(durationInNanos, 95) as p95_ns by span(startTime, 5m) as time_bucket, serviceName +| eval p95_ms = round(p95_ns / 1000000, 1) +| sort time_bucket +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20stats%20percentile%28durationInNanos%2C%2095%29%20as%20p95_ns%20by%20span%28startTime%2C%205m%29%20as%20time_bucket%2C%20serviceName%0A%7C%20eval%20p95_ms%20%3D%20round%28p95_ns%20%2F%201000000%2C%201%29%0A%7C%20sort%20time_bucket')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Slowest operations by service + +Find the most expensive operations to target for optimization. + +```sql +source = otel-v1-apm-span-* +| stats avg(durationInNanos) as avg_ns, percentile(durationInNanos, 95) as p95_ns, count() as calls by serviceName, name +| eval avg_ms = round(avg_ns / 1000000, 1), p95_ms = round(p95_ns / 1000000, 1) +| sort - p95_ms +| head 20 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20stats%20avg%28durationInNanos%29%20as%20avg_ns%2C%20percentile%28durationInNanos%2C%2095%29%20as%20p95_ns%2C%20count%28%29%20as%20calls%20by%20serviceName%2C%20name%0A%7C%20eval%20avg_ms%20%3D%20round%28avg_ns%20%2F%201000000%2C%201%29%2C%20p95_ms%20%3D%20round%28p95_ns%20%2F%201000000%2C%201%29%0A%7C%20sort%20-%20p95_ms%0A%7C%20head%2020')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +--- + +## Cross-signal correlation + +### Logs for a specific trace + +Jump from a trace to its associated logs using the traceId. + +```sql +source = logs-otel-v1* +| where traceId = '' +| sort time +``` + +### Services with both high error logs and slow traces + +Combine log and trace signals to find the most problematic services. + +```sql +source = logs-otel-v1* +| where severityText = 'ERROR' +| stats count() as error_logs by `resource.attributes.service.name` +| where error_logs > 10 +| sort - error_logs +``` + +Try in playground → + +Then investigate trace latency for those services: + +```sql +source = otel-v1-apm-span-* +| where serviceName = '' +| stats percentile(durationInNanos, 95) as p95, count() as spans by name +| eval p95_ms = round(p95 / 1000000, 1) +| sort - p95_ms +``` + +--- + +## Dashboard-ready queries + +These queries produce results well-suited for dashboard visualizations. + +### Service health summary (data table) + +```sql +source = otel-v1-apm-span-* +| stats count() as total_spans, + sum(case(status.code = 2, 1 else 0)) as error_spans, + avg(durationInNanos) as avg_latency_ns + by serviceName +| eval error_rate = round(error_spans * 100.0 / total_spans, 2), + avg_latency_ms = round(avg_latency_ns / 1000000, 1) +| sort - error_rate +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20stats%20count%28%29%20as%20total_spans%2C%0A%20%20%20%20%20%20%20%20sum%28case%28status.code%20%3D%202%2C%201%20else%200%29%29%20as%20error_spans%2C%0A%20%20%20%20%20%20%20%20avg%28durationInNanos%29%20as%20avg_latency_ns%0A%20%20by%20serviceName%0A%7C%20eval%20error_rate%20%3D%20round%28error_spans%20%2A%20100.0%20%2F%20total_spans%2C%202%29%2C%0A%20%20%20%20%20%20%20avg_latency_ms%20%3D%20round%28avg_latency_ns%20%2F%201000000%2C%201%29%0A%7C%20sort%20-%20error_rate')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +### Log volume heatmap (by service and hour) + +```sql +| eval hour = hour(time) +| stats count() as volume by `resource.attributes.service.name`, hour +| sort `resource.attributes.service.name`, hour +``` + +Try in playground → + +### Top error messages + +```sql +| where severityText = 'ERROR' +| top 20 body +``` + +Try in playground → + +--- + +## Advanced analytics + +### Outlier detection with eventstats + +Use `eventstats` to compute per-group aggregates without collapsing rows, then flag outliers that deviate significantly from their service's baseline. + +```sql +source = otel-v1-apm-span-* +| eventstats avg(durationInNanos) as svc_avg by serviceName +| eval deviation = durationInNanos - svc_avg +| where deviation > svc_avg * 2 +| sort - deviation +| head 20 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20eventstats%20avg%28durationInNanos%29%20as%20svc_avg%20by%20serviceName%0A%7C%20eval%20deviation%20%3D%20durationInNanos%20-%20svc_avg%0A%7C%20where%20deviation%20%3E%20svc_avg%20%2A%202%0A%7C%20sort%20-%20deviation%0A%7C%20head%2020')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +Find spans that take more than 3x the service average -- surface hidden performance outliers that percentile queries miss. + +### Rolling window analysis with streamstats + +Use `streamstats` to compute sliding-window aggregates over ordered events, ideal for detecting real-time latency regressions. + +```sql +source = otel-v1-apm-span-* +| sort startTime +| streamstats window=20 avg(durationInNanos) as rolling_avg by serviceName +| eval current_ms = durationInNanos / 1000000, avg_ms = rolling_avg / 1000000 +| where durationInNanos > rolling_avg * 3 +| head 20 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20sort%20startTime%0A%7C%20streamstats%20window%3D20%20avg%28durationInNanos%29%20as%20rolling_avg%20by%20serviceName%0A%7C%20eval%20current_ms%20%3D%20durationInNanos%20%2F%201000000%2C%20avg_ms%20%3D%20rolling_avg%20%2F%201000000%0A%7C%20where%20durationInNanos%20%3E%20rolling_avg%20%2A%203%0A%7C%20head%2020')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +Flag spans that exceed 3x the rolling 20-span average per service -- catch latency spikes as they happen. + +### Smoothed latency trends with trendline + +Use `trendline` to compute simple moving averages over sorted data, making it easy to spot sustained performance shifts versus momentary noise. + +```sql +source = otel-v1-apm-span-* +| trendline sort startTime sma(5, durationInNanos) as short_trend sma(20, durationInNanos) as long_trend +| eval short_ms = short_trend / 1000000, long_ms = long_trend / 1000000 +| eval trend = if(short_ms > long_ms, 'degrading', 'improving') +| head 50 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20trendline%20sort%20startTime%20sma%285%2C%20durationInNanos%29%20as%20short_trend%20sma%2820%2C%20durationInNanos%29%20as%20long_trend%0A%7C%20eval%20short_ms%20%3D%20short_trend%20%2F%201000000%2C%20long_ms%20%3D%20long_trend%20%2F%201000000%0A%7C%20eval%20trend%20%3D%20if%28short_ms%20%3E%20long_ms%2C%20!%27degrading!%27%2C%20!%27improving!%27%29%0A%7C%20head%2050')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +Compare short-term (5-span) versus long-term (20-span) moving averages to classify whether latency is degrading or improving. + +--- + +## Masterclass pipelines + +These multi-command pipelines combine several PPL features to solve real observability problems in a single query. + +### Service health scorecard + +A complete service health dashboard in one query -- error rates, latency percentiles, and automated health classification. + +```sql +source = otel-v1-apm-span-* +| stats count() as total_spans, + sum(case(status.code = 2, 1 else 0)) as error_spans, + avg(durationInNanos) as avg_latency_ns, + percentile(durationInNanos, 95) as p95_ns, + percentile(durationInNanos, 99) as p99_ns + by serviceName +| eval error_rate = round(error_spans * 100.0 / total_spans, 2), + avg_ms = round(avg_latency_ns / 1000000, 1), + p95_ms = round(p95_ns / 1000000, 1), + p99_ms = round(p99_ns / 1000000, 1), + health = case( + error_rate > 5, 'CRITICAL', + error_rate > 1, 'DEGRADED', + p99_ms > 5000, 'SLOW' + else 'HEALTHY') +| sort - error_rate +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20stats%20count%28%29%20as%20total_spans%2C%0A%20%20%20%20%20%20%20%20sum%28case%28status.code%20%3D%202%2C%201%20else%200%29%29%20as%20error_spans%2C%0A%20%20%20%20%20%20%20%20avg%28durationInNanos%29%20as%20avg_latency_ns%2C%0A%20%20%20%20%20%20%20%20percentile%28durationInNanos%2C%2095%29%20as%20p95_ns%2C%0A%20%20%20%20%20%20%20%20percentile%28durationInNanos%2C%2099%29%20as%20p99_ns%0A%20%20by%20serviceName%0A%7C%20eval%20error_rate%20%3D%20round%28error_spans%20%2A%20100.0%20%2F%20total_spans%2C%202%29%2C%0A%20%20%20%20%20%20%20avg_ms%20%3D%20round%28avg_latency_ns%20%2F%201000000%2C%201%29%2C%0A%20%20%20%20%20%20%20p95_ms%20%3D%20round%28p95_ns%20%2F%201000000%2C%201%29%2C%0A%20%20%20%20%20%20%20p99_ms%20%3D%20round%28p99_ns%20%2F%201000000%2C%201%29%2C%0A%20%20%20%20%20%20%20health%20%3D%20case%28%0A%20%20%20%20%20%20%20%20%20%20%20error_rate%20%3E%205%2C%20!%27CRITICAL!%27%2C%0A%20%20%20%20%20%20%20%20%20%20%20error_rate%20%3E%201%2C%20!%27DEGRADED!%27%2C%0A%20%20%20%20%20%20%20%20%20%20%20p99_ms%20%3E%205000%2C%20!%27SLOW!%27%0A%20%20%20%20%20%20%20%20%20%20%20else%20!%27HEALTHY!%27%29%0A%7C%20sort%20-%20error_rate')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +Combines `stats`, `eval`, and `case` to produce a single-query health scorecard across all services. Use this as a starting point for service-level dashboards. + +### GenAI agent cost and performance analysis + +Complete GenAI observability: latency, token usage, failure rate, and per-operation breakdown across all AI agents. + +```sql +source = otel-v1-apm-span-* +| where isnotnull(`attributes.gen_ai.operation.name`) +| eval duration_ms = durationInNanos / 1000000, + input_tokens = `attributes.gen_ai.usage.input_tokens`, + output_tokens = `attributes.gen_ai.usage.output_tokens`, + total_tokens = input_tokens + output_tokens +| stats count() as operations, + avg(duration_ms) as avg_latency_ms, + percentile(duration_ms, 95) as p95_ms, + sum(total_tokens) as total_tokens, + sum(case(status.code = 2, 1 else 0)) as failures + by serviceName, `attributes.gen_ai.operation.name`, `attributes.gen_ai.system` +| eval failure_rate = round(failures * 100.0 / operations, 2), + tokens_per_op = round(total_tokens / operations, 0) +| sort - total_tokens +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20otel-v1-apm-span-%2A%0A%7C%20where%20isnotnull%28%60attributes.gen_ai.operation.name%60%29%0A%7C%20eval%20duration_ms%20%3D%20durationInNanos%20%2F%201000000%2C%0A%20%20%20%20%20%20%20input_tokens%20%3D%20%60attributes.gen_ai.usage.input_tokens%60%2C%0A%20%20%20%20%20%20%20output_tokens%20%3D%20%60attributes.gen_ai.usage.output_tokens%60%2C%0A%20%20%20%20%20%20%20total_tokens%20%3D%20input_tokens%20%2B%20output_tokens%0A%7C%20stats%20count%28%29%20as%20operations%2C%0A%20%20%20%20%20%20%20%20avg%28duration_ms%29%20as%20avg_latency_ms%2C%0A%20%20%20%20%20%20%20%20percentile%28duration_ms%2C%2095%29%20as%20p95_ms%2C%0A%20%20%20%20%20%20%20%20sum%28total_tokens%29%20as%20total_tokens%2C%0A%20%20%20%20%20%20%20%20sum%28case%28status.code%20%3D%202%2C%201%20else%200%29%29%20as%20failures%0A%20%20by%20serviceName%2C%20%60attributes.gen_ai.operation.name%60%2C%20%60attributes.gen_ai.system%60%0A%7C%20eval%20failure_rate%20%3D%20round%28failures%20%2A%20100.0%20%2F%20operations%2C%202%29%2C%0A%20%20%20%20%20%20%20tokens_per_op%20%3D%20round%28total_tokens%20%2F%20operations%2C%200%29%0A%7C%20sort%20-%20total_tokens')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +Breaks down every GenAI operation by service, operation type, and AI system. Use this to track cost drivers, identify high-failure operations, and compare AI provider performance. + +### Envoy access log analysis + +Parse raw Envoy access logs into an API traffic dashboard -- method, path, and status class breakdown. + +```sql +source = logs-otel-v1* +| where `resource.attributes.service.name` = 'frontend-proxy' +| grok body '\[%{GREEDYDATA:timestamp}\] "%{WORD:method} %{URIPATH:path} HTTP/%{NUMBER}" %{POSINT:status}' +| where isnotnull(method) +| eval status_class = case( + cast(status as int) < 200, '1xx', + cast(status as int) < 300, '2xx', + cast(status as int) < 400, '3xx', + cast(status as int) < 500, '4xx' + else '5xx') +| stats count() as requests by method, path, status_class +| sort - requests +| head 30 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20logs-otel-v1%2A%0A%7C%20where%20%60resource.attributes.service.name%60%20%3D%20!%27frontend-proxy!%27%0A%7C%20grok%20body%20!%27%5C%5B%25%7BGREEDYDATA%3Atimestamp%7D%5C%5D%20%22%25%7BWORD%3Amethod%7D%20%25%7BURIPATH%3Apath%7D%20HTTP%2F%25%7BNUMBER%7D%22%20%25%7BPOSINT%3Astatus%7D!%27%0A%7C%20where%20isnotnull%28method%29%0A%7C%20eval%20status_class%20%3D%20case%28%0A%20%20%20%20%20%20%20cast%28status%20as%20int%29%20%3C%20200%2C%20!%271xx!%27%2C%0A%20%20%20%20%20%20%20cast%28status%20as%20int%29%20%3C%20300%2C%20!%272xx!%27%2C%0A%20%20%20%20%20%20%20cast%28status%20as%20int%29%20%3C%20400%2C%20!%273xx!%27%2C%0A%20%20%20%20%20%20%20cast%28status%20as%20int%29%20%3C%20500%2C%20!%274xx!%27%0A%20%20%20%20%20%20%20else%20!%275xx!%27%29%0A%7C%20stats%20count%28%29%20as%20requests%20by%20method%2C%20path%2C%20status_class%0A%7C%20sort%20-%20requests%0A%7C%20head%2030')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +Uses `grok` to extract structured fields from unstructured proxy logs, then aggregates into an API traffic summary. Adapt the grok pattern for other log formats. + +### Automatic error pattern discovery + +Cluster error messages into patterns per service with zero regex -- PPL's killer feature for incident triage. + +```sql +source = logs-otel-v1* +| where severityText = 'ERROR' +| patterns body method=brain mode=aggregation by `resource.attributes.service.name` +| sort - pattern_count +| head 20 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20logs-otel-v1%2A%0A%7C%20where%20severityText%20%3D%20!%27ERROR!%27%0A%7C%20patterns%20body%20method%3Dbrain%20mode%3Daggregation%20by%20%60resource.attributes.service.name%60%0A%7C%20sort%20-%20pattern_count%0A%7C%20head%2020')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +The `patterns` command with `method=brain` uses ML-based clustering to group similar error messages. During an incident, run this first to see the shape of the problem without writing a single regex. + +### Cross-signal correlation: logs meet traces + +Correlate error logs with error spans across indices -- find which trace operations cause which log errors. + +```sql +source = logs-otel-v1* +| where severityText = 'ERROR' +| where traceId != '' +| left join left=l right=r on l.traceId = r.traceId [ + source = otel-v1-apm-span-* + | where status.code = 2 + | eval span_duration_ms = durationInNanos / 1000000 + | sort - span_duration_ms + | head 1000 + ] +| where isnotnull(r.serviceName) +| stats count() as correlated_errors by l.`resource.attributes.service.name`, r.serviceName, r.name +| sort - correlated_errors +| head 20 +``` + +[Try in Playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'source%20%3D%20logs-otel-v1%2A%0A%7C%20where%20severityText%20%3D%20!%27ERROR!%27%0A%7C%20where%20traceId%20%21%3D%20!%27!%27%0A%7C%20left%20join%20left%3Dl%20right%3Dr%20on%20l.traceId%20%3D%20r.traceId%20%5B%0A%20%20%20%20source%20%3D%20otel-v1-apm-span-%2A%0A%20%20%20%20%7C%20where%20status.code%20%3D%202%0A%20%20%20%20%7C%20eval%20span_duration_ms%20%3D%20durationInNanos%20%2F%201000000%0A%20%20%20%20%7C%20sort%20-%20span_duration_ms%0A%20%20%20%20%7C%20head%201000%0A%20%20%5D%0A%7C%20where%20isnotnull%28r.serviceName%29%0A%7C%20stats%20count%28%29%20as%20correlated_errors%20by%20l.%60resource.attributes.service.name%60%2C%20r.serviceName%2C%20r.name%0A%7C%20sort%20-%20correlated_errors%0A%7C%20head%2020')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t))) + +Joins error logs with error spans using `traceId` to reveal which span operations are responsible for which log errors. This cross-index join is one of PPL's most powerful capabilities for root cause analysis. + +--- + +## Query tips + +### Backtick field names with dots + +OpenTelemetry attributes contain dots. Wrap them in backticks: + +```sql +| where isnotnull(`resource.attributes.service.name`) +``` + +### Combine stats with eval for computed metrics + +```sql +| stats count() as total, sum(case(severityText = 'ERROR', 1 else 0)) as errors by service +| eval error_pct = round(errors * 100.0 / total, 2) +``` + +### Use span() for time bucketing + +```sql +| stats count() by span(time, 1m) as minute +``` + +### Use head to limit during exploration + +Always add `| head` while exploring to avoid scanning all data: + +```sql +| where severityText = 'ERROR' +| head 50 +``` + +### Sort with - for descending + +```sql +| sort - durationInNanos +``` + +## Further reading + +- **[PPL Language Overview](/docs/ppl/)** - Why PPL and how it compares +- **[Command Reference](/docs/ppl/commands/)** - Full syntax for all commands +- **[Function Reference](/docs/ppl/functions/)** - 200+ built-in functions +- **[Discover Logs](/docs/investigate/discover-logs/)** - Using PPL in the Logs UI +- **[Discover Traces](/docs/investigate/discover-traces/)** - Using PPL in the Traces UI diff --git a/docs/starlight-docs/src/content/docs/ppl/functions.md b/docs/starlight-docs/src/content/docs/ppl/functions.md new file mode 100644 index 00000000..255c592a --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/functions.md @@ -0,0 +1,351 @@ +--- +title: "PPL Function Reference" +description: "Complete reference for PPL built-in functions - aggregations, string manipulation, date/time, math, conditionals, JSON, IP, collections, and more." +--- + +import { Aside } from '@astrojs/starlight/components'; + +PPL includes **200+ built-in functions** across 13 categories. Functions are used within commands like `eval`, `where`, `stats`, and `fields` to transform, filter, and aggregate data. + +## Aggregation functions + +Used with `stats`, `eventstats`, and `streamstats` to calculate summary values across rows. + +| Function | Description | +|----------|-------------| +| `count()` | Count the number of values | +| `sum()` | Sum of expression values | +| `avg()` | Average (mean) value | +| `max()` | Maximum value | +| `min()` | Minimum value | +| `var_samp()` | Sample variance | +| `var_pop()` | Population variance | +| `stddev_samp()` | Sample standard deviation | +| `stddev_pop()` | Population standard deviation | +| `distinct_count()` | Approximate distinct count (HyperLogLog++) | +| `percentile(, )` | Approximate percentile at given percentage | +| `median()` | Median (50th percentile) | +| `first()` | First non-null value | +| `last()` | Last non-null value | +| `earliest()` | Earliest value by timestamp | +| `latest()` | Latest value by timestamp | +| `take(, )` | Collect up to N original values | +| `list()` | Collect all values into array (with duplicates) | +| `values()` | Collect all unique values into sorted array | + +**Example - Error rate with percentile latency:** +```sql +source = otel-v1-apm-span-* +| stats count() as total, + sum(case(status.code = 2, 1 else 0)) as errors, + percentile(durationInNanos, 95) as p95_latency, + percentile(durationInNanos, 99) as p99_latency + by serviceName +``` + +--- + +## Condition functions + +Conditional logic and null handling. + +| Function | Description | +|----------|-------------| +| `isnull()` | Returns true if field is null | +| `isnotnull()` | Returns true if field is not null | +| `ifnull(, )` | Returns default if field is null | +| `nullif(, )` | Returns null if expressions are equal | +| `if(, , )` | Conditional expression | +| `case(, , ..., else )` | Multi-branch conditional | +| `coalesce(, , ...)` | First non-null value from list | +| `isblank()` | True if null, empty, or whitespace-only | +| `isempty()` | True if null or empty string | +| `contains(, )` | True if field contains substring (case-insensitive) | +| `regexp_match(, )` | True if regex matches | + +**Example - Categorize log severity:** +```sql +| eval severity_group = case( + severityNumber >= 17, 'error', + severityNumber >= 9, 'warning', + else 'info' + ) +| stats count() as log_count by severity_group +``` + +Try in playground → + +**Example - Safe division with null handling:** +```sql +| stats count() as total, sum(case(severityText = 'ERROR', 1 else 0)) as errors + by `resource.attributes.service.name` +| eval error_rate = if(total > 0, errors * 100.0 / total, 0) +``` + +Try in playground → + +--- + +## String functions + +Text manipulation and pattern matching. + +| Function | Description | +|----------|-------------| +| `concat(, , ...)` | Concatenate up to 9 strings | +| `concat_ws(, , ...)` | Concatenate with separator | +| `length()` | String length in bytes | +| `lower()` | Convert to lowercase | +| `upper()` | Convert to uppercase | +| `trim()` | Remove leading and trailing spaces | +| `ltrim()` | Remove leading spaces | +| `rtrim()` | Remove trailing spaces | +| `substring(, , )` | Extract substring | +| `replace(, , )` | Replace occurrences (supports regex) | +| `regexp_replace(, , )` | Regex-based replacement | +| `locate(, )` | Position of first occurrence | +| `position( IN )` | Position of first occurrence | +| `reverse()` | Reverse a string | +| `right(, )` | Last N characters | +| `like(, )` | Wildcard pattern match (`%`, `_`) | +| `ilike(, )` | Case-insensitive wildcard match | + +**Example - Extract service name prefix:** +```sql +| eval service_prefix = substring(`resource.attributes.service.name`, 0, locate('-', `resource.attributes.service.name`)) +| stats count() by service_prefix +``` + +Try in playground → + +--- + +## Date and time functions + +Date arithmetic, extraction, formatting, and conversion. All operations use UTC. + +| Function | Description | +|----------|-------------| +| `now()` | Current date and time | +| `curdate()` / `current_date()` | Current date | +| `curtime()` / `current_time()` | Current time | +| `date()` | Create DATE from string | +| `time()` | Create TIME from string | +| `timestamp()` | Create TIMESTAMP from string | +| `date_add(, INTERVAL )` | Add interval to date | +| `date_sub(, INTERVAL )` | Subtract interval from date | +| `datediff(, )` | Difference in days | +| `timestampdiff(, , )` | Difference in specified units | +| `date_format(, )` | Format date as string | +| `str_to_date(, )` | Parse string to date | +| `year()` | Extract year | +| `month()` | Extract month | +| `day()` / `dayofmonth()` | Extract day of month | +| `hour()` | Extract hour | +| `minute()` | Extract minute | +| `second()` | Extract second | +| `dayofweek()` | Day of week (1=Sunday) | +| `dayofyear()` | Day of year | +| `week()` | Week number | +| `quarter()` | Quarter of year | +| `unix_timestamp()` | Convert to Unix timestamp | +| `from_unixtime()` | Convert Unix timestamp to date | +| `last_day()` | Last day of month | +| `extract( FROM )` | Extract date part | + +**Example - Log volume by hour of day:** +```sql +| eval hour = hour(time) +| stats count() as volume by hour +| sort hour +``` + +Try in playground → + +--- + +## Math functions + +Numeric operations and mathematical calculations. + +| Function | Description | +|----------|-------------| +| `abs()` | Absolute value | +| `ceil()` / `ceiling()` | Ceiling (round up) | +| `floor()` | Floor (round down) | +| `round(, )` | Round to decimal places | +| `sqrt()` | Square root | +| `cbrt()` | Cube root | +| `pow(, )` / `power(...)` | Exponentiation | +| `exp()` | e raised to power | +| `ln()` | Natural logarithm | +| `log()` | Natural logarithm | +| `log2()` | Base-2 logarithm | +| `log10()` | Base-10 logarithm | +| `mod(, )` | Modulo (remainder) | +| `sign()` | Sign of value (-1, 0, 1) | +| `rand()` | Random float [0, 1) | +| `pi()` | Pi constant | +| `e()` | Euler's number | +| `sin()`, `cos()`, `tan()` | Trigonometric functions | +| `asin()`, `acos()`, `atan()` | Inverse trigonometric | +| `degrees()` | Radians to degrees | +| `radians()` | Degrees to radians | +| `conv(, , )` | Base conversion | +| `crc32()` | CRC32 checksum | + +**Example - Convert nanoseconds to milliseconds and round:** +```sql +source = otel-v1-apm-span-* +| eval duration_ms = round(durationInNanos / 1000000.0, 2) +| sort - duration_ms +| head 20 +``` + +--- + +## Collection functions + +Create, manipulate, and analyze arrays and multivalue fields. + +| Function | Description | +|----------|-------------| +| `array(, , ...)` | Create an array | +| `array_length()` | Length of array | +| `forall(, )` | True if all elements satisfy condition | +| `exists(, )` | True if any element satisfies condition | +| `filter(, )` | Filter array elements by condition | +| `transform(, )` | Transform each element | +| `reduce(, , )` | Reduce array to single value | +| `split(, )` | Split string into array | +| `mvjoin(, )` | Join array into string | +| `mvappend(, , ...)` | Concatenate arrays | +| `mvdedup()` | Remove duplicate array values | +| `mvfind(, )` | Find first matching element index | +| `mvindex(, , )` | Slice array by index | +| `mvmap(, )` | Map expression over array | +| `mvzip(, , )` | Zip two arrays element-wise | + +--- + +## JSON functions + +Parse, create, and manipulate JSON data. + +| Function | Description | +|----------|-------------| +| `json()` | Validate and parse JSON string | +| `json_valid()` | Check if string is valid JSON | +| `json_object(, , ...)` | Create JSON object | +| `json_array(, , ...)` | Create JSON array | +| `json_array_length()` | Count array elements | +| `json_extract(, ...)` | Extract values by path | +| `json_delete(, ...)` | Delete values by path | +| `json_set(, , )` | Set value at path | +| `json_append(, , )` | Append to array at path | +| `json_keys()` | Get object keys | + +**Example - Parse JSON from log body:** +```sql +| where json_valid(body) +| eval parsed = json_extract(body, '$.error.type') +| where isnotnull(parsed) +| stats count() by parsed +``` + +Try in playground → + +--- + +## IP address functions + +IP matching and geolocation. + +| Function | Description | +|----------|-------------| +| `cidrmatch(, )` | Check if IP is within CIDR range | +| `geoip()` | Look up IP geolocation | + +**Example - Filter internal IPs:** +```sql +| where not cidrmatch(client_ip, '10.0.0.0/8') + and not cidrmatch(client_ip, '172.16.0.0/12') +``` + +--- + +## Cryptographic functions + +Hashing for data integrity and anonymization. + +| Function | Description | +|----------|-------------| +| `md5()` | MD5 hash (32-char hex) | +| `sha1()` | SHA-1 hash | +| `sha2(, )` | SHA-2 hash (224, 256, 384, 512) | + +--- + +## Relevance functions + +Full-text search using the OpenSearch query engine. + +| Function | Description | +|----------|-------------| +| `match(, )` | Full-text match query | +| `match_phrase(, )` | Exact phrase match | +| `match_phrase_prefix(, )` | Phrase prefix match | +| `multi_match(, )` | Search across multiple fields | +| `simple_query_string(, )` | Flexible query string | +| `query_string(, )` | Full query string syntax | + +**Example - Full-text search in log bodies:** +```sql +| where match(body, 'connection timeout') +| head 20 +``` + +Try in playground → + +--- + +## Type conversion functions + +Convert between data types. + +| Function | Description | +|----------|-------------| +| `cast( AS )` | Cast to specified type | +| `tostring(, )` | Convert to string (formats: binary, hex, commas, duration) | +| `tonumber(, )` | Convert string to number (base 2-36) | + +**Example:** +```sql +| eval duration_str = tostring(durationInNanos / 1000000, 'commas') +``` + +--- + +## System functions + +Utilities for type inspection and diagnostics. + +| Function | Description | +|----------|-------------| +| `typeof()` | Returns the data type of an expression | + +**Example - Inspect field types:** +```sql +| eval body_type = typeof(body), severity_type = typeof(severityNumber) +| head 1 +``` + +Try in playground → + +--- + +## Further reading + +- **[Command Reference](/docs/ppl/commands/)** - All 50+ PPL commands +- **[Observability Examples](/docs/ppl/examples/)** - Real-world OTel queries +- **[PPL function source docs](https://github.com/opensearch-project/sql/tree/main/docs/user/ppl/functions)** - Detailed parameter docs for every function diff --git a/docs/starlight-docs/src/content/docs/ppl/index.md b/docs/starlight-docs/src/content/docs/ppl/index.md new file mode 100644 index 00000000..463ec7be --- /dev/null +++ b/docs/starlight-docs/src/content/docs/ppl/index.md @@ -0,0 +1,143 @@ +--- +title: "Piped Processing Language (PPL)" +description: "PPL is the native query language for OpenSearch Observability - a pipe-based, human-readable language for exploring logs, traces, and telemetry at scale." +--- + +Piped Processing Language (PPL) is the **native query language** of the OpenSearch Observability Stack. Every log query, every trace investigation, and every pattern analysis flows through PPL - a pipe-delimited language designed for the way operators and engineers actually think about data. + + + +**Try PPL now in the live playground →** + + + +```sql +source = logs-otel-v1* +| where severityNumber >= 17 +| stats count() as errors by `resource.attributes.service.name` +| sort - errors +``` + +Try in playground → + +That query finds every error log, counts them by service, and sorts by severity - all in four lines you can read aloud. No JSON nesting, no callback syntax, no query DSL to memorize. + +## Why PPL? + +### Think in pipelines, not in trees + +PPL follows the natural mental model of data investigation: start with a data source, progressively filter, transform, and aggregate. Each pipe (`|`) represents a single, composable operation. Reading a PPL query from top to bottom tells you exactly what happens at every step. + +```sql +source = otel-v1-apm-span-* +| where serviceName = 'frontend' +| where durationInNanos > 2000000000 +| stats avg(durationInNanos) as avg_latency by name +| sort - avg_latency +| head 10 +``` + +### One language across logs and traces + +Unlike platforms that require different query dialects for different signal types, PPL works across **both logs and traces** in the Observability Stack. The same syntax, the same commands, the same muscle memory - whether you are triaging an incident in logs or profiling latency in traces. + +| Signal | Query Language | Index Pattern | +|---------|---------------|-------------------------| +| Logs | **PPL** | `logs-otel-v1*` | +| Traces | **PPL** | `otel-v1-apm-span-*` | +| Metrics | PromQL | Prometheus time-series | + +### Designed for observability at scale + +PPL is not a general-purpose query language bolted onto a search engine. It was designed from the ground up for the workflows observability engineers perform daily: + +- **Pattern discovery** - the `patterns` command automatically extracts log patterns and clusters similar entries, replacing hours of manual regex work +- **Field extraction on the fly** - `parse`, `grok`, and `rex` let you extract structured fields from unstructured log text without re-indexing +- **Statistical analysis** - `stats`, `eventstats`, `streamstats`, and `trendline` cover everything from simple counts to rolling window calculations +- **Deduplication and ranking** - `dedup`, `top`, and `rare` surface the signal in noisy data +- **Machine learning built in** - `ml` and `kmeans` run anomaly detection and clustering directly in your query pipeline +- **Join and correlate** - `join`, `lookup`, `append`, and `subquery` combine data from multiple indices for cross-signal investigation + +### 50+ commands, 200+ functions + +PPL provides a comprehensive command set that covers the full spectrum of data exploration: + +| Category | Commands | +|----------|----------| +| **Search & filter** | `search`, `where`, `regex`, `subquery` | +| **Field selection** | `fields`, `table`, `rename`, `eval` | +| **Aggregation** | `stats`, `eventstats`, `streamstats`, `chart`, `timechart`, `bin` | +| **Sorting & limiting** | `sort`, `head`, `reverse` | +| **Dedup & ranking** | `dedup`, `top`, `rare` | +| **Text extraction** | `parse`, `grok`, `rex`, `spath`, `patterns` | +| **Data enrichment** | `join`, `lookup`, `append`, `appendcol`, `multisearch` | +| **Transformation** | `fillnull`, `expand`, `flatten`, `replace`, `convert` | +| **Trend & ML** | `trendline`, `ml`, `kmeans` | +| **Metadata** | `describe`, `explain`, `showdatasources` | + +For the full command reference, see [Commands](/docs/ppl/commands/). For function reference, see [Functions](/docs/ppl/functions/). + +## PPL in the Observability Stack + +PPL is woven into every investigation surface of the stack: + +### Discover + +The **Discover** interface for [Logs](/docs/investigate/discover-logs/) and [Traces](/docs/investigate/discover-traces/) uses PPL as its primary query language. Type PPL directly in the query bar, with autocomplete for field names, commands, and functions. + +### Claude Code plugin + +The [Claude Code observability plugin](/docs/claude-code/) generates PPL queries from natural language. Ask "show me the slowest traces from the frontend service" and the plugin produces the PPL query, runs it, and returns results - all powered by PPL templates. + +### Alerting and anomaly detection + +PPL queries can drive [alerts](/docs/alerting/) and [anomaly detection](/docs/anomaly-detection/) monitors. Define alert conditions using the same query language you use for ad-hoc investigation. + +### Dashboards + +PPL query results can be [saved to dashboards](/docs/dashboards/) as live visualizations - line charts, bar charts, heatmaps, and more - all driven by PPL. + +## Getting started with PPL + +The fastest way to learn PPL is to use it. Open the [live playground](https://observability.playground.opensearch.org/w/19jD-R/app/explore/logs/#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-6h,to:now))&_q=(dataset:(id:d1f424b0-2655-11f1-8baa-d5b726b04d73,timeFieldName:time,title:'logs-otel-v1*',type:INDEX_PATTERN),language:PPL,query:'')&_a=(legacy:(columns:!(body,severityText,resource.attributes.service.name),interval:auto,isDirty:!f,sort:!()),tab:(logs:(),patterns:(usingRegexPatterns:!f)),ui:(activeTabId:logs,showHistogram:!t)) and try these queries against live OpenTelemetry data: + +**Count logs by service:** +```sql +| stats count() as log_count by `resource.attributes.service.name` +``` +Try in playground → + +**Find error logs:** +```sql +| where severityText = 'ERROR' or severityText = 'FATAL' +``` +Try in playground → + +**GenAI operations breakdown:** +```sql +| stats count() as operations by `resource.attributes.service.name`, `attributes.gen_ai.operation.name` +``` +Try in playground → + +## How PPL compares + +PPL belongs to the family of pipe-based query languages used in modern observability platforms. If you have experience with Kusto Query Language (KQL) or Elastic Event Query Language (EQL), PPL will feel immediately familiar - with key advantages: + +| Capability | PPL | KQL (Kusto) | EQL (Elastic) | +|-----------|-----|-------------|----------------| +| Pipe-based syntax | Yes | Yes | Limited | +| Log pattern discovery | Built-in (`patterns`) | Requires external tooling | No | +| Machine learning in-query | `ml`, `kmeans` | Separate service | Separate service | +| Field extraction on-the-fly | `parse`, `grok`, `rex` | `parse`, `extract` | Limited | +| Join across indices | `join`, `lookup`, `subquery` | `join`, `lookup` | Sequence only | +| Rolling statistics | `streamstats`, `trendline` | `scan`, window functions | No | +| Open source | Fully open source (Apache 2.0) | Proprietary | Proprietary (SSPL) | +| OpenTelemetry-native | First-class OTel support | Via connector | Via integration | + +## Next steps + +- **[Command Reference](/docs/ppl/commands/)** - Syntax and examples for all 50+ PPL commands +- **[Function Reference](/docs/ppl/functions/)** - 200+ built-in functions across 13 categories +- **[Observability Examples](/docs/ppl/examples/)** - Real-world PPL queries for OTel logs, traces, and AI agent data +- **[Discover Logs](/docs/investigate/discover-logs/)** - Using PPL in the Logs Discover interface +- **[Discover Traces](/docs/investigate/discover-traces/)** - Using PPL in the Traces Discover interface diff --git a/docs/starlight-docs/src/content/docs/send-data/ai-agents/index.md b/docs/starlight-docs/src/content/docs/send-data/ai-agents/index.md index 78bcf7c8..1f723b1a 100644 --- a/docs/starlight-docs/src/content/docs/send-data/ai-agents/index.md +++ b/docs/starlight-docs/src/content/docs/send-data/ai-agents/index.md @@ -3,7 +3,7 @@ title: "AI Agents" description: "Instrument AI agent applications with the GenAI Observability SDKs" --- -The GenAI Observability SDKs provide purpose-built instrumentation for AI agent applications. They handle the gap between general-purpose OpenTelemetry and what agent developers actually need: tracing orchestration logic, capturing GenAI semantic attributes, and scoring agent quality — all through the standard OTLP pipeline. +The GenAI Observability SDKs provide purpose-built instrumentation for AI agent applications. They handle the gap between general-purpose OpenTelemetry and what agent developers actually need: tracing orchestration logic, capturing GenAI semantic attributes, and scoring agent quality - all through the standard OTLP pipeline. ## Why use the SDK? @@ -11,12 +11,12 @@ General OTel instrumentation (covered in the [Python](/docs/send-data/applicatio The GenAI SDKs add: -- **One-line OTEL setup** — `register()` configures the tracer provider, exporter, and auto-instrumentation in one call -- **`@observe` decorator** — trace agents, tools, and LLM calls with GenAI semantic convention attributes automatically -- **`enrich()`** — set model, tokens, provider, and other GenAI attributes on the active span without manual `set_attribute()` calls -- **Auto-instrumentation** — OpenAI, Anthropic, Bedrock, LangChain, and 20+ libraries traced with zero code changes -- **Evaluation scoring** — `score()` attaches quality metrics to traces through the same OTLP pipeline -- **AWS SigV4** — production-ready signing for OpenSearch Ingestion and OpenSearch Service +- **One-line OTEL setup** - `register()` configures the tracer provider, exporter, and auto-instrumentation in one call +- **`@observe` decorator** - trace agents, tools, and LLM calls with GenAI semantic convention attributes automatically +- **`enrich()`** - set model, tokens, provider, and other GenAI attributes on the active span without manual `set_attribute()` calls +- **Auto-instrumentation** - OpenAI, Anthropic, Bedrock, LangChain, and 20+ libraries traced with zero code changes +- **Evaluation scoring** - `score()` attaches quality metrics to traces through the same OTLP pipeline +- **AWS SigV4** - production-ready signing for OpenSearch Ingestion and OpenSearch Service ## Available SDKs @@ -43,10 +43,10 @@ def agent(question: str) -> str: return summarize(results) ``` -This produces a trace with `gen_ai.operation.name`, `gen_ai.agent.name`, `gen_ai.tool.name`, input/output capture, and token usage — all with standard OTel semantic conventions. +This produces a trace with `gen_ai.operation.name`, `gen_ai.agent.name`, `gen_ai.tool.name`, input/output capture, and token usage - all with standard OTel semantic conventions. ## Next steps -- [Python SDK reference](/docs/send-data/ai-agents/python/) — full API documentation -- [AI Observability — Getting Started](/docs/ai-observability/getting-started/) — end-to-end walkthrough from install to seeing traces -- [Evaluation & Scoring](/docs/ai-observability/evaluation/) — score traces, run experiments, compare agent versions +- [Python SDK reference](/docs/send-data/ai-agents/python/) - full API documentation +- [AI Observability - Getting Started](/docs/ai-observability/getting-started/) - end-to-end walkthrough from install to seeing traces +- [Evaluation & Scoring](/docs/ai-observability/evaluation/) - score traces, run experiments, compare agent versions diff --git a/docs/starlight-docs/src/content/docs/send-data/ai-agents/integrations.mdx b/docs/starlight-docs/src/content/docs/send-data/ai-agents/integrations.mdx index abdc077a..a1b20e2a 100644 --- a/docs/starlight-docs/src/content/docs/send-data/ai-agents/integrations.mdx +++ b/docs/starlight-docs/src/content/docs/send-data/ai-agents/integrations.mdx @@ -9,7 +9,7 @@ Each example below shows a complete, runnable integration. All examples assume t ## Strands Agents -[Strands Agents](https://github.com/strands-agents/sdk-python) is a model-driven agent framework from AWS. The SDK's auto-instrumentation captures Bedrock/OpenAI calls automatically — add `@observe` for agent-level visibility. +[Strands Agents](https://github.com/strands-agents/sdk-python) is a model-driven agent framework from AWS. The SDK's auto-instrumentation captures Bedrock/OpenAI calls automatically - add `@observe` for agent-level visibility. ```bash pip install "opensearch-genai-observability-sdk-py[bedrock]" strands-agents strands-agents-bedrock @@ -61,7 +61,7 @@ from strands import Agent from strands.models.bedrock import BedrockModel from strands.telemetry import StrandsTelemetry -# Initialize Strands' built-in telemetry — automatically creates spans for: +# Initialize Strands' built-in telemetry - automatically creates spans for: # invoke_agent, execute_tool, chat (LLM calls) telemetry = StrandsTelemetry() @@ -73,7 +73,7 @@ telemetry.tracer_provider.add_span_processor(BatchSpanProcessor(exporter)) @observe(op=Op.EXECUTE_TOOL) def fetch_hotel_ratings(city: str) -> str: """Fetch hotel ratings from an external API.""" - # Custom API call — not covered by Strands auto-instrumentation + # Custom API call - not covered by Strands auto-instrumentation return f"4.5 stars average in {city}" model = BedrockModel(model_id="us.anthropic.claude-sonnet-4-20250514-v1:0") @@ -265,6 +265,6 @@ run("What is OpenSearch?") ## Related links -- [Python SDK reference](/docs/send-data/ai-agents/python/) — full API documentation for `register`, `observe`, `enrich` -- [Auto-instrumentation](/docs/send-data/ai-agents/python/#auto-instrumentation) — supported providers and extras -- [Evaluation & Scoring](/docs/ai-observability/evaluation/) — score and evaluate your agent outputs +- [Python SDK reference](/docs/send-data/ai-agents/python/) - full API documentation for `register`, `observe`, `enrich` +- [Auto-instrumentation](/docs/send-data/ai-agents/python/#auto-instrumentation) - supported providers and extras +- [Evaluation & Scoring](/docs/ai-observability/evaluation/) - score and evaluate your agent outputs diff --git a/docs/starlight-docs/src/content/docs/send-data/ai-agents/python.mdx b/docs/starlight-docs/src/content/docs/send-data/ai-agents/python.mdx index acffb8e2..40f81712 100644 --- a/docs/starlight-docs/src/content/docs/send-data/ai-agents/python.mdx +++ b/docs/starlight-docs/src/content/docs/send-data/ai-agents/python.mdx @@ -1,6 +1,6 @@ --- title: "Python SDK" -description: "Reference for opensearch-genai-observability-sdk-py — instrument AI agent applications with OpenTelemetry" +description: "Reference for opensearch-genai-observability-sdk-py - instrument AI agent applications with OpenTelemetry" --- import { Aside } from '@astrojs/starlight/components'; @@ -42,10 +42,10 @@ The SDK exports these functions and classes. This page covers the instrumentatio | `enrich()` | Set GenAI attributes on active span | [This page](#enrich) | | `score()` | Attach evaluation scores to traces | [This page](#score) | | `AWSSigV4OTLPExporter` | SigV4-signed OTLP exporter | [This page](#aws-authentication) | -| `evaluate()` | Run agent against dataset with scorers | [Evaluation & Scoring](/docs/ai-observability/evaluation/#evaluate--run-experiments) | -| `Experiment` | Upload pre-computed eval results | [Evaluation & Scoring](/docs/ai-observability/evaluation/#experiment--upload-pre-computed-results) | +| `evaluate()` | Run agent against dataset with scorers | [Evaluation & Scoring](/docs/ai-observability/evaluation/#evaluate---run-experiments) | +| `Experiment` | Upload pre-computed eval results | [Evaluation & Scoring](/docs/ai-observability/evaluation/#experiment---upload-pre-computed-results) | | `EvalScore` | Scorer return type | [Evaluation & Scoring](/docs/ai-observability/evaluation/#evalscore-dataclass) | -| `OpenSearchTraceRetriever` | Query stored traces from OpenSearch | [Evaluation & Scoring](/docs/ai-observability/evaluation/#opensearchtraceretriever--query-stored-traces) | +| `OpenSearchTraceRetriever` | Query stored traces from OpenSearch | [Evaluation & Scoring](/docs/ai-observability/evaluation/#opensearchtraceretriever---query-stored-traces) | ## Quick start @@ -135,17 +135,17 @@ The unified tracing primitive. Works as a **decorator** (sync, async, generator, ### Usage forms ```python -# Bare decorator — span name = function qualname +# Bare decorator - span name = function qualname @observe def my_function(): ... -# Parameterized — set operation type, name, span kind +# Parameterized - set operation type, name, span kind @observe(name="weather_agent", op=Op.INVOKE_AGENT) def run_agent(query: str) -> str: ... -# Context manager — for inline tracing blocks +# Context manager - for inline tracing blocks with observe("llm_call", op=Op.CHAT) as span: response = llm.chat(messages) ``` @@ -181,7 +181,7 @@ In decorator mode, `observe()` automatically: - **Captures input** as `gen_ai.input.messages` (or `gen_ai.tool.call.arguments` for tools). Skips `self`/`cls`. - **Captures output** as `gen_ai.output.messages` (or `gen_ai.tool.call.result` for tools). Won't overwrite if already set. - **Records errors** as span status `ERROR` with an exception event. -- **Sets entity attributes** — `gen_ai.agent.name` for agents, `gen_ai.tool.name` + `gen_ai.tool.type="function"` for tools. +- **Sets entity attributes** - `gen_ai.agent.name` for agents, `gen_ai.tool.name` + `gen_ai.tool.type="function"` for tools. All values truncated at 10,000 characters. @@ -189,8 +189,8 @@ All values truncated at 10,000 characters. ```mermaid flowchart TD - A["@observe op=INVOKE_AGENT"] --> B["@observe op=CHAT — LLM call"] - A --> C["@observe op=EXECUTE_TOOL — tool call"] + A["@observe op=INVOKE_AGENT"] --> B["@observe op=CHAT - LLM call"] + A --> C["@observe op=EXECUTE_TOOL - tool call"] C --> D["LLM call (auto-instrumented)"] ``` @@ -261,7 +261,7 @@ All parameters are optional. Only provided values are set. ## Auto-instrumentation -`register()` discovers and activates installed instrumentor packages. Install the extra for your provider — no code changes needed. +`register()` discovers and activates installed instrumentor packages. Install the extra for your provider - no code changes needed. | Provider / framework | Extra | |---|---| @@ -285,7 +285,7 @@ register(auto_instrument=False) # to disable ## `score()` -Attaches an evaluation score to a trace or span. Scores are emitted as OTEL spans through the same OTLP pipeline — no separate client or index needed. +Attaches an evaluation score to a trace or span. Scores are emitted as OTEL spans through the same OTLP pipeline - no separate client or index needed. ```python from opensearch_genai_observability_sdk_py import score @@ -351,8 +351,8 @@ SigV4 + gRPC is not supported. Use `https://` for AWS endpoints. ## Related links -- [AI Observability — Getting Started](/docs/ai-observability/getting-started/) — end-to-end walkthrough -- [Evaluation & Scoring](/docs/ai-observability/evaluation/) — score traces, run experiments -- [Trace Retrieval](/docs/ai-observability/evaluation/#opensearchtraceretriever--query-stored-traces) — query stored traces from OpenSearch -- [Agent Traces](/docs/ai-observability/agent-tracing/) — viewing traces in OpenSearch Dashboards -- [GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/) — OTel spec reference +- [AI Observability - Getting Started](/docs/ai-observability/getting-started/) - end-to-end walkthrough +- [Evaluation & Scoring](/docs/ai-observability/evaluation/) - score traces, run experiments +- [Trace Retrieval](/docs/ai-observability/evaluation/#opensearchtraceretriever---query-stored-traces) - query stored traces from OpenSearch +- [Agent Traces](/docs/ai-observability/agent-tracing/) - viewing traces in OpenSearch Dashboards +- [GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/) - OTel spec reference diff --git a/docs/starlight-docs/src/content/docs/send-data/ai-agents/typescript.md b/docs/starlight-docs/src/content/docs/send-data/ai-agents/typescript.md index cfe6edd2..9d59b3c4 100644 --- a/docs/starlight-docs/src/content/docs/send-data/ai-agents/typescript.md +++ b/docs/starlight-docs/src/content/docs/send-data/ai-agents/typescript.md @@ -17,6 +17,6 @@ The SDK is being developed at [github.com/opensearch-project/genai-observability For JavaScript/TypeScript applications, you can use standard OpenTelemetry instrumentation with GenAI semantic conventions: -- [Node.js OpenTelemetry guide](/docs/send-data/applications/nodejs/) — manual and auto-instrumentation setup -- [Manual instrumentation](/docs/send-data/opentelemetry/manual-instrumentation/) — creating spans with `gen_ai.*` attributes -- [GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/) — the OTel spec for AI attributes +- [Node.js OpenTelemetry guide](/docs/send-data/applications/nodejs/) - manual and auto-instrumentation setup +- [Manual instrumentation](/docs/send-data/opentelemetry/manual-instrumentation/) - creating spans with `gen_ai.*` attributes +- [GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/) - the OTel spec for AI attributes diff --git a/docs/starlight-docs/src/content/docs/send-data/applications/index.md b/docs/starlight-docs/src/content/docs/send-data/applications/index.md index 4d5ef35f..3652ee15 100644 --- a/docs/starlight-docs/src/content/docs/send-data/applications/index.md +++ b/docs/starlight-docs/src/content/docs/send-data/applications/index.md @@ -3,7 +3,7 @@ title: "Instrument Applications" description: "Add OpenTelemetry instrumentation to your application code to send traces, metrics, and logs to the observability stack" --- -Application instrumentation is the process of adding observability signals — traces, metrics, and logs — to your application code. The OpenSearch Observability Stack uses OpenTelemetry (OTel) as its standard instrumentation framework, giving you a vendor-neutral way to collect telemetry from any language. +Application instrumentation is the process of adding observability signals - traces, metrics, and logs - to your application code. The OpenSearch Observability Stack uses OpenTelemetry (OTel) as its standard instrumentation framework, giving you a vendor-neutral way to collect telemetry from any language. ## How it works @@ -42,7 +42,7 @@ Manual instrumentation gives you full control over what gets traced and measured ### Combining both -Most production applications use both approaches together — auto-instrumentation for framework-level coverage and manual instrumentation for business-specific observability. +Most production applications use both approaches together - auto-instrumentation for framework-level coverage and manual instrumentation for business-specific observability. :::tip[Upstream documentation] For a complete list of supported languages and their instrumentation status, see the [OpenTelemetry language APIs & SDKs](https://opentelemetry.io/docs/languages/). @@ -60,9 +60,9 @@ All OpenTelemetry SDKs respect a standard set of environment variables. You can | `OTEL_TRACES_EXPORTER` | Traces exporter (`otlp`, `none`) | `otlp` | | `OTEL_METRICS_EXPORTER` | Metrics exporter (`otlp`, `none`) | `otlp` | | `OTEL_LOGS_EXPORTER` | Logs exporter (`otlp`, `none`) | `otlp` | -| `OTEL_RESOURCE_ATTRIBUTES` | Comma-separated key=value resource attributes | — | +| `OTEL_RESOURCE_ATTRIBUTES` | Comma-separated key=value resource attributes | - | | `OTEL_TRACES_SAMPLER` | Sampler type (`always_on`, `traceidratio`, `parentbased_traceidratio`) | `parentbased_always_on` | -| `OTEL_TRACES_SAMPLER_ARG` | Sampler argument (e.g., ratio `0.1`) | — | +| `OTEL_TRACES_SAMPLER_ARG` | Sampler argument (e.g., ratio `0.1`) | - | | `OTEL_PROPAGATORS` | Context propagation formats | `tracecontext,baggage` | Example using environment variables: diff --git a/docs/starlight-docs/src/content/docs/send-data/applications/nodejs.md b/docs/starlight-docs/src/content/docs/send-data/applications/nodejs.md index ee5a6e1d..b4e49536 100644 --- a/docs/starlight-docs/src/content/docs/send-data/applications/nodejs.md +++ b/docs/starlight-docs/src/content/docs/send-data/applications/nodejs.md @@ -250,7 +250,7 @@ export async function register() { ## Related links -- [TypeScript SDK](/docs/send-data/ai-agents/typescript/) — purpose-built instrumentation for AI agent applications +- [TypeScript SDK](/docs/send-data/ai-agents/typescript/) - purpose-built instrumentation for AI agent applications - [Applications overview](/docs/send-data/applications/) - [Auto-instrumentation](/docs/send-data/opentelemetry/auto-instrumentation/) - [Manual instrumentation](/docs/send-data/opentelemetry/manual-instrumentation/) diff --git a/docs/starlight-docs/src/content/docs/send-data/applications/python.md b/docs/starlight-docs/src/content/docs/send-data/applications/python.md index 97b8ffcb..bc64f1ee 100644 --- a/docs/starlight-docs/src/content/docs/send-data/applications/python.md +++ b/docs/starlight-docs/src/content/docs/send-data/applications/python.md @@ -267,7 +267,7 @@ with tracer.start_as_current_span("llm.chat") as span: ## Related links -- [Python SDK](/docs/send-data/ai-agents/python/) — purpose-built instrumentation for AI agent applications +- [Python SDK](/docs/send-data/ai-agents/python/) - purpose-built instrumentation for AI agent applications - [Applications overview](/docs/send-data/applications/) - [Auto-instrumentation](/docs/send-data/opentelemetry/auto-instrumentation/) - [Manual instrumentation](/docs/send-data/opentelemetry/manual-instrumentation/)