diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..723ef36 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea \ No newline at end of file diff --git a/README.md b/README.md index 10827d1..b9234de 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ -# Temporal Grafana Dashboards +# Temporal Dashboards + +This repository contains community-driven [Grafana](https://grafana.com/docs/grafana/latest/dashboards/) and [DataDog](https://docs.datadoghq.com/dashboards/) dashboards that can be used for monitoring +[Temporal Cloud](https://temporal.io/cloud), [Temporal Server](https://github.com/temporalio/temporal), and [Temporal SDK](https://docs.temporal.io/develop) metrics. -This repository contains community-driven Grafana [dashboards](https://grafana.com/docs/grafana/latest/dashboards/) that can be used for monitoring -Temporal Server and SDK metrics. We welcome contributions to existing as well as new dashboards that can help the community. @@ -10,10 +11,27 @@ We welcome contributions to existing as well as new dashboards that can help the ## Directory structure -* [`server/`](server): Dashboards for Temporal Server metrics +* [`cloud/`](cloud): Dashboards for Temporal Cloud metrics. +* [`server/`](server): Dashboards for Temporal Server metrics. * [`sdk/`](sdk): Dashboards for Temporal SDK metrics. * [`misc/`](misc): Server metrics dashboards that have not been fully tested yet or need improvements +## Setup +* [Temporal Cloud](https://docs.temporal.io/cloud/metrics/) +* [Temporal Server](https://docs.temporal.io/self-hosted-guide/monitoring) +* _Temporal SDK_ + * [Golang](https://docs.temporal.io/develop/go/observability) + * [Java](https://docs.temporal.io/develop/java/observability) + * [Python](https://docs.temporal.io/develop/python/observability) + * [TypeScript](https://docs.temporal.io/develop/typescript/observability) + * [.NET](https://docs.temporal.io/develop/dotnet/observability) + * [PHP](https://docs.temporal.io/develop/php/observability) + +## Available metrics +* [Temporal Cloud metrics](https://docs.temporal.io/production-deployment/cloud/metrics/reference) +* [Temporal Server metrics](https://docs.temporal.io/references/cluster-metrics) +* [Temporal SDK metrics](https://docs.temporal.io/references/sdk-metrics) + ## Usage Our [default helm chart](https://github.com/temporalio/helm-charts) installs Grafana and will provision the dashboards from this repo automatically. If you would like to try these dashboards on your own Grafana instance you can import them. Unfortunately Grafana does not allow importing by URL aside from those hosted on the Grafana website, so the JSON of the dashboard needs to be copy/pasted into your Grafana instance. To do this: diff --git a/cloud/README.md b/cloud/README.md new file mode 100644 index 0000000..221826e --- /dev/null +++ b/cloud/README.md @@ -0,0 +1,13 @@ +# Temporal Cloud Dashboards + +## Setup +* [Temporal Cloud](https://docs.temporal.io/cloud/metrics/) + +## Available metrics +* [Temporal Cloud metrics](https://docs.temporal.io/production-deployment/cloud/metrics/reference) + +## Dashboards +* **Grafana** [here](temporal_cloud.json) +* **DataDog** integration details and Dashboard access are found [here](https://docs.datadoghq.com/integrations/temporal-cloud/). + * Related [Blog post](https://temporal.io/blog/temporal-cloud-metrics-in-datadog). + diff --git a/sdk/README.md b/sdk/README.md index e22d16c..13366cb 100644 --- a/sdk/README.md +++ b/sdk/README.md @@ -1,10 +1,24 @@ -# Temporal Grafana SDK Dashboards +# Temporal SDK Dashboards -This repository contains the following dashboards: +## Setup + +* [Golang](https://docs.temporal.io/develop/go/observability) +* [Java](https://docs.temporal.io/develop/java/observability) +* [Python](https://docs.temporal.io/develop/python/observability) +* [TypeScript](https://docs.temporal.io/develop/typescript/observability) +* [.NET](https://docs.temporal.io/develop/dotnet/observability) +* [PHP](https://docs.temporal.io/develop/php/observability) + +## Available metrics +[Temporal SDK metrics](https://docs.temporal.io/references/sdk-metrics) + +## Grafana dashboards - [temporal-go-java-sdks-tally.json](temporal-go-java-sdks-tally.json) for [Go](https://github.com/temporalio/sdk-go) and [Java](https://github.com/temporalio/sdk-java) SDKs using Uber Tally to emits metrics. - [temporal-go-sdk-otel.json](temporal-go-sdk-otel.json) for [Go](https://github.com/temporalio/sdk-go) SDK using OpenTelemetry to emit metrics. -- [temporal-core-sdks-otel.json](temporal-core-sdks-otel.json) for [Core](https://github.com/temporalio/sdk-core) based SDKs. In Core based SDKs, metrics of the type Histogram -are measured in milliseconds by default, so the dashboard is configured accordingly to display them in milliseconds. +- [temporal-core-sdks-otel.json](temporal-core-sdks-otel.json) for [Core](https://github.com/temporalio/sdk-core) based SDKs. In Core based SDKs, metrics of the type Histogram + are measured in milliseconds by default, so the dashboard is configured accordingly to display them in milliseconds. +## DataDog dashboards +DataDog dashboards and related configuration are [here](datadog). diff --git a/sdk/datadog/README.md b/sdk/datadog/README.md new file mode 100644 index 0000000..b1f7b0a --- /dev/null +++ b/sdk/datadog/README.md @@ -0,0 +1,22 @@ +# Temporal DataDog SDK Dashboards + +The [Dashboard](temporal_sdk_dashboard.json) here works with the DataDog collector and [this openmetrics configuration](openmetrics.h_conf.yaml). + +### Prerequisites + +1. Some means of pushing metrics to DataDog. Often this can be a [DataDog Agent](https://docs.datadoghq.com/getting_started/agent/) installed. +2. Advanced `Percentile` configuration for Distribution metrics. + 1. Which metrics? Basically, any metrics [here](https://docs.temporal.io/references/sdk-metrics) that are a `Histogram` and for which you want to report percentiles (p95/p99). + +### Put it all together +1. If using the DataDog Agent: + a. Visit [here](https://docs.datadoghq.com/integrations/openmetrics/) for OpenMetrics configuration with the DataDog agent for details about configuring your DD Agent. + b. Drop this [conf.yaml](openmetrics.h_conf.yaml) at your Agent `openmetrics.d` config path. +2. Import the [Dashboard](temporal_sdk_dashboard.json) +3. Be sure you enable the `Percentile` configuration for relevant metrics. + +> **Example for configuring `temporal_request_latency`**: +> * Visit https://app.datadoghq.com/metric/summary?metric=temporal_request_latency +> * Set `Advanced > Percentiles > Configure > ON` + + diff --git a/sdk/datadog/openmetrics.h_conf.yaml b/sdk/datadog/openmetrics.h_conf.yaml new file mode 100644 index 0000000..a38dd11 --- /dev/null +++ b/sdk/datadog/openmetrics.h_conf.yaml @@ -0,0 +1,52 @@ +## OpenMetrics Configuration - CONVERT BUCKETS TO DISTRIBUTIONS + +init_config: + +instances: + - openmetrics_endpoint: "http://localhost:9464/metrics" + tags: + - "service:temporal-worker" + - "env:production" + + metrics: + - "temporal_*" + + # KEY: Convert histogram buckets to distributions + histogram_buckets_as_distributions: true # Convert to distributions + collect_histogram_buckets: true + + # Send buckets as distributions instead of individual metrics + send_distribution_buckets: true # Send as distributions + send_distribution_counts_as_monotonic: true # For proper count handling + send_histograms_buckets: false + + # Include all labels including "le" for bucket conversion + include_labels: + - "le" # NEEDED for bucket conversion + - "namespace" + - "operation" + - "workflow_type" + - "activity_type" + - "task_queue" + - "worker_type" + - "service_name" + - "error_type" + - "poller_type" + - "failure_reason" + + exclude_labels: + - "job" + - "instance" + - "__name__" + + timeout: 30 + min_collection_interval: 15 + + # This will create distribution metrics that you can then enable percentiles for + # in the Datadog Metrics Summary page: + # temporal_request_latency (distribution) + # temporal_workflow_endtoend_latency (distribution) + # etc. + # Example: You want to enable the `p95` percentile for `temporal_request_latency`: + # 1. Visit https://app.datadoghq.com/metric/summary?metric=temporal_request_latency + # 2. Set `Advanced > Percentiles > Configure > ON` diff --git a/sdk/datadog/temporal_sdk_dashboard.json b/sdk/datadog/temporal_sdk_dashboard.json new file mode 100644 index 0000000..1817f85 --- /dev/null +++ b/sdk/datadog/temporal_sdk_dashboard.json @@ -0,0 +1,1970 @@ +{ + "title": "Temporal SDK Metrics 🎉", + "description": "Dashboard using distribution metrics with enabled percentiles", + "widgets": [ + { + "id": 1, + "definition": { + "type": "note", + "content": "**Note:** You must enable percentiles (p95, p99) for distribution metrics in Datadog Metrics Summary page first.", + "background_color": "vivid_yellow", + "font_size": "12", + "text_align": "left", + "vertical_align": "top", + "show_tick": true, + "tick_pos": "75%", + "tick_edge": "bottom", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 12, + "height": 1 + } + }, + { + "id": 1443440509451425, + "definition": { + "title": "Compute", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 6912933675528841, + "definition": { + "title": "CPU Usage %", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.cpu.user{$host}", + "aggregator": "avg" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "avg:system.cpu.system{$host}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1 + query2" + } + ] + } + ], + "autoscale": true, + "precision": 1 + }, + "layout": { + "x": 0, + "y": 0, + "width": 4, + "height": 2 + } + }, + { + "id": 1762990522750473, + "definition": { + "title": "Memory Usage %", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.mem.pct_usable{$host}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ] + } + ], + "autoscale": true, + "precision": 1 + }, + "layout": { + "x": 4, + "y": 0, + "width": 4, + "height": 2 + } + }, + { + "id": 7395104907317546, + "definition": { + "title": "Load Average (1m)", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.load.1{$host}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ] + } + ], + "autoscale": true, + "precision": 2 + }, + "layout": { + "x": 8, + "y": 0, + "width": 4, + "height": 2 + } + }, + { + "id": 6857600481409383, + "definition": { + "title": "CPU Usage Over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "time": {}, + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.cpu.user{$host}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.cpu.system{$host}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.cpu.iowait{$host}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 2, + "width": 4, + "height": 2 + } + }, + { + "id": 2714990955524408, + "definition": { + "title": "Memory Usage Over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "time": {}, + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.mem.used{$host}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "area" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.mem.free{$host}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "area" + } + ] + }, + "layout": { + "x": 4, + "y": 2, + "width": 4, + "height": 2 + } + }, + { + "id": 2393682765506983, + "definition": { + "title": "Load Average", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "time": {}, + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.load.1{$host}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.load.5{$host}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.load.15{$host}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 8, + "y": 2, + "width": 4, + "height": 2 + } + }, + { + "id": 1947197773803764, + "definition": { + "title": "Disk I/O Operations", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "time": {}, + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.io.r_s{$host}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.io.w_s{$host}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 4, + "width": 4, + "height": 2 + } + }, + { + "id": 4419979897212949, + "definition": { + "title": "Network Traffic", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "time": {}, + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.net.bytes_rcvd{$host}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:system.net.bytes_sent{$host}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 4, + "width": 4, + "height": 2 + } + } + ] + }, + "layout": { + "x": 0, + "y": 1, + "width": 12, + "height": 1 + } + }, + { + "id": 8597427824673889, + "definition": { + "title": "Network", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 4807520225196443, + "definition": { + "title": "Request Success Rate", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_request.count{namespace:$namespace}.as_rate()" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:temporal_request_failure{namespace:$namespace}.as_rate()" + }, + { + "data_source": "metrics", + "name": "query3", + "query": "sum:temporal_request.count{namespace:$namespace}.as_rate()" + } + ], + "formulas": [ + { + "formula": "((query1 - query2) / query3) * 100", + "alias": "Success Rate (%)" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ], + "yaxis": { + "scale": "linear", + "min": "0", + "max": "100" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 4765020064671431, + "definition": { + "title": "RPC Long-Poll Latency (p95 & p99) - Distribution", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "p95:temporal_long_request_latency{namespace:$namespace,service_name:$service_name} by {namespace,operation}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "purple" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "p99:temporal_long_request_latency{namespace:$namespace,service_name:$service_name} by {namespace,operation}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "red" + }, + "display_type": "line" + } + ], + "yaxis": { + "label": "milliseconds" + } + }, + "layout": { + "x": 6, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 3, + "definition": { + "title": "RPC Latency (p95 & p99) - Distribution", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "p95:temporal_request_latency{namespace:$namespace,service_name:$service_name} by {namespace,operation}" + } + ], + "style": { + "palette": "purple" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "p99:temporal_request_latency{namespace:$namespace,service_name:$service_name} by {namespace,operation}" + } + ], + "style": { + "palette": "red" + }, + "display_type": "line" + } + ], + "yaxis": { + "label": "milliseconds" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 6, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 2, + "width": 12, + "height": 1 + } + }, + { + "id": 3583867315774251, + "definition": { + "title": "Workflows", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 7, + "definition": { + "title": "Workflow Task Schedule-to-Start Latency (p95) - Distribution", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "p95:temporal_workflow_task_schedule_to_start_latency{namespace:$namespace,service_name:$service_name} by {namespace,task_queue}" + } + ], + "formulas": [ + { + "formula": "query1", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + } + } + ], + "style": { + "palette": "purple" + }, + "display_type": "line" + } + ], + "yaxis": { + "label": "milliseconds" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 6, + "definition": { + "title": "Workflow Task Throughput", + "show_legend": true, + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_workflow_task_queue_poll_succeed.count{namespace:$namespace,service_name:$service_name} by {namespace,task_queue}.as_rate()" + } + ], + "style": { + "palette": "dog_classic" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 6, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 5, + "definition": { + "title": "Workflow End-to-End Latency (p95) - Distribution", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "p95:temporal_workflow_endtoend_latency{namespace:$namespace,service_name:$service_name} by {namespace,workflow_type}" + } + ], + "formulas": [ + { + "formula": "query1", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + } + } + ], + "style": { + "palette": "purple" + }, + "display_type": "line" + } + ], + "yaxis": { + "label": "milliseconds" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 6, + "height": 3 + } + }, + { + "id": 7348670239811961, + "definition": { + "title": "Workflow Task Failed", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_workflow_task_execution_failed.count{namespace:$namespace,service_name:$service_name,failure_reason:workflowerror} by {namespace,workflow_type}.as_rate()" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "green" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 6, + "y": 3, + "width": 6, + "height": 3 + } + }, + { + "id": 5799738924949078, + "definition": { + "title": "Workflow Task Replay Latency", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_workflow_task_replay_latency{namespace:$namespace,service_name:$service_name} by {namespace,task_queue}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "purple" + }, + "display_type": "line" + } + ], + "yaxis": { + "label": "milliseconds" + } + }, + "layout": { + "x": 0, + "y": 6, + "width": 6, + "height": 3 + } + }, + { + "id": 721448255054598, + "definition": { + "title": "Workflow Failures", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_workflow_failed.count{namespace:$namespace,service_name:$service_name} by {namespace,workflow_type}.as_rate()" + } + ], + "style": { + "palette": "green" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 6, + "y": 6, + "width": 6, + "height": 3 + } + }, + { + "id": 4, + "definition": { + "title": "Workflow Completions", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_workflow_completed.count{namespace:$namespace,service_name:$service_name} by {namespace,workflow_type}.as_rate()" + } + ], + "style": { + "palette": "green" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 9, + "width": 6, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 3, + "width": 12, + "height": 13 + } + }, + { + "id": 4824282029865285, + "definition": { + "title": "Activities", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 8416682883665696, + "definition": { + "title": "Activity Task Schedule-to-Start Latency (p95) - Distribution", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "p95:temporal_activity_schedule_to_start_latency{namespace:$namespace,service_name:$service_name} by {namespace,task_queue}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "purple" + }, + "display_type": "line" + } + ], + "yaxis": { + "label": "milliseconds" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 6638251645640149, + "definition": { + "title": "Activity Success End-to-End Latency (p95) - Distribution", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_activity_succeed_endtoend_latency{namespace:$namespace,service_name:$service_name} by {namespace,task_queue}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "purple" + }, + "display_type": "line" + } + ], + "yaxis": { + "label": "milliseconds" + } + }, + "layout": { + "x": 6, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 9, + "definition": { + "title": "Activity Execution Latency (p95) - Distribution", + "show_legend": true, + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "p95:temporal_activity_execution_latency{namespace:$namespace,service_name:$service_name} by {namespace,activity_type}" + } + ], + "style": { + "palette": "purple" + }, + "display_type": "line" + } + ], + "yaxis": { + "label": "milliseconds" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 6, + "height": 3 + } + }, + { + "id": 8, + "definition": { + "title": "Activity Execution Failures", + "show_legend": true, + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_activity_execution_failed.count{namespace:$namespace,service_name:$service_name} by {namespace,activity_type}.as_rate()" + } + ], + "style": { + "palette": "warm" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 6, + "y": 3, + "width": 6, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 16, + "width": 12, + "height": 7, + "is_column_break": true + } + }, + { + "id": 5568899104334129, + "definition": { + "title": "Local Activities", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 539489417759370, + "definition": { + "title": "Local Activity Execution Latency (p95) - Distribution", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "p95:temporal_local_activity_execution_latency{namespace:$namespace,service_name:$service_name} by {namespace, activity_type}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "purple" + }, + "display_type": "line" + } + ], + "yaxis": { + "label": "milliseconds" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 118517933928889, + "definition": { + "title": "Local Activity Success End-to-End Latency (p95) - Distribution", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_local_activity_succeeded_endtoend_latency{namespace:$namespace,service_name:$service_name} by {namespace,task_queue,activity_type}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "purple" + }, + "display_type": "line" + } + ], + "yaxis": { + "label": "milliseconds" + } + }, + "layout": { + "x": 6, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 4458058935779362, + "definition": { + "title": "Local Activity Total", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_local_activity_total{namespace:$namespace,service_name:$service_name} by {namespace,task_queue,activity_type}.as_rate()" + } + ], + "style": { + "palette": "warm" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 3, + "width": 6, + "height": 3 + } + }, + { + "id": 6837569975744609, + "definition": { + "title": "Local Activity Execution Failures", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_local_activity_execution_failed.count{namespace:$namespace,service_name:$service_name} by {namespace,activity_type}.as_rate()" + } + ], + "style": { + "palette": "warm" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 6, + "y": 3, + "width": 6, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 23, + "width": 12, + "height": 1 + } + }, + { + "id": 7544979003156379, + "definition": { + "title": "Worker Capacity", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 10, + "definition": { + "title": "Worker Task Slots Available - Current", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "temporal_worker_task_slots_available{namespace:$namespace,service_name:$service_name, worker_type:workflowworker} by {namespace,worker_type}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "purple" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query0", + "query": "avg:temporal_worker_task_slots_available{namespace:$namespace,service_name:$service_name,worker_type:activityworker} by {namespace,worker_type}" + } + ], + "formulas": [ + { + "formula": "query0" + } + ], + "style": { + "palette": "orange", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query0", + "query": "avg:temporal_worker_task_slots_available{namespace:$namespace,service_name:$service_name,worker_type:localactivityworker} by {namespace,worker_type}" + } + ], + "formulas": [ + { + "formula": "query0" + } + ], + "style": { + "palette": "orange", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 11, + "definition": { + "title": "Worker Task Slots Used", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "temporal_worker_task_slots_used{namespace:$namespace,service_name:$service_name, worker_type:workflowworker} by {namespace,worker_type}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "purple" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "temporal_worker_task_slots_used{namespace:$namespace,service_name:$service_name, worker_type:activityworker} by {namespace,worker_type}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "orange" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 6, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 4787740781475934, + "definition": { + "title": "Average Worker Task Slots Available (10m)", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_worker_task_slots_available{namespace:$namespace,worker_type:workflowworker}.rollup(avg, 600)" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "purple", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_worker_task_slots_available{namespace:$namespace,worker_type:activityworker}.rollup(avg, 600)" + } + ], + "style": { + "palette": "orange", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ], + "yaxis": { + "scale": "linear", + "include_zero": true, + "min": "0" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 6, + "height": 3 + } + }, + { + "id": 402798388361549, + "definition": { + "title": "Poller Count", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_num_pollers{namespace:$namespace} by {namespace}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "cool" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 6, + "y": 3, + "width": 6, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 24, + "width": 12, + "height": 1 + } + }, + { + "id": 260049159538739, + "definition": { + "title": "Sticky Cache", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 3742246126534320, + "definition": { + "title": "Sticky Cache Size", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_sticky_cache_size{namespace:$namespace,service_name:$service_name} by {namespace}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "cool" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 299370783628567, + "definition": { + "title": "Sticky Cache Forced Eviction", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_sticky_cache_total_forced_eviction.count{namespace:$namespace,service_name:$service_name} by {namespace,service_name}.as_count()" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "warm" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 6, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 12, + "definition": { + "title": "Sticky Cache Hits vs Misses", + "show_legend": true, + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_sticky_cache_hit.count{namespace:$namespace,service_name:$service_name} by {namespace}.as_rate()" + } + ], + "style": { + "palette": "green" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_sticky_cache_miss.count{namespace:$namespace,service_name:$service_name} by {namespace}.as_rate()" + } + ], + "style": { + "palette": "orange" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 3, + "width": 6, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 25, + "width": 12, + "height": 7 + } + }, + { + "id": 2847786625838236, + "definition": { + "title": "Deployment", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 1999300951699894, + "definition": { + "title": "NonDeterminism Errors Count", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_workflow_task_execution_failed.count{namespace:$namespace,failure_reason:nondeterminismerror} by {namespace,workflow_type,failure_reason}.as_count()" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "red", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ], + "yaxis": { + "scale": "linear", + "min": "0" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 3720169879467709, + "definition": { + "title": "Unregistered Activity Invocations (Non-Core SDKs)", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:temporal_unregistered_activity_invocation.count{namespace:$namespace,service_name:$service_name} by {namespace, task_queue}" + } + ], + "style": { + "palette": "purple", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ], + "yaxis": { + "scale": "linear", + "min": "0" + } + }, + "layout": { + "x": 6, + "y": 0, + "width": 6, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 32, + "width": 12, + "height": 1 + } + } + ], + "template_variables": [ + { + "name": "namespace", + "prefix": "namespace", + "available_values": [], + "default": "*" + }, + { + "name": "service_name", + "prefix": "service_name", + "available_values": [], + "default": "*" + }, + { + "name": "host", + "prefix": "host", + "available_values": [], + "default": "*" + } + ], + "layout_type": "ordered", + "notify_list": [], + "reflow_type": "fixed" +} diff --git a/server/README.md b/server/README.md new file mode 100644 index 0000000..b1e77ac --- /dev/null +++ b/server/README.md @@ -0,0 +1,7 @@ +# Temporal Server Dashboards + +## Setup +* [Temporal Server](https://docs.temporal.io/self-hosted-guide/monitoring) + +## Available metrics +* [Temporal Server metrics](https://docs.temporal.io/references/cluster-metrics)