diff --git a/docs/modules/trino/pages/usage-guide/monitoring.adoc b/docs/modules/trino/pages/usage-guide/monitoring.adoc index ad96afe8..0740b22a 100644 --- a/docs/modules/trino/pages/usage-guide/monitoring.adoc +++ b/docs/modules/trino/pages/usage-guide/monitoring.adoc @@ -3,3 +3,38 @@ The managed Trino instances are automatically configured to export Prometheus metrics. See xref:operators:monitoring.adoc[] for more details. + +== Metrics + +Trino automatically exposes built-in Prometheus metrics on coordinators and workers. The metrics are available on the `http` (`8080/metrics`) or +`https` (`8443/metrics`) port, depending on the TLS settings. + +The following `ServiceMonitor` example, demonstrates how the metrics could be scraped using the https://prometheus-operator.dev/[Prometheus Operator]. + +[source,yaml] +---- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: scrape-label +spec: + endpoints: + - port: https # or http + scheme: https # or http + path: /metrics + basicAuth: # <1> + username: + name: trino-user-secret + key: username + password: + name: trino-user-secret + key: password + jobLabel: app.kubernetes.io/instance + namespaceSelector: + any: true + selector: + matchLabels: + prometheus.io/scrape: "true" +---- + +<1> Add user information if Trino is configuration to use authentication diff --git a/tests/templates/kuttl/commons/check-metrics.py b/tests/templates/kuttl/commons/check-metrics.py new file mode 100644 index 00000000..9a4a42c5 --- /dev/null +++ b/tests/templates/kuttl/commons/check-metrics.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +import argparse +import requests +import time + + +def print_request_error_and_sleep(message, err, retry_count): + print("[" + str(retry_count) + "] " + message, err) + time.sleep(5) + + +def try_get(url): + retries = 3 + for i in range(retries): + try: + if "coordinator" in url: + r = requests.get( + url, + timeout=5, + headers={"x-trino-user": "admin"}, + auth=("admin", "admin"), + verify=False, + ) + else: + r = requests.get( + url, timeout=5, headers={"x-trino-user": "admin"}, verify=False + ) + r.raise_for_status() + return r + except requests.exceptions.HTTPError as errh: + print_request_error_and_sleep("Http Error: ", errh, i) + except requests.exceptions.ConnectionError as errc: + print_request_error_and_sleep("Error Connecting: ", errc, i) + except requests.exceptions.Timeout as errt: + print_request_error_and_sleep("Timeout Error: ", errt, i) + except requests.exceptions.RequestException as err: + print_request_error_and_sleep("Error: ", err, i) + + exit(-1) + + +def check_monitoring(hosts): + for host in hosts: + # test for the jmx exporter metrics + url = "http://" + host + ":8081/metrics" + response = try_get(url) + + if not response.ok: + print("Error for [" + url + "]: could not access monitoring") + exit(-1) + + # test for the native metrics + url = "https://" + host + ":8443/metrics" + response = try_get(url) + + if response.ok: + # arbitrary metric was chosen to test if metrics are present in the response + if "io_airlift_node_name_NodeInfo_StartTime" in response.text: + continue + else: + print("Error for [" + url + "]: missing metrics") + exit(-1) + else: + print("Error for [" + url + "]: could not access monitoring") + exit(-1) + + +if __name__ == "__main__": + all_args = argparse.ArgumentParser(description="Test Trino metrics.") + all_args.add_argument( + "-n", "--namespace", help="The namespace to run in", required=True + ) + args = vars(all_args.parse_args()) + namespace = args["namespace"] + + host_coordinator_0 = f"trino-coordinator-default-0.trino-coordinator-default.{namespace}.svc.cluster.local" + host_worker_0 = ( + f"trino-worker-default-0.trino-worker-default.{namespace}.svc.cluster.local" + ) + + hosts = [host_coordinator_0, host_worker_0] + + check_monitoring(hosts) + + print("Test check-metrics.py succeeded!") diff --git a/tests/templates/kuttl/smoke/21-assert.yaml b/tests/templates/kuttl/smoke/21-assert.yaml index 600736ce..b3f78bfc 100644 --- a/tests/templates/kuttl/smoke/21-assert.yaml +++ b/tests/templates/kuttl/smoke/21-assert.yaml @@ -6,3 +6,4 @@ commands: - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-active-workers.py -u admin -p admin -n $NAMESPACE -w 1 - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-opa.py -n $NAMESPACE - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-s3.py -n $NAMESPACE + - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-metrics.py -n $NAMESPACE diff --git a/tests/templates/kuttl/smoke/21-copy-scripts.yaml b/tests/templates/kuttl/smoke/21-copy-scripts.yaml index f38f3274..fc51e8f8 100644 --- a/tests/templates/kuttl/smoke/21-copy-scripts.yaml +++ b/tests/templates/kuttl/smoke/21-copy-scripts.yaml @@ -5,3 +5,4 @@ commands: - script: kubectl cp -n $NAMESPACE ./check-active-workers.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE ./check-opa.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE ./check-s3.py trino-test-helper-0:/tmp || true + - script: kubectl cp -n $NAMESPACE ../../../../templates/kuttl/commons/check-metrics.py trino-test-helper-0:/tmp || true diff --git a/tests/templates/kuttl/smoke/31-assert.yaml b/tests/templates/kuttl/smoke/31-assert.yaml index 0690b385..fa6250c7 100644 --- a/tests/templates/kuttl/smoke/31-assert.yaml +++ b/tests/templates/kuttl/smoke/31-assert.yaml @@ -6,3 +6,4 @@ commands: - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-active-workers.py -u admin -p admin -n $NAMESPACE -w 2 - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-opa.py -n $NAMESPACE - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-s3.py -n $NAMESPACE + - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-metrics.py -n $NAMESPACE diff --git a/tests/templates/kuttl/smoke_aws/21-assert.yaml b/tests/templates/kuttl/smoke_aws/21-assert.yaml index 600736ce..b3f78bfc 100644 --- a/tests/templates/kuttl/smoke_aws/21-assert.yaml +++ b/tests/templates/kuttl/smoke_aws/21-assert.yaml @@ -6,3 +6,4 @@ commands: - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-active-workers.py -u admin -p admin -n $NAMESPACE -w 1 - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-opa.py -n $NAMESPACE - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-s3.py -n $NAMESPACE + - script: kubectl exec -n $NAMESPACE trino-test-helper-0 -- python /tmp/check-metrics.py -n $NAMESPACE diff --git a/tests/templates/kuttl/smoke_aws/21-copy-scripts.yaml b/tests/templates/kuttl/smoke_aws/21-copy-scripts.yaml index f38f3274..fc51e8f8 100644 --- a/tests/templates/kuttl/smoke_aws/21-copy-scripts.yaml +++ b/tests/templates/kuttl/smoke_aws/21-copy-scripts.yaml @@ -5,3 +5,4 @@ commands: - script: kubectl cp -n $NAMESPACE ./check-active-workers.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE ./check-opa.py trino-test-helper-0:/tmp || true - script: kubectl cp -n $NAMESPACE ./check-s3.py trino-test-helper-0:/tmp || true + - script: kubectl cp -n $NAMESPACE ../../../../templates/kuttl/commons/check-metrics.py trino-test-helper-0:/tmp || true