diff --git a/metrics-collector/README.md b/metrics-collector/README.md index 19e5eedf9..9abb6844a 100644 --- a/metrics-collector/README.md +++ b/metrics-collector/README.md @@ -2,6 +2,8 @@ Code Engine job that demonstrates how to collect resource metrics (CPU, memory and disk usage) of running Code Engine apps, jobs, and builds +![Dashboard overview](./images/icl-dashboard-overview.png) + ## Installation ### Capture metrics every n seconds @@ -17,11 +19,11 @@ $ ibmcloud ce job create \ --wait ``` -* Submit a daemon job that collects metrics in an endless loop. The daemon job queries the Metrics API every 10 seconds +* Submit a daemon job that collects metrics in an endless loop. The daemon job queries the Metrics API every 30 seconds ``` $ ibmcloud ce jobrun submit \ --job metrics-collector \ - --env INTERVAL=10 + --env INTERVAL=30 ``` @@ -57,6 +59,81 @@ One can use the environment variable `COLLECT_DISKUSAGE=true` to also collect th Once your IBM Cloud Code Engine project has detected a corresponding IBM Cloud Logs instance, which is configured to receive platform logs, you can consume the resource metrics in IBM Cloud Logs. Use the filter `metric:instance-resources` to filter for log lines that print resource metrics for each detected IBM Cloud Code Engine instance that is running in a project. +### Custom dashboard + +Follow the steps below to create a custom dashboard in your IBM Cloud Logs instance, to gain insights into resource consumption metrics. + +![Dashboard overview](./images/icl-dashboard-overview.png) + +**Setup instructions:** + +* Navigate to the "Custom dashboards" view, hover of the "New" button, and click "Import dashboard" + +![New dashboard](./images/icl-dashboard-new.png) + +* In the "Import" modal, select the file [./setup/dashboard-code_engine_resource_consumption_metrics.json](./setup/dashboard-code_engine_resource_consumption_metrics.json) located in this repository, and click "Import" + +![Import modal](./images/icl-dashboard-import.png) + +* Confirm the import by clicking "Import" again + +![Import confirmation](./images/icl-dashboard-import-confirm.png) + + +### Logs view + +Follow the steps below to create a Logs view in your IBM Cloud Logs instance, that allows you to drill into individual instance-resources log lines. + +![Logs overview](./images/icl-logs-view-overview.png) + +**Setup instructions:** + +* Filter only log lines related collected istio-proxy logs, by filtering for the following query +``` +app:"codeengine" AND message.metric:"instance-resources" +``` + +![Query](./images/icl-logs-view-query.png) + +* In the left bar, click "Add Filter" and add the following filters + * `Application` + * `App` + * `Label.Project` + * `Message.Component_name` + +![Filters](./images/icl-logs-view-filters.png) + +* In the top-right corner, click on "Columns" and configure the following columns: + * `Timestamp` + * `label.Project` + * `message.component_type` + * `message.component_name` + * `message.message` + * `Text` + +![Columns](./images/icl-logs-view-columns.png) + +* Once applied adjust the column widths appropriately + +* In the top-right corner, select `1-line` as view mode + +![View](./images/icl-logs-view-mode.png) + +* In the graph title it says "**Count** all grouped by **Severity**". Click on `Severity` and select `message.component_name` instead. Furthermore, select `Max` as aggregation metric and choose `message.memory.usage` as aggregation field + +![Graph](./images/icl-logs-view-graph.png) + +* Save the view + +![Save](./images/icl-logs-view-save.png) + +* Utilize the custom logs view to drill into HTTP requests + +![Logs overview](./images/icl-logs-view-overview.png) + + +## IBM Log Analysis setup (deprecated) + ### Log lines Along with a human readable message, like `Captured metrics of app instance 'load-generator-00001-deployment-677d5b7754-ktcf6': 3m vCPU, 109 MB memory, 50 MB ephemeral storage`, each log line passes specific resource utilization details in a structured way allowing to apply advanced filters on them. diff --git a/metrics-collector/images/icl-dashboard-import-confirm.png b/metrics-collector/images/icl-dashboard-import-confirm.png new file mode 100644 index 000000000..890132c42 Binary files /dev/null and b/metrics-collector/images/icl-dashboard-import-confirm.png differ diff --git a/metrics-collector/images/icl-dashboard-import.png b/metrics-collector/images/icl-dashboard-import.png new file mode 100644 index 000000000..17e25ac43 Binary files /dev/null and b/metrics-collector/images/icl-dashboard-import.png differ diff --git a/metrics-collector/images/icl-dashboard-new.png b/metrics-collector/images/icl-dashboard-new.png new file mode 100644 index 000000000..baef751fc Binary files /dev/null and b/metrics-collector/images/icl-dashboard-new.png differ diff --git a/metrics-collector/images/icl-dashboard-overview.png b/metrics-collector/images/icl-dashboard-overview.png new file mode 100644 index 000000000..a55bdb551 Binary files /dev/null and b/metrics-collector/images/icl-dashboard-overview.png differ diff --git a/metrics-collector/images/icl-logs-view-columns.png b/metrics-collector/images/icl-logs-view-columns.png new file mode 100644 index 000000000..b26dc2666 Binary files /dev/null and b/metrics-collector/images/icl-logs-view-columns.png differ diff --git a/metrics-collector/images/icl-logs-view-filters.png b/metrics-collector/images/icl-logs-view-filters.png new file mode 100644 index 000000000..00a23011c Binary files /dev/null and b/metrics-collector/images/icl-logs-view-filters.png differ diff --git a/metrics-collector/images/icl-logs-view-graph.png b/metrics-collector/images/icl-logs-view-graph.png new file mode 100644 index 000000000..1a36d5f7a Binary files /dev/null and b/metrics-collector/images/icl-logs-view-graph.png differ diff --git a/metrics-collector/images/icl-logs-view-overview.png b/metrics-collector/images/icl-logs-view-overview.png new file mode 100644 index 000000000..f00e2a316 Binary files /dev/null and b/metrics-collector/images/icl-logs-view-overview.png differ diff --git a/metrics-collector/images/icl-logs-view-query.png b/metrics-collector/images/icl-logs-view-query.png new file mode 100644 index 000000000..f9a7654b0 Binary files /dev/null and b/metrics-collector/images/icl-logs-view-query.png differ diff --git a/metrics-collector/images/icl-logs-view-save.png b/metrics-collector/images/icl-logs-view-save.png new file mode 100644 index 000000000..eaedf46e0 Binary files /dev/null and b/metrics-collector/images/icl-logs-view-save.png differ diff --git a/metrics-collector/main.go b/metrics-collector/main.go index 7a4ccb0e7..63a2ba867 100644 --- a/metrics-collector/main.go +++ b/metrics-collector/main.go @@ -34,9 +34,12 @@ func main() { } // If the 'INTERVAL' env var is set then sleep for that many seconds - sleepDuration := 10 + sleepDuration := 30 if t := os.Getenv("INTERVAL"); t != "" { sleepDuration, _ = strconv.Atoi(t) + if sleepDuration < 30 { + sleepDuration = 30 + } } // In daemon mode, collect resource metrics in an endless loop @@ -111,10 +114,10 @@ func collectInstanceMetrics() { // fetches all pods pods := getAllPods(coreClientset, namespace, config) - + // fetch all pod metrics podMetrics := getAllPodMetrics(namespace, config) - + var wg sync.WaitGroup for _, metric := range *podMetrics { @@ -258,7 +261,7 @@ func getAllPods(coreClientset *kubernetes.Clientset, namespace string, config *r // Helper function to retrieve all pods from the Kube API func obtainDiskUsage(coreClientset *kubernetes.Clientset, namespace string, pod string, container string, config *rest.Config) float64 { - + // per default, we do not collect disk space statistics if os.Getenv("COLLECT_DISKUSAGE") != "true" { return 0 @@ -304,12 +307,16 @@ func obtainDiskUsage(coreClientset *kubernetes.Clientset, namespace string, pod // Render captured system error messages, in case the stdout stream did not receive any valid content if err != nil { - fmt.Println("obtainDiskUsage of pod:" + pod + "/container:" + container + " failed with a stream err - " + err.Error() + " - stderr: '" + errBuf.String() + "'") + if err.Error() == "Internal error occurred: failed calling webhook \"validating.webhook.pod-exec-auth-check.codeengine.cloud.ibm.com\": failed to call webhook: Post \"https://validating-webhook-serving.ibm-cfn-system.svc:443/validate/pod-exec?timeout=5s\": EOF" { + // Do nothing and silently ignore this issue as it is most likely related to pod terminations + } else { + fmt.Println("obtainDiskUsage of pod:" + pod + "/container:" + container + " failed with a stream err - " + err.Error() + " - stderr: '" + errBuf.String() + "'") + } } return float64(0) } - + // Parse the output "4000 /" by splitting the words diskUsageOutput := strings.Fields(strings.TrimSuffix(diskUsageOutputStr, "\n")) if len(diskUsageOutput) > 2 { diff --git a/metrics-collector/setup/dashboard-code_engine_resource_consumption_metrics.json b/metrics-collector/setup/dashboard-code_engine_resource_consumption_metrics.json new file mode 100644 index 000000000..6dd9bb0ce --- /dev/null +++ b/metrics-collector/setup/dashboard-code_engine_resource_consumption_metrics.json @@ -0,0 +1,573 @@ +{ + "id": "RQac8BNYDtzOIAaz9PXVF", + "name": "Code Engine - Resource consumption metrics", + "layout": { + "sections": [ + { + "id": { + "value": "1eeb5f84-2fa5-4967-8b15-7e8a15691555" + }, + "rows": [ + { + "id": { + "value": "dd094835-c83a-49ad-a95f-e1b623fe8c31" + }, + "appearance": { + "height": 19 + }, + "widgets": [ + { + "id": { + "value": "f7790a61-0de6-47d0-905d-ee386b0d4a11" + }, + "title": "CPU usage (in %)", + "definition": { + "lineChart": { + "legend": { + "isVisible": true, + "columns": [], + "groupByQuery": true, + "placement": "LEGEND_PLACEMENT_AUTO" + }, + "tooltip": { + "showLabels": false, + "type": "TOOLTIP_TYPE_ALL" + }, + "queryDefinitions": [ + { + "id": "76071e2e-b28e-4a14-bf47-ca0b3dfea660", + "query": { + "logs": { + "luceneQuery": { + "value": "app:\"codeengine\" AND message.metric:\"instance-resources\"" + }, + "groupBy": [], + "aggregations": [ + { + "max": { + "observationField": { + "keypath": [ + "message", + "cpu", + "usage" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + } + } + ], + "filters": [], + "groupBys": [ + { + "keypath": [ + "message", + "component_type" + ], + "scope": "DATASET_SCOPE_USER_DATA" + }, + { + "keypath": [ + "message", + "name" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + ] + } + }, + "seriesCountLimit": "20", + "unit": "UNIT_UNSPECIFIED", + "scaleType": "SCALE_TYPE_LINEAR", + "name": "Query 1", + "isVisible": true, + "colorScheme": "classic", + "resolution": { + "bucketsPresented": 96 + }, + "dataModeType": "DATA_MODE_TYPE_HIGH_UNSPECIFIED", + "customUnit": "", + "decimal": 2 + } + ], + "stackedLine": "STACKED_LINE_UNSPECIFIED" + } + } + }, + { + "id": { + "value": "8b3b1968-aaa9-4ab2-a5ea-94f222a24e58" + }, + "title": "CPU usage (in vCore millis)", + "definition": { + "lineChart": { + "legend": { + "isVisible": true, + "columns": [], + "groupByQuery": true, + "placement": "LEGEND_PLACEMENT_AUTO" + }, + "tooltip": { + "showLabels": false, + "type": "TOOLTIP_TYPE_ALL" + }, + "queryDefinitions": [ + { + "id": "69a80b40-c5e4-462b-8067-28c0d7e0fc75", + "query": { + "logs": { + "luceneQuery": { + "value": "app:\"codeengine\" AND message.metric:\"instance-resources\"" + }, + "groupBy": [], + "aggregations": [ + { + "max": { + "observationField": { + "keypath": [ + "message", + "cpu", + "current" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + } + } + ], + "filters": [], + "groupBys": [ + { + "keypath": [ + "message", + "component_type" + ], + "scope": "DATASET_SCOPE_USER_DATA" + }, + { + "keypath": [ + "message", + "name" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + ] + } + }, + "seriesCountLimit": "20", + "unit": "UNIT_UNSPECIFIED", + "scaleType": "SCALE_TYPE_LINEAR", + "name": "Query 1", + "isVisible": true, + "colorScheme": "classic", + "resolution": { + "bucketsPresented": 96 + }, + "dataModeType": "DATA_MODE_TYPE_HIGH_UNSPECIFIED", + "customUnit": "", + "decimal": 2 + } + ], + "stackedLine": "STACKED_LINE_UNSPECIFIED" + } + } + } + ] + } + ], + "options": { + "custom": { + "name": "CPU consumption", + "collapsed": false, + "color": { + "predefined": "SECTION_PREDEFINED_COLOR_UNSPECIFIED" + } + } + } + }, + { + "id": { + "value": "469c61d5-9a4b-4193-a477-33759997ff54" + }, + "rows": [ + { + "id": { + "value": "cba8703d-9074-427d-be10-3c18465a74a1" + }, + "appearance": { + "height": 19 + }, + "widgets": [ + { + "id": { + "value": "9244e0ea-fe55-4308-8d5e-846266bc503b" + }, + "title": "Memory usage (in %)", + "definition": { + "lineChart": { + "legend": { + "isVisible": true, + "columns": [], + "groupByQuery": true, + "placement": "LEGEND_PLACEMENT_AUTO" + }, + "tooltip": { + "showLabels": false, + "type": "TOOLTIP_TYPE_ALL" + }, + "queryDefinitions": [ + { + "id": "65f68702-6215-4972-b18a-a36e3663bccc", + "query": { + "logs": { + "luceneQuery": { + "value": "app:\"codeengine\" AND message.metric:\"instance-resources\"" + }, + "groupBy": [], + "aggregations": [ + { + "max": { + "observationField": { + "keypath": [ + "message", + "memory", + "usage" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + } + } + ], + "filters": [], + "groupBys": [ + { + "keypath": [ + "message", + "component_type" + ], + "scope": "DATASET_SCOPE_USER_DATA" + }, + { + "keypath": [ + "message", + "name" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + ] + } + }, + "seriesCountLimit": "20", + "unit": "UNIT_UNSPECIFIED", + "scaleType": "SCALE_TYPE_LINEAR", + "name": "Query 1", + "isVisible": true, + "colorScheme": "classic", + "resolution": { + "bucketsPresented": 96 + }, + "dataModeType": "DATA_MODE_TYPE_HIGH_UNSPECIFIED", + "customUnit": "", + "decimal": 2 + } + ], + "stackedLine": "STACKED_LINE_UNSPECIFIED" + } + } + }, + { + "id": { + "value": "ea14669a-36da-4958-8503-1a00f7d9d5ed" + }, + "title": "Memory usage (in MB)", + "definition": { + "lineChart": { + "legend": { + "isVisible": true, + "columns": [], + "groupByQuery": true, + "placement": "LEGEND_PLACEMENT_AUTO" + }, + "tooltip": { + "showLabels": false, + "type": "TOOLTIP_TYPE_ALL" + }, + "queryDefinitions": [ + { + "id": "b461bcb4-58d3-4c85-af14-4c9a89474dc2", + "query": { + "logs": { + "luceneQuery": { + "value": "app:\"codeengine\" AND message.metric:\"instance-resources\"" + }, + "groupBy": [], + "aggregations": [ + { + "max": { + "observationField": { + "keypath": [ + "message", + "memory", + "current" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + } + } + ], + "filters": [], + "groupBys": [ + { + "keypath": [ + "message", + "component_type" + ], + "scope": "DATASET_SCOPE_USER_DATA" + }, + { + "keypath": [ + "message", + "name" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + ] + } + }, + "seriesCountLimit": "20", + "unit": "UNIT_UNSPECIFIED", + "scaleType": "SCALE_TYPE_LINEAR", + "name": "Query 1", + "isVisible": true, + "colorScheme": "classic", + "resolution": { + "bucketsPresented": 96 + }, + "dataModeType": "DATA_MODE_TYPE_HIGH_UNSPECIFIED", + "customUnit": "", + "decimal": 2 + } + ], + "stackedLine": "STACKED_LINE_UNSPECIFIED" + } + } + } + ] + } + ], + "options": { + "custom": { + "name": "Memory consumption", + "collapsed": false, + "color": { + "predefined": "SECTION_PREDEFINED_COLOR_UNSPECIFIED" + } + } + } + }, + { + "id": { + "value": "33fa5523-5d8a-4145-a7a6-70e4d2d2b1f4" + }, + "rows": [ + { + "id": { + "value": "652c4e29-a22f-4930-bd67-b61b3e89232d" + }, + "appearance": { + "height": 19 + }, + "widgets": [ + { + "id": { + "value": "763356d2-690c-4d41-906b-0f7c6e583a23" + }, + "title": "Disk usage usage (in MB)", + "definition": { + "lineChart": { + "legend": { + "isVisible": true, + "columns": [], + "groupByQuery": true, + "placement": "LEGEND_PLACEMENT_AUTO" + }, + "tooltip": { + "showLabels": false, + "type": "TOOLTIP_TYPE_ALL" + }, + "queryDefinitions": [ + { + "id": "3f5be437-9aad-4f0e-a944-884ebfab3a0c", + "query": { + "logs": { + "luceneQuery": { + "value": "app:\"codeengine\" AND message.metric:\"instance-resources\"" + }, + "groupBy": [], + "aggregations": [ + { + "max": { + "observationField": { + "keypath": [ + "message", + "disk_usage", + "current" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + } + } + ], + "filters": [], + "groupBys": [ + { + "keypath": [ + "message", + "component_type" + ], + "scope": "DATASET_SCOPE_USER_DATA" + }, + { + "keypath": [ + "message", + "name" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + ] + } + }, + "seriesCountLimit": "20", + "unit": "UNIT_UNSPECIFIED", + "scaleType": "SCALE_TYPE_LINEAR", + "name": "Query 1", + "isVisible": true, + "colorScheme": "classic", + "resolution": { + "bucketsPresented": 96 + }, + "dataModeType": "DATA_MODE_TYPE_HIGH_UNSPECIFIED", + "customUnit": "", + "decimal": 2 + } + ], + "stackedLine": "STACKED_LINE_UNSPECIFIED" + } + } + } + ] + } + ], + "options": { + "custom": { + "name": "Disk usage", + "collapsed": false, + "color": { + "predefined": "SECTION_PREDEFINED_COLOR_UNSPECIFIED" + } + } + } + } + ] + }, + "variables": [], + "filters": [ + { + "source": { + "logs": { + "operator": { + "equals": { + "selection": { + "list": { + "values": [] + } + } + } + }, + "observationField": { + "keypath": [ + "label", + "Project" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + } + }, + "enabled": true, + "collapsed": false + }, + { + "source": { + "logs": { + "operator": { + "equals": { + "selection": { + "list": { + "values": [] + } + } + } + }, + "observationField": { + "keypath": [ + "message", + "component_name" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + } + }, + "enabled": true, + "collapsed": false + }, + { + "source": { + "logs": { + "operator": { + "equals": { + "selection": { + "list": { + "values": [] + } + } + } + }, + "observationField": { + "keypath": [ + "message", + "parent" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + } + }, + "enabled": true, + "collapsed": false + }, + { + "source": { + "logs": { + "operator": { + "equals": { + "selection": { + "list": { + "values": [] + } + } + } + }, + "observationField": { + "keypath": [ + "message", + "name" + ], + "scope": "DATASET_SCOPE_USER_DATA" + } + } + }, + "enabled": true, + "collapsed": false + } + ], + "relativeTimeFrame": "86400s", + "annotations": [], + "off": {} +} \ No newline at end of file