Skip to content

Commit

Permalink
Merge pull request #401 from Azure/AddNodespaceDefaultAlert
Browse files Browse the repository at this point in the history
Add proactive and reactive disk usage alerts for node
  • Loading branch information
Sohamdg081992 authored Mar 1, 2023
2 parents 60d3c96 + e75c010 commit c3877f9
Showing 1 changed file with 90 additions and 0 deletions.
90 changes: 90 additions & 0 deletions GeneratedMonitoringArtifacts/Default/DefaultAlerts.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,55 @@
"clusterName": "[parameters('clusterName')]",
"interval": "PT1M",
"rules": [
{
"alert": "NodeFilesystemSpaceFillingUp",
"expression": "avg by (namespace,cluster,job,device,instance,mountpoint)(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}) / avg by (namespace,cluster,job,device,instance,mountpoint)(node_filesystem_size_bytes{job=\"node\",fstype!=\"\"}) * 100 < 40 and avg by (namespace,cluster,job,device,instance,mountpoint)(predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60)) < 0 and avg by (namespace,cluster,job,device,instance,mountpoint)(node_filesystem_readonly{job=\"node\",fstype!=\"\"}) == 0",
"for": "PT15M",
"annotations": {
"description": "An extrapolation algorithm predicts that disk space usage for node {{ $labels.instance }} on device {{ $labels.device }} in {{ $labels.cluster}} will run out of space within the upcoming 24 hours. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/node/NodeFilesystemSpaceFillingUp.md)."
},
"labels": {
"severity": "warning"
},
"severity": 3,
"resolveConfiguration": {
"autoResolved": true,
"timeToResolve": "PT10M"
},
"actions": [
{
"actionGroupId": "[parameters('actionGroupResourceId')]"
}
]
},
{
"alert": "NodeFilesystemSpaceUsageFull85Pct",
"expression": "1 - avg by (namespace,cluster,job,device,instance,mountpoint)(node_filesystem_avail_bytes{job=\"node\"}) / avg by (namespace,cluster,job,device,instance,mountpoint)(node_filesystem_size_bytes{job=\"node\"}) > .85",
"for": "PT15M",
"annotations": {
"description": "Disk space usage for node {{ $labels.instance }} on device {{ $labels.device }} in {{ $labels.cluster}} is greater than 85%. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/node/NodeFilesystemAlmostOutOfSpace.md)."
},
"labels": {
"severity": "warning"
},
"severity": 3,
"resolveConfiguration": {
"autoResolved": true,
"timeToResolve": "PT10M"
},
"actions": [
{
"actionGroupId": "[parameters('actionGroupResourceId')]"
}
]
},
{
"alert": "KubePodCrashLooping",
"expression": "max_over_time(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", job=\"kube-state-metrics\"}[5m]) >= 1",
"for": "PT15M",
"annotations": {
"description": "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) in {{ $labels.cluster}} is restarting {{ printf \"%.2f\" $value }} / second. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePodCrashLooping.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -67,6 +112,9 @@
"alert": "KubePodNotReady",
"expression": "sum by (namespace, pod, cluster) ( max by(namespace, pod, cluster) ( kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"} ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) ( 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!=\"Job\"}) )) > 0",
"for": "PT15M",
"annotations": {
"description": "{{ $labels.namespace }}/{{ $labels.pod }} in {{ $labels.cluster}} is not ready. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePodNotReady.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -85,6 +133,9 @@
"alert": "KubeDeploymentReplicasMismatch",
"expression": "( kube_deployment_spec_replicas{job=\"kube-state-metrics\"} > kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}) and ( changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[10m]) == 0)",
"for": "PT15M",
"annotations": {
"description": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeDeploymentReplicasMismatch.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -103,6 +154,9 @@
"alert": "KubeStatefulSetReplicasMismatch",
"expression": "( kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\"}) and ( changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[10m]) == 0)",
"for": "PT15M",
"annotations": {
"description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeStatefulSetReplicasMismatch.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -120,6 +174,9 @@
{
"alert": "KubeJobNotCompleted",
"expression": "time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job=\"kube-state-metrics\"} and kube_job_status_active{job=\"kube-state-metrics\"} > 0) > 43200",
"annotations": {
"description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} is taking more than 12 hours to complete. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeJobCompletion.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -138,6 +195,9 @@
"alert": "KubeJobFailed",
"expression": "kube_job_failed{job=\"kube-state-metrics\"} > 0",
"for": "PT15M",
"annotations": {
"description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} failed to complete. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeJobFailed.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -156,6 +216,9 @@
"alert": "KubeHpaReplicasMismatch",
"expression": "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"} !=kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}) and(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"} >kube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\"}) and(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"} <kube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"}) and changes(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}[15m]) == 0",
"for": "PT15M",
"annotations": {
"description": "Horizontal Pod Autoscaler in {{ $labels.cluster}} has not matched the desired number of replicas for longer than 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeHpaReplicasMismatch.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -174,6 +237,9 @@
"alert": "KubeHpaMaxedOut",
"expression": "kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"} ==kube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"}",
"for": "PT15M",
"annotations": {
"description": "Horizontal Pod Autoscaler in {{ $labels.cluster}} has been running at max replicas for longer than 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeHpaMaxedOut.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -192,6 +258,9 @@
"alert": "KubeCPUQuotaOvercommit",
"expression": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(cpu|requests.cpu)\"})) /sum(kube_node_status_allocatable{resource=\"cpu\", job=\"kube-state-metrics\"}) > 1.5",
"for": "PT5M",
"annotations": {
"description": "Cluster {{ $labels.cluster}} has overcommitted CPU resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeCPUQuotaOvercommit.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -210,6 +279,9 @@
"alert": "KubeMemoryQuotaOvercommit",
"expression": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(memory|requests.memory)\"})) /sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) > 1.5",
"for": "PT5M",
"annotations": {
"description": "Cluster {{ $labels.cluster}} has overcommitted memory resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeMemoryQuotaOvercommit.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -228,6 +300,9 @@
"alert": "KubeQuotaAlmostFull",
"expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type)(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) > 0.9 < 1",
"for": "PT15M",
"annotations": {
"description": "{{ $value | humanizePercentage }} usage of {{ $labels.resource }} in namespace {{ $labels.namespace }} in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeQuotaAlmostFull.md)."
},
"labels": {
"severity": "info"
},
Expand All @@ -246,6 +321,9 @@
"alert": "KubeVersionMismatch",
"expression": "count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"git_version\",\"$1\",\"git_version\",\"(v[0-9]*.[0-9]*).*\"))) > 1",
"for": "PT15M",
"annotations": {
"description": "There are {{ $value }} different versions of Kubernetes components running in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeVersionMismatch.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -264,6 +342,9 @@
"alert": "KubeNodeNotReady",
"expression": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0",
"for": "PT15M",
"annotations": {
"description": "{{ $labels.node }} in {{ $labels.cluster}} has been unready for more than 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeNotReady.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -282,6 +363,9 @@
"alert": "KubeNodeUnreachable",
"expression": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1",
"for": "PT15M",
"annotations": {
"description": "{{ $labels.node }} in {{ $labels.cluster}} is unreachable and some workloads may be rescheduled. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeUnreachable.md)."
},
"labels": {
"severity": "warning"
},
Expand All @@ -300,6 +384,9 @@
"alert": "KubeletTooManyPods",
"expression": "count by(cluster, node) ( (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"}))/max by(cluster, node) ( kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1) > 0.95",
"for": "PT15M",
"annotations": {
"description": "Kubelet '{{ $labels.node }}' in {{ $labels.cluster}} is running at {{ $value | humanizePercentage }} of its Pod capacity. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeletTooManyPods.md)."
},
"labels": {
"severity": "info"
},
Expand All @@ -318,6 +405,9 @@
"alert": "KubeNodeReadinessFlapping",
"expression": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (cluster, node) > 2",
"for": "PT15M",
"annotations": {
"description": "The readiness status of node {{ $labels.node }} in {{ $labels.cluster}} has changed more than 2 times in the last 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeReadinessFlapping.md)."
},
"labels": {
"severity": "warning"
},
Expand Down

0 comments on commit c3877f9

Please sign in to comment.