This k8s resources deploy the monitoring and logging service on Microsoft Azure AKS.
monitoring: Prometheus & Grafana
logging: Elasticsearch & fluentd & Kibana
-
start Azure AKS
- select the vm series like Dsv3-series which supportes the Premium Storage
$ az group create --name k8s --location japaneast $ az aks create --resource-group k8s --name k8saks --node-count 3 --ssh-key-value $HOME/.ssh/azure.pub --node-vm-size Standard_D2s_v3 --kubernetes-version 1.11.1 $ az aks get-credentials --resource-group k8s --name k8saks
-
create a ServiceAccount as
tiller
which has thecluster-admin
ClusterRole.$ kubectl apply -f rbac/tiller-rbac.yaml
-
install helm according to install guide.
-
initialize helm using
tiller
ServiceAccount$ helm init --service-account tiller $ helm repo update
-
enable helm charts of coreos
$ helm repo add coreos https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
-
confirm that a tiller is launched successfully
$ kubectl get pod --namespace kube-system -l app=helm -l name=tiller
-
install coreos/prometheus-operator
$ helm install coreos/prometheus-operator --name pg-op --namespace monitoring
-
confirm that
prometheus-operator
has been launched$ kubectl get jobs --namespace monitoring -l app=prometheus-operator -l release=pg-op $ kubectl get pods --namespace monitoring -l app=prometheus-operator -l release=pg-op
-
edit
monitoring/kube-prometheus-azure.yaml
- change persistent volume size, storagClass and so on if you needed
$ vi monitoring/kube-prometheus-azure.yaml
-
install Prometheus & Grafana
$ helm install coreos/kube-prometheus --name pg --namespace monitoring -f monitoring/kube-prometheus-azure.yaml
-
confirm that Prometheus and Grafana is launched
-
confirm that AlertManager is launched successfully
$ kubectl get persistentvolumeclaims --namespace monitoring -l app=alertmanager
$ kubectl get pods -n monitoring -l app=alertmanager
-
confirm that Prometheus is launched successfully
$ kubectl get persistentvolumeclaims --namespace monitoring -l app=prometheus
$ kubectl get pods --namespace monitoring -l app=prometheus
-
confirm that grafana is launched successfully
$ kubectl get pods --namespace monitoring -l app=pg-grafana
-
confirm that node-exporter is launched successfully on each node
$ kubectl get daemonsets --namespace monitoring $ kubectl get pods --namespace monitoring -l app=pg-exporter-node -o wide
-
-
patch
kube-dns-v20
since default kube-dns of Azure AKS does not export dns metrics$ kubectl patch deployment -n kube-system kube-dns-v20 --patch "$(cat monitoring/kube-dns-metrics-patch.yaml)"
-
patch ServiceMonitor resource of
pg-exporter-kubelet
to look for the http endpoints$ kubectl get servicemonitors pg-exporter-kubelets --namespace monitoring -o yaml | sed 's/https/http/' | kubectl replace -f -
-
delete monitor of apiserver because apiserver of Azure AKS does not allow to connect apiserver directry
$ kubectl delete servicemonitor pg-exporter-kubernetes --namespace monitoring
-
delete
alert: DeadMansSwitch
$ kubectl edit prometheusrule pg-kube-prometheus --namespace monitoring
for: 10m labels: severity: warning - - alert: DeadMansSwitch - annotations: - description: This is a DeadMansSwitch meant to ensure that the entire Alerting - pipeline is functional. - summary: Alerting DeadMansSwitch - expr: vector(1) - labels: - severity: none - expr: process_open_fds / process_max_fds record: fd_utilization - alert: FdExhaustionClose
-
delete
alert: K8SApiserverDown
$ kubectl edit prometheusrule pg-exporter-kubernetes --namespace monitoring
for: 10m labels: severity: critical - - alert: K8SApiserverDown - annotations: - description: No API servers are reachable or all have disappeared from service - discovery - summary: No API servers are reachable - expr: absent(up{job="apiserver"} == 1) - for: 20m - labels: - severity: critical - alert: K8sCertificateExpirationNotice annotations: description: Kubernetes API Certificate is expiring soon (less than 7 days)
-
delete
alert: K8SControllerManagerDown
$ kubectl edit prometheusrule pg-exporter-kube-controller-manager --namespace monitoring
spec: groups: - name: kube-controller-manager.rules - rules: - - alert: K8SControllerManagerDown - annotations: - description: There is no running K8S controller manager. Deployments and replication - controllers are not making progress. - runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager - summary: Controller manager is down - expr: absent(up{job="kube-controller-manager"} == 1) - for: 5m - labels: - severity: critical + rules: []
-
delete
alert: K8SSchedulerDown
$ kubectl edit prometheusrule pg-exporter-kube-scheduler --namespace monitoring
labels: quantile: "0.5" record: cluster:scheduler_binding_latency_seconds:quantile - - alert: K8SSchedulerDown - annotations: - description: There is no running K8S scheduler. New pods are not being assigned - to nodes. - runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler - summary: Scheduler is down - expr: absent(up{job="kube-scheduler"} == 1) - for: 5m - labels: - severity: critical
-
port forward to prometheus
$ kubectl port-forward $(kubectl get pod --namespace monitoring -l prometheus=kube-prometheus -l app=prometheus -o template --template "{{(index .items 0).metadata.name}}") --namespace monitoring 9090:9090
-
open http://localhost:9090/targets and confirm that no
State
is down -
open http://localhost:9090/alerts and confirm that no
Alert
is fired
-
port forward to grafana
$ kubectl port-forward $(kubectl get pod --namespace monitoring -l app=pg-grafana -o template --template "{{(index .items 0).metadata.name}}") --namespace monitoring 3000:3000
-
log in grafana (initial username/password is admin/admin)
-
show dashboard
- if you have seen no graph, you have to reconfigure datasource of prometheus (URL:
http://pg-prometheus:9090/
) - Unfortunately, the status of Control Plane is N/A because the exporter of kubernetes control plane has been deleted above
- if you have seen no graph, you have to reconfigure datasource of prometheus (URL:
-
add a dashboard to show the capacity of persistent volumes
- import
monitoring/dashboard_persistent_volumes.json
- import
-
install Elasticsearch
$ kubectl apply -f logging/es-statefulset.yaml $ kubectl apply -f logging/es-service.yaml
-
confirm that Elasticsearch is launched
-
confirm that a StatefulSet is running successfully
$ kubectl get statefulsets --namespace monitoring -l k8s-app=elasticsearch-logging
-
confirm that two Pods are running successfully
$ kubectl get pods --namespace monitoring -l k8s-app=elasticsearch-logging
-
confirm that two PersistentVolumeClaims are running successfully
$ kubectl get persistentvolumeclaims --namespace monitoring -l k8s-app=elasticsearch-logging
-
confirm that a Service is running successfully
$ kubectl get services --namespace monitoring -l k8s-app=elasticsearch-logging
-
-
enable routing allocation of Elasticsearch cluster
$ kubectl exec -it elasticsearch-logging-0 --namespace monitoring -- curl -H "Content-Type: application/json" -X PUT http://elasticsearch-logging:9200/_cluster/settings -d '{"transient": {"cluster.routing.allocation.enable":"all"}}'
-
install Fluentd
$ kubectl apply -f logging/fluentd-es-configmap.yaml $ kubectl apply -f logging/fluentd-es-ds.yaml
-
confirm that a DaemonSet is running and a pod is running successfully on each node
$ kubectl get daemonsets --namespace monitoring -l k8s-app=fluentd-es $ kubectl get pods --namespace monitoring -l k8s-app=fluentd-es -o wide
-
install Kibana
$ kubectl apply -f logging/kibana-deployment.yaml $ kubectl apply -f logging/kibana-service.yaml
-
confirm that Kibana is launched successfully
-
confirm that a Deployment is running successfully
$ kubectl get deployments --namespace monitoring -l k8s-app=kibana-logging
-
confirm that a Pod is running successfully
$ kubectl get pods --namespace monitoring -l k8s-app=kibana-logging
-
confirm that a Service is running successfully
$ kubectl get services --namespace monitoring -l k8s-app=kibana-logging
-
-
install Curator
$ kubectl apply -f logging/curator-configmap.yaml $ kubectl apply -f logging/curator-cronjob.yaml
-
confirm that a CronJob of Curator is registered
$ kubectl get cronjobs --namespace monitoring -l k8s-app=elasticsearch-curator
-
port forward to kibana
$ kubectl port-forward $(kubectl get pod --namespace monitoring -l k8s-app=kibana-logging -o template --template "{{(index .items 0).metadata.name}}") --namespace monitoring 5601:5601
-
set up index from "Management -> Index Patterns"
- Index Pattern:
logstash-*
- Time filter field:
@timestamp
- Index Pattern:
- open Configuration -> DataSource and add a DataSource connecting to Elasticsearch
- name:
elasticsearch
- Type:
Elasticsearch
- URL:
http://elasticsearch-logging:9200/
- Access:
Server(Default)
- Index name:
logstash-*
- Time field nama:
@timestamp
- Version:
5.6+
- name:
- import
monitoring/dashboard_elasticsearch.json
to addElasticsearch
dashboard - When you open
Elasticsearch
dashboard, you can see the raw Elasticsearch log table and log count graph
Copyright (c) 2018 Nobuyuki Matsui [email protected]