diff --git a/cluster-init/Makefile b/cluster-init/Makefile index 5445519..03f61b1 100644 --- a/cluster-init/Makefile +++ b/cluster-init/Makefile @@ -2,11 +2,12 @@ overlay := non-prod .PHONY: default default: ## Deploy cluster management tools - cd ../kube-state-metrics/ && make cd ../sealed-secrets && make overlay=$(overlay) cd ../cert-manager && make overlay=$(overlay) cd ../istio && make overlay=$(overlay) - cd ../observability/ && make + cd ../observability/kube-state-metrics/ && make + cd ../observability/prometheus && make + cd ../observability/oms-agent && make cd ../egress && make overlay=$(overlay) cd ../argo-cd && make overlay=$(overlay) @@ -14,11 +15,12 @@ default: ## Deploy cluster management tools delete: ## Remove cluster management tools cd ../argo-cd && make delete overlay=$(overlay) || true cd ../egress && make delete overlay=$(overlay) || true - cd ../observability/ && make delete || true + cd ../observability/oms-agent && make delete || true + cd ../observability/prometheus && make delete || true + cd ../observability/kube-state-metrics/ && make delete || true cd ../istio && make delete overlay=$(overlay) || true cd ../cert-manager && make delete overlay=$(overlay) || true cd ../sealed-secrets && make delete overlay=$(overlay) || true - cd ../kube-state-metrics/ && make delete || true .PHONY: help help: ## Display this help screen diff --git a/istio/Makefile b/istio/Makefile index f1eed03..6da47aa 100644 --- a/istio/Makefile +++ b/istio/Makefile @@ -37,6 +37,17 @@ init1: ## Install SSL certs and Istio profile init2: ## Install custom manifests kustomize build overlays-2/$(overlay) | kubectl apply -f - +.PHONY: restart_proxies +restart_proxies: ## Restarts all istio dataplane proxies, can be used when rolling out upgrade + kubectl rollout restart deployment/argocd-application-controller -n argocd + kubectl rollout restart deployment/argocd-dex-server -n argocd + kubectl rollout restart deployment/argocd-redis -n argocd + kubectl rollout restart deployment/argocd-repo-server -n argocd + kubectl rollout restart deployment/argocd-server -n argocd + kubectl rollout restart deployment/doc-index-updater -n doc-index-updater + kubectl rollout restart deployment/medicines-api -n medicines-api + cd ../observability/prometheus && make + .PHONY: delete delete: ## Remove Istio kubectl delete istiooperators.install.istio.io -n istio-system istiocontrolplane --ignore-not-found || true diff --git a/istio/init-1/profile.yaml b/istio/init-1/profile.yaml index 796a92c..7d8428f 100644 --- a/istio/init-1/profile.yaml +++ b/istio/init-1/profile.yaml @@ -7,15 +7,6 @@ spec: meshConfig: outboundTrafficPolicy: mode: REGISTRY_ONLY - addonComponents: - kiali: - enabled: true - grafana: - enabled: false - prometheus: - enabled: true - tracing: - enabled: true components: pilot: enabled: true @@ -31,24 +22,6 @@ spec: patches: - path: spec.minReplicas value: 2 - telemetry: - enabled: true - k8s: - resources: - requests: - cpu: "200m" - memory: "500M" - overlays: - - kind: Deployment - name: istio-telemetry - patches: - - path: spec.replicas - value: 2 - - kind: HorizontalPodAutoscaler - name: istio-telemetry - patches: - - path: spec.minReplicas - value: 2 ingressGateways: - name: istio-ingressgateway enabled: true @@ -58,9 +31,3 @@ spec: values: sidecarInjectorWebhook: rewriteAppHTTPProbe: true - telemetry: - enabled: true - v1: - enabled: false - v2: - enabled: true diff --git a/kube-state-metrics/base/cluster-role-binding.yaml b/kube-state-metrics/base/cluster-role-binding.yaml deleted file mode 100644 index 7363054..0000000 --- a/kube-state-metrics/base/cluster-role-binding.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.5 - name: kube-state-metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: kube-state-metrics -subjects: - - kind: ServiceAccount - name: kube-state-metrics - namespace: kube-system diff --git a/kube-state-metrics/base/cluster-role.yaml b/kube-state-metrics/base/cluster-role.yaml deleted file mode 100644 index 67fd5b7..0000000 --- a/kube-state-metrics/base/cluster-role.yaml +++ /dev/null @@ -1,117 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.5 - name: kube-state-metrics -rules: - - apiGroups: - - "" - resources: - - configmaps - - secrets - - nodes - - pods - - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - - endpoints - verbs: - - list - - watch - - apiGroups: - - extensions - resources: - - daemonsets - - deployments - - replicasets - - ingresses - verbs: - - list - - watch - - apiGroups: - - apps - resources: - - statefulsets - - daemonsets - - deployments - - replicasets - verbs: - - list - - watch - - apiGroups: - - batch - resources: - - cronjobs - - jobs - verbs: - - list - - watch - - apiGroups: - - autoscaling - resources: - - horizontalpodautoscalers - verbs: - - list - - watch - - apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create - - apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create - - apiGroups: - - policy - resources: - - poddisruptionbudgets - verbs: - - list - - watch - - apiGroups: - - certificates.k8s.io - resources: - - certificatesigningrequests - verbs: - - list - - watch - - apiGroups: - - storage.k8s.io - resources: - - storageclasses - - volumeattachments - verbs: - - list - - watch - - apiGroups: - - admissionregistration.k8s.io - resources: - - mutatingwebhookconfigurations - - validatingwebhookconfigurations - verbs: - - list - - watch - - apiGroups: - - networking.k8s.io - resources: - - networkpolicies - verbs: - - list - - watch - - apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - list - - watch diff --git a/kube-state-metrics/base/deployment.yaml b/kube-state-metrics/base/deployment.yaml deleted file mode 100644 index cd21903..0000000 --- a/kube-state-metrics/base/deployment.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.5 - name: kube-state-metrics -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: kube-state-metrics - template: - metadata: - labels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.5 - spec: - containers: - - image: quay.io/coreos/kube-state-metrics:v1.9.5 - livenessProbe: - httpGet: - path: /healthz - port: 8080 - initialDelaySeconds: 5 - timeoutSeconds: 5 - name: kube-state-metrics - ports: - - containerPort: 8080 - name: http-metrics - - containerPort: 8081 - name: telemetry - readinessProbe: - httpGet: - path: / - port: 8081 - initialDelaySeconds: 5 - timeoutSeconds: 5 - securityContext: - runAsUser: 65534 - nodeSelector: - kubernetes.io/os: linux - serviceAccountName: kube-state-metrics diff --git a/kube-state-metrics/base/kustomization.yaml b/kube-state-metrics/base/kustomization.yaml deleted file mode 100644 index df94033..0000000 --- a/kube-state-metrics/base/kustomization.yaml +++ /dev/null @@ -1,8 +0,0 @@ -namespace: kube-system - -resources: - - cluster-role-binding.yaml - - cluster-role.yaml - - deployment.yaml - - service-account.yaml - - service.yaml diff --git a/kube-state-metrics/base/service-account.yaml b/kube-state-metrics/base/service-account.yaml deleted file mode 100644 index 2357dc2..0000000 --- a/kube-state-metrics/base/service-account.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - labels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.5 - name: kube-state-metrics diff --git a/kube-state-metrics/base/service.yaml b/kube-state-metrics/base/service.yaml deleted file mode 100644 index e6346ba..0000000 --- a/kube-state-metrics/base/service.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.5 - name: kube-state-metrics -spec: - clusterIP: None - ports: - - name: http-metrics - port: 8080 - targetPort: http-metrics - - name: telemetry - port: 8081 - targetPort: telemetry - selector: - app.kubernetes.io/name: kube-state-metrics diff --git a/observability/README.md b/observability/README.md new file mode 100644 index 0000000..63a250d --- /dev/null +++ b/observability/README.md @@ -0,0 +1,63 @@ +# Monitoring + +## AKS + +Azure Kubernetes Service (AKS) provides good high-level monitoring of the cluster, such as the CPU and memory usage of each node in the cluster. To view this find the cluster in the Azure portal and then click on the "Insights" tab. + +## Custom dashboards + +We have custom dashboards for the doc-index-updater that can be found by searching for "Shared Dashboards" in the Azure Portal. + +They are set up in the following way: + +- [Prometheus](https://prometheus.io/) scrapes metrics from different pods in the cluster (such as [Istio](https://istio.io/) and [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics#overview)). +- [Azure's OMS agent](https://docs.microsoft.com/en-us/azure/azure-monitor/platform/log-analytics-agent) scrapes this data and adds it to the logs analytics workspace for the cluster. +- The Azure Monitor dashboard runs queries against the log analytics workspace and plots the results. + +### Prometheus + +Prometheus is no longer installed by Istio, so we have a set of [manifests](./prometheus) for that. + +There are two parts to the [config](./prometheus/overlay/prometheus-cm.yaml): + +- `prometheus.yml` specifies what pods to scrape and other general settings +- `prometheus.rules.yml` specifies some [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/), basically each rule is a query that Prometheus runs regularly and stores the results as a new metric. These are what we export to the Azure Monitor (by setting the `azure_monitor: true` label for each rule, see the Azure OMS agent section below). + +Prometheus [stores its data locally on disk](https://prometheus.io/docs/prometheus/latest/storage/). This means that if the Prometheus pod is deleted then Prometheus's database is deleted as well. **This happens if you run `make` in the deployments repo** in order to force Prometheus to refresh its config. It is possible to make [Prometheus can reload its config whilst still running](https://prometheus.io/docs/prometheus/latest/configuration/configuration/) if you enable the `--web.enable-lifecycle` flag but I haven't figured out how to inject that into the Istio profile yet. + +### Azure OMS agent + +The OMS agent pulls logs and metrics from the Kubernetes cluster and add it to a log analytics workspace. + +This is configured by the `oms_agent` block in terraform (in the [products](https://github.com/MHRA/products) repo): + +```terraform +resource "azurerm_kubernetes_cluster" "cluster" { + # ...other properties... + + addon_profile { + oms_agent { + enabled = true + log_analytics_workspace_id = azurerm_log_analytics_workspace.cluster.id + } + } +} +``` + +The configuration for the OMS agent lives [here](./oms-agent/container-azm-ms-agentconfig.yaml). + +In this configuration we tell the OMS agent to only scrape Prometheus metrics which have the label `azure_monitor: true` by setting the scrape URLs in `prometheus-data-collection-settings` to: + +```yaml +urls = [ +"http://prometheus.istio-system.svc.cluster.local:9090/federate?match[]={azure_monitor=%22true%22}" +] +``` + +(This uses [Prometheus federation](https://prometheus.io/docs/prometheus/latest/federation/)). + +### Azure Monitor Dashboard + +The code for the dashboard lives in terraform in [modules/cluster/dashboard.tf](../modules/cluster/dashboard.tf). The JSON code for the dashboard is pretty gnarly so if you want to make changes I would recommend making them in the UI, then exporting the dashboard and JSON and pop that into terraform (and don't forget to parametrise things like the subscription id etc). + +The queries for the Azure Monitor dashboard and written using Azure's [Kusto Query Language (KQL)](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/concepts/). diff --git a/observability/kube-state-metrics/.gitignore b/observability/kube-state-metrics/.gitignore new file mode 100644 index 0000000..6eb86f1 --- /dev/null +++ b/observability/kube-state-metrics/.gitignore @@ -0,0 +1 @@ +install.yaml diff --git a/kube-state-metrics/Makefile b/observability/kube-state-metrics/Makefile similarity index 69% rename from kube-state-metrics/Makefile rename to observability/kube-state-metrics/Makefile index 06ef988..ab6d756 100644 --- a/kube-state-metrics/Makefile +++ b/observability/kube-state-metrics/Makefile @@ -1,10 +1,10 @@ .PHONY: default default: ## Deploy using Kustomize - kustomize build ./base | kubectl apply -f - + kustomize build ./overlay | kubectl apply -f - .PHONY: delete delete: ## Deploy using Kustomize - kustomize build ./base | kubectl delete --ignore-not-found -f - || true + kustomize build ./overlay | kubectl delete --ignore-not-found -f - || true .PHONY: help help: ## Display this help screen diff --git a/observability/kube-state-metrics/README.md b/observability/kube-state-metrics/README.md new file mode 100644 index 0000000..82cfdcb --- /dev/null +++ b/observability/kube-state-metrics/README.md @@ -0,0 +1,18 @@ +# kube-state-metrics install + +The `generated` folder contains the vanilla installation manifests for `kube-state-metrics`. To refresh generated manifests, fetch the helm repo (see below) and split them into the `generated` folder, like this: + +```bash + +helm repo add kube-state-metrics https://kubernetes.github.io/kube-state-metrics +helm repo update + +helm template kube-state-metrics kube-state-metrics/kube-state-metrics >install.yaml + +# install https://github.com/mogensen/kubernetes-split-yaml +go get -v github.com/mogensen/kubernetes-split-yaml + +# splits the yaml into resource oriented manifests and stores them in the `generated` folder +~/go/bin/kubernetes-split-yaml install.yaml + +``` diff --git a/observability/kube-state-metrics/generated/kube-state-metrics-cr.yaml b/observability/kube-state-metrics/generated/kube-state-metrics-cr.yaml new file mode 100755 index 0000000..1f96287 --- /dev/null +++ b/observability/kube-state-metrics/generated/kube-state-metrics-cr.yaml @@ -0,0 +1,146 @@ +# Source: kube-state-metrics/templates/role.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + helm.sh/chart: kube-state-metrics-2.13.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: kube-state-metrics + name: kube-state-metrics +rules: + +- apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - configmaps + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - cronjobs + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - daemonsets + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - deployments + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - endpoints + verbs: ["list", "watch"] + +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - jobs + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - limitranges + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - mutatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - namespaces + verbs: ["list", "watch"] + +- apiGroups: ["networking.k8s.io"] + resources: + - networkpolicies + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - nodes + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumeclaims + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumes + verbs: ["list", "watch"] + +- apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - pods + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - replicasets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - replicationcontrollers + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - resourcequotas + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - secrets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - services + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - validatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - volumeattachments + verbs: ["list", "watch"] diff --git a/observability/kube-state-metrics/generated/kube-state-metrics-crb.yaml b/observability/kube-state-metrics/generated/kube-state-metrics-crb.yaml new file mode 100755 index 0000000..47d58d0 --- /dev/null +++ b/observability/kube-state-metrics/generated/kube-state-metrics-crb.yaml @@ -0,0 +1,18 @@ +# Source: kube-state-metrics/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + helm.sh/chart: kube-state-metrics-2.13.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: kube-state-metrics + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: kube-system diff --git a/observability/kube-state-metrics/generated/kube-state-metrics-deployment.yaml b/observability/kube-state-metrics/generated/kube-state-metrics-deployment.yaml new file mode 100755 index 0000000..a903dec --- /dev/null +++ b/observability/kube-state-metrics/generated/kube-state-metrics-deployment.yaml @@ -0,0 +1,103 @@ +# Source: kube-state-metrics/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: kube-system + labels: + app.kubernetes.io/name: kube-state-metrics + helm.sh/chart: "kube-state-metrics-2.13.2" + app.kubernetes.io/instance: "kube-state-metrics" + app.kubernetes.io/managed-by: "Helm" + app.kubernetes.io/version: "1.9.8" +spec: + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics + replicas: 1 + template: + metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: "kube-state-metrics" + spec: + hostNetwork: false + serviceAccountName: kube-state-metrics + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsUser: 65534 + containers: + - name: kube-state-metrics + args: + - --collectors=certificatesigningrequests + + - --collectors=configmaps + + - --collectors=cronjobs + + - --collectors=daemonsets + + - --collectors=deployments + + - --collectors=endpoints + + - --collectors=horizontalpodautoscalers + + - --collectors=ingresses + + - --collectors=jobs + + - --collectors=limitranges + + - --collectors=mutatingwebhookconfigurations + + - --collectors=namespaces + + - --collectors=networkpolicies + + - --collectors=nodes + + - --collectors=persistentvolumeclaims + + - --collectors=persistentvolumes + + - --collectors=poddisruptionbudgets + + - --collectors=pods + + - --collectors=replicasets + + - --collectors=replicationcontrollers + + - --collectors=resourcequotas + + - --collectors=secrets + + - --collectors=services + + - --collectors=statefulsets + + - --collectors=storageclasses + + - --collectors=validatingwebhookconfigurations + + - --collectors=volumeattachments + + - --telemetry-port=8081 + imagePullPolicy: IfNotPresent + image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v1.9.8" + ports: + - containerPort: 8080 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 diff --git a/observability/kube-state-metrics/generated/kube-state-metrics-sa.yaml b/observability/kube-state-metrics/generated/kube-state-metrics-sa.yaml new file mode 100755 index 0000000..7d2951d --- /dev/null +++ b/observability/kube-state-metrics/generated/kube-state-metrics-sa.yaml @@ -0,0 +1,14 @@ +--- +# Source: kube-state-metrics/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + helm.sh/chart: kube-state-metrics-2.13.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: kube-state-metrics + name: kube-state-metrics + namespace: kube-system +imagePullSecrets: + [] diff --git a/observability/kube-state-metrics/generated/kube-state-metrics-svc.yaml b/observability/kube-state-metrics/generated/kube-state-metrics-svc.yaml new file mode 100755 index 0000000..20d5d7a --- /dev/null +++ b/observability/kube-state-metrics/generated/kube-state-metrics-svc.yaml @@ -0,0 +1,25 @@ +# Source: kube-state-metrics/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: kube-system + labels: + app.kubernetes.io/name: kube-state-metrics + helm.sh/chart: "kube-state-metrics-2.13.2" + app.kubernetes.io/instance: "kube-state-metrics" + app.kubernetes.io/managed-by: "Helm" + annotations: + prometheus.io/scrape: "true" +spec: + type: "ClusterIP" + clusterIP: None + ports: + - name: "http" + protocol: TCP + port: 8080 + targetPort: 8080 + + selector: + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: kube-state-metrics diff --git a/observability/kube-state-metrics/generated/kustomization.yaml b/observability/kube-state-metrics/generated/kustomization.yaml new file mode 100644 index 0000000..60a92ba --- /dev/null +++ b/observability/kube-state-metrics/generated/kustomization.yaml @@ -0,0 +1,6 @@ +resources: + - kube-state-metrics-cr.yaml + - kube-state-metrics-crb.yaml + - kube-state-metrics-deployment.yaml + - kube-state-metrics-sa.yaml + - kube-state-metrics-svc.yaml diff --git a/observability/kube-state-metrics/overlay/kube-state-metrics-deployment.yaml b/observability/kube-state-metrics/overlay/kube-state-metrics-deployment.yaml new file mode 100755 index 0000000..78790b1 --- /dev/null +++ b/observability/kube-state-metrics/overlay/kube-state-metrics-deployment.yaml @@ -0,0 +1,20 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: kube-system + # labels: + # app.kubernetes.io/version: "2.0.0" +spec: + template: + spec: + containers: + - name: kube-state-metrics + # image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.0.0" + resources: + limits: + cpu: 500m + memory: 500Mi + requests: + cpu: 200m + memory: 200Mi diff --git a/observability/kube-state-metrics/overlay/kube-state-metrics-svc.yaml b/observability/kube-state-metrics/overlay/kube-state-metrics-svc.yaml new file mode 100755 index 0000000..d2f2eb2 --- /dev/null +++ b/observability/kube-state-metrics/overlay/kube-state-metrics-svc.yaml @@ -0,0 +1,8 @@ +# Source: kube-state-metrics/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: kube-system +spec: + clusterIP: None diff --git a/observability/kube-state-metrics/overlay/kustomization.yaml b/observability/kube-state-metrics/overlay/kustomization.yaml new file mode 100644 index 0000000..5a7f2dd --- /dev/null +++ b/observability/kube-state-metrics/overlay/kustomization.yaml @@ -0,0 +1,6 @@ +resources: + - ../generated + +patchesStrategicMerge: + - kube-state-metrics-deployment.yaml + - kube-state-metrics-svc.yaml diff --git a/observability/oms-agent/Makefile b/observability/oms-agent/Makefile new file mode 100644 index 0000000..b36b4dc --- /dev/null +++ b/observability/oms-agent/Makefile @@ -0,0 +1,15 @@ + +.PHONY: default +default: apply ## Create resources + +.PHONY: apply +apply: ## Apply kubernetes manifests + kustomize build . | kubectl apply -f - + +.PHONY: delete +delete: ## Delete resources + kustomize build . | kubectl delete --ignore-not-found -f - || true + +.PHONY: help +help: ## Display this help screen + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' diff --git a/observability/container-azm-ms-agentconfig.yaml b/observability/oms-agent/container-azm-ms-agentconfig.yaml similarity index 100% rename from observability/container-azm-ms-agentconfig.yaml rename to observability/oms-agent/container-azm-ms-agentconfig.yaml diff --git a/observability/kustomization.yaml b/observability/oms-agent/kustomization.yaml similarity index 62% rename from observability/kustomization.yaml rename to observability/oms-agent/kustomization.yaml index 90c5901..4b396dd 100644 --- a/observability/kustomization.yaml +++ b/observability/oms-agent/kustomization.yaml @@ -1,3 +1,2 @@ resources: - container-azm-ms-agentconfig.yaml - - prometheus-configmap.yaml diff --git a/observability/prometheus-configmap.yaml b/observability/prometheus-configmap.yaml deleted file mode 100644 index b793df3..0000000 --- a/observability/prometheus-configmap.yaml +++ /dev/null @@ -1,356 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - annotations: - install.operator.istio.io/chart-owner: AddonComponents - labels: - app: prometheus - install.operator.istio.io/owner-group: install.istio.io - install.operator.istio.io/owner-kind: IstioOperator - install.operator.istio.io/owner-name: istiocontrolplane - release: istio - name: prometheus - namespace: istio-system -data: - prometheus.rules.yml: | - groups: - - name: traffic - rules: - - record: job:incoming_requests_per_second_per_pod:mean - expr: | - sum by (namespace, pod_name) (rate(istio_requests_total[1m])) - labels: - azure_monitor: true - - - name: request_latency - rules: - - record: job:success_response_latency_milliseconds_per_pod:mean - expr: | - sum by (namespace, pod_name) (rate(istio_request_duration_milliseconds_sum{response_code!~"5.."}[1m])) - / - sum by (namespace, pod_name) (rate(istio_request_duration_milliseconds_count{response_code!~"5.."}[1m])) - labels: - azure_monitor: true - - - record: job:error_response_latency_milliseconds_per_pod:mean - expr: | - sum by (namespace, pod_name) (rate(istio_request_duration_milliseconds_sum{response_code=~"5.."}[1m])) - / - sum by (namespace, pod_name) (rate(istio_request_duration_milliseconds_count{response_code=~"5.."}[1m])) - labels: - azure_monitor: true - - - name: request_error_rate - rules: - - record: job:request_error_rate_per_pod:mean - expr: | - sum by (namespace, pod_name) (rate(istio_request_duration_milliseconds_count{response_code=~"5.."}[1m])) - / - sum by (namespace, pod_name) (rate(istio_request_duration_milliseconds_count{response_code!~"5.."}[1m])) - labels: - azure_monitor: true - - - name: saturation - rules: - - record: job:cpu_usage_seconds_per_pod:mean - expr: | - sum by (namespace, pod) (rate(container_cpu_usage_seconds_total{image!=""}[1m])) - labels: - azure_monitor: true - - - record: job:memory_usage_percent_per_pod:mean - expr: | - sum(container_memory_working_set_bytes) by (namespace, pod) - / - sum(label_join(kube_pod_container_resource_limits_memory_bytes, "pod", "", "pod")) by (namespace, pod) - labels: - azure_monitor: true - - prometheus.yml: |- - global: - scrape_interval: 15s - - rule_files: - - "prometheus.rules.yml" - - scrape_configs: - - # Mixer scrapping. Defaults to Prometheus and mixer on same namespace. - # - - job_name: 'istio-mesh' - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - istio-system - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: istio-telemetry;prometheus - - # Scrape config for envoy stats - - job_name: 'envoy-stats' - metrics_path: /stats/prometheus - kubernetes_sd_configs: - - role: pod - - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_port_name] - action: keep - regex: '.*-envoy-prom' - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:15090 - target_label: __address__ - - action: labeldrop - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod_name - - - job_name: 'istio-policy' - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - istio-system - - - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: istio-policy;http-policy-monitoring - - - job_name: 'istio-telemetry' - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - istio-system - - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: istio-telemetry;http-monitoring - - - job_name: 'pilot' - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - istio-system - - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: istio-pilot;http-monitoring - - - job_name: 'galley' - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - istio-system - - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: istio-galley;http-monitoring - - - job_name: 'citadel' - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - istio-system - - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: istio-citadel;http-monitoring - - - job_name: 'sidecar-injector' - - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - istio-system - - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: istio-sidecar-injector;http-monitoring - - # scrape config for API servers - - job_name: 'kubernetes-apiservers' - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - default - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: kubernetes;https - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - # scrape config for nodes (kubelet) - - job_name: 'kubernetes-nodes' - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics - - # Scrape config for Kubelet cAdvisor. - # - # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics - # (those whose names begin with 'container_') have been removed from the - # Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to - # retrieve those metrics. - # - # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor - # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics" - # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with - # the --cadvisor-port=0 Kubelet flag). - # - # This job is not necessary and should be removed in Kubernetes 1.6 and - # earlier versions, or it will cause the metrics to be scraped twice. - - job_name: 'kubernetes-cadvisor' - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - - # scrape config for service endpoints. - - job_name: 'kubernetes-service-endpoints' - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name - - - job_name: 'kubernetes-pods' - kubernetes_sd_configs: - - role: pod - relabel_configs: # If first two labels are present, pod should be scraped by the istio-secure job. - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_pod_annotation_sidecar_istio_io_status] - action: drop - regex: (.+) - - source_labels: [__meta_kubernetes_pod_annotation_istio_mtls] - action: drop - regex: (true) - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod_name - - # ^ Above is istio default config - # Below are our additions - - - job_name: 'doc-index-updater' - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - doc-index-updater - relabel_configs: # If first two labels are present, pod should be scraped by the istio-secure job. - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_pod_annotation_sidecar_istio_io_status] - action: drop - regex: (.+) - - source_labels: [__meta_kubernetes_pod_annotation_istio_mtls] - action: drop - regex: (true) - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod_name - - - job_name: 'kube-state-metrics' - static_configs: - - targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080'] diff --git a/observability/prometheus/.gitignore b/observability/prometheus/.gitignore new file mode 100644 index 0000000..6eb86f1 --- /dev/null +++ b/observability/prometheus/.gitignore @@ -0,0 +1 @@ +install.yaml diff --git a/observability/Makefile b/observability/prometheus/Makefile similarity index 66% rename from observability/Makefile rename to observability/prometheus/Makefile index f054a4d..e2705c2 100644 --- a/observability/Makefile +++ b/observability/prometheus/Makefile @@ -1,10 +1,9 @@ -.PHONY: default -default: apply restart-prometheus ## Create resources - .PHONY: apply apply: ## Apply kubernetes manifests - kustomize build . | kubectl apply -f - + kustomize build overlay \ + | istioctl kube-inject -f - \ + | kubectl apply -f - .PHONY: restart-prometheus restart-prometheus: ## Restart prometheus so config changes take effect @@ -12,7 +11,10 @@ restart-prometheus: ## Restart prometheus so config changes take effect .PHONY: delete delete: ## Delete resources - kustomize build . | kubectl delete --ignore-not-found -f - || true + kustomize build overlay \ + | istioctl kube-inject -f - \ + | kubectl apply -f - \ + | kubectl delete --ignore-not-found -f - || true .PHONY: help help: ## Display this help screen diff --git a/observability/prometheus/README.md b/observability/prometheus/README.md new file mode 100644 index 0000000..ed97dae --- /dev/null +++ b/observability/prometheus/README.md @@ -0,0 +1,16 @@ +# Prometheus install + +The `generated` folder contains the vanilla installation manifests for Prometheus from the Istio repository. To refresh generated manifests, fetch the relevant source yaml (see below) and split them into the `generated` folder, like this: + +```bash + +# the url will probably change for newer versions of Istio +curl https://raw.githubusercontent.com/istio/istio/release-1.9/samples/addons/prometheus.yaml -o install.yaml + +# install https://github.com/mogensen/kubernetes-split-yaml +go get -v github.com/mogensen/kubernetes-split-yaml + +# splits the yaml into resource oriented manifests and stores them in the `generated` folder +~/go/bin/kubernetes-split-yaml install.yaml + +``` diff --git a/observability/prometheus/generated/kustomization.yaml b/observability/prometheus/generated/kustomization.yaml new file mode 100644 index 0000000..e00efe4 --- /dev/null +++ b/observability/prometheus/generated/kustomization.yaml @@ -0,0 +1,7 @@ +resources: + - prometheus-cm.yaml + - prometheus-cr.yaml + - prometheus-crb.yaml + - prometheus-deployment.yaml + - prometheus-sa.yaml + - prometheus-svc.yaml diff --git a/observability/prometheus/generated/prometheus-cm.yaml b/observability/prometheus/generated/prometheus-cm.yaml new file mode 100755 index 0000000..b45acbf --- /dev/null +++ b/observability/prometheus/generated/prometheus-cm.yaml @@ -0,0 +1,274 @@ +# Source: prometheus/templates/server/cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + component: "server" + app: prometheus + release: prometheus + chart: prometheus-11.16.2 + heritage: Helm + name: prometheus + namespace: istio-system +data: + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 15s + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: kubernetes_node + - job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: kubernetes_node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + - job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_pod_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: kubernetes_pod_name + - action: drop + regex: Pending|Succeeded|Failed + source_labels: + - __meta_kubernetes_pod_phase + - job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_pod_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: kubernetes_pod_name + - action: drop + regex: Pending|Succeeded|Failed + source_labels: + - __meta_kubernetes_pod_phase + scrape_interval: 5m + scrape_timeout: 30s + recording_rules.yml: | + {} + rules: | + {} diff --git a/observability/prometheus/generated/prometheus-cr.yaml b/observability/prometheus/generated/prometheus-cr.yaml new file mode 100755 index 0000000..d8eca84 --- /dev/null +++ b/observability/prometheus/generated/prometheus-cr.yaml @@ -0,0 +1,41 @@ +# Source: prometheus/templates/server/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + component: "server" + app: prometheus + release: prometheus + chart: prometheus-11.16.2 + heritage: Helm + name: prometheus +rules: + - apiGroups: + - "" + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + - ingresses + - configmaps + verbs: + - get + - list + - watch + - apiGroups: + - "extensions" + - "networking.k8s.io" + resources: + - ingresses/status + - ingresses + verbs: + - get + - list + - watch + - nonResourceURLs: + - "/metrics" + verbs: + - get diff --git a/observability/prometheus/generated/prometheus-crb.yaml b/observability/prometheus/generated/prometheus-crb.yaml new file mode 100755 index 0000000..0346d19 --- /dev/null +++ b/observability/prometheus/generated/prometheus-crb.yaml @@ -0,0 +1,19 @@ +# Source: prometheus/templates/server/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + component: "server" + app: prometheus + release: prometheus + chart: prometheus-11.16.2 + heritage: Helm + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: istio-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus diff --git a/observability/prometheus/generated/prometheus-deployment.yaml b/observability/prometheus/generated/prometheus-deployment.yaml new file mode 100755 index 0000000..d4c8edf --- /dev/null +++ b/observability/prometheus/generated/prometheus-deployment.yaml @@ -0,0 +1,93 @@ +# Source: prometheus/templates/server/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + component: "server" + app: prometheus + release: prometheus + chart: prometheus-11.16.2 + heritage: Helm + name: prometheus + namespace: istio-system +spec: + selector: + matchLabels: + component: "server" + app: prometheus + release: prometheus + replicas: 1 + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + labels: + component: "server" + app: prometheus + release: prometheus + chart: prometheus-11.16.2 + heritage: Helm + spec: + serviceAccountName: prometheus + containers: + - name: prometheus-server-configmap-reload + image: "jimmidyson/configmap-reload:v0.4.0" + imagePullPolicy: "IfNotPresent" + args: + - --volume-dir=/etc/config + - --webhook-url=http://127.0.0.1:9090/-/reload + resources: {} + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "prom/prometheus:v2.21.0" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=15d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 0 + periodSeconds: 5 + timeoutSeconds: 30 + failureThreshold: 3 + successThreshold: 1 + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 30 + failureThreshold: 3 + successThreshold: 1 + resources: {} + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: prometheus + - name: storage-volume + emptyDir: {} diff --git a/observability/prometheus/generated/prometheus-sa.yaml b/observability/prometheus/generated/prometheus-sa.yaml new file mode 100755 index 0000000..7176fed --- /dev/null +++ b/observability/prometheus/generated/prometheus-sa.yaml @@ -0,0 +1,14 @@ +--- +# Source: prometheus/templates/server/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + component: "server" + app: prometheus + release: prometheus + chart: prometheus-11.16.2 + heritage: Helm + name: prometheus + namespace: istio-system + annotations: {} diff --git a/observability/prometheus/generated/prometheus-svc.yaml b/observability/prometheus/generated/prometheus-svc.yaml new file mode 100755 index 0000000..5c288af --- /dev/null +++ b/observability/prometheus/generated/prometheus-svc.yaml @@ -0,0 +1,24 @@ +# Source: prometheus/templates/server/service.yaml +apiVersion: v1 +kind: Service +metadata: + labels: + component: "server" + app: prometheus + release: prometheus + chart: prometheus-11.16.2 + heritage: Helm + name: prometheus + namespace: istio-system +spec: + ports: + - name: http + port: 9090 + protocol: TCP + targetPort: 9090 + selector: + component: "server" + app: prometheus + release: prometheus + sessionAffinity: None + type: "ClusterIP" diff --git a/observability/prometheus/overlay/kustomization.yaml b/observability/prometheus/overlay/kustomization.yaml new file mode 100644 index 0000000..d4687d7 --- /dev/null +++ b/observability/prometheus/overlay/kustomization.yaml @@ -0,0 +1,6 @@ +resources: + - ../generated + +patchesStrategicMerge: + - prometheus-cm.yaml + - prometheus-deployment.yaml diff --git a/observability/prometheus/overlay/prometheus-cm.yaml b/observability/prometheus/overlay/prometheus-cm.yaml new file mode 100755 index 0000000..aacf408 --- /dev/null +++ b/observability/prometheus/overlay/prometheus-cm.yaml @@ -0,0 +1,357 @@ +# Source: prometheus/templates/server/cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + component: "server" + app: prometheus + release: prometheus + chart: prometheus-11.16.2 + heritage: Helm + name: prometheus + namespace: istio-system +data: + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 15s + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: kubernetes_node + - job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: kubernetes_node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + - job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_pod_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: kubernetes_pod_name + - action: drop + regex: Pending|Succeeded|Failed + source_labels: + - __meta_kubernetes_pod_phase + - job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_pod_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: kubernetes_pod_name + - action: drop + regex: Pending|Succeeded|Failed + source_labels: + - __meta_kubernetes_pod_phase + scrape_interval: 5m + scrape_timeout: 30s + + - job_name: 'istiod' + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - istio-system + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istiod;http-monitoring + + - job_name: 'envoy-stats' + metrics_path: /stats/prometheus + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + action: keep + regex: '.*-envoy-prom' + scheme: https + tls_config: + ca_file: /etc/prom-certs/root-cert.pem + cert_file: /etc/prom-certs/cert-chain.pem + key_file: /etc/prom-certs/key.pem + insecure_skip_verify: true # Prometheus does not support Istio security naming, thus skip verifying target pod certificate + + + - job_name: 'kube-state-metrics' + static_configs: + - targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080'] + + recording_rules.yml: | + {} + rules: | + groups: + - name: traffic + rules: + - record: job:incoming_requests_per_second_per_pod:mean + expr: | + sum by (namespace, pod_name) (rate(istio_requests_total[1m])) + labels: + azure_monitor: true + + - name: request_latency + rules: + - record: job:success_response_latency_milliseconds_per_pod:mean + expr: | + sum by (namespace, pod_name) (rate(istio_request_duration_milliseconds_sum{response_code!~"5.."}[1m])) + / + sum by (namespace, pod_name) (rate(istio_request_duration_milliseconds_count{response_code!~"5.."}[1m])) + labels: + azure_monitor: true + + - record: job:error_response_latency_milliseconds_per_pod:mean + expr: | + sum by (namespace, pod_name) (rate(istio_request_duration_milliseconds_sum{response_code=~"5.."}[1m])) + / + sum by (namespace, pod_name) (rate(istio_request_duration_milliseconds_count{response_code=~"5.."}[1m])) + labels: + azure_monitor: true + + - name: request_error_rate + rules: + - record: job:request_error_rate_per_pod:mean + expr: | + sum by (namespace, pod_name) (rate(istio_request_duration_milliseconds_count{response_code=~"5.."}[1m])) + / + sum by (namespace, pod_name) (rate(istio_request_duration_milliseconds_count{response_code!~"5.."}[1m])) + labels: + azure_monitor: true + + - name: saturation + rules: + - record: job:cpu_usage_seconds_per_pod:mean + expr: | + sum by (namespace, pod) (rate(container_cpu_usage_seconds_total{image!=""}[1m])) + labels: + azure_monitor: true + + - record: job:memory_usage_percent_per_pod:mean + expr: | + sum(container_memory_working_set_bytes) by (namespace, pod) + / + sum(label_join(kube_pod_container_resource_limits_memory_bytes, "pod", "", "pod")) by (namespace, pod) + labels: + azure_monitor: true diff --git a/observability/prometheus/overlay/prometheus-deployment.yaml b/observability/prometheus/overlay/prometheus-deployment.yaml new file mode 100644 index 0000000..6c37da5 --- /dev/null +++ b/observability/prometheus/overlay/prometheus-deployment.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: istio-system +spec: + template: + metadata: + annotations: + prometheus.io/path: /stats/prometheus + prometheus.io/port: "15020" + prometheus.io/scrape: "true" + sidecar.istio.io/inject: "true" + sidecar.istio.io/status: '{"initContainers":["istio-init"],"containers":["istio-proxy"],"volumes":["istio-envoy","istio-data","istio-podinfo","istio-token","istiod-ca-cert"],"imagePullSecrets":null}' + traffic.sidecar.istio.io/includeInboundPorts: "" # do not intercept any inbound ports + traffic.sidecar.istio.io/includeOutboundIPRanges: "" # do not intercept any outbound traffic + # configure an env variable `OUTPUT_CERTS` to write certificates to the given folder + proxy.istio.io/config: | + proxyMetadata: + OUTPUT_CERTS: /etc/istio-output-certs + sidecar.istio.io/userVolumeMount: '[{"name": "istio-certs", "mountPath": "/etc/istio-output-certs"}]' # mount the shared volume at sidecar proxy + spec: + containers: + - name: prometheus-server + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + runAsGroup: 1337 + runAsNonRoot: true + runAsUser: 1337 + volumeMounts: + - mountPath: /etc/prom-certs/ + name: istio-certs + volumes: + - emptyDir: + medium: Memory + name: istio-certs diff --git a/sealed-secrets/Makefile b/sealed-secrets/Makefile index 0eb5b40..696c178 100644 --- a/sealed-secrets/Makefile +++ b/sealed-secrets/Makefile @@ -10,7 +10,7 @@ default: ## Create resources [ $$(kubectl get secret \ -n kube-system \ --selector "sealedsecrets.bitnami.com/sealed-secrets-key=active" \ - --no-headers | wc -l) -eq 1 ] || ( \ + --no-headers | wc -l) -gt 0 ] || ( \ az keyvault secret show \ --vault-name $(vault-$(overlay)) \ --name sealing-key \