diff --git a/ansible/deploy_swarm_monitoring.yml b/ansible/deploy_swarm_monitoring.yml
new file mode 100644
index 0000000..d5e11f1
--- /dev/null
+++ b/ansible/deploy_swarm_monitoring.yml
@@ -0,0 +1,18 @@
+#
+# Copyright StackHPC, 2018
+#
+---
+- name: Deploy Swarm cluster monitoring
+ hosts: cluster
+ become: yes
+ roles:
+ - role: monasca_agent_docker
+ - role: monasca_fluentd
+ - role: prometheus-docker-node
+
+- name: Deploy Prometheus server
+ hosts: master
+ become: yes
+ roles:
+ - role: prometheus-server
+
diff --git a/ansible/group_vars/all/all b/ansible/group_vars/all/all
index 078c77c..a6964d5 100644
--- a/ansible/group_vars/all/all
+++ b/ansible/group_vars/all/all
@@ -15,7 +15,7 @@ alaska_cloud: alaska
alaska_homedir: /alaska
alaska_softiron: 10.4.99.101
-# OpenStack fully qualified project name
+# OpenStack fully qualified project name (used for Grafana with domain support)
project_name: p3@default
# Virtual IP address of the controller node
@@ -28,6 +28,24 @@ alaska_monitoring_server: 10.60.253.3
monasca_agent_p3_username: p3-monasca-agent
monasca_agent_p3_password: "{{ vault_monasca_agent_password }}"
+# Monasca Fluentd config
+monasca_fluentd_log_api_uri: http://{{ controller_vip }}:5607
+monasca_fluentd_keystone_uri: http://{{ controller_vip }}:5000
+monasca_fluentd_username: "{{ monasca_agent_p3_username }}"
+monasca_fluentd_password: "{{ monasca_agent_p3_password }}"
+monasca_fluentd_project_domain_id: default
+monasca_fluentd_project_name: p3
+
+# Monasca Agent (Docker) config
+monasca_agent_docker_forwarder_port: 17120
+monasca_agent_docker_log_level: INFO
+monasca_agent_docker_api_uri: http://{{ controller_vip }}:8082/v2.0
+monasca_agent_docker_keystone_uri: http://{{ controller_vip }}:5000/v3
+monasca_agent_docker_username: "{{ monasca_agent_p3_username }}"
+monasca_agent_docker_password: "{{ monasca_agent_p3_password }}"
+monasca_agent_docker_project_name: p3
+
+
# Local Grafana admin account for configuring Grafana
grafana_admin_username: grafana-admin
grafana_admin_password: "{{ vault_grafana_admin_password }}"
diff --git a/ansible/roles/monasca_agent_docker/tasks/main.yml b/ansible/roles/monasca_agent_docker/tasks/main.yml
new file mode 100644
index 0000000..46ec759
--- /dev/null
+++ b/ansible/roles/monasca_agent_docker/tasks/main.yml
@@ -0,0 +1,116 @@
+#
+# Copyright StackHPC, 2018
+#
+---
+- name: Start monasca-agent-forwarder container
+ docker_container:
+ name: monasca-agent-forwarder
+ pull: true
+ image: stackhpc/agent-forwarder:latest
+ state: started
+ volumes:
+ - plugins:/etc/monasca/agent/conf.d:ro
+ ports:
+ - "{{ monasca_agent_docker_forwarder_port }}:{{ monasca_agent_docker_forwarder_port }}"
+ env:
+ LOG_LEVEL: "{{ monasca_agent_docker_log_level }}"
+ OS_AUTH_URL: "{{ monasca_agent_docker_keystone_uri }}"
+ OS_USERNAME: "{{ monasca_agent_docker_username }}"
+ OS_PASSWORD: "{{ monasca_agent_docker_password }}"
+ OS_USER_DOMAIN_NAME: Default
+ OS_PROJECT_NAME: "{{ monasca_agent_docker_project_name }}"
+ OS_PROJECT_DOMAIN_NAME: Default
+ MONASCA_URL: "{{ monasca_agent_docker_api_uri }}"
+ SERVICE_TYPE: monitoring
+ ENDPOINT_TYPE: public
+ REGION_NAME: RegionOne
+ AGENT_HOSTNAME: "{{ ansible_hostname }}"
+ FORWARDER_URL: "http://monasca_agent-forwarder:{{ monasca_agent_docker_forwarder_port }}"
+ FORWARDER_PORT: "{{ monasca_agent_docker_forwarder_port }}"
+
+- name: Create Monasca collector plugin directory
+ file:
+ path: /etc/monasca/agent/conf.d/
+ state: directory
+ owner: root
+ group: root
+ mode: 0755
+
+- name: Template Monasca collector plugins
+ template:
+ src: "{{ item }}.j2"
+ dest: "/etc/monasca/agent/conf.d/{{ item }}"
+ owner: root
+ mode: 0644
+ with_items:
+ - cpu.yaml
+ - docker.yaml
+ - disk.yaml
+ - ib_network.yaml
+ - load.yaml
+ - memory.yaml
+ - network.yaml
+
+- name: Wait for forwarder service
+ wait_for:
+ port: "{{ monasca_agent_docker_forwarder_port }}"
+ delay: 1
+
+- name: Start monasca-agent-collector container
+ docker_container:
+ name: monasca-agent-collector
+ pull: true
+ links:
+ - monasca-agent-forwarder
+ image: stackhpc/agent-collector:latest
+ state: started
+ env:
+ DOCKER: True
+ LOG_LEVEL: "{{ monasca_agent_docker_log_level }}"
+ OS_AUTH_URL: "{{ monasca_agent_docker_keystone_uri }}"
+ OS_USERNAME: "{{ monasca_agent_docker_username }}"
+ OS_PASSWORD: "{{ monasca_agent_docker_password }}"
+ OS_USER_DOMAIN_NAME: Default
+ OS_PROJECT_NAME: "{{ monasca_agent_docker_project_name }}"
+ OS_PROJECT_DOMAIN_NAME: Default
+ MONASCA_URL: "{{ monasca_agent_docker_api_uri }}"
+ SERVICE_TYPE: monitoring
+ ENDPOINT_TYPE: public
+ REGION_NAME: RegionOne
+ AGENT_HOSTNAME: "{{ ansible_hostname }}"
+ FORWARDER_URL: "http://monasca-agent-forwarder:{{ monasca_agent_docker_forwarder_port }}"
+ FORWARDER_PORT: "{{ monasca_agent_docker_forwarder_port }}"
+ volumes:
+ - "/:/rootfs"
+ - "/var/run:/var/run:rw"
+ - "/sys:/sys:ro"
+ - "/var/lib/docker/:/var/lib/docker:ro"
+ - "/dev/disk/:/dev/disk:ro"
+ - "/etc/monasca/agent/conf.d:/etc/monasca/agent/conf.d:ro"
+
+- name: Start monasca-agent-statsd container
+ docker_container:
+ name: monasca-agent-statsd
+ pull: true
+ links:
+ - monasca-agent-forwarder
+ image: stackhpc/agent-statsd:latest
+ state: started
+ ports:
+ - "8125:8125/udp"
+ env:
+ DOCKER: True
+ LOG_LEVEL: "{{ monasca_agent_docker_log_level }}"
+ OS_AUTH_URL: "{{ monasca_agent_docker_keystone_uri }}"
+ OS_USERNAME: "{{ monasca_agent_docker_username }}"
+ OS_PASSWORD: "{{ monasca_agent_docker_password }}"
+ OS_USER_DOMAIN_NAME: Default
+ OS_PROJECT_NAME: "{{ monasca_agent_docker_project_name }}"
+ OS_PROJECT_DOMAIN_NAME: Default
+ MONASCA_URL: "{{ monasca_agent_docker_api_uri }}"
+ SERVICE_TYPE: monitoring
+ ENDPOINT_TYPE: public
+ REGION_NAME: RegionOne
+ AGENT_HOSTNAME: "{{ ansible_hostname }}"
+ FORWARDER_URL: "http://monasca-agent-forwarder:{{ monasca_agent_docker_forwarder_port }}"
+ FORWARDER_PORT: "{{ monasca_agent_docker_forwarder_port }}"
diff --git a/ansible/roles/monasca_agent_docker/templates/cpu.yaml.j2 b/ansible/roles/monasca_agent_docker/templates/cpu.yaml.j2
new file mode 100644
index 0000000..2aa6a36
--- /dev/null
+++ b/ansible/roles/monasca_agent_docker/templates/cpu.yaml.j2
@@ -0,0 +1,4 @@
+init_config: null
+instances:
+- built_by: System
+ name: cpu_stats
diff --git a/ansible/roles/monasca_agent_docker/templates/disk.yaml.j2 b/ansible/roles/monasca_agent_docker/templates/disk.yaml.j2
new file mode 100644
index 0000000..a4bb1dd
--- /dev/null
+++ b/ansible/roles/monasca_agent_docker/templates/disk.yaml.j2
@@ -0,0 +1,6 @@
+init_config: null
+instances:
+- built_by: System
+ device_blacklist_re: .*freezer_backup_snap.*
+ ignore_filesystem_types: iso9660,tmpfs
+ name: disk_stats
diff --git a/ansible/roles/monasca_agent_docker/templates/docker.yaml.j2 b/ansible/roles/monasca_agent_docker/templates/docker.yaml.j2
new file mode 100644
index 0000000..2f3f3eb
--- /dev/null
+++ b/ansible/roles/monasca_agent_docker/templates/docker.yaml.j2
@@ -0,0 +1,5 @@
+init_config:
+ docker_root: '/'
+
+instances:
+ - url: 'unix://var/run/docker.sock'
diff --git a/ansible/roles/monasca_agent_docker/templates/ib_network.yaml.j2 b/ansible/roles/monasca_agent_docker/templates/ib_network.yaml.j2
new file mode 100644
index 0000000..7ffd569
--- /dev/null
+++ b/ansible/roles/monasca_agent_docker/templates/ib_network.yaml.j2
@@ -0,0 +1,4 @@
+init_config: null
+instances:
+- built_by: IBNetworkDetect
+ name: ib_network_stats
diff --git a/ansible/roles/monasca_agent_docker/templates/load.yaml.j2 b/ansible/roles/monasca_agent_docker/templates/load.yaml.j2
new file mode 100644
index 0000000..c5ce0ca
--- /dev/null
+++ b/ansible/roles/monasca_agent_docker/templates/load.yaml.j2
@@ -0,0 +1,4 @@
+init_config: null
+instances:
+- built_by: System
+ name: load_stats
diff --git a/ansible/roles/monasca_agent_docker/templates/memory.yaml.j2 b/ansible/roles/monasca_agent_docker/templates/memory.yaml.j2
new file mode 100644
index 0000000..c9854a3
--- /dev/null
+++ b/ansible/roles/monasca_agent_docker/templates/memory.yaml.j2
@@ -0,0 +1,4 @@
+init_config: null
+instances:
+- built_by: System
+ name: memory_stats
diff --git a/ansible/roles/monasca_agent_docker/templates/network.yaml.j2 b/ansible/roles/monasca_agent_docker/templates/network.yaml.j2
new file mode 100644
index 0000000..5188355
--- /dev/null
+++ b/ansible/roles/monasca_agent_docker/templates/network.yaml.j2
@@ -0,0 +1,5 @@
+init_config: null
+instances:
+- built_by: System
+ excluded_interface_re: lo.*|vnet.*|tun.*|ovs.*|br.*|tap.*|qbr.*|qvb.*|qvo.*
+ name: network_stats
diff --git a/ansible/roles/monasca_fluentd/handlers/main.yml b/ansible/roles/monasca_fluentd/handlers/main.yml
new file mode 100644
index 0000000..e2f583c
--- /dev/null
+++ b/ansible/roles/monasca_fluentd/handlers/main.yml
@@ -0,0 +1,6 @@
+---
+- name: Restart Fluentd
+ docker_container:
+ name: fluentd
+ restart: True
+ when: fluentd_config.changed
diff --git a/ansible/roles/monasca_fluentd/tasks/main.yml b/ansible/roles/monasca_fluentd/tasks/main.yml
new file mode 100644
index 0000000..803f653
--- /dev/null
+++ b/ansible/roles/monasca_fluentd/tasks/main.yml
@@ -0,0 +1,39 @@
+#
+# Copyright StackHPC, 2018
+#
+---
+- name: Create Fluentd user
+ user:
+ name: fluent
+ state: present
+
+- name: Create Fluentd config directory
+ file:
+ path: /etc/fluentd
+ state: directory
+ owner: fluent
+ group: fluent
+ mode: 0755
+
+- name: Generate Fluentd config
+ template:
+ src: fluentd.conf.j2
+ dest: /etc/fluentd/fluentd.conf
+ owner: fluent
+ group: fluent
+ mode: 0644
+ register: fluentd_config
+ notify: Restart Fluentd
+
+- name: Start Fluentd container
+ docker_container:
+ name: fluentd
+ image: stackhpc/monasca-fluentd:latest
+ state: started
+ ports:
+ - "24224:24224"
+ - "24224:24224/udp"
+ volumes:
+ - /etc/fluentd/:/fluentd/etc:ro
+ env:
+ FLUENTD_CONF: "fluentd.conf"
diff --git a/ansible/roles/monasca_fluentd/templates/fluentd.conf.j2 b/ansible/roles/monasca_fluentd/templates/fluentd.conf.j2
new file mode 100644
index 0000000..609a003
--- /dev/null
+++ b/ansible/roles/monasca_fluentd/templates/fluentd.conf.j2
@@ -0,0 +1,45 @@
+# Accept logs from Docker Fluentd log driver
+
+ @type forward
+ port 24224
+ bind 0.0.0.0
+
+
+# Add a timestamp dimension to all logs to record the event time. The
+# event time is the time extracted from the log message in all cases
+# where the time_key is set, and the time the record entered fluentd
+# if no time_key is set.
+# logs.
+
+ @type record_transformer
+
+ timestamp ${time}
+
+
+
+# Docker saves all logs under the 'log' field. The fluentd-monasca
+# plugin assumes that they are saved under the 'message' field. Here
+# we map the 'log' field to the 'message' field for all logs.
+
+ @type record_transformer
+ enable_ruby true
+
+ message ${record["log"]}
+
+ remove_keys log
+
+
+
+ type copy
+
+ @type monasca
+ keystone_url {{ monasca_fluentd_keystone_uri }}
+ monasca_log_api {{ monasca_fluentd_log_api_uri }}
+ monasca_log_api_version v3.0
+ username {{ monasca_fluentd_username }}
+ password {{ monasca_fluentd_password }}
+ domain_id {{ monasca_fluentd_project_domain_id }}
+ project_name {{ monasca_fluentd_project_name }}
+
+
+
diff --git a/ansible/roles/prometheus-docker-node/tasks/main.yml b/ansible/roles/prometheus-docker-node/tasks/main.yml
new file mode 100644
index 0000000..ff6d255
--- /dev/null
+++ b/ansible/roles/prometheus-docker-node/tasks/main.yml
@@ -0,0 +1,23 @@
+---
+- name: Add Prometheus node-exporter
+ docker_container:
+ name: prom-node-exporter
+ image: prom/node-exporter
+ network_mode: host # TODO should access host fs too
+ ports:
+ - 9100:9100
+
+- name: Add cAdvisor
+ docker_container:
+ name: cAdvisor
+ image: google/cadvisor:latest
+ privileged: yes
+ detach: yes
+ ports:
+ - 8080:8080
+ volumes:
+ - /:/rootfs:ro
+ - /var/run:/var/run:rw
+ - /sys:/sys:ro
+ - /var/lib/docker/:/var/lib/docker:ro
+ - /dev/disk/:/dev/disk:ro
diff --git a/ansible/roles/prometheus-server/defaults/main.yml b/ansible/roles/prometheus-server/defaults/main.yml
new file mode 100644
index 0000000..72c8632
--- /dev/null
+++ b/ansible/roles/prometheus-server/defaults/main.yml
@@ -0,0 +1,3 @@
+---
+
+prometheus_home: /home/centos/prometheus # TODO - better default?
diff --git a/ansible/roles/prometheus-server/files/grafana_dashboard.json b/ansible/roles/prometheus-server/files/grafana_dashboard.json
new file mode 100644
index 0000000..5d16730
--- /dev/null
+++ b/ansible/roles/prometheus-server/files/grafana_dashboard.json
@@ -0,0 +1,406 @@
+{
+ "annotations": {
+ "list": []
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 1,
+ "hideControls": false,
+ "id": 1,
+ "links": [],
+ "refresh": false,
+ "rows": [
+ {
+ "collapse": false,
+ "height": "250px",
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "id": 1,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_bytes{job='node',device='eno1'}[5m])",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} - {{device}}",
+ "refId": "A",
+ "step": 2
+ },
+ {
+ "expr": "irate(node_infiniband_port_data_received_bytes{job='node',device=\"mlx5_0\"}[5m])",
+ "format": "time_series",
+ "hide": true,
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} - {{device}}",
+ "refId": "C",
+ "step": 2
+ },
+ {
+ "expr": "irate(node_infiniband_port_data_transmitted_bytes{job='node',device=\"mlx5_0\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} - {{device}}",
+ "refId": "D",
+ "step": 2
+ },
+ {
+ "expr": "irate(node_network_transmit_bytes{job='node',device='eno1'}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} - {{device}}",
+ "refId": "E",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Network",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_io_time_ms{job='node',device!~'^(md\\\\d+$|dm-)'}[5m]) / 1000 * 100",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} - {{device}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Disk bandwidth",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "100 - (avg by (instance) (irate(node_cpu{job=\"node\",mode=\"idle\"}[5m])) * 100)",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{ instance }}",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "CPU",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "id": 3,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_Active{job=\"node\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}}",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Memory Usage",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ }
+ ],
+ "schemaVersion": 14,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-30m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Spark",
+ "version": 11
+}
diff --git a/ansible/roles/prometheus-server/tasks/main.yml b/ansible/roles/prometheus-server/tasks/main.yml
new file mode 100644
index 0000000..02db899
--- /dev/null
+++ b/ansible/roles/prometheus-server/tasks/main.yml
@@ -0,0 +1,101 @@
+---
+- name: Create prometheus data dir
+ file:
+ path: "{{ prometheus_home }}/data"
+ state: directory
+ mode: 0755
+
+- name: Configure prometheus targets
+ template:
+ src: prometheus.yml # TODO - add ip addresses properly
+ dest: "{{ prometheus_home }}/prometheus.yml"
+
+- name: Configure prometheus alerts
+ template:
+ src: alerts.rules
+ dest: "{{ prometheus_home }}/alerts.rules"
+
+- name: Add Prometheus server
+ docker_container:
+ name: prometheus
+ image: prom/prometheus
+ ports:
+ - 9090:9090
+ volumes:
+ - "{{ prometheus_home }}/prometheus.yml:/etc/prometheus/prometheus.yml"
+ - "{{ prometheus_home }}/alerts.rules:/etc/prometheus/alerts.rules"
+ - "{{ prometheus_home }}/data:/prometheus prom/prometheus"
+
+- name: Create Grafana dir
+ file:
+ path: "/var/lib/grafana"
+ state: directory
+ mode: 0755
+
+- name: Add Grafana
+ docker_container:
+ name: grafana
+ image: grafana/grafana
+ ports:
+ - 3000:3000 # TODO - add more config, github OAuth?
+ volumes:
+ - /var/lib/grafana
+
+- name: check datasource
+ uri:
+ url: "http://{{ ansible_host}}:3000/api/datasources"
+ method: GET
+ user: admin
+ password: admin
+ force_basic_auth: yes
+ return_content: yes
+ register: datasources
+
+- name: Add datasource
+ uri:
+ url: "http://{{ ansible_host}}:3000/api/datasources"
+ method: POST
+ user: admin
+ password: admin
+ force_basic_auth: yes
+ status_code: 200
+ body_format: json
+ body: '{"name": "Prometheus",
+ "type": "prometheus",
+ "access": "proxy",
+ "url": "http://{{ ansible_host }}:9090",
+ "password": "",
+ "user": "",
+ "basicAuth": false,
+ "basicAuthUser": "",
+ "basicAuthPassword": "",
+ "isDefault": true,
+ "jsonData": null }'
+ when: datasources.json == []
+
+- name: check datasource
+ uri:
+ url: "http://{{ ansible_host}}:3000/api/search?query=Spark"
+ method: GET
+ user: admin
+ password: admin
+ force_basic_auth: yes
+ return_content: yes
+ register: dashboards
+
+- debug:
+ msg: "Return value: {{ dashboards.json }}"
+
+- name: Add dashboard
+ uri:
+ url: "http://{{ ansible_host}}:3000/api/dashboards/db"
+ method: POST
+ user: admin
+ password: admin
+ force_basic_auth: yes
+ status_code: 200
+ body_format: json
+ body:
+ dashboard: '{{ lookup("file","grafana_dashboard.json") }}'
+ overwrite: true
+ when: dashboards.json == []
diff --git a/ansible/roles/prometheus-server/templates/alerts.rules b/ansible/roles/prometheus-server/templates/alerts.rules
new file mode 100644
index 0000000..a4f0a33
--- /dev/null
+++ b/ansible/roles/prometheus-server/templates/alerts.rules
@@ -0,0 +1,20 @@
+# {{ ansible_managed }}
+
+{% raw %}
+ALERT InstanceDown
+ IF up{job="node"} == 0
+ FOR 5m
+ LABELS { severity = "page" }
+ ANNOTATIONS {
+ summary = "Instance {{ $labels.instance }} down",
+ description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
+ }
+
+ALERT DummyInfiniBandTrafficHigh
+ IF irate(node_infiniband_port_data_received_bytes{job='node'}[5m]) > 1400
+ FOR 30s
+ LABELS { severity = "low" }
+ ANNOTATIONS {
+ summary = "Instance {{ $labels.instance }} high traffic on {{ $labels.device}}"
+ }
+{% endraw %}
diff --git a/ansible/roles/prometheus-server/templates/prometheus.yml b/ansible/roles/prometheus-server/templates/prometheus.yml
new file mode 100644
index 0000000..75538b4
--- /dev/null
+++ b/ansible/roles/prometheus-server/templates/prometheus.yml
@@ -0,0 +1,51 @@
+# {{ ansible_managed }}
+
+global:
+ scrape_interval: 15s
+ evaluation_interval: 15s
+
+ external_labels:
+ monitor: 'sahara-test'
+
+rule_files:
+ # TODO - "alerts.rules"
+
+scrape_configs:
+ - job_name: 'prometheus'
+
+ static_configs:
+ - targets:
+{% for node in groups['master'] %}
+ - '{{ node }}:9090'
+{% endfor %}
+
+ - job_name: 'node'
+ scrape_interval: 15s
+ static_configs:
+ - targets:
+{% for node in groups['master'] %}
+ - '{{ node }}:9100'
+{% endfor %}
+{% for node in groups['slave'] %}
+ - '{{ node }}:9100'
+{% endfor %}
+
+ params:
+ collect[]:
+ - cpu
+ - meminfo
+ - diskstats
+ - netdev
+ - netstat
+ - infiniband
+
+ - job_name: 'cAdvisor'
+ scrape_interval: 15s
+ static_configs:
+ - targets:
+{% for node in groups['master'] %}
+ - '{{ node }}:8080'
+{% endfor %}
+{% for node in groups['slave'] %}
+ - '{{ node }}:8080'
+{% endfor %}