Skip to content
This repository has been archived by the owner on Sep 1, 2022. It is now read-only.

Commit

Permalink
after mlhep'16
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrey Ustyuzhanin committed Aug 16, 2016
1 parent 5570bc8 commit 1d4ec81
Show file tree
Hide file tree
Showing 11 changed files with 379 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
srv/*
59 changes: 59 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Makefile for building & starting everware-containers
# arguments can be supplied by -e:
#
# IMAGE -- name of image to use
#

DOCKER_CMD=docker
DOCKER_CMD_NODE=sudo docker -H tcp://0.0.0.0:2375
PSSH=parallel-ssh -O StrictHostKeyChecking=no
CLUSTER_HOSTS=etc/cluster.txt
IMAGE ?= yandex/rep:0.6.5
HERE:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
HEADHOST=head.haze.yandex.net
SRVDIR=srv


include swarm.makefile
include docker.makefile
include system.makefile
include bosun.makefile

help:
@echo Usage: make [-e VARIABLE=VALUE] targets
@echo "variables:"
@grep -h "#\s\+\w\+ -- " $(MAKEFILE_LIST) |sed "s/#\s//"
@echo
@echo targets and corresponding dependencies:
@fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' -e 's/^/ /' | sed -e 's/##//'


$(SRVDIR): ## create srv for etcd & scollector
[ -d $(SRVDIR) ] || mkdir -p $(SRVDIR)

uptime: ## uptime cluster
${PSSH} -h ${CLUSTER_HOSTS} -i uptime

pull: ## pull image to cluster nodes
${PSSH} -h ${CLUSTER_HOSTS} -i -t 0 ${DOCKER_CMD_NODE} pull ${IMAGE}

ps-user-containers: ${CLUSTER_HOSTS} ## list container running on the cluster
${PSSH} -h ${CLUSTER_HOSTS} -i '${DOCKER_CMD_NODE} ps -a'

count-user-containers: ${CLUSTER_HOSTS} ## count container running on the cluster
${PSSH} -h ${CLUSTER_HOSTS} -i '${DOCKER_CMD_NODE} ps | grep -v "CONTAINER ID" | wc -l'

images: ${CLUSTER_HOSTS} ## list images created at clusters
${PSSH} -h ${CLUSTER_HOSTS} -i '${DOCKER_CMD_NODE} images'

rm-images: ${CLUSTER_HOSTS} ## remove all images
${PSSH} -h ${CLUSTER_HOSTS} -i '${DOCKER_CMD_NODE} images -q | xargs ${DOCKER_CMD_NODE} rmi'

rm-user-containers: ${CLUSTER_HOSTS} ## stop & remove user containers
${PSSH} -h ${CLUSTER_HOSTS} --timeout=0 -i '${DOCKER_CMD_NODE} ps -aq|xargs --no-run-if-empty ${DOCKER_CMD_NODE} rm -f'

df: ${CLUSTER_HOSTS} ## check disk free space on cluster nodes
${PSSH} -h ${CLUSTER_HOSTS} -i df -h /

mdu: ## mfs du
du -m --max-depth 1 /mnt/shared
48 changes: 48 additions & 0 deletions bosun.makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@

BOSUN_IMAGE = stackexchange/bosun
SCOLLECTOR_SHARED = /mnt/shared/scollector


bosun-start: ## start monitoring (bosun)
${DOCKER_CMD} run -d -p 4242:4242 -p 8070:8070 --name bosun ${BOSUN_IMAGE}

bosun-rm:
docker rm -f bosun

bosun-stop:
docker stop bosun

bosun-restart:
docker restart bosun

bosun-exec:
docker exec -ti bosun bash

bosun-update-conf:
docker cp etc/bosun.conf bosun:/data/bosun.conf
docker restart bosun

scollector-install: ${CLUSTER_HOSTS} ${SRVDIR}
[ -f ${SRVDIR}/scollector-linux ] || ( \
wget https://github.com/bosun-monitor/bosun/releases/download/0.5.0/scollector-linux-386 \
-O ${SRVDIR}/scollector-linux ; \
chmod +x ${SRVDIR}/scollector-linux ; \
)
sudo cp ${SRVDIR}/scollector-linux etc/scollector_supervisord.conf etc/scollector.toml ${SCOLLECTOR_SHARED}
sudo cp -r scollector_metrics ${SCOLLECTOR_SHARED}
sudo sed -i -e "s/#HEAD#/${HEADHOST}/" -e "s|#BASE#|${SCOLLECTOR_SHARED}|" \
${SCOLLECTOR_SHARED}/scollector_supervisord.conf ${SCOLLECTOR_SHARED}/scollector.toml
${PSSH} -h ${CLUSTER_HOSTS} -H ${HEADHOST} -i 'sudo cp ${SCOLLECTOR_SHARED}/scollector_supervisord.conf /etc/supervisor/conf.d; \
sudo supervisorctl reload'

scollector-reload: ${CLUSTER_HOSTS}
${PSSH} -h ${CLUSTER_HOSTS} -H ${HEADHOST} -i "sudo supervisorctl reload"

scollector-start: ${CLUSTER_HOSTS}
${PSSH} -h ${CLUSTER_HOSTS} -H ${HEADHOST} -i "sudo supervisorctl start scollector"

scollector-check: ${CLUSTER_HOSTS}
${PSSH} -h ${CLUSTER_HOSTS} -H ${HEADHOST} -i 'pgrep -f scollector-linux'

scollector-stop:
${PSSH} -h ${CLUSTER_HOSTS} -H ${HEADHOST} -i 'sudo supervisorctl stop scollector'
39 changes: 39 additions & 0 deletions docker.makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@

DOCKER_OPTS = --ipv6 -H tcp://[::]:2375 -H unix:///var/run/docker.sock --fixed-cidr-v6=fc00::/64
DOCKER_CONFIG = /etc/default/docker
DOCKER_VERSION = 1.11.2-0~trusty

install_docker: pdocker-repo pdocker-install pdocker-version

pdocker-repo: ${CLUSTER_HOSTS}
${PSSH} -h ${CLUSTER_HOSTS} -i 'echo "deb https://apt.dockerproject.org/repo ubuntu-trusty main" | sudo tee /etc/apt/sources.list.d/docker.list; cat /etc/apt/sources.list.d/docker.list'
${PSSH} -h ${CLUSTER_HOSTS} 'sudo apt-get update'

pdocker-install: ${CLUSTER_HOSTS}
${PSSH} -h ${CLUSTER_HOSTS} -i 'sudo apt-get install -y --force-yes docker-engine=${DOCKER_VERSION}'

pdocker-version: ${CLUSTER_HOSTS}
${PSSH} -h ${CLUSTER_HOSTS} -i "${DOCKER_CMD_NODE} version"

pdocker-config: ${CLUSTER_HOSTS}
${PSSH} -h ${CLUSTER_HOSTS} -i "sudo sed -i '/^DOCKER_OPTS/d' ${DOCKER_CONFIG} ; echo DOCKER_OPTS=\'${DOCKER_OPTS}\' | sudo tee -a ${DOCKER_CONFIG}"

pdocker-restart: ${CLUSTER_HOSTS}
${PSSH} -h ${CLUSTER_HOSTS} -i "sudo service docker restart"

pdocker-stop: ## stop dockers on cluster
${PSSH} -h ${CLUSTER_HOSTS} -i "sudo service docker stop"

pdocker-start: ## start dockers on cluster
${PSSH} -h ${CLUSTER_HOSTS} -i "sudo service docker start"

pdocker-remove: ## remove docker directories
${PSSH} -h ${CLUSTER_HOSTS} -i sudo rm -rf /var/lib/docker

pdocker-check: ${CLUSTER_HOSTS} ## check docker version
${PSSH} -h ${CLUSTER_HOSTS} -i "dpkg -l | grep docker && ps ax|grep 'docker daemon'|grep -v grep"

pdocker-clean-key: ${CLUSTER_HOSTS} ## remove docker key (should be different for different nodes, otherwise swarm doesn't fly)
${PSSH} -h ${CLUSTER_HOSTS} -i 'sudo rm -f /etc/docker/key.json && sudo service docker restart'


108 changes: 108 additions & 0 deletions etc/bosun.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
tsdbHost = localhost:4242
stateFile = /data/bosun.state
ledisBindAddr = 0.0.0.0:9565
smtpHost = mx.yandex-team.ru:25
emailFrom = [email protected]


template test {
subject = {{.Last.Status}}: {{.Alert.Name}} on {{.Group.host}}
body = `<p>Alert: {{.Alert.Name}} triggered on {{.Group.host}}
<hr>
<p><strong>Computation</strong>
<table>
{{range .Computations}}
<tr><td><a href="{{$.Expr .Text}}">{{.Text}}</a></td><td>{{.Value}}</td></tr>
{{end}}
</table>
<hr>
{{ .Graph .Alert.Vars.metric }}
<hr>
<p><strong>Relevant Tags</strong>
<table>
{{range $k, $v := .Group}}
<tr><td>{{$k}}</td><td>{{$v}}</td></tr>
{{end}}
</table>`
}


# email sysadmins and Nick each day until ack'd
notification default {
email = [email protected]
next = default
timeout = 1d
}

alert cpu.is.too.high {
warnNotification = default
template = test
$metric = q("sum:rate{counter,,1}:os.cpu{host=*}", "1h", "")
$avgcpu = avg($metric)
crit = $avgcpu > 80
warn = $avgcpu > 50
}

alert sockets.is.too.high {
warnNotification = default
template = test
$metric = q("sum:linux.net.sockets.used{host=*}", "1h", "")
$avgtime = avg($metric)
crit = $avgtime > 30000
warn = $avgtime > 10000
}

alert docker.containers.active.dev.high {
warnNotification = default
template = test
$metric = q("dev:3m-avg:docker.contatiners.active", "1h", "")
$maxdev = max($metric)
warn = $maxdev > 2
crit = $maxdev > 3
}

alert docker.containers.more.than.cpus {
warnNotification = default
template = test
$metric = q("max:docker.contatiners.active{host=*}", "30m", "")
$containers_hosts = max($metric)
warn = $containers_hosts > 12
crit = $containers_hosts > 16
}

alert disk.free.space.too.low {
warnNotification = default
template = test
$metric = q("min:os.disk.fs.percent_free{host=*}", "1h", "")
$containers_hosts = min($metric)
warn = $containers_hosts < 50
crit = $containers_hosts < 10
}

alert mem.free.too.low {
warnNotification = default
template = test
$metric = q("min:os.mem.percent_free{host=*}", "10m", "")
$containers_hosts = min($metric)
warn = $containers_hosts < 10
crit = $containers_hosts < 5
}

alert traffic.in.is.too.high {
warnNotification = default
template = test
$metric = q("sum:rate:linux.net.stat.ip.inoctets{host=*}", "1h", "")
$avgcpu = max($metric)
crit = $avgcpu > 200000000
warn = $avgcpu > 10000000
}

alert traffic.out.is.too.high {
warnNotification = default
template = test
$metric = q("sum:rate:linux.net.stat.ip.outoctets{host=*}", "1h", "")
$avgcpu = max($metric)
crit = $avgcpu > 200000000
warn = $avgcpu > 10000000
}

3 changes: 3 additions & 0 deletions etc/cluster.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
w-1.haze.yandex.net
w-2.haze.yandex.net
w-3.haze.yandex.net
1 change: 1 addition & 0 deletions etc/scollector.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ColDir = "#BASE#/scollector_metrics"
6 changes: 6 additions & 0 deletions etc/scollector_supervisord.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[program:scollector]
command=#BASE#/scollector-linux -d -h #HEAD#:8070
autostart=true
autorestart=true
stderr_logfile=/var/log/scollector.err.log
stdout_logfile=/var/log/scollector.out.log
6 changes: 6 additions & 0 deletions scollector_metrics/30/docker_counter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

s=`date +%s`
echo "docker.images $s `docker images -q|wc -l`"
echo "docker.contatiners.active $s `docker ps -q|wc -l`"
echo "docker.contatiners.all $s `docker ps -a -q|wc -l`"
74 changes: 74 additions & 0 deletions swarm.makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@

SWARM_PORT=4000
SWARM_OPT=-H tcp://${HEADHOST}:${SWARM_PORT}
SWARM_IMAGE=swarm:1.2.3
CLUSTER_SWARM=etc/_cluster_swarm.txt
ETCD=etcd-v2.2.5-linux-amd64

etcd-start: $(SRVDIR)
[ -d $(SRVDIR)/$(ETCD) ] || ( curl -L https://github.com/coreos/etcd/releases/download/v2.2.5/$(ETCD).tar.gz -o $(SRVDIR)/$(ETCD).tar.gz ; \
tar xzf $(SRVDIR)/$(ETCD).tar.gz -C $(SRVDIR) )
cd $(SRVDIR)/$(ETCD) && \
nohup ./etcd \
-initial-advertise-peer-urls http://${HEADHOST}:2380 \
-listen-peer-urls="http://0.0.0.0:2380,http://0.0.0.0:7001" \
-listen-client-urls="http://0.0.0.0:2379,http://0.0.0.0:4001" \
-advertise-client-urls="http://${HEADHOST}:2379" \
-initial-cluster-token etcd-01 \
-initial-cluster="default=http://${HEADHOST}:2380" \
-initial-cluster-state new > etcd.log &
sleep 1
tail $(SRVDIR)/$(ETCD)/etcd.log

etcd-stop:
pkill -9 etcd

etcd-check:
curl -L -g http://${HEADHOST}:2379/v2/keys/?recursive=true | json_pp

${CLUSTER_SWARM}: ${CLUSTER_HOSTS}
# cat ${CLUSTER_HOSTS} | sed 's/$$/:2375/' > ${CLUSTER_SWARM}

swarm-check: etcd-check swarm-info
@echo "OK"

_swarm-check-master-stopped:
@if [[ `${DOCKER_CMD} ps | grep swarm` ]] ; then echo "swarm master is already running" ; exit 1; fi

swarm-start-master: _swarm-check-master-stopped ${CLUSTER_SWARM} ## start swarm master
# ${DOCKER_CMD} run -v ${HERE}:/cfg -d -p ${SWARM_PORT}:2375 --name=swarm_master ${SWARM_IMAGE} manage --strategy random file:///cfg/${CLUSTER_SWARM}
${DOCKER_CMD} run -d -p ${SWARM_PORT}:2375 --name=swarm_master ${SWARM_IMAGE} manage --strategy random etcd://${HEADHOST}:2379

swarm-stop-master: ## stop swarm master
if ${DOCKER_CMD} ps -a | grep swarm_master ; then \
${DOCKER_CMD} rm -f swarm_master ; \
fi

swarm-logs:
${DOCKER_CMD} logs swarm_master

swarm-restart-master: swarm-stop-master swarm-start-master ## restart swarm master

swarm-stop: swarm-unregister-nodes swarm-stop-master etcd-stop
@echo Stop OK

swarm-start: etcd-start swarm-start-master swarm-register-nodes
@echo Start OK

swarm-restart: swarm-stop swarm-start
@echo Restart OK

swarm-info: ## check swarm
${DOCKER_CMD} ${SWARM_OPT} info

swarm-ps: ## list containers running in swarm
${DOCKER_CMD} ${SWARM_OPT} ps

swarm-psa: ## list all containers in swarm
${DOCKER_CMD} ${SWARM_OPT} ps -a

swarm-register-nodes:
${PSSH} -h ${CLUSTER_HOSTS} -i 'MYIP=$$(host `hostname -f`| awk "{print \$$5}") ; docker run --name swarm_node -d ${SWARM_IMAGE} join --advertise=[$$MYIP]:2375 etcd://${HEADHOST}:2379'

swarm-unregister-nodes:
${PSSH} -h ${CLUSTER_HOSTS} -i 'if docker ps -a|grep swarm_node ; then docker rm -f swarm_node ; fi'
34 changes: 34 additions & 0 deletions system.makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@

NAMESERVER = 2a02:6b8:0:3400::1023
install-dns:
${PSSH} -h ${CLUSTER_HOSTS} -H ${HEADHOST} -i 'echo "debconf resolvconf/linkify-resolvconf select true" | \
sudo debconf-set-selections && sudo dpkg-reconfigure -f noninteractive resolvconf ; \
sudo resolvconf --disable-updates && (sudo resolvconf --updates-are-enabled && echo Hmm || echo OK) ; \
sudo sed -i -e "\$$ a nameserver ${NAMESERVER}" -e "/^nameserver/ d" /etc/resolv.conf \
'
${PSSH} -h ${CLUSTER_HOSTS} -i "sudo ip6tables -t nat -L POSTROUTING | grep MASQ || sudo ip6tables -t nat -I POSTROUTING -j MASQUERADE"

install-nfs-server:
sudo apt-get install -y nfs-kernel-server
mount | grep /mnt/shared
sudo sed -i -e "\$$ a /mnt/shared *(rw,sync,no_root_squash)" -e "/^\/mnt\/shared/ d" /etc/exports
sudo service nfs-kernel-server start

install-nfs-client:
# ${PSSH} -h ${CLUSTER_HOSTS} -i 'sudo sed -i -e "\$$ a ${HEADHOST}:/mnt/shared /mnt/shared nfs rsize=8192,wsize=8192,timeo=14,intr"
${PSSH} -h ${CLUSTER_HOSTS} -i 'sudo sed -i -e "\$$ a ${HEADHOST}:/mnt/shared /mnt/shared nfs rsize=8192,wsize=8192,timeo=14,intr" \
-e "/^${HEADHOST}/ d" /etc/fstab ; \
sudo apt-get install -y nfs-common ; \
sudo mkdir -p /mnt/shared ; \
sudo mount /mnt/shared'

install-supervisor: ${CLUSTER_HOSTS}
${PSSH} -h ${CLUSTER_HOSTS} -H ${HEADHOST} -i 'sudo apt-get install -y --force-yes supervisor'

system-check:
${PSSH} -h ${CLUSTER_HOSTS} -H ${HEADHOST} -i "sudo ip6tables -t nat -L POSTROUTING | grep MASQ && \
grep 'nameserver ${NAMESERVER}' /etc/resolv.conf && \
test -d /mnt/shared/data && \
dpkg -l | grep supervisor \
"

0 comments on commit 1d4ec81

Please sign in to comment.