forked from glensc/monitoring-plugin-check_chef_stale
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_chef_stale.rb
executable file
·117 lines (103 loc) · 2.9 KB
/
check_chef_stale.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/ruby
# Based on: http://www.nclouds.com/alerting-for-stale-nodes-on-chef-with-nagios/
#
# Rewritten by Elan Ruusamäe <[email protected]>
# https://github.com/glensc/nagios-plugin-check_chef_stale
require 'optparse'
require 'chef/client'
# Setup some defaults
# Hours to be alerted upon
critical = 12
warning = 2
# Solr search query to make
query = "ohai_time:*"
# Chef client.rb config files to use (can have multiple chef server configs)
chef_clients = ['/etc/chef/client.rb']
OptionParser.new do |opts|
opts.banner = "Usage: check_chef_stale.rb [options]"
opts.on("-w", "--warning N", "Set warning treshold in hours. Default: #{warning}h") do |v|
warning = v.to_i
end
opts.on("-c", "--critical N", "Set critical treshold in hours. Default: #{critical}h") do |v|
critical = v.to_i
end
opts.on("-q", "--query QUERY", "Append query to Chef-SOLR search") do |v|
query = "#{query} #{v}"
end
end.parse!
OK_STATE = 0
WARNING_STATE = 1
CRITICAL_STATE = 2
UNKNOWN_STATE = 3
if warning > critical || warning < 0
puts "Warning: warning should be less than critical and bigger than zero"
exit(WARNING_STATE)
end
all_nodes = []
unique_nodes = []
repeating = {}
cnodes = []
wnodes = []
chef_clients.each do |client|
Chef::Config.from_file(File.expand_path(client))
Chef::Search::Query.new.search('node', query) do |node|
all_nodes << node
end
end
now = Time.now.to_i
all_nodes.each do |node|
if repeating.include? node.name
if repeating[node.name][0] < node['ohai_time'].to_i
repeating.store(node.name, [node['ohai_time'].to_i, node])
end
else
repeating.store(node.name, [node['ohai_time'].to_i, node])
end
end
repeating.each do |nodename, node|
unique_nodes << node[1]
end
unique_nodes.each do |node|
hours = (now - node['ohai_time'].to_i)/3600
if hours >= critical
cnodes << node
elsif hours >= warning
wnodes << node
end
end
def report_fail_nodes(nodes, hours)
if nodes.length == 1
node = nodes[0]
time = Time.at(node['ohai_time']).strftime('%Y-%m-%d %H:%M:%S %z')
res = "#{node} did not check in for #{hours.to_s} hours (#{time})"
else
res = "#{nodes.length.to_s} nodes did not check in for #{hours.to_s} hours: "
res += nodes.map { |n| n.name }.sort.join(', ')
end
res
end
def report_ok_nodes(nodes, hours)
if nodes.length == 1
node = nodes[0]
time = Time.at(node['ohai_time']).strftime('%Y-%m-%d %H:%M:%S %z')
"#{node} checked in #{hours} hours (#{time})"
else
"All #{nodes.length} nodes checked in #{hours} hours"
end
end
if unique_nodes.length == 0
puts "CRITICAL: No nodes match criteria"
exit(CRITICAL_STATE)
elsif cnodes.length > 0
puts "CRITICAL: " + report_fail_nodes(cnodes, critical)
exit(CRITICAL_STATE)
elsif wnodes.length > 0
puts "WARNING: " + report_fail_nodes(wnodes, warning)
exit(WARNING_STATE)
elsif cnodes.length == 0 and wnodes.length == 0
puts "OK: "+ report_ok_nodes(unique_nodes, warning)
exit(OK_STATE)
else
puts "UNKNOWN"
exit(UNKNOWN_STATE)
end