Skip to content

Commit a506e62

Browse files
Add retry logic when susbcription fails
1 parent 034e2ad commit a506e62

File tree

10 files changed

+925
-277
lines changed

10 files changed

+925
-277
lines changed

redfish-exporter/.env

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,20 @@
1-
UPDATED="2025-01-22"
1+
UPDATED="2024-09-24"
22
DESCRIPTION="Redfish Event Listener/Exporter"
3-
LISTENER_IP="<Listener_IP>"
4-
LISTENER_PORT="<PORT>"
5-
METRICS_PORT="<MERTRICS_PORT>"
3+
LISTENER_IP="10.11.18.55"
4+
LISTENER_PORT="9003"
5+
METRICS_PORT="2112"
66
USE_SSL="false"
77
CERTFILE="path/to/certfile"
88
KEYFILE="path/to/keyfile"
9-
SLURM_CONTROL_NODE="<SLURM_CONTROL_NODE_IP>"
10-
#List of '|' seperated reasons for avoiding drain action if there is a match
11-
SLURM_DRAIN_EXCLUDE_REASON_LIST="reason 1|reason 2"
9+
SLURM_CONTROL_NODE="10.235.34.47"
10+
SLURM_DRAIN_EXCLUDE_REASON_LIST="AMD|Pensando|RebootNeeded"
1211
SLURM_SCONTROL_PATH="/usr/bin/scontrol"
12+
TLS_TIMEOUT="15"
1313

14-
# Match RAS events received based on severity and '|' seperated list of message fields and perform drain action with the DrainReasonPrefix set as the prefix in the reason
15-
# Message can be left empty if it doesn't need to be matched against, in that case only severity is matched
16-
# only DrainNode action is supported for now
1714
TRIGGER_EVENTS="[\
18-
{\"Severity\":\"Critical\",\"Message\":\"message 1|This is a critical test event\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNeeded\"},\
19-
{\"Severity\":\"Info\",\"Message\":\"message 3\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"},\
20-
{\"Severity\":\"Warning\",\"Message\":\"message 4|This is a test event message\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"}
15+
{\"Severity\":\"Critical\",\"Message\":\"Image 'UBB_FPGA' is being verified at 'ERoT'|This is an e2e critical test event\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNeeded\"},\
16+
{\"Severity\":\"Info\",\"Message\":\"Image 'UBB_FPGA' is being verified at 'ERoT'\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"},\
17+
{\"Severity\":\"Warning\",\"Message\":\"Image 'UBB_FPGA' is being verified at 'ERoT'|This is an e2e test event message\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"}
2118
]"
2219

2320
# Subscription (v1.5+)
@@ -33,11 +30,11 @@ TRIGGER_EVENTS="[\
3330

3431
# Deprecated <v1.5
3532
SUBSCRIPTION_PAYLOAD="{\
36-
\"Destination\":\"http://<Listener_IP:Port>\",\
33+
\"Destination\":\"http://10.11.18.55:9003\",\
3734
\"EventTypes\":[\"Alert\"],\
3835
\"Protocol\":\"Redfish\",\
3936
\"Context\":\"YourContextData\",\
40-
\"Oem\":{\"Supermicro\":{\"EnableSubscription\":true}}\
37+
\"Oem\":{\"Supermicro\": {\"EnableSubscription\": true}}\
4138
}"
4239

4340
# Config for setting default labels in Prometheus counter metrics.
@@ -46,5 +43,10 @@ PROMETHEUS_CONFIG="{\
4643
}"
4744

4845
REDFISH_SERVERS="[\
49-
{\"ip\":\"https://<BMC_IP>\",\"username\":\"<username>\",\"password\":\"<password>\",\"loginType\":\"Session\",\"slurmNode\":\"<nodename\"}
50-
]"
46+
{\"ip\":\"https://10.235.37.54\",\"username\":\"ADMIN\",\"password\":\"PHHCJZUHDV\",\"loginType\":\"Session\",\"slurmNode\":\"smc300x-ccs-aus-GPUFCE9\"},
47+
{\"ip\":\"https://10.235.37.48\",\"username\":\"ADMIN\",\"password\":\"PHHCJZUHDV\",\"loginType\":\"Session\",\"slurmNode\":\"smc300x-ccs-aus-GPUFCE9\"}
48+
49+
]"
50+
51+
REDFISH_SERVERS_COMMON_CONFIG="{\
52+
\"hostSuffix\":\"ipmi.cluster\",\"username\":\"<username>\",\"password\":\"<password>\"}"

redfish-exporter/config.go

Lines changed: 136 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,23 @@ package main
1919
import (
2020
"crypto/tls"
2121
"encoding/json"
22+
"fmt"
2223
"log"
24+
"net"
2325
"os"
2426
"strconv"
2527
"strings"
2628

2729
"github.com/joho/godotenv"
30+
"gopkg.in/yaml.v3"
2831
)
2932

3033
const (
3134
DefaultListenerPort = "8080"
3235
DefaultMetricsPort = "2112"
3336
DefaultUseSSL = "false"
3437
DefaultSeverityConfig = "Fatal,Critical,Informational"
38+
NodeDrainPolicyFile = "nodeDrainPolicy.json"
3539
)
3640

3741
type Config struct {
@@ -56,11 +60,19 @@ type Config struct {
5660
SlurmDrainExcludeStr string
5761
SubscriptionPayload SubscriptionPayload
5862
RedfishServers []RedfishServer
59-
TriggerEvents []TriggerEvent
63+
TriggerEvents map[string]map[string][]EventInfo //map[Severity][MessageRegistry.MessageId][]EventInfo
6064
PrometheusConfig PrometheusConfig
6165
context *tls.Config
6266
eventCount int
6367
dataBuffer []byte
68+
TlsTimeOut string
69+
}
70+
71+
type EventInfo struct {
72+
UniqueString string
73+
Category string
74+
Subcategory string
75+
DrainReasonPrefix string
6476
}
6577

6678
type TriggerEvent struct {
@@ -70,11 +82,27 @@ type TriggerEvent struct {
7082
DrainReasonPrefix string `json:"DrainReasonPrefix"`
7183
}
7284

85+
type TriggerEventsInfo struct {
86+
Category string `json:"Category"`
87+
Subcategory string `json:"Subcategory"`
88+
MessageRegistry string `json:"MessageRegistry"`
89+
MessageId string `json:"MessageId"`
90+
UniqueString string `json:"UniqueString"`
91+
Severity string `json:"Severity"`
92+
DrainReasonPrefix string `json:"DrainReasonPrefix"`
93+
Enable bool `json:"Enable"`
94+
}
95+
7396
type PrometheusConfig struct {
7497
Severity []string `json:"Severity"`
7598
}
7699

77-
func setupConfig() Config {
100+
type target struct {
101+
Targets []string `yaml:"targets"`
102+
Labels map[string]string `yaml:"labels"`
103+
}
104+
105+
func setupConfig(targetFile string) Config {
78106
// Load .env file
79107
err := godotenv.Load()
80108
if err != nil {
@@ -125,20 +153,13 @@ func setupConfig() Config {
125153
AppConfig.SlurmUser = os.Getenv("SLURM_USER")
126154
AppConfig.SlurmDrainExcludeStr = os.Getenv("SLURM_DRAIN_EXCLUDE_REASON_LIST")
127155
AppConfig.SlurmScontrolPath = os.Getenv("SLURM_SCONTROL_PATH")
156+
AppConfig.TlsTimeOut = os.Getenv("TLS_TIMEOUT")
128157

129158
subscriptionPayloadJSON := os.Getenv("SUBSCRIPTION_PAYLOAD")
130159
if err := json.Unmarshal([]byte(subscriptionPayloadJSON), &AppConfig.SubscriptionPayload); err != nil {
131160
log.Fatalf("Failed to parse SUBSCRIPTION_PAYLOAD: %v", err)
132161
}
133162

134-
triggerEventsJSON := os.Getenv("TRIGGER_EVENTS")
135-
if triggerEventsJSON != "" {
136-
err = json.Unmarshal([]byte(triggerEventsJSON), &AppConfig.TriggerEvents)
137-
if err != nil {
138-
log.Fatalf("Failed to unmarshal TRIGGER_EVENTS: %v", err)
139-
}
140-
}
141-
142163
prometheusConfigJSON := os.Getenv("PROMETHEUS_CONFIG")
143164
if prometheusConfigJSON != "" {
144165
err = json.Unmarshal([]byte(prometheusConfigJSON), &AppConfig.PrometheusConfig)
@@ -154,10 +175,113 @@ func setupConfig() Config {
154175
redfishServersJSON := os.Getenv("REDFISH_SERVERS")
155176
if redfishServersJSON == "" {
156177
log.Println("REDFISH_SERVERS environment variable is not set or is empty")
178+
} else {
179+
if err := json.Unmarshal([]byte(redfishServersJSON), &AppConfig.RedfishServers); err != nil {
180+
log.Fatalf("Failed to parse REDFISH_SERVERS: %v", err)
181+
}
182+
}
183+
184+
// Read the node drain policy config file
185+
nodeDrainPolicyConfig, err := os.ReadFile(NodeDrainPolicyFile)
186+
187+
if err != nil {
188+
log.Fatalf("Failed to read: %v", NodeDrainPolicyFile)
189+
}
190+
191+
triggerEventsInfo := []TriggerEventsInfo{}
192+
err = json.Unmarshal(nodeDrainPolicyConfig, &triggerEventsInfo)
193+
if err != nil {
194+
log.Fatalf("Failed to unmarshal file: %v | err: %v", NodeDrainPolicyFile, err)
195+
}
196+
197+
tInfoMap := map[string]map[string][]EventInfo{}
198+
199+
for _, evt := range triggerEventsInfo {
200+
fmt.Printf("Trigger Event: %+v\n", evt)
201+
if evt.Enable != true {
202+
continue
203+
}
204+
eInfo := EventInfo{}
205+
eInfo.Category = evt.Category
206+
eInfo.Subcategory = evt.Subcategory
207+
eInfo.DrainReasonPrefix = evt.DrainReasonPrefix
208+
eInfo.UniqueString = evt.UniqueString
209+
key := ""
210+
if evt.MessageRegistry == "" {
211+
key = evt.MessageId
212+
} else {
213+
key = evt.MessageRegistry + "." + evt.MessageId
214+
}
215+
if ee, ok := tInfoMap[evt.Severity]; !ok {
216+
eInfoMap := map[string][]EventInfo{}
217+
eInfoMap[key] = []EventInfo{eInfo}
218+
tInfoMap[evt.Severity] = eInfoMap
219+
} else {
220+
ee[key] = append(ee[key], eInfo)
221+
}
222+
}
223+
224+
AppConfig.TriggerEvents = tInfoMap
225+
226+
for kk, tt := range AppConfig.TriggerEvents {
227+
fmt.Println("Severity: ", kk)
228+
for kkk, ttt := range tt {
229+
fmt.Println("key: ", kkk)
230+
fmt.Printf("event: %+v\n", ttt)
231+
}
232+
}
233+
234+
// Read and parse the REDFISH_SERVERS_COMMON_CONFIG environment variable
235+
redfishServersCommonConfigJSON := os.Getenv("REDFISH_SERVERS_COMMON_CONFIG")
236+
if redfishServersCommonConfigJSON == "" {
237+
log.Println("redfishServersCommonConfigJSON environment variable is not set or is empty")
238+
return AppConfig
239+
}
240+
redfishServersCommonConfig := RedfishServersCommongConfig{}
241+
if err := json.Unmarshal([]byte(redfishServersCommonConfigJSON), &redfishServersCommonConfig); err != nil {
242+
log.Fatalf("Failed to parse REDFISH_SERVERS_COMMON_CONFIG: %v", err)
243+
}
244+
245+
if targetFile == "" {
246+
log.Println("No target file provided")
157247
return AppConfig
158248
}
159-
if err := json.Unmarshal([]byte(redfishServersJSON), &AppConfig.RedfishServers); err != nil {
160-
log.Fatalf("Failed to parse REDFISH_SERVERS: %v", err)
249+
250+
targetYamlFile, err := os.ReadFile(targetFile)
251+
252+
if err != nil {
253+
log.Fatalf("Failed to read file: %v", targetFile)
254+
}
255+
256+
targets := []target{}
257+
258+
err = yaml.Unmarshal(targetYamlFile, &targets)
259+
260+
if err != nil {
261+
log.Fatalf("Error parsing target file: %v | err: %v", targetFile, err)
262+
}
263+
264+
for _, t := range targets {
265+
log.Println("target: ", t.Targets)
266+
267+
for _, hostName := range t.Targets {
268+
// add this target to Redfish servers
269+
server := RedfishServer{}
270+
bmcHost := fmt.Sprintf(hostName+".%v", redfishServersCommonConfig.HostSuffix)
271+
ips, err := net.LookupIP(bmcHost)
272+
if err != nil || len(ips) == 0 {
273+
log.Printf("[error] Couldn't get the IP for host: %v | ips: %v | err: %v", bmcHost, ips, err)
274+
continue
275+
}
276+
log.Println("IPs: ", ips)
277+
278+
server.IP = fmt.Sprintf("https://%v", ips[0])
279+
server.LoginType = "Session"
280+
server.Username = redfishServersCommonConfig.UserName
281+
server.Password = redfishServersCommonConfig.Password
282+
server.SlurmNode = hostName
283+
AppConfig.RedfishServers = append(AppConfig.RedfishServers, server)
284+
}
161285
}
162286

163287
return AppConfig

redfish-exporter/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ require (
99
github.com/nod-ai/ADA/redfish-exporter v0.0.0-20241002210630-2ef2d1070d90
1010
github.com/prometheus/client_golang v1.20.4
1111
github.com/stmcginnis/gofish v0.19.0
12+
gopkg.in/yaml.v3 v3.0.1
1213
)
1314

1415
require (

redfish-exporter/go.sum

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,6 @@ golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
2626
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
2727
google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
2828
google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
29+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
30+
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
31+
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

redfish-exporter/listener.go

Lines changed: 49 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ import (
2727
"log"
2828
"net"
2929
"net/http"
30-
"regexp"
3130
"strings"
3231

3332
"github.com/nod-ai/ADA/redfish-exporter/metrics"
@@ -172,6 +171,37 @@ func (s *Server) handleConnection(AppConfig Config, conn net.Conn) {
172171
}
173172
}
174173

174+
func getDrainReasonPrefix(info EventInfo) string {
175+
return info.DrainReasonPrefix + ": " + info.Category + ": " + info.Subcategory
176+
}
177+
178+
func isTriggerEvent(evt Event, config Config) (bool, string) {
179+
tInfoMap := config.TriggerEvents
180+
181+
if eInfoMap, ok := tInfoMap[evt.Severity]; !ok {
182+
return false, ""
183+
} else {
184+
if eInfo, ok1 := eInfoMap[evt.MessageId]; !ok1 {
185+
return false, ""
186+
} else {
187+
if len(eInfo) == 1 {
188+
return true, getDrainReasonPrefix(eInfo[0])
189+
} else {
190+
for _, info := range eInfo {
191+
strs := strings.Split(info.UniqueString, "|")
192+
for _, str := range strs {
193+
if strings.Contains(evt.Message, str) == true {
194+
return true, getDrainReasonPrefix(info)
195+
}
196+
}
197+
198+
}
199+
}
200+
}
201+
}
202+
return false, ""
203+
}
204+
175205
func (s *Server) processRequest(AppConfig Config, conn net.Conn, req *http.Request, eventCount *int, dataBuffer *[]byte) error {
176206
// Extract method, headers, and payload
177207
method := req.Method
@@ -218,38 +248,26 @@ func (s *Server) processRequest(AppConfig Config, conn net.Conn, req *http.Reque
218248
log.Printf("Message ID: %s", messageId)
219249
log.Printf("Message Args: %v", messageArgs)
220250
log.Printf("Origin Of Condition: %s", originOfCondition)
221-
for _, triggerEvent := range AppConfig.TriggerEvents {
222-
if severity == triggerEvent.Severity {
223-
if triggerEvent.Message != "" {
224-
re := regexp.MustCompile(triggerEvent.Message)
225-
match := re.FindAllString(message, -1)
226-
227-
if len(match) == 0 {
228-
continue
229-
}
251+
252+
trigger, drainReason := isTriggerEvent(event, AppConfig)
253+
if trigger == true {
254+
log.Printf("Matched Trigger Event: %s | messageId: %s | message: %s", event.Severity, event.MessageId, event.Message)
255+
// Sending event belongs to redfish_utils. Each server may have different slurm node associated, and redfish_servers has the info/map.
256+
if s.slurmQueue != nil {
257+
redfishServerInfo := getServerInfoByIP(AppConfig.RedfishServers, ip)
258+
if len(strings.TrimSpace(redfishServerInfo.SlurmNode)) == 0 {
259+
log.Println("failed to get the slurm node name, cannot perform drain action")
260+
continue
230261
}
231-
log.Printf("Matched Trigger Event: %s | message: %s | with action %s", triggerEvent.Severity, triggerEvent.Message, triggerEvent.Action)
232-
// Sending event belongs to redfish_utils. Each server may have different slurm node associated, and redfish_servers has the info/map.
233-
if s.slurmQueue != nil {
234-
redfishServerInfo := getServerInfoByIP(AppConfig.RedfishServers, ip)
235-
if len(strings.TrimSpace(redfishServerInfo.SlurmNode)) == 0 {
236-
log.Printf("failed to get the slurm node name, cannot perform action: %v", triggerEvent.Action)
237-
break
238-
}
239-
evt := slurm.AddEventReq{
240-
RedfishServerIP: redfishServerInfo.IP,
241-
SlurmNodeName: redfishServerInfo.SlurmNode,
242-
Severity: triggerEvent.Severity,
243-
Action: triggerEvent.Action,
244-
DrainReasonPrefix: triggerEvent.DrainReasonPrefix,
245-
MessageId: messageId,
246-
Message: message,
247-
ExcludeStr: AppConfig.SlurmDrainExcludeStr,
248-
ScontrolPath: AppConfig.SlurmScontrolPath,
249-
}
250-
s.slurmQueue.Add(evt)
262+
evt := slurm.AddEventReq{
263+
RedfishServerIP: redfishServerInfo.IP,
264+
SlurmNodeName: redfishServerInfo.SlurmNode,
265+
Severity: event.Severity,
266+
DrainReason: drainReason,
267+
ExcludeStr: AppConfig.SlurmDrainExcludeStr,
268+
ScontrolPath: AppConfig.SlurmScontrolPath,
251269
}
252-
break
270+
s.slurmQueue.Add(evt)
253271
}
254272
}
255273
}

0 commit comments

Comments
 (0)