Skip to content

Commit

Permalink
Merge pull request #188 from mfreeman451/fix/node_recovery_issue
Browse files Browse the repository at this point in the history
Fix/node recovery issue
  • Loading branch information
mfreeman451 authored Feb 6, 2025
2 parents 02b6b15 + b8dc49f commit 1b7d7f8
Show file tree
Hide file tree
Showing 5 changed files with 210 additions and 208 deletions.
90 changes: 71 additions & 19 deletions pkg/cloud/alerts/webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,30 @@ const (
)

type WebhookAlert struct {
Level AlertLevel `json:"level"`
Title string `json:"title"`
Message string `json:"message"`
Timestamp string `json:"timestamp"`
NodeID string `json:"node_id"`
Details map[string]any `json:"details,omitempty"`
Level AlertLevel `json:"level"`
Title string `json:"title"`
Message string `json:"message"`
Timestamp string `json:"timestamp"`
NodeID string `json:"node_id"`
ServiceName string `json:"service_name,omitempty"`
Details map[string]any `json:"details,omitempty"`
}

// AlertKey combines nodeID and title to make a unique key for cooldown tracking.
type AlertKey struct {
NodeID string
Title string
ServiceName string
}

type WebhookAlerter struct {
config WebhookConfig
client *http.Client
lastAlertTimes map[string]time.Time
mu sync.RWMutex
bufferPool *sync.Pool
config WebhookConfig
client *http.Client
LastAlertTimes map[AlertKey]time.Time
NodeDownStates map[string]bool
ServiceAlertStates map[string]bool
Mu sync.RWMutex
bufferPool *sync.Pool
}

func (w *WebhookConfig) UnmarshalJSON(data []byte) error {
Expand Down Expand Up @@ -95,7 +105,8 @@ func NewWebhookAlerter(config WebhookConfig) *WebhookAlerter {
client: &http.Client{
Timeout: 10 * time.Second,
},
lastAlertTimes: make(map[string]time.Time),
LastAlertTimes: make(map[AlertKey]time.Time),
NodeDownStates: make(map[string]bool),
bufferPool: &sync.Pool{
New: func() interface{} {
return new(bytes.Buffer)
Expand All @@ -104,6 +115,13 @@ func NewWebhookAlerter(config WebhookConfig) *WebhookAlerter {
}
}

func (w *WebhookAlerter) MarkServiceAsRecovered(nodeID string) {
w.Mu.Lock()
defer w.Mu.Unlock()

w.ServiceAlertStates[nodeID] = false
}

func (w *WebhookAlerter) IsEnabled() bool {
return w.config.Enabled
}
Expand All @@ -125,13 +143,34 @@ func (w *WebhookAlerter) getTemplateFuncs() template.FuncMap {
}
}

// Alert sends an alert through the webhook.
func (w *WebhookAlerter) Alert(ctx context.Context, alert *WebhookAlert) error {
if !w.IsEnabled() {
log.Printf("Webhook alerter disabled, skipping alert: %s", alert.Title)

return errWebhookDisabled
}

if err := w.checkCooldown(alert.Title); err != nil {
// Only check NodeDownStates for "Node Offline" alerts.
if alert.Title == "Node Offline" {
w.Mu.RLock()
if w.NodeDownStates[alert.NodeID] {
w.Mu.RUnlock()
log.Printf("Skipping duplicate 'Node Offline' alert for node: %s", alert.NodeID)

return nil // Or return a specific error if you want to track this
}

w.Mu.RUnlock()

// If we got here, it is a valid down alert.
w.Mu.Lock()
w.NodeDownStates[alert.NodeID] = true
w.Mu.Unlock()
}

// Always check cooldown (using the correct AlertKey, with ServiceName).
if err := w.CheckCooldown(alert.NodeID, alert.Title, alert.ServiceName); err != nil {
return err
}

Expand All @@ -147,21 +186,34 @@ func (w *WebhookAlerter) Alert(ctx context.Context, alert *WebhookAlert) error {
return w.sendRequest(ctx, payload)
}

func (w *WebhookAlerter) checkCooldown(alertTitle string) error {
func (w *WebhookAlerter) MarkNodeAsRecovered(nodeID string) {
w.Mu.Lock()
defer w.Mu.Unlock()

w.NodeDownStates[nodeID] = false

log.Printf("Marked Node: %v as recovered in the webhook alerter", nodeID)
}

// CheckCooldown checks if an alert is within its cooldown period.
func (w *WebhookAlerter) CheckCooldown(nodeID, alertTitle, serviceName string) error {
if w.config.Cooldown <= 0 {
return nil
}

w.mu.Lock()
defer w.mu.Unlock()
w.Mu.Lock()
defer w.Mu.Unlock()

lastAlertTime, exists := w.lastAlertTimes[alertTitle]
key := AlertKey{NodeID: nodeID, Title: alertTitle, ServiceName: serviceName}

lastAlertTime, exists := w.LastAlertTimes[key]
if exists && time.Since(lastAlertTime) < w.config.Cooldown {
log.Printf("Alert '%s' is within cooldown period, skipping", alertTitle)
log.Printf("Alert '%s' for node '%s' is within cooldown period, skipping", alertTitle, nodeID)

return ErrWebhookCooldown
}

w.lastAlertTimes[alertTitle] = time.Now()
w.LastAlertTimes[key] = time.Now()

return nil
}
Expand Down
15 changes: 0 additions & 15 deletions pkg/cloud/node_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"errors"
"fmt"
"log"
"os"
"time"

"github.com/mfreeman451/serviceradar/pkg/cloud/alerts"
Expand All @@ -19,20 +18,6 @@ type NodeRecoveryManager struct {
getHostname func() string
}

func newNodeRecoveryManager(d db.Service, alerter alerts.AlertService) *NodeRecoveryManager {
return &NodeRecoveryManager{
db: d,
alerter: alerter,
getHostname: func() string {
hostname, err := os.Hostname()
if err != nil {
return statusUnknown
}
return hostname
},
}
}

func (m *NodeRecoveryManager) processRecovery(ctx context.Context, nodeID string, lastSeen time.Time) error {
tx, err := m.db.Begin()
if err != nil {
Expand Down
Loading

0 comments on commit 1b7d7f8

Please sign in to comment.