Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions performance/config/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,28 @@ groups:
description: "Cache hit rate is {{ $value | humanizePercentage }}, below 80% threshold"
runbook_url: "https://docs.predictiq.com/runbooks/low-cache-hit-rate"

- alert: CacheCircuitBreakerOpen
expr: cache_circuit_breaker_state{state="open"} == 1
for: 2m
labels:
severity: critical
component: cache
annotations:
summary: "Redis cache circuit breaker has been open for 2+ minutes"
description: "The Redis cache circuit breaker is open, causing cache bypass and degraded performance. Check Redis connectivity and health."
runbook_url: "https://docs.predictiq.com/runbooks/cache-circuit-breaker-open"

- alert: CacheCircuitBreakerHalfOpen
expr: cache_circuit_breaker_state{state="half_open"} == 1
for: 1m
labels:
severity: warning
component: cache
annotations:
summary: "Redis cache circuit breaker is in half-open state"
description: "The Redis cache circuit breaker is probing for recovery. If this persists, Redis may be unstable."
runbook_url: "https://docs.predictiq.com/runbooks/cache-circuit-breaker-half-open"

- name: database_performance
interval: 30s
rules:
Expand Down
217 changes: 212 additions & 5 deletions performance/config/grafana-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -242,11 +242,218 @@
"notifications": []
}
},
{
"id": 11,
"title": "Cache Circuit Breaker State",
"type": "stat",
"gridPos": { "x": 12, "y": 16, "w": 6, "h": 4 },
"targets": [
{
"expr": "cache_circuit_breaker_state{state=\"closed\"}",
"legendFormat": "Closed",
"refId": "A"
},
{
"expr": "cache_circuit_breaker_state{state=\"open\"}",
"legendFormat": "Open",
"refId": "B"
},
{
"expr": "cache_circuit_breaker_state{state=\"half_open\"}",
"legendFormat": "Half-Open",
"refId": "C"
}
],
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "value_and_name"
},
"fieldConfig": {
"defaults": {
"mappings": [
{
"type": "value",
"options": {
"0": {
"text": "Inactive",
"color": "transparent"
},
"1": {
"text": "Active",
"color": "green"
}
}
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"value": null,
"color": "transparent"
},
{
"value": 1,
"color": "green"
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Open"
},
"properties": [
{
"id": "thresholds",
"value": {
"mode": "absolute",
"steps": [
{
"value": null,
"color": "transparent"
},
{
"value": 1,
"color": "red"
}
]
}
},
{
"id": "mappings",
"value": [
{
"type": "value",
"options": {
"0": {
"text": "Inactive",
"color": "transparent"
},
"1": {
"text": "OPEN - Cache Bypassed",
"color": "red"
}
}
}
]
}
]
},
{
"matcher": {
"id": "byName",
"options": "Half-Open"
},
"properties": [
{
"id": "thresholds",
"value": {
"mode": "absolute",
"steps": [
{
"value": null,
"color": "transparent"
},
{
"value": 1,
"color": "yellow"
}
]
}
},
{
"id": "mappings",
"value": [
{
"type": "value",
"options": {
"0": {
"text": "Inactive",
"color": "transparent"
},
"1": {
"text": "Half-Open - Probing",
"color": "yellow"
}
}
}
]
}
]
}
]
}
},
{
"id": 12,
"title": "Cache Circuit Breaker State Timeline",
"type": "graph",
"gridPos": { "x": 18, "y": 16, "w": 6, "h": 4 },
"targets": [
{
"expr": "cache_circuit_breaker_state{state=\"open\"}",
"legendFormat": "Open",
"refId": "A"
},
{
"expr": "cache_circuit_breaker_state{state=\"half_open\"}",
"legendFormat": "Half-Open",
"refId": "B"
}
],
"yaxes": [
{
"format": "short",
"label": "State",
"max": 1,
"min": 0
}
],
"alert": {
"name": "Cache Circuit Breaker Open",
"conditions": [
{
"evaluator": {
"params": [1],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": ["A", "2m", "now"]
},
"reducer": {
"params": [],
"type": "avg"
},
"type": "query"
}
],
"executionErrorState": "alerting",
"frequency": "1m",
"handler": 1,
"message": "Redis cache circuit breaker has been open for more than 2 minutes",
"noDataState": "no_data",
"notifications": []
}
},
{
"id": 6,
"title": "Database Query Time (p95)",
"type": "graph",
"gridPos": { "x": 12, "y": 16, "w": 12, "h": 8 },
"gridPos": { "x": 0, "y": 20, "w": 12, "h": 8 },
"targets": [
{
"expr": "histogram_quantile(0.95, rate(db_query_duration_seconds_bucket[5m]))",
Expand Down Expand Up @@ -293,7 +500,7 @@
"id": 7,
"title": "Database Connection Pool — Active vs Idle",
"type": "graph",
"gridPos": { "x": 0, "y": 24, "w": 12, "h": 8 },
"gridPos": { "x": 12, "y": 20, "w": 12, "h": 8 },
"targets": [
{
"expr": "db_pool_connections_active{pool=\"main\"}",
Expand Down Expand Up @@ -345,7 +552,7 @@
"id": 10,
"title": "Database Pool Acquire Duration (p95)",
"type": "graph",
"gridPos": { "x": 12, "y": 24, "w": 12, "h": 8 },
"gridPos": { "x": 0, "y": 28, "w": 12, "h": 8 },
"targets": [
{
"expr": "histogram_quantile(0.95, rate(db_pool_acquire_duration_seconds_bucket{pool=\"main\"}[5m]))",
Expand Down Expand Up @@ -397,7 +604,7 @@
"id": 8,
"title": "Contract Gas Costs",
"type": "graph",
"gridPos": { "x": 12, "y": 24, "w": 12, "h": 8 },
"gridPos": { "x": 12, "y": 28, "w": 12, "h": 8 },
"targets": [
{
"expr": "contract_gas_used{operation=\"create_market\"}",
Expand Down Expand Up @@ -454,7 +661,7 @@
"id": 9,
"title": "System Health Overview",
"type": "stat",
"gridPos": { "x": 0, "y": 32, "w": 24, "h": 4 },
"gridPos": { "x": 0, "y": 36, "w": 24, "h": 4 },
"targets": [
{
"expr": "up{job=\"predictiq-api\"}",
Expand Down
Loading