diff --git a/healthagent/config.py b/healthagent/config.py index ad9357d..6157388 100644 --- a/healthagent/config.py +++ b/healthagent/config.py @@ -25,7 +25,6 @@ class EvalType(StrEnum): NE = "ne" IN = "in" BITMASK = "bitmask" - DELTA_GT = "delta_gt" WINDOW_GT = "window_gt" diff --git a/healthagent/defaults.yaml b/healthagent/defaults.yaml index 707c05a..e5793d9 100644 --- a/healthagent/defaults.yaml +++ b/healthagent/defaults.yaml @@ -118,13 +118,6 @@ gpu: warning: 1.0e-6 category: NVLink msg: "GPU {gpu} NVLink effective BER {value:.2e} exceeds {threshold:.2e}" - DCGM_FI_DEV_PCIE_REPLAY_COUNTER: - eval: delta_gt - warning: 50 - error: 200 - category: PCIe - msg: "GPU {gpu} PCIe replay rate {value:.0f}/min exceeds {threshold}/min" - # ── Systemd module ────────────────────────────────────── systemd: services: diff --git a/healthagent/gpu.py b/healthagent/gpu.py index 20137ee..190cbf5 100644 --- a/healthagent/gpu.py +++ b/healthagent/gpu.py @@ -16,7 +16,6 @@ log = logging.getLogger('healthagent') # Maximum number of samples to retain per field per entity. -# At 1s polling, 300 = 5-minute window for delta_gt rate computation. MAX_KEEP_SAMPLES = 300 # Field-specific enrichments: map field ID -> callable(raw_value, field_values) -> list[str]. @@ -224,8 +223,7 @@ def track_fieldsv2(self): if not samples or samples[0].isBlank: continue - newest, oldest = samples[-1], samples[0] - #log.debug(f"Field {watch['field']} entity {entity_id}: {len(samples)} samples, oldest={oldest.value} ts={oldest.ts}, newest={newest.value} ts={newest.ts}") + newest = samples[-1] severity = None threshold_used = None evaluated = None @@ -235,10 +233,6 @@ def track_fieldsv2(self): continue triggered, evaluated = evaluate( watch["eval"], newest.value, thresh, - prev_value=oldest.value, - prev_time=oldest.ts / 1_000_000, - current_time=newest.ts / 1_000_000, - window=watch.get("window", 60), ) if triggered: severity = level diff --git a/healthagent/util.py b/healthagent/util.py index 6cebf04..8b74e3a 100644 --- a/healthagent/util.py +++ b/healthagent/util.py @@ -182,24 +182,19 @@ def __len__(self): return len(self._samples) -def evaluate(eval_type, value, threshold, *, prev_value=None, prev_time=None, - current_time=None, window=60, samples: TimeSeries = None): +def evaluate(eval_type, value, threshold, *, window=60, samples: TimeSeries = None): """Unified threshold evaluation. Returns (triggered: bool, evaluated_value). - For delta_gt, evaluated_value is the computed rate per window. For window_gt, evaluated_value is the delta within the time window. For bitmask, evaluated_value is the matching bits (value & threshold). For all others, evaluated_value is the input value. Args: - eval_type: Comparison type (gt, lt, ge, le, eq, ne, in, bitmask, delta_gt, window_gt) + eval_type: Comparison type (gt, lt, ge, le, eq, ne, in, bitmask, window_gt) value: Current value to evaluate threshold: Threshold to compare against (list for 'in' eval type) - prev_value: Previous sample value (delta_gt only) - prev_time: Previous sample timestamp in monotonic seconds (delta_gt only) - current_time: Current sample timestamp in monotonic seconds (delta_gt only) - window: Time window in seconds (delta_gt: rate normalization, window_gt: sliding window size) - samples: TimeSeries instance for recording and windowed evaluation (window_gt only) + window: Time window in seconds for window_gt sliding window size (default: 60) + samples: TimeSeries instance for windowed evaluation (window_gt only) """ eval_type = str(eval_type).strip().lower() @@ -220,17 +215,6 @@ def evaluate(eval_type, value, threshold, *, prev_value=None, prev_time=None, elif eval_type == "bitmask": matched = operator.index(value) & operator.index(threshold) return matched != 0, matched - elif eval_type == "delta_gt": - if prev_value is None or prev_time is None or current_time is None: - return False, 0.0 - delta = value - prev_value - if delta < 0: - return False, 0.0 - elapsed = current_time - prev_time - if elapsed <= 0: - return False, 0.0 - rate = (delta * window) / elapsed - return rate > threshold, rate elif eval_type == "window_gt": if samples is None: return False, 0 diff --git a/integration/test_inject.py b/integration/test_inject.py index c62ea91..403f39e 100644 --- a/integration/test_inject.py +++ b/integration/test_inject.py @@ -62,7 +62,6 @@ def _report(name, value_str, ok): FI_RETIRED_PENDING = 392 FI_FABRIC_MANAGER_STATUS = 170 FI_RECOVERY_ACTION = 1523 -FI_PCIE_REPLAY_COUNTER = 202 FI_EFFECTIVE_BER_FLOAT = 1218 FI_FABRIC_HEALTH_MASK = 174 FI_XID_ERRORS = 230 @@ -82,12 +81,6 @@ def test_clocks(gpu_id, duration): inject_loop(gpu_id, FI_CLOCKS_EVENT_REASONS, 0xE8, duration, "CLOCKS_EVENT_REASONS") -def test_pcie_replay(gpu_id, duration): - """Inject high PCIe replay counter — triggers delta_gt.""" - print(f"\n=== GPU {gpu_id}: PCIe replay counter (delta_gt warning > 50, error > 200) ===") - inject_loop(gpu_id, FI_PCIE_REPLAY_COUNTER, 99999, duration, "PCIE_REPLAY_COUNTER") - - def test_persistence_mode(gpu_id, duration): """Inject persistence mode = 0 — triggers ne 1 error.""" print(f"\n=== GPU {gpu_id}: Persistence mode (error if != 1) ===") @@ -167,7 +160,6 @@ def test_clear(gpu_id, duration): (FI_FABRIC_HEALTH_MASK, 0x1AA, "FABRIC_HEALTH_MASK (healthy)"), (FI_RECOVERY_ACTION, 0, "RECOVERY_ACTION"), (FI_EFFECTIVE_BER_FLOAT, 0.0, "EFFECTIVE_BER_FLOAT"), - (FI_PCIE_REPLAY_COUNTER, 0, "PCIE_REPLAY_COUNTER"), (FI_XID_ERRORS, 0, "XID_ERRORS"), ] for field_id, value, name in clears: @@ -180,7 +172,6 @@ def test_clear(gpu_id, duration): TESTS = { "temp": test_temperature, "clocks": test_clocks, - "pcie": test_pcie_replay, "persist": test_persistence_mode, "dbe": test_dbe, "remap": test_row_remap, diff --git a/tests/test_util.py b/tests/test_util.py index b18ab8f..b5319c0 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -119,99 +119,6 @@ def test_bitmask_rejects_float_threshold(self): evaluate("bitmask", 0xFF, 3.14) -class TestEvaluateDeltaGt: - - def test_normal_rate_triggers(self): - # 120 events in 60 seconds = 120/min, threshold 100/min - triggered, rate = evaluate("delta_gt", 620, 100, - prev_value=500, prev_time=0.0, current_time=60.0) - assert triggered is True - assert rate == 120.0 - - def test_normal_rate_below_threshold(self): - # 50 events in 60 seconds = 50/min, threshold 100/min - triggered, rate = evaluate("delta_gt", 550, 100, - prev_value=500, prev_time=0.0, current_time=60.0) - assert triggered is False - assert rate == 50.0 - - def test_exact_threshold_not_triggered(self): - # 100 events in 60 seconds = 100/min, threshold 100/min (not strictly greater) - triggered, rate = evaluate("delta_gt", 600, 100, - prev_value=500, prev_time=0.0, current_time=60.0) - assert triggered is False - assert rate == 100.0 - - def test_negative_delta_ignored(self): - # Counter reset: current < prev - triggered, rate = evaluate("delta_gt", 10, 0, - prev_value=500, prev_time=0.0, current_time=60.0) - assert triggered is False - assert rate == 0.0 - - def test_zero_elapsed_ignored(self): - triggered, rate = evaluate("delta_gt", 600, 100, - prev_value=500, prev_time=10.0, current_time=10.0) - assert triggered is False - assert rate == 0.0 - - def test_missing_prev_value(self): - # First sample — no previous data - triggered, rate = evaluate("delta_gt", 500, 100, - prev_value=None, prev_time=0.0, current_time=60.0) - assert triggered is False - assert rate == 0.0 - - def test_missing_prev_time(self): - triggered, rate = evaluate("delta_gt", 500, 100, - prev_value=400, prev_time=None, current_time=60.0) - assert triggered is False - assert rate == 0.0 - - def test_missing_current_time(self): - triggered, rate = evaluate("delta_gt", 500, 100, - prev_value=400, prev_time=0.0, current_time=None) - assert triggered is False - assert rate == 0.0 - - def test_custom_window_per_hour(self): - # 10 events in 60 seconds = 600/hour, threshold 500/hour - triggered, rate = evaluate("delta_gt", 510, 500, - prev_value=500, prev_time=0.0, current_time=60.0, - window=3600) - assert triggered is True - assert rate == 600.0 - - def test_custom_window_per_second(self): - # 120 events in 60 seconds = 2/sec, threshold 1/sec - triggered, rate = evaluate("delta_gt", 620, 1, - prev_value=500, prev_time=0.0, current_time=60.0, - window=1) - assert triggered is True - assert rate == 2.0 - - def test_threshold_zero_any_increment(self): - # Any new event triggers when threshold is 0 - triggered, rate = evaluate("delta_gt", 501, 0, - prev_value=500, prev_time=0.0, current_time=60.0) - assert triggered is True - assert rate > 0 - - def test_no_change_threshold_zero(self): - # No new events, threshold 0 — should not trigger - triggered, rate = evaluate("delta_gt", 500, 0, - prev_value=500, prev_time=0.0, current_time=60.0) - assert triggered is False - assert rate == 0.0 - - def test_delayed_cycle_rate_normalized(self): - # 250 events over 5 minutes = 50/min, threshold 100/min — should NOT trigger - triggered, rate = evaluate("delta_gt", 750, 100, - prev_value=500, prev_time=0.0, current_time=300.0) - assert triggered is False - assert rate == 50.0 - - class TestEvaluateIn: def test_in_match(self):