diff --git a/comp/anomalydetection/logssource/impl/pipeline.go b/comp/anomalydetection/logssource/impl/pipeline.go index 58a2c99a2c88..572593446a01 100644 --- a/comp/anomalydetection/logssource/impl/pipeline.go +++ b/comp/anomalydetection/logssource/impl/pipeline.go @@ -66,7 +66,7 @@ func (p *observerPipeline) start() { go func() { defer close(p.drainDone) for msg := range p.outputChan { - p.observerHandle.ObserveLog(msg) + p.observerHandle.ObserveLog(&messageLogView{msg: msg}) } }() p.proc.Start() @@ -97,3 +97,17 @@ func (p *observerPipeline) Stop() {} func (p *observerPipeline) Flush(ctx context.Context) { p.proc.Flush(ctx) } + +// messageLogView adapts *message.Message (whose GetContent returns []byte) to +// the observer's LogView interface (whose GetContent returns string). The +// string conversion copies the bytes once at the pipeline boundary; downstream +// extractors then share the resulting immutable string with zero copies. +type messageLogView struct { + msg *message.Message +} + +func (v *messageLogView) GetContent() string { return string(v.msg.GetContent()) } +func (v *messageLogView) GetStatus() string { return v.msg.GetStatus() } +func (v *messageLogView) GetTags() []string { return v.msg.GetTags() } +func (v *messageLogView) GetHostname() string { return v.msg.GetHostname() } +func (v *messageLogView) GetTimestampUnixMilli() int64 { return v.msg.GetTimestampUnixMilli() } diff --git a/comp/anomalydetection/observer/README.md b/comp/anomalydetection/observer/README.md index 09a199d84c06..38141a03d865 100644 --- a/comp/anomalydetection/observer/README.md +++ b/comp/anomalydetection/observer/README.md @@ -225,7 +225,7 @@ type MyExtractor struct{} func (m *MyExtractor) Name() string { return "my_extractor" } func (m *MyExtractor) ProcessLog(log observer.LogView) observer.LogMetricsExtractorOutput { - content := string(log.GetContent()) + content := log.GetContent() // Extract what you need synchronously — don't store the view return observer.LogMetricsExtractorOutput{ Metrics: []observer.MetricOutput{{ diff --git a/comp/anomalydetection/observer/def/types.go b/comp/anomalydetection/observer/def/types.go index f2f5d932eb4c..be57d15edbae 100644 --- a/comp/anomalydetection/observer/def/types.go +++ b/comp/anomalydetection/observer/def/types.go @@ -47,7 +47,7 @@ type MetricView interface { // This interface exists to prevent data races. Implementations must not store // the LogView itself. Copy any needed values synchronously. type LogView interface { - GetContent() []byte + GetContent() string GetStatus() string GetTags() []string GetHostname() string @@ -236,19 +236,19 @@ type LogObserver interface { // The storage keeps full summaries (min/max/sum/count) so aggregation // is specified at read time, not write time. type MetricOutput struct { - Name string - Value float64 - Tags []string - ContextKey string + Name string + Value float64 + Tags []string + Context *MetricContext // optional; stored on the series for anomaly enrichment } // LogMetricsExtractorOutput is what we obtain when we process a log with a log metrics extractor. type LogMetricsExtractorOutput struct { Metrics []MetricOutput Telemetry []ObserverTelemetry - // EvictedContextKeys lists context keys that are no longer valid (e.g. after - // extractor garbage collection). The engine removes matching contextRefs entries. - EvictedContextKeys []string + // EvictedMetricNames lists metric names whose series should be removed from + // storage (e.g. after extractor LRU eviction or garbage collection). + EvictedMetricNames []string } // SeriesDescriptor is the fully resolved identity of a time series. @@ -554,17 +554,6 @@ func AggregateString(agg Aggregate) string { } } -// ContextProvider resolves metric keys back to richer context about their -// origin. Components that synthesize metrics from richer data (e.g. log -// extractors that turn log patterns into count metrics) can implement this -// interface so that downstream consumers (detectors, reporters) can produce -// more descriptive anomaly reports. -type ContextProvider interface { - // GetContextByKey returns contextual information for a previously emitted - // context key, if available. - GetContextByKey(key string) (MetricContext, bool) -} - // MetricContext describes the origin of a synthesized metric. type MetricContext struct { // Pattern is the normalized pattern that generated this metric (e.g. a log signature). diff --git a/comp/anomalydetection/observer/impl/agent_internal_logs_test.go b/comp/anomalydetection/observer/impl/agent_internal_logs_test.go index 50b8476de9c4..1439db91f25e 100644 --- a/comp/anomalydetection/observer/impl/agent_internal_logs_test.go +++ b/comp/anomalydetection/observer/impl/agent_internal_logs_test.go @@ -6,7 +6,6 @@ package observerimpl import ( - "hash/fnv" "testing" "time" @@ -46,11 +45,9 @@ func TestAgentInternalLogsFlowIntoObserver(t *testing.T) { pkglog.Info(msg) // Agent logs are forwarded as structured JSON: {"msg":"..."}. - payload := []byte(`{"msg":"agent internal hello"}`) + payload := `{"msg":"agent internal hello"}` sig := logSignature(payload, 4096) - h := fnv.New64a() - _, _ = h.Write([]byte(sig)) - metricName := "log.pattern." + toHex64(h.Sum64()) + ".count" + metricName := patternCountMetricName(sig) tags := []string{"component:core", "level:info", "observer_source:agent-internal-logs", "source:datadog-agent"} // Poll briefly since observer processes asynchronously. @@ -61,19 +58,3 @@ func TestAgentInternalLogsFlowIntoObserver(t *testing.T) { require.Greater(collect, len(s.Points), 0) }, time.Second*5, time.Millisecond*10) } - -func toHex64(v uint64) string { - const hextable = "0123456789abcdef" - var out [16]byte - for i := 15; i >= 0; i-- { - out[i] = hextable[v&0xF] - v >>= 4 - } - // Mirror fmt.Sprintf("%x", ...) (no leading zeros trimmed? actually %x trims; we keep full width here but it won't match) - // Trim leading zeros for parity with production metric naming (fmt %x). - i := 0 - for i < 15 && out[i] == '0' { - i++ - } - return string(out[i:]) -} diff --git a/comp/anomalydetection/observer/impl/bench_log_ingestion_test.go b/comp/anomalydetection/observer/impl/bench_log_ingestion_test.go index 6271bba60703..c2f02e946f41 100644 --- a/comp/anomalydetection/observer/impl/bench_log_ingestion_test.go +++ b/comp/anomalydetection/observer/impl/bench_log_ingestion_test.go @@ -12,50 +12,50 @@ import ( // diverseLogContent returns distinct line shapes (JSON, kv, syslog, plain) for series s // so neighboring series exercise different tokenizer / pattern signatures. -func diverseLogContent(s int) []byte { +func diverseLogContent(s int) string { switch s % 20 { case 0: - return []byte(fmt.Sprintf(`{"msg":"log from series %d","level":"info"}`, s)) + return fmt.Sprintf(`{"msg":"log from series %d","level":"info"}`, s) case 1: - return []byte(fmt.Sprintf(`{"@timestamp":"2024-01-01T00:00:00Z","message":"evt-%d","severity":"WARN","svc":"api"}`, s)) + return fmt.Sprintf(`{"@timestamp":"2024-01-01T00:00:00Z","message":"evt-%d","severity":"WARN","svc":"api"}`, s) case 2: - return []byte(fmt.Sprintf(`{"trace_id":"%08x","span_id":"%08x","msg":"child span","ok":true}`, s, s+1)) + return fmt.Sprintf(`{"trace_id":"%08x","span_id":"%08x","msg":"child span","ok":true}`, s, s+1) case 3: - return []byte(fmt.Sprintf(`{"nested":{"user":%d,"shard":3},"event":"login","ip":"10.0.0.%d"}`, s, s%255)) + return fmt.Sprintf(`{"nested":{"user":%d,"shard":3},"event":"login","ip":"10.0.0.%d"}`, s, s%255) case 4: - return []byte(fmt.Sprintf(`level=INFO ts=1704067200 series=%d msg="request done" duration_ms=12`, s)) + return fmt.Sprintf(`level=INFO ts=1704067200 series=%d msg="request done" duration_ms=12`, s) case 5: - return []byte(fmt.Sprintf(`level=ERROR logger=com.example req=%d stack=java.lang.Exception`, s)) + return fmt.Sprintf(`level=ERROR logger=com.example req=%d stack=java.lang.Exception`, s) case 6: - return []byte(fmt.Sprintf(`[2024-01-15 14:30:00] INFO worker-%d task=flush completed=true`, s)) + return fmt.Sprintf(`[2024-01-15 14:30:00] INFO worker-%d task=flush completed=true`, s) case 7: - return []byte(fmt.Sprintf(`<134>1 2024-01-15T14:30:00Z host app-%d - - - msg="syslog style"`, s)) + return fmt.Sprintf(`<134>1 2024-01-15T14:30:00Z host app-%d - - - msg="syslog style"`, s) case 8: - return []byte(fmt.Sprintf(`10.1.2.3 - - [15/Jan/2024:14:30:00 +0000] "GET /api/v%d/items HTTP/1.1" 200 4321`, s%50)) + return fmt.Sprintf(`10.1.2.3 - - [15/Jan/2024:14:30:00 +0000] "GET /api/v%d/items HTTP/1.1" 200 4321`, s%50) case 9: - return []byte(fmt.Sprintf(`time="2024-01-15T14:30:00Z" level=debug msg="slow query" series=%d ms=450`, s)) + return fmt.Sprintf(`time="2024-01-15T14:30:00Z" level=debug msg="slow query" series=%d ms=450`, s) case 10: - return []byte(fmt.Sprintf(`{"arr":[%d,2,3],"obj":{"k":"v"},"flag":false}`, s)) + return fmt.Sprintf(`{"arr":[%d,2,3],"obj":{"k":"v"},"flag":false}`, s) case 11: - return []byte(fmt.Sprintf(`ERROR: connection reset by peer series=%d errno=104`, s)) + return fmt.Sprintf(`ERROR: connection reset by peer series=%d errno=104`, s) case 12: - return []byte(fmt.Sprintf(`{"msg":"unicode 测试 %d café","meta":{"region":"eu-west-1"}}`, s)) + return fmt.Sprintf(`{"msg":"unicode 测试 %d café","meta":{"region":"eu-west-1"}}`, s) case 13: - return []byte(fmt.Sprintf(`kafka: topic=logs partition=%d offset=999 key=null`, s)) + return fmt.Sprintf(`kafka: topic=logs partition=%d offset=999 key=null`, s) case 14: - return []byte(fmt.Sprintf(`{"a":1,"b":%d,"c":{"d":[1,2]}}`, s)) + return fmt.Sprintf(`{"a":1,"b":%d,"c":{"d":[1,2]}}`, s) case 15: - return []byte(fmt.Sprintf(`[pid=12345] series=%d action=gc pause_ms=3`, s)) + return fmt.Sprintf(`[pid=12345] series=%d action=gc pause_ms=3`, s) case 16: - return []byte(fmt.Sprintf(`{"http":{"method":"POST","path":"/hook/%d","status":201}}`, s)) + return fmt.Sprintf(`{"http":{"method":"POST","path":"/hook/%d","status":201}}`, s) case 17: - return []byte(fmt.Sprintf(`plain text line series=%d no json here`, s)) + return fmt.Sprintf(`plain text line series=%d no json here`, s) case 18: - return []byte(fmt.Sprintf(`{"double":%d.%d,"scientific":1.23e-4}`, s/10, s%10)) + return fmt.Sprintf(`{"double":%d.%d,"scientific":1.23e-4}`, s/10, s%10) case 19: - return []byte(fmt.Sprintf(`merge key=value series=%d another=42`, s)) + return fmt.Sprintf(`merge key=value series=%d another=42`, s) default: - return []byte(fmt.Sprintf(`{"msg":"log from series %d","level":"info"}`, s)) + return fmt.Sprintf(`{"msg":"log from series %d","level":"info"}`, s) } } @@ -68,7 +68,7 @@ func BenchmarkLogExtraction_SeriesCount(b *testing.B) { logs := make([]*logObs, numSeries) for s := 0; s < numSeries; s++ { logs[s] = &logObs{ - content: []byte(fmt.Sprintf(`{"msg":"log from series %d","level":"info"}`, s)), + content: fmt.Sprintf(`{"msg":"log from series %d","level":"info"}`, s), status: "error", tags: []string{fmt.Sprintf("series:%d", s)}, timestampMs: 0, diff --git a/comp/anomalydetection/observer/impl/context_provider.go b/comp/anomalydetection/observer/impl/context_provider.go index 48342305df8c..604716684275 100644 --- a/comp/anomalydetection/observer/impl/context_provider.go +++ b/comp/anomalydetection/observer/impl/context_provider.go @@ -8,12 +8,12 @@ package observerimpl import ( "fmt" - observer "github.com/DataDog/datadog-agent/comp/anomalydetection/observer/def" + observerdef "github.com/DataDog/datadog-agent/comp/anomalydetection/observer/def" ) // validateUniqueExtractorNames rejects duplicate runtime extractor names since // they are used as namespaces in storage and context lookup. -func validateUniqueExtractorNames(extractors []observer.LogMetricsExtractor) { +func validateUniqueExtractorNames(extractors []observerdef.LogMetricsExtractor) { seen := make(map[string]struct{}, len(extractors)) for _, ext := range extractors { name := ext.Name() @@ -24,20 +24,6 @@ func validateUniqueExtractorNames(extractors []observer.LogMetricsExtractor) { } } -// collectContextProviders discovers ContextProvider implementations among -// instantiated extractors via type assertion. Returns a map keyed by the -// extractor's component name (which is used as the storage namespace for -// its metrics), enabling O(1) lookup during anomaly enrichment. -func collectContextProviders(extractors []observer.LogMetricsExtractor) map[string]observer.ContextProvider { - providers := make(map[string]observer.ContextProvider) - for _, ext := range extractors { - if cp, ok := ext.(observer.ContextProvider); ok { - providers[ext.Name()] = cp - } - } - return providers -} - // truncate shortens s to maxLen, appending "..." if truncated. func truncate(s string, maxLen int) string { runes := []rune(s) diff --git a/comp/anomalydetection/observer/impl/debug.go b/comp/anomalydetection/observer/impl/debug.go index 84b451182bbd..9133db276aa0 100644 --- a/comp/anomalydetection/observer/impl/debug.go +++ b/comp/anomalydetection/observer/impl/debug.go @@ -32,9 +32,9 @@ type DebugView interface { // AddTelemetry writes a data point into the engine's telemetry namespace. // Used by the testbench to store per-detector timing stats for UI display. AddTelemetry(name string, value float64, timestamp int64, tags []string) - // ReplayStoredData resets analysis state (preserving extractor context and - // contextRefs) then replays all data currently in storage through the - // scheduler in chronological order. Call after Flush(). + // ReplayStoredData resets analysis state (preserving extractor context) + // then replays all data currently in storage through the scheduler in + // chronological order. Call after Flush(). ReplayStoredData() // StorageReader returns a read-only view of the engine's time-series storage. // Used by the testbench to compute windowed log rates in change messages. diff --git a/comp/anomalydetection/observer/impl/engine.go b/comp/anomalydetection/observer/impl/engine.go index a2d3e94a8d46..8807d4cda7cf 100644 --- a/comp/anomalydetection/observer/impl/engine.go +++ b/comp/anomalydetection/observer/impl/engine.go @@ -26,11 +26,6 @@ type anomalyDedupKey struct { title string } -type seriesContextRef struct { - namespace string - contextKey string -} - // engine is the shared orchestration core for the observer pipeline. // It encapsulates storage, log extraction, detection, and correlation, // providing a single execution path used by both the live observer and testbench. @@ -44,12 +39,10 @@ type engine struct { // take a write lock; readers (stateView methods) take a read lock. mu sync.RWMutex - storage *timeSeriesStorage - extractors []observerdef.LogMetricsExtractor - detectors []observerdef.Detector - correlators []observerdef.Correlator - contextProviders map[string]observerdef.ContextProvider // namespace → provider - contextRefs map[string]seriesContextRef + storage *timeSeriesStorage + extractors []observerdef.LogMetricsExtractor + detectors []observerdef.Detector + correlators []observerdef.Correlator // scheduler decides when the engine should advance analysis. scheduler schedulerPolicy @@ -83,8 +76,10 @@ type engine struct { correlationMu sync.RWMutex // Accumulated telemetry from detection runs (for StateView access). - accumulatedTelemetry []observerdef.ObserverTelemetry - telemetryMu sync.RWMutex + // Only populated when enableAccumulatedTelemetry is true (testbench mode). + accumulatedTelemetry []observerdef.ObserverTelemetry + telemetryMu sync.RWMutex + enableAccumulatedTelemetry bool // Event subscription management. sinks []eventSink @@ -115,21 +110,37 @@ type engine struct { // confined to the engine run loop. Lock-free via atomic.Pointer to a // copy-on-write map so we don't add a mutex to the hot path. sourceTagCache atomic.Pointer[map[string]string] + + // onProcessingTime is an optional callback for reporting per-component + // processing time directly (e.g. gauge.Set) instead of constructing + // ObserverTelemetry objects. When set, timing telemetry is reported + // through this callback and omitted from returned telemetry slices. + // Live mode sets this; testbench leaves it nil to preserve the + // ObserverTelemetry path needed for UI display. + onProcessingTime func(detectorTag string, nanos float64) + + // detectorTags caches "detector:" strings for each extractor, + // detector, logObserver, and correlator to avoid per-log concatenation. + detectorTags map[string]string } // engineConfig holds the parameters for constructing an engine. type engineConfig struct { - storage *timeSeriesStorage - extractors []observerdef.LogMetricsExtractor - detectors []observerdef.Detector - correlators []observerdef.Correlator - contextProviders map[string]observerdef.ContextProvider // namespace → provider + storage *timeSeriesStorage + extractors []observerdef.LogMetricsExtractor + detectors []observerdef.Detector + correlators []observerdef.Correlator // scheduler is the scheduling policy. If nil, defaults to currentBehaviorPolicy. scheduler schedulerPolicy rawAnomalyWindow int64 maxRawAnomalies int + + // enableAccumulatedTelemetry gates the ever-growing accumulatedTelemetry + // slice. Only testbench mode needs it; live mode skips the append to + // avoid an unbounded memory leak. + enableAccumulatedTelemetry bool } // newEngine creates an engine with the given configuration. @@ -141,16 +152,15 @@ func newEngine(cfg engineConfig) *engine { validateUniqueExtractorNames(cfg.extractors) e := &engine{ - storage: cfg.storage, - extractors: cfg.extractors, - detectors: cfg.detectors, - correlators: cfg.correlators, - contextProviders: cfg.contextProviders, - contextRefs: make(map[string]seriesContextRef), - scheduler: sched, - - rawAnomalyWindow: cfg.rawAnomalyWindow, - maxRawAnomalies: cfg.maxRawAnomalies, + storage: cfg.storage, + extractors: cfg.extractors, + detectors: cfg.detectors, + correlators: cfg.correlators, + scheduler: sched, + + rawAnomalyWindow: cfg.rawAnomalyWindow, + maxRawAnomalies: cfg.maxRawAnomalies, + enableAccumulatedTelemetry: cfg.enableAccumulatedTelemetry, } // Cache log observers from detectors. @@ -160,6 +170,7 @@ func newEngine(cfg engineConfig) *engine { } } + e.rebuildDetectorTags() return e } @@ -282,13 +293,17 @@ func (e *engine) IngestMetric(source string, m *metricObs) []advanceRequest { func (e *engine) IngestLog(source string, l *logObs) ([]advanceRequest, []observerdef.ObserverTelemetry) { sourceTag := e.sourceTagForIngest(source) view := &logView{obs: l} - var logTelemetry = []observerdef.ObserverTelemetry{} + var logTelemetry []observerdef.ObserverTelemetry for _, extractor := range e.extractors { processingStartTime := time.Now() out := extractor.ProcessLog(view) - e.removeContextRefsForEvictedKeys(extractor.Name(), out.EvictedContextKeys) - processingTime := time.Since(processingStartTime) - logTelemetry = append(logTelemetry, newTelemetryGauge([]string{"detector:" + extractor.Name()}, telemetryDetectorProcessingTimeNs, float64(processingTime.Nanoseconds()), l.timestampMs/1000)) + e.removeEvictedMetricSeries(extractor.Name(), out.EvictedMetricNames) + nanos := float64(time.Since(processingStartTime).Nanoseconds()) + if e.onProcessingTime != nil { + e.onProcessingTime(e.detectorTag(extractor.Name()), nanos) + } else { + logTelemetry = append(logTelemetry, newTelemetryGauge([]string{e.detectorTag(extractor.Name())}, telemetryDetectorProcessingTimeNs, nanos, l.timestampMs/1000)) + } for _, m := range out.Metrics { // Avoid copying m.Tags when sourceTag is already present: storage.Add // performs its own deep copy on first-write of a series via @@ -302,17 +317,8 @@ func (e *engine) IngestLog(source string, l *logObs) ([]advanceRequest, []observ tags = append(newTags, sourceTag) } res := e.storage.Add(extractor.Name(), m.Name, m.Value, l.timestampMs/1000, tags) - if m.ContextKey != "" && res.StorageKey != "" { - // Reuse the storage key computed inside storage.Add instead of - // recomputing seriesKey here. seriesKey is hot enough that this - // duplicate accounted for ~14.5 MiB heap-live in the - // quality_gate_container_logs SMP profile (now renamed to - // observer_logs_anomaly_stress; the 'quality_gate_*' prefix is - // reserved for SMP quality-gate cases). - e.contextRefs[res.StorageKey] = seriesContextRef{ - namespace: extractor.Name(), - contextKey: m.ContextKey, - } + if m.Context != nil && res.Ref >= 0 { + e.storage.SetContext(res.Ref, m.Context) } } if len(out.Telemetry) > 0 { @@ -322,10 +328,14 @@ func (e *engine) IngestLog(source string, l *logObs) ([]advanceRequest, []observ for _, lo := range e.logObservers { processingStartTime := time.Now() lo.ProcessLog(view) - processingTime := time.Since(processingStartTime) - logTelemetry = append(logTelemetry, newTelemetryGauge([]string{"detector:" + lo.Name()}, telemetryDetectorProcessingTimeNs, float64(processingTime.Nanoseconds()), l.timestampMs/1000)) + nanos := float64(time.Since(processingStartTime).Nanoseconds()) + if e.onProcessingTime != nil { + e.onProcessingTime(e.detectorTag(lo.Name()), nanos) + } else { + logTelemetry = append(logTelemetry, newTelemetryGauge([]string{e.detectorTag(lo.Name())}, telemetryDetectorProcessingTimeNs, nanos, l.timestampMs/1000)) + } } - if len(logTelemetry) > 0 { + if e.enableAccumulatedTelemetry && len(logTelemetry) > 0 { e.telemetryMu.Lock() e.accumulatedTelemetry = append(e.accumulatedTelemetry, logTelemetry...) e.telemetryMu.Unlock() @@ -345,38 +355,15 @@ func sliceContains(items []string, want string) bool { return false } -// removeContextRefsForEvictedKeys drops engine contextRefs whose extractor -// namespace and context key match an eviction from extractor GC, and frees -// the corresponding storage series. Without the storage cleanup, evicted -// patterns leak their tags + columnar arrays indefinitely (the contextRefs -// map is just metadata; the heavy data lives in storage.series). -func (e *engine) removeContextRefsForEvictedKeys(namespace string, evictedKeys []string) { - // No garbage collection done - if len(evictedKeys) == 0 { - return - } - want := make(map[string]struct{}, len(evictedKeys)) - for _, k := range evictedKeys { - if k != "" { - want[k] = struct{}{} - } - } - if len(want) == 0 { - return - } - var storageKeys []string - for seriesID, ref := range e.contextRefs { - if ref.namespace != namespace { +// removeEvictedMetricSeries removes all storage series for the given metric +// names in namespace. Called when an extractor GC/LRU evicts a pattern cluster. +func (e *engine) removeEvictedMetricSeries(namespace string, evictedNames []string) { + for _, name := range evictedNames { + if name == "" { continue } - if _, ok := want[ref.contextKey]; ok { - delete(e.contextRefs, seriesID) - storageKeys = append(storageKeys, seriesID) - } - } - if len(storageKeys) > 0 { - freedRefs := e.storage.RemoveSeriesByKeys(storageKeys) - e.fanOutSeriesRemoval(freedRefs) + freed := e.storage.RemoveSeriesByMetricName(namespace, name) + e.fanOutSeriesRemoval(freed) } } @@ -484,6 +471,10 @@ func (e *engine) advanceWithReason(upToSec int64, reason advanceReason) advanceR }) } + if freed := e.storage.EvictDefault(); len(freed) > 0 { + e.fanOutSeriesRemoval(freed) + } + result := e.runDetectorsAndCorrelatorsSnapshot(upToSec, detectors, correlators) e.emit(engineEvent{ @@ -526,8 +517,12 @@ func (e *engine) runDetectorsAndCorrelatorsSnapshot(upTo int64, detectors []obse processingStartTime := time.Now() result := detector.Detect(storageForDetect, upTo) - processingTime := time.Since(processingStartTime) - allTelemetry = append(allTelemetry, newTelemetryGauge([]string{"detector:" + detector.Name()}, telemetryDetectorProcessingTimeNs, float64(processingTime.Nanoseconds()), upTo)) + nanos := float64(time.Since(processingStartTime).Nanoseconds()) + if e.onProcessingTime != nil { + e.onProcessingTime(e.detectorTag(detector.Name()), nanos) + } else { + allTelemetry = append(allTelemetry, newTelemetryGauge([]string{e.detectorTag(detector.Name())}, telemetryDetectorProcessingTimeNs, nanos, upTo)) + } // Emit detect digest (captures raw result BEFORE dedup). if e.onDetectDigest != nil { @@ -576,7 +571,12 @@ func (e *engine) runDetectorsAndCorrelatorsSnapshot(upTo int64, detectors []obse e.accumulateCorrelations(correlator.ActiveCorrelations()) advanceStart := time.Now() correlator.Advance(upTo) - allTelemetry = append(allTelemetry, newTelemetryGauge([]string{"detector:" + correlator.Name()}, telemetryDetectorProcessingTimeNs, float64(time.Since(advanceStart).Nanoseconds()), upTo)) + corrNanos := float64(time.Since(advanceStart).Nanoseconds()) + if e.onProcessingTime != nil { + e.onProcessingTime(e.detectorTag(correlator.Name()), corrNanos) + } else { + allTelemetry = append(allTelemetry, newTelemetryGauge([]string{e.detectorTag(correlator.Name())}, telemetryDetectorProcessingTimeNs, corrNanos, upTo)) + } e.emit(engineEvent{ kind: eventCorrelationUpdated, timestamp: upTo, @@ -586,8 +586,8 @@ func (e *engine) runDetectorsAndCorrelatorsSnapshot(upTo int64, detectors []obse }) } - // Accumulate telemetry so StateView can expose it. - if len(allTelemetry) > 0 { + // Accumulate telemetry so StateView can expose it (testbench only). + if e.enableAccumulatedTelemetry && len(allTelemetry) > 0 { e.telemetryMu.Lock() e.accumulatedTelemetry = append(e.accumulatedTelemetry, allTelemetry...) e.telemetryMu.Unlock() @@ -599,27 +599,15 @@ func (e *engine) runDetectorsAndCorrelatorsSnapshot(upTo int64, detectors []obse } } -// enrichAnomaly decorates an anomaly with context from the originating -// extractor, if available. This runs automatically on every anomaly so -// detectors don't need to be aware of context providers. -// Lookup builds the storage key from Source fields (namespace, name, tags) -// and maps that to a provider namespace and context key. +// enrichAnomaly decorates an anomaly with context stored on its series. +// The context is set at ingest time (storage.SetContext) and retrieved here +// via a single O(1) ref lookup — no string keys or provider indirection. func (e *engine) enrichAnomaly(a *observerdef.Anomaly) { - if a.Source.Name == "" { + if a.SourceRef == nil { return } - fullKey := seriesKey(a.Source.Namespace, a.Source.Name, a.Source.Tags) - - ref, ok := e.contextRefs[fullKey] - if !ok { - return - } - provider, ok := e.contextProviders[ref.namespace] - if !ok { - return - } - ctx, ok := provider.GetContextByKey(ref.contextKey) - if !ok { + ctx := e.storage.GetContext(a.SourceRef.Ref) + if ctx == nil { return } a.Context = &observerdef.MetricContext{ @@ -636,8 +624,12 @@ func (e *engine) processAnomaly(anomaly observerdef.Anomaly) []observerdef.Obser for _, correlator := range e.correlators { processingStartTime := time.Now() correlator.ProcessAnomaly(anomaly) - processingTime := time.Since(processingStartTime) - allTelemetry = append(allTelemetry, newTelemetryGauge([]string{"detector:" + correlator.Name()}, telemetryDetectorProcessingTimeNs, float64(processingTime.Nanoseconds()), anomaly.Timestamp)) + nanos := float64(time.Since(processingStartTime).Nanoseconds()) + if e.onProcessingTime != nil { + e.onProcessingTime(e.detectorTag(correlator.Name()), nanos) + } else { + allTelemetry = append(allTelemetry, newTelemetryGauge([]string{e.detectorTag(correlator.Name())}, telemetryDetectorProcessingTimeNs, nanos, anomaly.Timestamp)) + } } return allTelemetry @@ -785,6 +777,34 @@ func (e *engine) AccumulatedCorrelations() []observerdef.ActiveCorrelation { return result } +// rebuildDetectorTags rebuilds the cached "detector:" tag map from the +// current extractors, detectors, logObservers, and correlators. +func (e *engine) rebuildDetectorTags() { + tags := make(map[string]string) + for _, ext := range e.extractors { + tags[ext.Name()] = "detector:" + ext.Name() + } + for _, d := range e.detectors { + tags[d.Name()] = "detector:" + d.Name() + } + for _, lo := range e.logObservers { + tags[lo.Name()] = "detector:" + lo.Name() + } + for _, c := range e.correlators { + tags[c.Name()] = "detector:" + c.Name() + } + e.detectorTags = tags +} + +// detectorTag returns the cached "detector:" tag for the given component +// name. Falls back to concatenation if not cached (shouldn't happen in practice). +func (e *engine) detectorTag(name string) string { + if tag, ok := e.detectorTags[name]; ok { + return tag + } + return "detector:" + name +} + // Storage returns the engine's storage. func (e *engine) Storage() *timeSeriesStorage { return e.storage @@ -803,6 +823,7 @@ func (e *engine) SetDetectors(detectors []observerdef.Detector) { e.logObservers = append(e.logObservers, lo) } } + e.rebuildDetectorTags() } // SetCorrelators replaces the engine's correlators. @@ -811,6 +832,7 @@ func (e *engine) SetCorrelators(correlators []observerdef.Correlator) { defer e.mu.Unlock() e.correlators = correlators + e.rebuildDetectorTags() } // SetExtractors replaces the engine's log-metrics extractors. Used when @@ -822,8 +844,7 @@ func (e *engine) SetExtractors(extractors []observerdef.LogMetricsExtractor) { validateUniqueExtractorNames(extractors) e.extractors = extractors - e.contextProviders = collectContextProviders(extractors) - e.contextRefs = make(map[string]seriesContextRef) + e.rebuildDetectorTags() } // Reset clears analysis state so detectors will re-analyze from scratch. @@ -851,7 +872,6 @@ func (e *engine) Reset() { } } - e.contextRefs = make(map[string]seriesContextRef) } // resetRawAnomalies clears the raw anomaly tracking state. @@ -890,13 +910,10 @@ func (e *engine) resetFull() { } // resetAnalysisState resets detector and correlator state, anomaly tracking, -// telemetry, and correlations — but does NOT reset extractors and does NOT -// clear contextRefs. Used before batch replay so that: -// - enrichAnomaly can still call provider.GetContextByKey (extractor context intact) -// - contextRefs still maps series storage keys to their context keys -// -// Detectors and correlators ARE reset so they start from a clean slate and -// produce correct anomaly/correlation results during the replay. +// telemetry, and correlations — but does NOT reset extractors. Used before +// batch replay so that enrichAnomaly can still attach context (stored on +// seriesStats) during replay. Detectors and correlators ARE reset so they +// start from a clean slate and produce correct results. func (e *engine) resetAnalysisState() { e.mu.Lock() e.lastAnalyzedDataTime = 0 @@ -911,8 +928,8 @@ func (e *engine) resetAnalysisState() { for _, correlator := range e.correlators { correlator.Reset() } - // Extractors and contextRefs are intentionally NOT reset: their state was - // built during log ingestion and is needed by enrichAnomaly during replay. + // Extractors are intentionally NOT reset: their state was built during + // log ingestion and is needed by enrichAnomaly during replay. e.resetRawAnomalies() e.resetTelemetry() diff --git a/comp/anomalydetection/observer/impl/events_test.go b/comp/anomalydetection/observer/impl/events_test.go index 26bd2233b7fd..06d6a2ef97a7 100644 --- a/comp/anomalydetection/observer/impl/events_test.go +++ b/comp/anomalydetection/observer/impl/events_test.go @@ -115,21 +115,9 @@ func (d *anomalyDetector) Detect(_ observerdef.StorageReader, _ int64) observerd } } -type stubContextProvider struct { - context observerdef.MetricContext - ok bool - requestedKey string -} - -func (p *stubContextProvider) GetContextByKey(key string) (observerdef.MetricContext, bool) { - p.requestedKey = key - return p.context, p.ok -} - type stubExtractor struct { - name string - contextByKey map[string]observerdef.MetricContext - resetCount int + name string + resetCount int } func (e *stubExtractor) Name() string { return e.name } @@ -138,11 +126,6 @@ func (e *stubExtractor) ProcessLog(_ observerdef.LogView) observerdef.LogMetrics return observerdef.LogMetricsExtractorOutput{} } -func (e *stubExtractor) GetContextByKey(key string) (observerdef.MetricContext, bool) { - ctx, ok := e.contextByKey[key] - return ctx, ok -} - func (e *stubExtractor) Reset() { e.resetCount++ } @@ -234,14 +217,14 @@ func TestAdvanceEmitsAnomalyCreatedEvents(t *testing.T) { } func TestAdvanceEnrichesAnomalyContextWithoutOverwritingDescription(t *testing.T) { - provider := &stubContextProvider{ - context: observerdef.MetricContext{ - Pattern: "error <*> timeout", - Example: "very long example line that should still be attached as context without replacing the detector description", - Source: "log_metrics_extractor", - }, - ok: true, - } + storage := newTimeSeriesStorage() + res := storage.Add("log_metrics_extractor", "log.pattern.abc.count", 1.0, 1, []string{"observer_source:source-a", "service:api"}) + storage.SetContext(res.Ref, &observerdef.MetricContext{ + Pattern: "error <*> timeout", + Example: "very long example line that should still be attached as context without replacing the detector description", + Source: "log_metrics_extractor", + }) + anomalies := []observerdef.Anomaly{{ Source: observerdef.SeriesDescriptor{ Namespace: "log_metrics_extractor", @@ -249,31 +232,21 @@ func TestAdvanceEnrichesAnomalyContextWithoutOverwritingDescription(t *testing.T Tags: []string{"observer_source:source-a", "service:api"}, Aggregate: observerdef.AggregateCount, }, + SourceRef: &observerdef.QueryHandle{Ref: res.Ref, Aggregate: observerdef.AggregateCount}, DetectorName: "test", Description: "detector-authored description", Timestamp: 99, }} - storage := newTimeSeriesStorage() - // Add a series so that the storage key matches Source fields. - storage.Add("log_metrics_extractor", "log.pattern.abc.count", 1.0, 1, []string{"observer_source:source-a", "service:api"}) e := newEngine(engineConfig{ storage: storage, detectors: []observerdef.Detector{ &anomalyDetector{name: "test", anomalies: anomalies}, }, - contextProviders: map[string]observerdef.ContextProvider{ - "log_metrics_extractor": provider, - }, }) - e.contextRefs["log_metrics_extractor|log.pattern.abc.count|observer_source:source-a,service:api"] = seriesContextRef{ - namespace: "log_metrics_extractor", - contextKey: "ctx-1", - } sink := &collectingSink{} e.Subscribe(sink) - e.Advance(100) anomalyEvents := sink.eventsOfKind(eventAnomalyCreated) @@ -284,32 +257,25 @@ func TestAdvanceEnrichesAnomalyContextWithoutOverwritingDescription(t *testing.T assert.Equal(t, "error <*> timeout", got.Context.Pattern) assert.Equal(t, "log_metrics_extractor", got.Context.Source) assert.Contains(t, got.Context.Example, "very long example line") - assert.Equal(t, "ctx-1", provider.requestedKey) } -func TestSetExtractorsRefreshesContextProviders(t *testing.T) { - first := &stubExtractor{name: "first", contextByKey: map[string]observerdef.MetricContext{}} - second := &stubExtractor{name: "second", contextByKey: map[string]observerdef.MetricContext{ - "ctx-2": {Pattern: "p2", Example: "e2", Source: "second"}, - }} +func TestSetExtractorsContextSurvivesInStorage(t *testing.T) { storage := newTimeSeriesStorage() - // Add a series so that the storage key matches Source fields. - storage.Add("second", "metric", 1.0, 1, []string{"service:api"}) + res := storage.Add("second", "metric", 1.0, 1, []string{"service:api"}) + storage.SetContext(res.Ref, &observerdef.MetricContext{Pattern: "p2", Example: "e2", Source: "second"}) + e := newEngine(engineConfig{ - storage: storage, - extractors: []observerdef.LogMetricsExtractor{first}, - contextProviders: collectContextProviders([]observerdef.LogMetricsExtractor{first}), + storage: storage, + extractors: []observerdef.LogMetricsExtractor{&stubExtractor{name: "first"}}, detectors: []observerdef.Detector{&anomalyDetector{name: "test", anomalies: []observerdef.Anomaly{{ Source: observerdef.SeriesDescriptor{Namespace: "second", Name: "metric", Tags: []string{"service:api"}}, + SourceRef: &observerdef.QueryHandle{Ref: res.Ref}, Timestamp: 1, }}}}, }) - e.SetExtractors([]observerdef.LogMetricsExtractor{second}) - e.contextRefs["second|metric|service:api"] = seriesContextRef{ - namespace: "second", - contextKey: "ctx-2", - } + // Context lives in storage, not in engine maps — SetExtractors doesn't clear it. + e.SetExtractors([]observerdef.LogMetricsExtractor{&stubExtractor{name: "second"}}) result := e.Advance(2) require.Len(t, result.anomalies, 1) require.NotNil(t, result.anomalies[0].Context) @@ -320,26 +286,25 @@ func TestEnrichAnomalyWithRealLogPatternExtractorUsesStoredSeriesTags(t *testing extractor := NewLogPatternExtractor(DefaultLogPatternExtractorConfig()) extractor.config.MinClusterSizeBeforeEmit = 1 e := newEngine(engineConfig{ - storage: newTimeSeriesStorage(), - extractors: []observerdef.LogMetricsExtractor{extractor}, - contextProviders: collectContextProviders([]observerdef.LogMetricsExtractor{extractor}), + storage: newTimeSeriesStorage(), + extractors: []observerdef.LogMetricsExtractor{extractor}, }) _, _ = e.IngestLog("source-a", &logObs{ - content: []byte("GET /users/123 returned 500"), + content: "GET /users/123 returned 500", status: "warn", tags: []string{"service:api"}, timestampMs: 1_000, }) - // Second log in the same sub-clusterer so patterns merge and produce a wildcard. + // Second log merges the pattern and updates the stored example to the latest. _, _ = e.IngestLog("source-a", &logObs{ - content: []byte("GET /users/456 returned 500"), + content: "GET /users/456 returned 500", status: "warn", tags: []string{"service:api"}, timestampMs: 1_500, }) _, _ = e.IngestLog("source-b", &logObs{ - content: []byte("GET /users/456 returned 500"), + content: "GET /users/789 returned 500", status: "warn", tags: []string{"service:worker"}, timestampMs: 2_000, @@ -354,6 +319,7 @@ func TestEnrichAnomalyWithRealLogPatternExtractorUsesStoredSeriesTags(t *testing Name: meta.Name, Tags: meta.Tags, }, + SourceRef: &observerdef.QueryHandle{Ref: meta.Ref}, } break } @@ -363,23 +329,22 @@ func TestEnrichAnomalyWithRealLogPatternExtractorUsesStoredSeriesTags(t *testing e.enrichAnomaly(&anomaly) require.NotNil(t, anomaly.Context) assert.Equal(t, "log_pattern_extractor", anomaly.Context.Source) - assert.Equal(t, "GET /users/123 returned 500", anomaly.Context.Example) + // Example is updated to the latest matching log. + assert.Equal(t, "GET /users/456 returned 500", anomaly.Context.Example) assert.Contains(t, anomaly.Context.Pattern, "*") - assert.NotContains(t, anomaly.Context.Example, "456") } func TestAdvance_LogMetricAnomalyIsEnrichedViaMatchingSeriesIdentity(t *testing.T) { extractor := NewLogMetricsExtractor(LogMetricsExtractorConfig{}) detector := newSeriesDetectorAdapter(&emitOnSeriesDetector{name: "test_series_detector"}, []observerdef.Aggregate{observerdef.AggregateCount}) e := newEngine(engineConfig{ - storage: newTimeSeriesStorage(), - extractors: []observerdef.LogMetricsExtractor{extractor}, - contextProviders: collectContextProviders([]observerdef.LogMetricsExtractor{extractor}), - detectors: []observerdef.Detector{detector}, + storage: newTimeSeriesStorage(), + extractors: []observerdef.LogMetricsExtractor{extractor}, + detectors: []observerdef.Detector{detector}, }) e.IngestLog("source-a", &logObs{ - content: []byte("GET /users/123 returned 500"), + content: "GET /users/123 returned 500", tags: []string{"service:api"}, timestampMs: 1_000, }) @@ -399,7 +364,7 @@ func TestAdvance_LogMetricAnomalyIsEnrichedViaMatchingSeriesIdentity(t *testing. require.NotNil(t, anomaly.Context) assert.Equal(t, "log_metrics_extractor", anomaly.Context.Source) assert.Equal(t, "GET /users/123 returned 500", anomaly.Context.Example) - assert.Equal(t, logSignature([]byte("GET /users/123 returned 500"), extractor.config.MaxEvalBytes), anomaly.Context.Pattern) + assert.Equal(t, logSignature("GET /users/123 returned 500", extractor.config.MaxEvalBytes), anomaly.Context.Pattern) } func containsTag(tags []string, want string) bool { @@ -412,8 +377,8 @@ func containsTag(tags []string, want string) bool { } func TestNewEnginePanicsOnDuplicateExtractorNames(t *testing.T) { - first := &stubExtractor{name: "dup", contextByKey: map[string]observerdef.MetricContext{}} - second := &stubExtractor{name: "dup", contextByKey: map[string]observerdef.MetricContext{}} + first := &stubExtractor{name: "dup"} + second := &stubExtractor{name: "dup"} assert.PanicsWithValue(t, `duplicate log extractor name: "dup"`, func() { _ = newEngine(engineConfig{ @@ -425,8 +390,8 @@ func TestNewEnginePanicsOnDuplicateExtractorNames(t *testing.T) { func TestSetExtractorsPanicsOnDuplicateExtractorNames(t *testing.T) { e := newEngine(engineConfig{storage: newTimeSeriesStorage()}) - first := &stubExtractor{name: "dup", contextByKey: map[string]observerdef.MetricContext{}} - second := &stubExtractor{name: "dup", contextByKey: map[string]observerdef.MetricContext{}} + first := &stubExtractor{name: "dup"} + second := &stubExtractor{name: "dup"} assert.PanicsWithValue(t, `duplicate log extractor name: "dup"`, func() { e.SetExtractors([]observerdef.LogMetricsExtractor{first, second}) @@ -590,7 +555,7 @@ func TestReplayWithLiveScheduleNoMatchingTimestamps(t *testing.T) { func TestEngineResetResetsDetectorsAndCorrelators(t *testing.T) { detector := &resettableDetector{name: "detector"} correlator := &resettableCorrelator{name: "correlator"} - extractor := &stubExtractor{name: "extractor", contextByKey: map[string]observerdef.MetricContext{}} + extractor := &stubExtractor{name: "extractor"} e := newEngine(engineConfig{ storage: newTimeSeriesStorage(), detectors: []observerdef.Detector{detector}, @@ -942,7 +907,7 @@ func TestFindingM12_LogOnlyTimestampsSkippedInReplay(t *testing.T) { // Ingest log at 103 (no virtual metrics produced) logRequests, _ := e.IngestLog("ns", &logObs{ - content: []byte("error happened"), + content: "error happened", status: "error", timestampMs: 103000, // 103 seconds in millis }) @@ -1008,7 +973,7 @@ func TestIngestLogCopiesMetricTagsBeforeInjectingObserverSource(t *testing.T) { }) _, _ = e.IngestLog("source-a", &logObs{ - content: []byte("hello"), + content: "hello", tags: []string{"env:test"}, timestampMs: 1000, }) diff --git a/comp/anomalydetection/observer/impl/ingest_metrics_disabled_test.go b/comp/anomalydetection/observer/impl/ingest_metrics_disabled_test.go index c86a1bedaf06..37854f96a14d 100644 --- a/comp/anomalydetection/observer/impl/ingest_metrics_disabled_test.go +++ b/comp/anomalydetection/observer/impl/ingest_metrics_disabled_test.go @@ -73,7 +73,7 @@ func TestObserverDropsMetricsWhenIngestMetricsDisabled(t *testing.T) { }) drop.ObserveLog(&logObs{ - content: []byte("Request completed in 45ms"), + content: "Request completed in 45ms", status: "info", tags: []string{"service:web"}, timestampMs: 1_000_000, @@ -104,7 +104,7 @@ func TestMetricDropHandle(t *testing.T) { assert.True(t, wrap.ObserveMetricAndReportDrop(&sampleNoSource{name: "any.metric"}), "ObserveMetricAndReportDrop reports true (config drop) so recordingHandle writes Dropped=true") - wrap.ObserveLog(&logObs{content: []byte("hi"), timestampMs: 1}) + wrap.ObserveLog(&logObs{content: "hi", timestampMs: 1}) assert.Equal(t, 1, inner.logReceived, "metricDropHandle: inner.logReceived = %d, want 1 (ObserveLog must forward to inner)", inner.logReceived) } diff --git a/comp/anomalydetection/observer/impl/log_detector_connection_errors.go b/comp/anomalydetection/observer/impl/log_detector_connection_errors.go index 128515a5da5c..bf361fcad6e0 100644 --- a/comp/anomalydetection/observer/impl/log_detector_connection_errors.go +++ b/comp/anomalydetection/observer/impl/log_detector_connection_errors.go @@ -31,63 +31,31 @@ func DefaultConnectionErrorExtractorConfig() ConnectionErrorExtractorConfig { // ConnectionErrorExtractor is a log detector that detects connection errors // and converts them into a connection.errors metric. -// It implements observer.ContextProvider so that anomalies detected on the -// emitted metric can be enriched with the matched pattern and an example log line. -type ConnectionErrorExtractor struct { - // patternContext tracks the matched pattern and an example log line for - // each (metric, tags) combination so enrichAnomaly can format a - // human-readable description instead of a raw tag dump. - patternContext map[string]observer.MetricContext -} +type ConnectionErrorExtractor struct{} // Name returns the detector name. func (c *ConnectionErrorExtractor) Name() string { return "connection_error_extractor" } -// Reset clears the context map; called when the engine resets for replay. -func (c *ConnectionErrorExtractor) Reset() { - c.patternContext = nil -} - -// GetContextByKey implements observer.ContextProvider. -func (c *ConnectionErrorExtractor) GetContextByKey(key string) (observer.MetricContext, bool) { - if c.patternContext == nil { - return observer.MetricContext{}, false - } - ctx, ok := c.patternContext[key] - return ctx, ok -} - // ProcessLog checks if a log contains connection error patterns and returns a metric if so. // Anomaly detection is handled by metrics detection on the count aggregation of the emitted metric. func (c *ConnectionErrorExtractor) ProcessLog(log observer.LogView) observer.LogMetricsExtractorOutput { - content := strings.ToLower(string(log.GetContent())) + content := strings.ToLower(log.GetContent()) tags := log.GetTags() for _, pattern := range connectionErrorPatterns { if strings.Contains(content, pattern) { - contextKey := metricContextKey("connection.errors", tags) - - if c.patternContext == nil { - c.patternContext = make(map[string]observer.MetricContext) - } - // Store the matched pattern + a truncated example log line so - // enrichAnomaly can produce "Log pattern change rate detected: - // pattern: connection refused - // example: " instead of a raw tag dump. - c.patternContext[contextKey] = observer.MetricContext{ - Pattern: pattern, - Example: truncate(string(log.GetContent()), 160), - Source: "connection_error_extractor", - } - return observer.LogMetricsExtractorOutput{ Metrics: []observer.MetricOutput{{ - Name: "connection.errors", - Value: 1.0, - Tags: tags, - ContextKey: contextKey, + Name: "connection.errors", + Value: 1.0, + Tags: tags, + Context: &observer.MetricContext{ + Pattern: pattern, + Example: truncate(log.GetContent(), 160), + Source: "connection_error_extractor", + }, }}, } } diff --git a/comp/anomalydetection/observer/impl/log_detector_connection_errors_test.go b/comp/anomalydetection/observer/impl/log_detector_connection_errors_test.go index 130dc430e267..45118f8010ec 100644 --- a/comp/anomalydetection/observer/impl/log_detector_connection_errors_test.go +++ b/comp/anomalydetection/observer/impl/log_detector_connection_errors_test.go @@ -19,7 +19,7 @@ func TestConnectionErrorExtractor_Name(t *testing.T) { func TestConnectionErrorExtractor_Process_ConnectionRefused(t *testing.T) { e := &ConnectionErrorExtractor{} log := &mockLogView{ - content: []byte("Failed to connect: connection refused"), + content: "Failed to connect: connection refused", tags: []string{"env:prod", "service:api"}, } @@ -34,7 +34,7 @@ func TestConnectionErrorExtractor_Process_ConnectionRefused(t *testing.T) { func TestConnectionErrorExtractor_Process_ECONNRESET(t *testing.T) { e := &ConnectionErrorExtractor{} log := &mockLogView{ - content: []byte("Socket error: ECONNRESET"), + content: "Socket error: ECONNRESET", tags: []string{"env:staging"}, } @@ -49,7 +49,7 @@ func TestConnectionErrorExtractor_Process_ECONNRESET(t *testing.T) { func TestConnectionErrorExtractor_Process_NoMatch(t *testing.T) { e := &ConnectionErrorExtractor{} log := &mockLogView{ - content: []byte("Request completed successfully"), + content: "Request completed successfully", tags: []string{"env:test"}, } @@ -61,7 +61,7 @@ func TestConnectionErrorExtractor_Process_NoMatch(t *testing.T) { func TestConnectionErrorExtractor_Process_CaseInsensitive(t *testing.T) { e := &ConnectionErrorExtractor{} log := &mockLogView{ - content: []byte("Error: Connection Refused by server"), + content: "Error: Connection Refused by server", tags: []string{"env:prod"}, } @@ -77,7 +77,7 @@ func TestConnectionErrorExtractor_Process_TagsCopied(t *testing.T) { e := &ConnectionErrorExtractor{} inputTags := []string{"env:prod", "service:api", "host:web-1"} log := &mockLogView{ - content: []byte("connection timed out after 30s"), + content: "connection timed out after 30s", tags: inputTags, } @@ -105,7 +105,7 @@ func TestConnectionErrorExtractor_Process_AllPatterns(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { log := &mockLogView{ - content: []byte(tc.content), + content: tc.content, tags: []string{"test:pattern"}, } diff --git a/comp/anomalydetection/observer/impl/log_metrics_extractor.go b/comp/anomalydetection/observer/impl/log_metrics_extractor.go index 4ec63e25e138..4f54d71e2b21 100644 --- a/comp/anomalydetection/observer/impl/log_metrics_extractor.go +++ b/comp/anomalydetection/observer/impl/log_metrics_extractor.go @@ -6,10 +6,8 @@ package observerimpl import ( - "bytes" "encoding/json" - "fmt" - "hash/fnv" + "strconv" "strings" "unicode" @@ -46,17 +44,10 @@ const LogMetricsExtractorName = "log_metrics_extractor" // - Unstructured logs: pattern frequency -> Sum aggregation // // This is intentionally minimal; cardinality controls live in the observer storage (Step 5). -// -// LogMetricsExtractor also implements observer.ContextProvider, tracking the -// pattern signature and a recent example log line for each pattern metric it -// emits. Detectors can query this via StorageReader.GetContext to produce -// richer anomaly descriptions. +// Each MetricOutput carries an inline Context so the engine can attach it to the series +// for anomaly enrichment without a separate string-keyed lookup. type LogMetricsExtractor struct { config LogMetricsExtractorConfig - - // patternContext tracks the signature and a recent example for each context - // key emitted by this extractor. - patternContext map[string]observer.MetricContext } // NewLogMetricsExtractor creates a LogMetricsExtractor with the given config. @@ -66,22 +57,6 @@ func NewLogMetricsExtractor(config LogMetricsExtractorConfig) *LogMetricsExtract func (a *LogMetricsExtractor) Name() string { return LogMetricsExtractorName } -// Reset clears cached per-series context so replay/reanalysis starts from the -// currently observed data instead of reusing stale examples. -func (a *LogMetricsExtractor) Reset() { - a.patternContext = nil -} - -// GetContextByKey implements observer.ContextProvider. It returns the pattern -// signature and a recent example log line for a previously emitted context key. -func (a *LogMetricsExtractor) GetContextByKey(key string) (observer.MetricContext, bool) { - if a.patternContext == nil { - return observer.MetricContext{}, false - } - ctx, ok := a.patternContext[key] - return ctx, ok -} - func (a *LogMetricsExtractor) ProcessLog(log observer.LogView) observer.LogMetricsExtractorOutput { content := log.GetContent() tags := log.GetTags() @@ -93,43 +68,34 @@ func (a *LogMetricsExtractor) ProcessLog(log observer.LogView) observer.LogMetri } metricName := patternCountMetricName(patternSig) - contextKey := metricContextKey(metricName, tags) - - // Track context for this pattern metric so detectors can enrich anomalies. - if a.patternContext == nil { - a.patternContext = make(map[string]observer.MetricContext) - } - a.patternContext[contextKey] = observer.MetricContext{ - Pattern: patternSig, - Example: string(content), - Source: "log_metrics_extractor", - } metrics := []observer.MetricOutput{{ - Name: metricName, - Value: 1, - Tags: tags, - ContextKey: contextKey, + Name: metricName, + Value: 1, + Tags: tags, + Context: &observer.MetricContext{ + Pattern: patternSig, + Example: content, + Source: "log_metrics_extractor", + }, }} // For JSON logs, also extract numeric field metrics if isJSONObject(content) { - metrics = append(metrics, a.extractJSONFieldMetrics(content, tags, string(content))...) + metrics = append(metrics, a.extractJSONFieldMetrics(content, tags)...) } return observer.LogMetricsExtractorOutput{Metrics: metrics} } -func isJSONObject(b []byte) bool { - trimmed := bytes.TrimSpace(b) - return len(trimmed) > 1 && trimmed[0] == '{' && json.Valid(trimmed) +func isJSONObject(s string) bool { + trimmed := strings.TrimSpace(s) + return len(trimmed) > 1 && trimmed[0] == '{' && json.Valid([]byte(trimmed)) } // extractJSONFieldMetrics extracts numeric field metrics from JSON content. -// example is the raw log line stored in context so enrichAnomaly can show a -// representative log rather than a raw tag dump when an anomaly is detected. -func (a *LogMetricsExtractor) extractJSONFieldMetrics(content []byte, tags []string, example string) []observer.MetricOutput { - dec := json.NewDecoder(bytes.NewReader(content)) +func (a *LogMetricsExtractor) extractJSONFieldMetrics(content string, tags []string) []observer.MetricOutput { + dec := json.NewDecoder(strings.NewReader(content)) dec.UseNumber() var obj map[string]any @@ -137,10 +103,6 @@ func (a *LogMetricsExtractor) extractJSONFieldMetrics(content []byte, tags []str return nil } - if a.patternContext == nil { - a.patternContext = make(map[string]observer.MetricContext) - } - var out []observer.MetricOutput for k, v := range obj { if a.config.ExcludeFields != nil { @@ -160,18 +122,15 @@ func (a *LogMetricsExtractor) extractJSONFieldMetrics(content []byte, tags []str } metricName := "log.field." + sanitizeMetricFragment(k) - contextKey := metricContextKey(metricName, tags) - a.patternContext[contextKey] = observer.MetricContext{ - Pattern: k, - Example: truncate(example, 160), - Source: "log_metrics_extractor", - } - out = append(out, observer.MetricOutput{ - Name: metricName, - Value: f, - Tags: tags, - ContextKey: contextKey, + Name: metricName, + Value: f, + Tags: tags, + Context: &observer.MetricContext{ + Pattern: k, + Example: truncate(content, 160), + Source: "log_metrics_extractor", + }, }) } @@ -205,14 +164,8 @@ func coerceNumber(v any) (float64, bool) { } } -func metricContextKey(metricName string, tags []string) string { - return seriesKey("", metricName, tags) -} - func patternCountMetricName(signature string) string { - h := fnv.New64a() - _, _ = h.Write([]byte(signature)) - return fmt.Sprintf("log.pattern.%x.count", h.Sum64()) + return "log.pattern." + strconv.FormatUint(fnv64aString(signature), 16) + ".count" } func sanitizeMetricFragment(s string) string { @@ -310,8 +263,8 @@ const ( tokEnd ) -// logSignature returns a deterministic signature for the input bytes, capped to maxEvalBytes if > 0. -func logSignature(input []byte, maxEvalBytes int) string { +// logSignature returns a deterministic signature for the input string, capped to maxEvalBytes if > 0. +func logSignature(input string, maxEvalBytes int) string { if len(input) == 0 { return "" } @@ -373,7 +326,8 @@ func logSignature(input []byte, maxEvalBytes int) string { out.WriteString(sigTokenToString(lastToken)) } - for _, char := range input[1:] { + for i := 1; i < len(input); i++ { + char := input[i] currentToken := getTokenType(char) if currentToken != lastToken { insertToken() diff --git a/comp/anomalydetection/observer/impl/log_metrics_extractor_test.go b/comp/anomalydetection/observer/impl/log_metrics_extractor_test.go index 0dbfef96e9f8..af5e7cdd1b93 100644 --- a/comp/anomalydetection/observer/impl/log_metrics_extractor_test.go +++ b/comp/anomalydetection/observer/impl/log_metrics_extractor_test.go @@ -6,8 +6,6 @@ package observerimpl import ( - "fmt" - "hash/fnv" "testing" "github.com/stretchr/testify/assert" @@ -25,7 +23,7 @@ func TestLogMetricsExtractor_JSONNumericExtraction(t *testing.T) { }) log := &mockLogView{ - content: []byte(`{"duration_ms":45,"status":200,"foo":"bar","pid":1234}`), + content: `{"duration_ms":45,"status":200,"foo":"bar","pid":1234}`, tags: []string{"service:api"}, } @@ -39,10 +37,8 @@ func TestLogMetricsExtractor_JSONNumericExtraction(t *testing.T) { } // Pattern count is based on the full JSON payload. - sig := logSignature([]byte(`{"duration_ms":45,"status":200,"foo":"bar","pid":1234}`), 0) - h := fnv.New64a() - _, _ = h.Write([]byte(sig)) - expectedCountName := fmt.Sprintf("log.pattern.%x.count", h.Sum64()) + sig := logSignature(`{"duration_ms":45,"status":200,"foo":"bar","pid":1234}`, 0) + expectedCountName := patternCountMetricName(sig) if m, ok := got[expectedCountName]; assert.True(t, ok) { assert.Equal(t, float64(1), m.Value) assert.Equal(t, []string{"service:api"}, m.Tags) @@ -62,7 +58,7 @@ func TestLogMetricsExtractor_UnstructuredPatternCount(t *testing.T) { a := NewLogMetricsExtractor(LogMetricsExtractorConfig{}) log := &mockLogView{ - content: []byte("Request completed in 45ms"), + content: "Request completed in 45ms", tags: []string{"service:web"}, } @@ -72,10 +68,8 @@ func TestLogMetricsExtractor_UnstructuredPatternCount(t *testing.T) { assert.Equal(t, []string{"service:web"}, res.Metrics[0].Tags) // Compute expected metric name (hash of signature). - sig := logSignature([]byte("Request completed in 45ms"), 0) - h := fnv.New64a() - _, _ = h.Write([]byte(sig)) - assert.Equal(t, fmt.Sprintf("log.pattern.%x.count", h.Sum64()), res.Metrics[0].Name) + sig := logSignature("Request completed in 45ms", 0) + assert.Equal(t, patternCountMetricName(sig), res.Metrics[0].Name) } func TestLogMetricsExtractor_JSONIncludeFields(t *testing.T) { @@ -86,7 +80,7 @@ func TestLogMetricsExtractor_JSONIncludeFields(t *testing.T) { }) log := &mockLogView{ - content: []byte(`{"duration_ms":45,"status":200}`), + content: `{"duration_ms":45,"status":200}`, tags: []string{"service:api"}, } @@ -102,10 +96,8 @@ func TestLogMetricsExtractor_JSONIncludeFields(t *testing.T) { assert.Equal(t, float64(45), m.Value) } - sig := logSignature([]byte(`{"duration_ms":45,"status":200}`), 0) - h := fnv.New64a() - _, _ = h.Write([]byte(sig)) - expectedCountName := fmt.Sprintf("log.pattern.%x.count", h.Sum64()) + sig := logSignature(`{"duration_ms":45,"status":200}`, 0) + expectedCountName := patternCountMetricName(sig) if m, ok := got[expectedCountName]; assert.True(t, ok) { assert.Equal(t, float64(1), m.Value) } @@ -115,43 +107,38 @@ func TestLogMetricsExtractor_InvalidJSONFallsBackToUnstructured(t *testing.T) { a := NewLogMetricsExtractor(LogMetricsExtractorConfig{}) // Looks like JSON but is invalid -> treated as unstructured (pattern frequency). - input := []byte(`{"duration_ms":45,`) + input := `{"duration_ms":45,` log := &mockLogView{content: input, tags: []string{"service:api"}} res := a.ProcessLog(log) require.Len(t, res.Metrics, 1) sig := logSignature(input, 0) - h := fnv.New64a() - _, _ = h.Write([]byte(sig)) - assert.Equal(t, fmt.Sprintf("log.pattern.%x.count", h.Sum64()), res.Metrics[0].Name) + assert.Equal(t, patternCountMetricName(sig), res.Metrics[0].Name) } -func TestLogMetricsExtractor_GetContextByKeyUsesOutputContextKey(t *testing.T) { +func TestLogMetricsExtractor_ProcessLogEmitsInlineContext(t *testing.T) { a := &LogMetricsExtractor{} log := &mockLogView{ - content: []byte("Request completed in 45ms"), + content: "Request completed in 45ms", tags: []string{"service:web", "env:prod"}, } res := a.ProcessLog(log) require.Len(t, res.Metrics, 1) - require.NotEmpty(t, res.Metrics[0].ContextKey) - - ctx, ok := a.GetContextByKey(res.Metrics[0].ContextKey) - require.True(t, ok) - assert.Equal(t, "log_metrics_extractor", ctx.Source) - assert.Equal(t, "Request completed in 45ms", ctx.Example) + require.NotNil(t, res.Metrics[0].Context) + assert.Equal(t, "log_metrics_extractor", res.Metrics[0].Context.Source) + assert.Equal(t, "Request completed in 45ms", res.Metrics[0].Context.Example) } -func TestLogMetricsExtractor_ContextKeySeparatesSameMetricByTags(t *testing.T) { +func TestLogMetricsExtractor_SamePatternDifferentTagsHaveDistinctContexts(t *testing.T) { a := &LogMetricsExtractor{} logA := &mockLogView{ - content: []byte("Request completed in 45ms"), + content: "Request completed in 45ms", tags: []string{"service:api"}, } logB := &mockLogView{ - content: []byte("Request completed in 45ms"), + content: "Request completed in 45ms", tags: []string{"service:worker"}, } @@ -160,14 +147,10 @@ func TestLogMetricsExtractor_ContextKeySeparatesSameMetricByTags(t *testing.T) { require.Len(t, resA.Metrics, 1) require.Len(t, resB.Metrics, 1) require.Equal(t, resA.Metrics[0].Name, resB.Metrics[0].Name) - require.NotEqual(t, resA.Metrics[0].ContextKey, resB.Metrics[0].ContextKey) - - ctxA, ok := a.GetContextByKey(resA.Metrics[0].ContextKey) - require.True(t, ok) - ctxB, ok := a.GetContextByKey(resB.Metrics[0].ContextKey) - require.True(t, ok) - assert.Equal(t, "Request completed in 45ms", ctxA.Example) - assert.Equal(t, "Request completed in 45ms", ctxB.Example) - assert.Equal(t, ctxA.Pattern, ctxB.Pattern) + require.NotNil(t, resA.Metrics[0].Context) + require.NotNil(t, resB.Metrics[0].Context) + assert.Equal(t, "Request completed in 45ms", resA.Metrics[0].Context.Example) + assert.Equal(t, "Request completed in 45ms", resB.Metrics[0].Context.Example) + assert.Equal(t, resA.Metrics[0].Context.Pattern, resB.Metrics[0].Context.Pattern) } diff --git a/comp/anomalydetection/observer/impl/log_pattern_extractor.go b/comp/anomalydetection/observer/impl/log_pattern_extractor.go index dc431953230c..22e8244c150d 100644 --- a/comp/anomalydetection/observer/impl/log_pattern_extractor.go +++ b/comp/anomalydetection/observer/impl/log_pattern_extractor.go @@ -100,71 +100,17 @@ func tokenizerFromConfig(cfg LogPatternExtractorConfig) *patterns.Tokenizer { return t } -// PatternKeyInfo contains what can identify a pattern. -type PatternKeyInfo struct { - ClusterID int64 - GroupHash uint64 -} - // LogPatternExtractor is a LogMetricsExtractor that clusters log messages into // patterns and emits a count metric per pattern. type LogPatternExtractor struct { taggedClusterer *TaggedPatternClusterer registry *TagGroupByKeyRegistry - ctx logPatternExtractorContext NextGarbageCollectionTime int64 // config is the resolved hyperparameters (MinClusterSizeBeforeEmit is never zero after init). config LogPatternExtractorConfig } var _ observerdef.LogMetricsExtractor = (*LogPatternExtractor)(nil) -var _ observerdef.ContextProvider = (*LogPatternExtractor)(nil) - -type patternMetricContext struct { - keyInfo PatternKeyInfo - example string -} - -// logPatternExtractorContext holds per-metric context for GetContextByKey and -// indexes keys by tagged cluster (globalClusterHash) for O(cluster) deletion on GC. -// The tagged key encodes both groupHash and clusterID so that different sub-clusterers -// with coincidentally equal cluster IDs don't collide. -type logPatternExtractorContext struct { - byKey map[string]patternMetricContext - keysByTaggedCluster map[string][]string // key: globalClusterHash(groupHash, clusterID) -} - -func (c *logPatternExtractorContext) get(key string) (patternMetricContext, bool) { - if c.byKey == nil { - return patternMetricContext{}, false - } - v, ok := c.byKey[key] - return v, ok -} - -func (c *logPatternExtractorContext) put(groupHash uint64, clusterID int64, contextKey string, entry patternMetricContext) { - taggedKey := globalClusterHash(groupHash, clusterID) - if c.byKey == nil { - c.byKey = make(map[string]patternMetricContext) - } - if _, exists := c.byKey[contextKey]; !exists { - if c.keysByTaggedCluster == nil { - c.keysByTaggedCluster = make(map[string][]string) - } - c.keysByTaggedCluster[taggedKey] = append(c.keysByTaggedCluster[taggedKey], contextKey) - c.byKey[contextKey] = entry - } -} - -func (c *logPatternExtractorContext) removeTaggedCluster(taggedKey string) { - if c.byKey == nil { - return - } - for _, k := range c.keysByTaggedCluster[taggedKey] { - delete(c.byKey, k) - } - delete(c.keysByTaggedCluster, taggedKey) -} // NewLogPatternExtractor creates a new LogPatternExtractor. // A zero-value cfg is accepted; zero fields fall back to DefaultLogPatternExtractorConfig values. @@ -216,38 +162,14 @@ func (e *LogPatternExtractor) Name() string { return "log_pattern_extractor" } -// Reset clears clustering and cached per-series context so reanalysis starts -// from the currently observed logs. The registry is kept so that previously -// registered hashes remain resolvable. +// Reset clears clustering state so reanalysis starts from the currently +// observed logs. The registry is kept so that previously registered hashes +// remain resolvable. func (e *LogPatternExtractor) Reset() { e.taggedClusterer.Reset() - e.ctx = logPatternExtractorContext{} e.NextGarbageCollectionTime = 0 } -// GetContextByKey implements observerdef.ContextProvider for pattern metrics -// emitted by this extractor. -func (e *LogPatternExtractor) GetContextByKey(key string) (observerdef.MetricContext, bool) { - entry, ok := e.ctx.get(key) - if !ok { - return observerdef.MetricContext{}, false - } - - pattern := "" - cluster, err := e.taggedClusterer.GetCluster(entry.keyInfo.GroupHash, entry.keyInfo.ClusterID) - if err == nil && cluster != nil { - pattern = cluster.PatternString() - } - - group, _ := e.registry.Lookup(entry.keyInfo.GroupHash) - return observerdef.MetricContext{ - Pattern: pattern, - Example: entry.example, - Source: e.Name(), - SplitTags: group.AsMap(), - }, true -} - // logSeverityIsWarnPlus returns true when the log should be clustered: warning func logSeverityIsWarnPlus(log observerdef.LogView) bool { status := strings.ToLower(strings.TrimSpace(log.GetStatus())) @@ -268,7 +190,7 @@ func (e *LogPatternExtractor) ProcessLog(log observerdef.LogView) observerdef.Lo gc := e.maybeGarbageCollect(logUnixSec) telemetry := []observerdef.ObserverTelemetry{} result := observerdef.LogMetricsExtractorOutput{ - EvictedContextKeys: gc.contextKeys, + EvictedMetricNames: gc.metricNames, } if gc.clustersEvicted > 0 { // We count active patterns so we remove them @@ -277,7 +199,7 @@ func (e *LogPatternExtractor) ProcessLog(log observerdef.LogView) observerdef.Lo if !logSeverityIsWarnPlus(log) { return result } - message := string(log.GetContent()) + message := log.GetContent() groupTags := tagsForPatternGrouping(log.GetTags(), log.GetHostname()) groupHash, cluster, ok := e.taggedClusterer.Process(groupTags, message, logUnixSec) // Drain LRU evictions — from per-group MaxClusters or whole-group MaxTagGroups @@ -287,15 +209,9 @@ func (e *LogPatternExtractor) ProcessLog(log observerdef.LogView) observerdef.Lo // !ok (defensive; current Process only returns !ok for empty messages, after // which no eviction can have occurred, but keep this path honest). if evicted := e.taggedClusterer.DrainLRUEvictions(); len(evicted) > 0 { - var lruKeys []string for _, ev := range evicted { - taggedKey := globalClusterHash(ev.GroupHash, ev.ClusterID) - if e.ctx.keysByTaggedCluster != nil { - lruKeys = append(lruKeys, e.ctx.keysByTaggedCluster[taggedKey]...) - } - e.ctx.removeTaggedCluster(taggedKey) + result.EvictedMetricNames = append(result.EvictedMetricNames, clusterMetricName(e.Name(), ev.GroupHash, ev.ClusterID)) } - result.EvictedContextKeys = append(result.EvictedContextKeys, lruKeys...) telemetry = append(telemetry, newTelemetryCounter([]string{"detector:" + e.Name()}, telemetryLogPatternExtractorPatternCount, -float64(len(evicted)), logUnixSec)) } if !ok { @@ -311,19 +227,18 @@ func (e *LogPatternExtractor) ProcessLog(log observerdef.LogView) observerdef.Lo return result } - metricName := "log." + e.Name() + "." + globalClusterHash(groupHash, cluster.ID) + ".count" - contextKey := metricContextKey(metricName, log.GetTags()) - - e.ctx.put(groupHash, cluster.ID, contextKey, patternMetricContext{ - keyInfo: PatternKeyInfo{ClusterID: cluster.ID, GroupHash: groupHash}, - example: message, - }) - + metricName := clusterMetricName(e.Name(), groupHash, cluster.ID) + group, _ := e.registry.Lookup(groupHash) result.Metrics = []observerdef.MetricOutput{{ - Name: metricName, - Value: 1, - Tags: log.GetTags(), - ContextKey: contextKey, + Name: metricName, + Value: 1, + Tags: log.GetTags(), + Context: &observerdef.MetricContext{ + Pattern: cluster.PatternString(), + Example: message, + Source: e.Name(), + SplitTags: group.AsMap(), + }, }} result.Telemetry = telemetry return result @@ -331,12 +246,12 @@ func (e *LogPatternExtractor) ProcessLog(log observerdef.LogView) observerdef.Lo // gcResult holds what was evicted during a garbage-collection pass. type gcResult struct { - contextKeys []string + metricNames []string clustersEvicted int } // maybeGarbageCollect removes stale clusters from all sub-clusterers and -// returns the context keys evicted so the engine can drop matching contextRefs. +// returns the metric names evicted so the engine can remove matching series. func (e *LogPatternExtractor) maybeGarbageCollect(currentTime int64) gcResult { if e.config.ClusterTimeToLiveSec == 0 || currentTime < e.NextGarbageCollectionTime { return gcResult{} @@ -350,12 +265,13 @@ func (e *LogPatternExtractor) maybeGarbageCollect(currentTime int64) gcResult { } var result gcResult for _, ev := range evicted { - taggedKey := globalClusterHash(ev.GroupHash, ev.ClusterID) - if e.ctx.keysByTaggedCluster != nil { - result.contextKeys = append(result.contextKeys, e.ctx.keysByTaggedCluster[taggedKey]...) - } - e.ctx.removeTaggedCluster(taggedKey) + result.metricNames = append(result.metricNames, clusterMetricName(e.Name(), ev.GroupHash, ev.ClusterID)) } result.clustersEvicted = len(evicted) return result } + +// clusterMetricName returns the deterministic metric name for a pattern cluster. +func clusterMetricName(extractorName string, groupHash uint64, clusterID int64) string { + return "log." + extractorName + "." + globalClusterHash(groupHash, clusterID) + ".count" +} diff --git a/comp/anomalydetection/observer/impl/log_pattern_extractor_test.go b/comp/anomalydetection/observer/impl/log_pattern_extractor_test.go index 3c68c3f3c2bc..6b5737cda345 100644 --- a/comp/anomalydetection/observer/impl/log_pattern_extractor_test.go +++ b/comp/anomalydetection/observer/impl/log_pattern_extractor_test.go @@ -15,22 +15,20 @@ import ( observerdef "github.com/DataDog/datadog-agent/comp/anomalydetection/observer/def" ) -func TestLogPatternExtractor_GetContextByKeyUsesOutputContextKey(t *testing.T) { +func TestLogPatternExtractor_ProcessLogEmitsInlineContext(t *testing.T) { e := NewLogPatternExtractor(DefaultLogPatternExtractorConfig()) e.config.MinClusterSizeBeforeEmit = 1 log := &mockLogView{ - content: []byte("GET /users/123 returned 500"), + content: "GET /users/123 returned 500", status: "warn", tags: []string{"service:web", "env:prod"}, } res := e.ProcessLog(log) require.Len(t, res.Metrics, 1) - require.NotEmpty(t, res.Metrics[0].ContextKey) - - ctx, ok := e.GetContextByKey(res.Metrics[0].ContextKey) - require.True(t, ok) + require.NotNil(t, res.Metrics[0].Context) + ctx := res.Metrics[0].Context assert.Equal(t, "log_pattern_extractor", ctx.Source) assert.Equal(t, "GET /users/123 returned 500", ctx.Example) assert.NotEmpty(t, ctx.Pattern) @@ -43,22 +41,22 @@ func TestLogPatternExtractor_DifferentTagGroupsProduceDifferentMetricNames(t *te // 1 pattern per service (same pattern strings but different IDs) logA := &mockLogView{ - content: []byte("GET /users/123 returned 500"), + content: "GET /users/123 returned 500", status: "warn", tags: []string{"service:api"}, } logB := &mockLogView{ - content: []byte("GET /users/456 returned 500"), + content: "GET /users/456 returned 500", status: "warn", tags: []string{"service:worker"}, } logC := &mockLogView{ - content: []byte("GET /users/124 returned 500"), + content: "GET /users/124 returned 500", status: "warn", tags: []string{"service:api"}, } logD := &mockLogView{ - content: []byte("GET /users/457 returned 500"), + content: "GET /users/457 returned 500", status: "warn", tags: []string{"service:worker"}, } @@ -71,16 +69,13 @@ func TestLogPatternExtractor_DifferentTagGroupsProduceDifferentMetricNames(t *te require.Len(t, resB.Metrics, 1) // Different tag groups → different sub-clusterers → different globalClusterHash → different names. require.NotEqual(t, resA.Metrics[0].Name, resB.Metrics[0].Name) - require.NotEqual(t, resA.Metrics[0].ContextKey, resB.Metrics[0].ContextKey) - - ctxA, ok := e.GetContextByKey(resA.Metrics[0].ContextKey) - require.True(t, ok) - ctxB, ok := e.GetContextByKey(resB.Metrics[0].ContextKey) - require.True(t, ok) + require.NotNil(t, resA.Metrics[0].Context) + require.NotNil(t, resB.Metrics[0].Context) + ctxA := resA.Metrics[0].Context + ctxB := resB.Metrics[0].Context assert.Equal(t, "GET /users/123 returned 500", ctxA.Example) assert.Equal(t, "GET /users/456 returned 500", ctxB.Example) - assert.Equal(t, ctxA.Pattern, ctxB.Pattern) assert.Equal(t, map[string]string{"service": "api"}, ctxA.SplitTags) assert.Equal(t, map[string]string{"service": "worker"}, ctxB.SplitTags) } @@ -89,7 +84,7 @@ func TestLogPatternExtractor_DifferentHostnamesProduceDifferentMetricNamesWhenNo e := NewLogPatternExtractor(DefaultLogPatternExtractorConfig()) e.config.MinClusterSizeBeforeEmit = 1 - msg := []byte("GET /users/123 returned 500") + msg := "GET /users/123 returned 500" tags := []string{"service:api", "env:prod"} logA := &mockLogView{ @@ -110,43 +105,41 @@ func TestLogPatternExtractor_DifferentHostnamesProduceDifferentMetricNamesWhenNo require.Len(t, resA.Metrics, 1) require.Len(t, resB.Metrics, 1) require.NotEqual(t, resA.Metrics[0].Name, resB.Metrics[0].Name) - require.NotEqual(t, resA.Metrics[0].ContextKey, resB.Metrics[0].ContextKey) - - ctxA, ok := e.GetContextByKey(resA.Metrics[0].ContextKey) - require.True(t, ok) - ctxB, ok := e.GetContextByKey(resB.Metrics[0].ContextKey) - require.True(t, ok) - assert.Equal(t, map[string]string{"service": "api", "env": "prod", "host": "host-a"}, ctxA.SplitTags) - assert.Equal(t, map[string]string{"service": "api", "env": "prod", "host": "host-b"}, ctxB.SplitTags) + + require.NotNil(t, resA.Metrics[0].Context) + require.NotNil(t, resB.Metrics[0].Context) + assert.Equal(t, map[string]string{"service": "api", "env": "prod", "host": "host-a"}, resA.Metrics[0].Context.SplitTags) + assert.Equal(t, map[string]string{"service": "api", "env": "prod", "host": "host-b"}, resB.Metrics[0].Context.SplitTags) } -func TestLogPatternExtractor_ResetClearsContext(t *testing.T) { +func TestLogPatternExtractor_ResetClearsClusters(t *testing.T) { e := NewLogPatternExtractor(DefaultLogPatternExtractorConfig()) e.config.MinClusterSizeBeforeEmit = 1 log := &mockLogView{ - content: []byte("GET /users/123 returned 500"), + content: "GET /users/123 returned 500", status: "warn", tags: []string{"service:web"}, } res := e.ProcessLog(log) require.Len(t, res.Metrics, 1) - - _, ok := e.GetContextByKey(res.Metrics[0].ContextKey) - require.True(t, ok) + require.NotNil(t, res.Metrics[0].Context) e.Reset() - _, ok = e.GetContextByKey(res.Metrics[0].ContextKey) - assert.False(t, ok) + // After Reset, the clusterer starts fresh — the same log re-emits a metric + // with fresh context (new cluster, same pattern). + res2 := e.ProcessLog(log) + require.Len(t, res2.Metrics, 1) + require.NotNil(t, res2.Metrics[0].Context) } func TestLogPatternExtractor_SkipsBelowWarnSeverity(t *testing.T) { e := NewLogPatternExtractor(DefaultLogPatternExtractorConfig()) out := e.ProcessLog(&mockLogView{ - content: []byte("INFO: routine request completed"), + content: "INFO: routine request completed", status: "info", tags: []string{"service:api"}, }) @@ -161,7 +154,7 @@ func TestLogPatternExtractor_DeferredEmitUntilMinPatterns(t *testing.T) { for i := range 4 { out := e.ProcessLog(&mockLogView{ - content: []byte(fmt.Sprintf("WARN distinct pattern seed %d not mergeable xyz", i)), + content: fmt.Sprintf("WARN distinct pattern seed %d not mergeable xyz", i), status: status, tags: tags, }) @@ -169,7 +162,7 @@ func TestLogPatternExtractor_DeferredEmitUntilMinPatterns(t *testing.T) { } out := e.ProcessLog(&mockLogView{ - content: []byte("WARN distinct pattern seed 4 not mergeable xyz"), + content: "WARN distinct pattern seed 4 not mergeable xyz", status: status, tags: tags, }) @@ -193,8 +186,8 @@ func TestLogPatternExtractor_GarbageCollectRemovesStaleClusterAndContext(t *test tags := []string{"service:api"} // Distinct messages so the second log does not refresh the first cluster's LastSeenUnix. - msg1 := []byte("WARN distinct pattern seed 700 not mergeable xyz") - msg2 := []byte("WARN distinct pattern seed 701 not mergeable xyz") + msg1 := "WARN distinct pattern seed 700 not mergeable xyz" + msg2 := "WARN distinct pattern seed 701 not mergeable xyz" // t=1000: create cluster A, emit metric and pattern context. const tsMs1 = 1_000_000 // unix sec = 1000 @@ -205,10 +198,9 @@ func TestLogPatternExtractor_GarbageCollectRemovesStaleClusterAndContext(t *test timestampMs: tsMs1, }) require.Len(t, res1.Metrics, 1) - require.Empty(t, res1.EvictedContextKeys, "no GC on first log") - ctxKey1 := res1.Metrics[0].ContextKey - _, ok := e.GetContextByKey(ctxKey1) - require.True(t, ok, "pattern context should exist before GC") + require.Empty(t, res1.EvictedMetricNames, "no GC on first log") + metricName1 := res1.Metrics[0].Name + require.NotNil(t, res1.Metrics[0].Context, "context populated inline") // t=1015: GC runs first (cutoff 1015-10=1005); cluster A last seen 1000 is stale. // Then a new log creates cluster B. @@ -220,15 +212,9 @@ func TestLogPatternExtractor_GarbageCollectRemovesStaleClusterAndContext(t *test timestampMs: tsMs2, }) require.Len(t, res2.Metrics, 1) - require.Equal(t, []string{ctxKey1}, res2.EvictedContextKeys, "GC should report evicted context keys for the engine") - ctxKey2 := res2.Metrics[0].ContextKey - - _, ok = e.GetContextByKey(ctxKey1) - assert.False(t, ok, "stale cluster pattern context should be removed by GC") - _, ok = e.GetContextByKey(ctxKey2) - require.True(t, ok, "active cluster pattern context should remain") + require.Equal(t, []string{metricName1}, res2.EvictedMetricNames, "GC should report evicted metric names for the engine") - require.NotEqual(t, ctxKey1, ctxKey2) + require.NotEqual(t, metricName1, res2.Metrics[0].Name) // Only cluster B should remain in the tagged clusterer. remaining := e.taggedClusterer.GetAllClusters() @@ -245,8 +231,8 @@ func TestLogPatternExtractor_DisableOptimizationsSkipsGarbageCollection(t *testi tags := []string{"service:api"} // Structurally different lines so the pattern clusterer keeps two clusters (unlike // two strings that differ only by a numeric token, which often merge into one template). - msg1 := []byte(`10.143.180.25 - - [27/Aug/2020:00:27:02 +0000] "POST /api/v1/series HTTP/1.1" 202 16`) - msg2 := []byte(`2020-08-27 02:32:42 ERROR (connector.go:34) - Failed to connected to redis`) + msg1 := `10.143.180.25 - - [27/Aug/2020:00:27:02 +0000] "POST /api/v1/series HTTP/1.1" 202 16` + msg2 := `2020-08-27 02:32:42 ERROR (connector.go:34) - Failed to connected to redis` const tsMs1 = 1_000_000 // unix sec = 1000 res1 := e.ProcessLog(&mockLogView{ @@ -256,9 +242,8 @@ func TestLogPatternExtractor_DisableOptimizationsSkipsGarbageCollection(t *testi timestampMs: tsMs1, }) require.Len(t, res1.Metrics, 1) - ctxKey1 := res1.Metrics[0].ContextKey - _, ok := e.GetContextByKey(ctxKey1) - require.True(t, ok) + require.NotNil(t, res1.Metrics[0].Context) + metricName1 := res1.Metrics[0].Name // Same timeline as TestLogPatternExtractor_GarbageCollectRemovesStaleClusterAndContext, where GC // would evict cluster A — but with DisableOptimizations, TTL is off so A stays. @@ -270,22 +255,17 @@ func TestLogPatternExtractor_DisableOptimizationsSkipsGarbageCollection(t *testi timestampMs: tsMs2, }) require.Len(t, res2.Metrics, 1) - require.Empty(t, res2.EvictedContextKeys, "GC must not run when optimizations are disabled") - ctxKey2 := res2.Metrics[0].ContextKey - - _, ok = e.GetContextByKey(ctxKey1) - require.True(t, ok, "first cluster context must remain without GC") - _, ok = e.GetContextByKey(ctxKey2) - require.True(t, ok) + require.Empty(t, res2.EvictedMetricNames, "GC must not run when optimizations are disabled") + require.NotEqual(t, metricName1, res2.Metrics[0].Name, "two distinct clusters") remaining := e.taggedClusterer.GetAllClusters() require.Len(t, remaining, 2, "both clusters should still exist when GC is disabled") } -func TestLogPatternExtractor_GCEvictedContextKeysTwoTagSetsOneCluster(t *testing.T) { - // Two different tag-sets that share the same split dimensions (same sub-clusterer) - // produce two context keys for the same cluster. GC should evict both when the cluster - // becomes stale. Non-split tags (e.g. "version:") differ so the context keys differ. +func TestLogPatternExtractor_GCEvictedMetricNamesTwoTagSetsOneCluster(t *testing.T) { + // Two different tag-sets sharing the same split dimensions produce two series for + // the same cluster. GC evicts one cluster → one metric name is returned. The engine + // then removes both tag-variant series via RemoveSeriesByMetricName. e := NewLogPatternExtractor(DefaultLogPatternExtractorConfig()) e.config.MinClusterSizeBeforeEmit = 1 e.config.ClusterTimeToLiveSec = 10 @@ -295,7 +275,7 @@ func TestLogPatternExtractor_GCEvictedContextKeysTwoTagSetsOneCluster(t *testing // on every ProcessLog call, evicting the cluster before both tags are ingested. e.NextGarbageCollectionTime = 1 << 62 - msg := []byte("WARN disk usage above threshold") + msg := "WARN disk usage above threshold" const tsMs = int64(1_000_000) // unix sec = 1000 // Same pattern, same split-dimension tag group (service:api) → same sub-clusterer. @@ -312,13 +292,13 @@ func TestLogPatternExtractor_GCEvictedContextKeysTwoTagSetsOneCluster(t *testing // Trigger GC: cutoff = 1015-10 = 1005, cluster last seen at 1000 is stale. res := e.ProcessLog(&mockLogView{ - content: []byte("WARN distinct pattern seed 999 not mergeable xyz"), + content: "WARN distinct pattern seed 999 not mergeable xyz", status: "warn", tags: []string{"service:api"}, timestampMs: 1_015_000, }) - assert.Len(t, res.EvictedContextKeys, 2, "two tag variants → two context keys evicted") + assert.Len(t, res.EvictedMetricNames, 1, "one cluster evicted → one metric name regardless of tag variant count") } func TestLogPatternExtractor_NoGCBeforeInterval(t *testing.T) { @@ -327,13 +307,13 @@ func TestLogPatternExtractor_NoGCBeforeInterval(t *testing.T) { e.config.ClusterTimeToLiveSec = 10 e.config.GarbageCollectionIntervalSec = 3600 // far in the future - msg := []byte("WARN connection refused to db host *") + msg := "WARN connection refused to db host *" res1 := e.ProcessLog(&mockLogView{content: msg, status: "warn", tags: nil, timestampMs: 1_000_000}) require.Len(t, res1.Metrics, 1) // GC interval not elapsed: no evictions even though cluster would be stale. res2 := e.ProcessLog(&mockLogView{content: msg, status: "warn", tags: nil, timestampMs: 2_000_000}) - assert.Empty(t, res2.EvictedContextKeys) + assert.Empty(t, res2.EvictedMetricNames) } func TestLogPatternExtractor_LRUCapEvictsAndDropsContext(t *testing.T) { @@ -357,28 +337,21 @@ func TestLogPatternExtractor_LRUCapEvictsAndDropsContext(t *testing.T) { var ctxKeys []string for i, m := range msgs { res := e.ProcessLog(&mockLogView{ - content: []byte(m), + content: m, status: "warn", tags: tags, timestampMs: int64(1_000_000 + i*1_000), // 1s apart so LastSeenUnix differs }) require.Len(t, res.Metrics, 1, "each distinct shape should emit a metric (i=%d)", i) - ctxKeys = append(ctxKeys, res.Metrics[0].ContextKey) + ctxKeys = append(ctxKeys, res.Metrics[0].Name) switch i { case 0, 1: - require.Empty(t, res.EvictedContextKeys, "no eviction below or at cap (i=%d)", i) + require.Empty(t, res.EvictedMetricNames, "no eviction below or at cap (i=%d)", i) case 2: - // Third shape pushes over the cap of 2; the oldest (ctxKeys[0]) is evicted. - require.Equal(t, []string{ctxKeys[0]}, res.EvictedContextKeys, - "oldest cluster's context key surfaced for the engine to drop") - // Confirm the engine-side context entry was actually removed. - _, ok := e.GetContextByKey(ctxKeys[0]) - require.False(t, ok, "evicted cluster's pattern context should be gone") - _, ok = e.GetContextByKey(ctxKeys[1]) - require.True(t, ok, "surviving cluster's context still resolvable") - _, ok = e.GetContextByKey(ctxKeys[2]) - require.True(t, ok, "newly-inserted cluster's context resolvable") + // Third shape pushes over the cap of 2; the oldest cluster's metric is evicted. + require.Equal(t, []string{ctxKeys[0]}, res.EvictedMetricNames, + "oldest cluster's metric name surfaced for the engine to drop") // Pattern_count telemetry must include a -1 decrement for the eviction. var found bool for _, tel := range res.Telemetry { @@ -406,16 +379,16 @@ func TestLogPatternExtractor_TagGroupCapEvictsLRUGroup(t *testing.T) { processAt := func(service string, msg string, tsMs int64) string { res := e.ProcessLog(&mockLogView{ - content: []byte(msg), + content: msg, status: "warn", tags: []string{"service:" + service}, timestampMs: tsMs, }) require.Len(t, res.Metrics, 1) - return res.Metrics[0].ContextKey + return res.Metrics[0].Name } - // Two groups, two contexts. + // Two groups, two metric names. kA := processAt("a", "WARN alpha", 1_000_000) kB := processAt("b", "WARN beta", 1_001_000) @@ -424,17 +397,14 @@ func TestLogPatternExtractor_TagGroupCapEvictsLRUGroup(t *testing.T) { // Adding a third group must evict B's lone cluster. res := e.ProcessLog(&mockLogView{ - content: []byte("WARN gamma"), + content: "WARN gamma", status: "warn", tags: []string{"service:c"}, timestampMs: 1_003_000, }) require.Len(t, res.Metrics, 1) - require.Contains(t, res.EvictedContextKeys, kB, "LRU group's context key surfaced") - _, ok := e.GetContextByKey(kB) - require.False(t, ok, "evicted group's context entry removed") - _, ok = e.GetContextByKey(kA) - require.True(t, ok, "surviving group's context preserved") + require.Contains(t, res.EvictedMetricNames, kB, "LRU group's metric name surfaced for eviction") + assert.NotContains(t, res.EvictedMetricNames, kA, "surviving group's metric name not evicted") } // TestEngine_LogPatternLRUEvictionFreesStorage is the end-to-end proof that @@ -465,7 +435,7 @@ func TestEngine_LogPatternLRUEvictionFreesStorage(t *testing.T) { for i, m := range msgs { e.IngestLog("src", &logObs{ - content: []byte(m), + content: m, status: "warn", tags: tags, timestampMs: int64(1_000_000 + i*1_000), @@ -479,16 +449,12 @@ func TestEngine_LogPatternLRUEvictionFreesStorage(t *testing.T) { require.Equal(t, 2, storage.TotalSeriesCount(""), "LRU eviction must shrink storage; before the fix storage grew unboundedly") - // All surviving contextRefs must resolve to live storage series — i.e. - // nothing dangling on the engine side either. - for key := range e.contextRefs { - ref, ok := storage.seriesIDs[key] - require.True(t, ok, "engine contextRef without storage series for key %q", key) - require.NotNil(t, storage.GetSeriesMeta(ref), - "engine contextRef points at a retired storage ref for key %q", key) + // Verify surviving series all have live context attached (context lives in + // storage now, not in a separate engine map). + for _, meta := range storage.ListSeries(observerdef.SeriesFilter{}) { + ctx := storage.GetContext(meta.Ref) + require.NotNil(t, ctx, "surviving series %q should have context", meta.Name) } - require.Len(t, e.contextRefs, 2, - "engine should keep one contextRef per surviving extractor cluster") } // TestEngine_LogPatternLRUEvictionFreesDetectorState extends @@ -537,7 +503,7 @@ func TestEngine_LogPatternLRUEvictionFreesDetectorState(t *testing.T) { } for i, m := range msgs { e.IngestLog("src", &logObs{ - content: []byte(m), + content: m, status: "warn", tags: tags, timestampMs: int64(1_000_000 + i*1_000), @@ -561,7 +527,7 @@ func TestEngine_LogPatternLRUEvictionFreesDetectorState(t *testing.T) { // oldest cluster out. After the fix, the engine fans the freed refs // out to every detector and the per-series maps shrink accordingly. e.IngestLog("src", &logObs{ - content: []byte("WARN x y z w"), + content: "WARN x y z w", status: "warn", tags: tags, timestampMs: 1_002_000, diff --git a/comp/anomalydetection/observer/impl/log_tagged_pattern_clusterer.go b/comp/anomalydetection/observer/impl/log_tagged_pattern_clusterer.go index a35e788d571a..163268ce35b2 100644 --- a/comp/anomalydetection/observer/impl/log_tagged_pattern_clusterer.go +++ b/comp/anomalydetection/observer/impl/log_tagged_pattern_clusterer.go @@ -7,9 +7,7 @@ package observerimpl import ( "container/heap" - "encoding/binary" "fmt" - "hash/fnv" "strconv" "strings" @@ -49,15 +47,11 @@ func (c TagGroupByKey) AsMap() map[string]string { // tagGroupByKeyHash computes a stable fnv64a hash for a TagGroupByKey. func tagGroupByKeyHash(c TagGroupByKey) uint64 { - h := fnv.New64a() - h.Write([]byte(c.Source)) - h.Write([]byte{'|'}) - h.Write([]byte(c.Service)) - h.Write([]byte{'|'}) - h.Write([]byte(c.Env)) - h.Write([]byte{'|'}) - h.Write([]byte(c.Host)) - return h.Sum64() + h := fnv64aString(c.Source) + h = fnv64aMix(h, c.Service) + h = fnv64aMix(h, c.Env) + h = fnv64aMix(h, c.Host) + return h } // TagGroupByKeyRegistry is a bidirectional, append-only store between a uint64 hash @@ -133,10 +127,9 @@ func tagsForPatternGrouping(tags []string, hostname string) []string { // clusterID) pair. It is used as the variable segment of the metric name so // that each (tag-group × pattern) combination gets a unique, stable name. func globalClusterHash(groupHash uint64, clusterID int64) string { - h := fnv.New64a() - _ = binary.Write(h, binary.LittleEndian, groupHash) - _ = binary.Write(h, binary.LittleEndian, clusterID) - return strconv.FormatUint(h.Sum64(), 16) + h := fnv64aMixUint64(fnvOffsetBasis64, groupHash) + h = fnv64aMixInt64(h, clusterID) + return strconv.FormatUint(h, 16) } // TaggedPatternClusterer wraps one *patterns.PatternClusterer per tag-group diff --git a/comp/anomalydetection/observer/impl/metrics_detector_bocpd.go b/comp/anomalydetection/observer/impl/metrics_detector_bocpd.go index 74782d315fb7..63f236945ae5 100644 --- a/comp/anomalydetection/observer/impl/metrics_detector_bocpd.go +++ b/comp/anomalydetection/observer/impl/metrics_detector_bocpd.go @@ -20,11 +20,8 @@ type bocpdStateKey struct { // bocpdSeriesState holds per-series streaming BOCPD state. type bocpdSeriesState struct { - - // Cursor: how many points we've processed so far (count-based for safety). - lastProcessedTime int64 - lastProcessedCount int - lastWriteGen int64 // storage writeGeneration at last Detect + lastProcessedTime int64 + lastWriteGen int64 // storage writeGeneration at last Detect // Warmup: Welford online mean/variance accumulation. initialized bool @@ -128,10 +125,10 @@ type BOCPDDetector struct { // per-(series, aggregation) state. series map[bocpdStateKey]*bocpdSeriesState - // Cache the discovered series list across Detect calls. Refresh when storage - // reports that new series were added. - cachedSeries []observer.SeriesMeta - cachedGen uint64 + // Cache the discovered series refs across Detect calls. Refresh when storage + // reports that new series were added or removed. + cachedRefs []observer.SeriesRef + cachedGen uint64 } // NewBOCPDDetector creates a streaming BOCPD detector with the given config. @@ -183,21 +180,24 @@ func (b *BOCPDDetector) Name() string { // Detect implements Detector. It discovers series, reads only newly visible // points, and updates per-series BOCPD posterior state incrementally. // -// Correctness takes priority over positional cursoring: storage may insert -// points into existing history, so this detector gates incremental work on -// visible point counts rather than raw slice positions. +// Uses writeGeneration as the sole skip guard: if nothing was written since +// last Detect, skip. Otherwise process all points strictly after lastProcessedTime. func (b *BOCPDDetector) Detect(storage observer.StorageReader, dataTime int64) observer.DetectionResult { gen := storage.SeriesGeneration() - if b.cachedSeries == nil || gen != b.cachedGen { - b.cachedSeries = storage.ListSeries(observer.WorkloadSeriesFilter()) + if b.cachedRefs == nil || gen != b.cachedGen { + metas := storage.ListSeries(observer.WorkloadSeriesFilter()) + b.cachedRefs = make([]observer.SeriesRef, len(metas)) + for i, m := range metas { + b.cachedRefs[i] = m.Ref + } b.cachedGen = gen } var allAnomalies []observer.Anomaly - for _, meta := range b.cachedSeries { + for _, ref := range b.cachedRefs { for _, agg := range b.config.Aggregations { - sk := bocpdStateKey{ref: meta.Ref, agg: agg} + sk := bocpdStateKey{ref: ref, agg: agg} state, exists := b.series[sk] if !exists { @@ -205,25 +205,13 @@ func (b *BOCPDDetector) Detect(storage observer.StorageReader, dataTime int64) o b.series[sk] = state } - visibleCount := storage.PointCountUpTo(meta.Ref, dataTime) - currentGen := storage.WriteGeneration(meta.Ref) - mergeOccurred := visibleCount == state.lastProcessedCount && currentGen != state.lastWriteGen - if visibleCount <= state.lastProcessedCount && !mergeOccurred { + currentGen := storage.WriteGeneration(ref) + if currentGen == state.lastWriteGen { continue } - startTime := state.lastProcessedTime - if mergeOccurred { - startTime = state.lastProcessedTime - 1 - if startTime < 0 { - startTime = 0 - } - } - - pointsSeen := false prevLen := len(allAnomalies) - storage.ForEachPoint(meta.Ref, startTime, dataTime, agg, func(series *observer.Series, p observer.Point) { - pointsSeen = true + storage.ForEachPoint(ref, state.lastProcessedTime, dataTime, agg, func(series *observer.Series, p observer.Point) { anomaly := b.processPoint(state, p, series, agg) if anomaly != nil { allAnomalies = append(allAnomalies, *anomaly) @@ -232,17 +220,9 @@ func (b *BOCPDDetector) Detect(storage observer.StorageReader, dataTime int64) o }) // Set SourceRef on any anomalies produced in this iteration. for k := prevLen; k < len(allAnomalies); k++ { - allAnomalies[k].SourceRef = &observer.QueryHandle{Ref: meta.Ref, Aggregate: agg} - } - - if !pointsSeen && mergeOccurred { - state.lastWriteGen = currentGen - continue - } - if pointsSeen { - state.lastProcessedCount = visibleCount - state.lastWriteGen = currentGen + allAnomalies[k].SourceRef = &observer.QueryHandle{Ref: ref, Aggregate: agg} } + state.lastWriteGen = currentGen } } @@ -252,7 +232,7 @@ func (b *BOCPDDetector) Detect(storage observer.StorageReader, dataTime int64) o // Reset clears all per-series state for replay/reanalysis. func (b *BOCPDDetector) Reset() { b.series = make(map[bocpdStateKey]*bocpdSeriesState) - b.cachedSeries = nil + b.cachedRefs = nil b.cachedGen = 0 } @@ -273,7 +253,7 @@ func (b *BOCPDDetector) RemoveSeries(refs []observer.SeriesRef) { } // Drop the cached series snapshot so the next Detect re-lists from // storage and we don't iterate over removed refs. - b.cachedSeries = nil + b.cachedRefs = nil b.cachedGen = 0 } diff --git a/comp/anomalydetection/observer/impl/metrics_detector_bocpd_test.go b/comp/anomalydetection/observer/impl/metrics_detector_bocpd_test.go index 7d840e61c985..5da21b47833a 100644 --- a/comp/anomalydetection/observer/impl/metrics_detector_bocpd_test.go +++ b/comp/anomalydetection/observer/impl/metrics_detector_bocpd_test.go @@ -339,14 +339,11 @@ func TestFindingH3_CPProbUsesOnlyPriorPredictiveNotSumOverRunLengths(t *testing. func TestFindingM6_BOCPDSkipsSameBucketValueMerges(t *testing.T) { // When two values arrive at the same timestamp, storage merges them into - // one bucket. But PointCountUpTo doesn't change (still 1 bucket), so - // BOCPD's cache check skips the series on the second Detect call. - // - // Steps: - // 1. Add first value at timestamp T, call Detect - // 2. Add second value at same timestamp T (storage merges), call Detect again - // 3. Assert the detector processed the updated merged value - + // one bucket. PointCountUpTo doesn't change (still 1 bucket), but + // writeGeneration does. The detector must notice the write and advance + // lastWriteGen — it will not re-process the merged value (to avoid + // double-counting the posterior), but gen must advance so future writes + // are not missed. config := DefaultBOCPDConfig() config.WarmupPoints = 5 config.Aggregations = []observer.Aggregate{observer.AggregateAverage} @@ -374,16 +371,6 @@ func TestFindingM6_BOCPDSkipsSameBucketValueMerges(t *testing.T) { assert.Equal(t, 150.0, series.Points[0].Value, "storage should have merged the two values at timestamp 5") - // Now Detect again. The detector should process the updated merged value. - // But the bug is: PointCountUpTo still returns 5 (same as before), so the - // detector's lastProcessedCount check causes it to skip this series. - - // To detect whether the detector re-processed, we check its internal state. - // After processing x=100 at t=5, the posterior was updated with x=100. - // After re-processing x=150 (merged average), it should update with x=150. - // But if skipped, the posterior still reflects x=100. - - // Snapshot the writeGeneration the detector saw after first Detect. var stateBefore *bocpdSeriesState for _, s := range d.series { stateBefore = s @@ -401,15 +388,11 @@ func TestFindingM6_BOCPDSkipsSameBucketValueMerges(t *testing.T) { } genAfter := stateAfter.lastWriteGen - pointCount := storage.PointCountUpTo(observer.SeriesRef(0), 5) - t.Logf("genBefore=%d, genAfter=%d, PointCountUpTo=%d, writeGen=%d", - genBefore, genAfter, pointCount, storage.WriteGeneration(observer.SeriesRef(0))) - - // The detector should notice the merge via writeGeneration even though - // PointCountUpTo didn't change. If it re-processed, genAfter > genBefore. + // The detector must advance lastWriteGen when a same-bucket merge changes + // writeGeneration, even though no new points are processed (using the + // lastProcessedTime exclusive cursor avoids double-counting the posterior). assert.Greater(t, genAfter, genBefore, - "detector should re-process when a same-bucket merge changes the value; "+ - "lastWriteGen should advance but didn't (%d == %d)", genBefore, genAfter) + "detector should advance lastWriteGen on same-bucket merge; got %d == %d", genBefore, genAfter) } func TestFindingM7_WarmupPointsOneCausesNaN(t *testing.T) { diff --git a/comp/anomalydetection/observer/impl/metrics_detector_scanmw.go b/comp/anomalydetection/observer/impl/metrics_detector_scanmw.go index d45d9a1c9897..19c85f4821e2 100644 --- a/comp/anomalydetection/observer/impl/metrics_detector_scanmw.go +++ b/comp/anomalydetection/observer/impl/metrics_detector_scanmw.go @@ -24,9 +24,7 @@ type scanmwStateKey struct { // (metrics_detector_bocpd.go). If more scan-based detectors are added, // consider extracting a shared scanSeriesState base. type scanmwSeriesState struct { - // Cursor (same pattern as BOCPD: metrics_detector_bocpd.go:16-22) - lastProcessedCount int - lastWriteGen int64 + lastWriteGen int64 // Segment tracking: only scan [segmentStartTime, dataTime]. // 0 initially (scan full history), advances to changepoint timestamp on fire. @@ -74,9 +72,9 @@ type ScanMWDetector struct { // per-series state keyed by ref+agg series map[scanmwStateKey]*scanmwSeriesState - // Cache the discovered series list across Detect calls. - cachedSeries []observer.SeriesMeta - cachedGen uint64 + // Cache the discovered series refs across Detect calls. + cachedRefs []observer.SeriesRef + cachedGen uint64 } // NewScanMWDetector creates a ScanMW detector with default settings. @@ -103,7 +101,7 @@ func (d *ScanMWDetector) Name() string { // Reset clears all per-series state for replay/reanalysis. func (d *ScanMWDetector) Reset() { d.series = make(map[scanmwStateKey]*scanmwSeriesState) - d.cachedSeries = nil + d.cachedRefs = nil d.cachedGen = 0 } @@ -122,7 +120,7 @@ func (d *ScanMWDetector) RemoveSeries(refs []observer.SeriesRef) { delete(d.series, scanmwStateKey{ref: ref, agg: agg}) } } - d.cachedSeries = nil + d.cachedRefs = nil d.cachedGen = 0 } @@ -136,25 +134,25 @@ func (d *ScanMWDetector) Detect(storage observer.StorageReader, dataTime int64) d.ensureDefaults() gen := storage.SeriesGeneration() - if d.cachedSeries == nil || gen != d.cachedGen { - d.cachedSeries = storage.ListSeries(observer.WorkloadSeriesFilter()) + if d.cachedRefs == nil || gen != d.cachedGen { + metas := storage.ListSeries(observer.WorkloadSeriesFilter()) + d.cachedRefs = make([]observer.SeriesRef, len(metas)) + for i, m := range metas { + d.cachedRefs[i] = m.Ref + } d.cachedGen = gen } // Bulk-fetch point counts and write generations in a single lock acquisition. - refs := make([]observer.SeriesRef, len(d.cachedSeries)) - for i, meta := range d.cachedSeries { - refs[i] = meta.Ref - } - bulkStatus := bulkSeriesStatus(storage, refs, dataTime) + bulkStatus := bulkSeriesStatus(storage, d.cachedRefs, dataTime) var allAnomalies []observer.Anomaly - for i, meta := range d.cachedSeries { + for i, ref := range d.cachedRefs { status := bulkStatus[i] for _, agg := range d.Aggregations { - sk := scanmwStateKey{ref: meta.Ref, agg: agg} + sk := scanmwStateKey{ref: ref, agg: agg} state, exists := d.series[sk] if !exists { @@ -162,20 +160,15 @@ func (d *ScanMWDetector) Detect(storage observer.StorageReader, dataTime int64) d.series[sk] = state } - // Replay optimization: skip unless MinSegment new points are visible. - // The scan needs MinSegment points per side to evaluate a split, so - // fewer new points can't create a new valid split boundary. This cuts - // per-series scans from O(timestamps) to O(timestamps/MinSegment). - // During live ingestion, writeGen changes on every write so this - // condition falls through to the gen check and behaves as before. - if status.pointCount < state.lastProcessedCount+d.MinSegment && status.writeGeneration == state.lastWriteGen { + // Skip if nothing has been written since last Detect. + if status.writeGeneration == state.lastWriteGen { continue } // Collect points into reusable buffer to avoid per-call allocation. state.buf = state.buf[:0] var seriesMeta *observer.Series - storage.ForEachPoint(meta.Ref, state.segmentStartTime, dataTime, agg, func(s *observer.Series, p observer.Point) { + storage.ForEachPoint(ref, state.segmentStartTime, dataTime, agg, func(s *observer.Series, p observer.Point) { if seriesMeta == nil { sCopy := *s seriesMeta = &sCopy @@ -184,19 +177,17 @@ func (d *ScanMWDetector) Detect(storage observer.StorageReader, dataTime int64) }) if seriesMeta == nil || len(state.buf) < d.MinPoints { - state.lastProcessedCount = status.pointCount state.lastWriteGen = status.writeGeneration continue } anomaly, changeIdx, found := d.scanMW(state.buf, seriesMeta, agg) if found { - anomaly.SourceRef = &observer.QueryHandle{Ref: meta.Ref, Aggregate: agg} + anomaly.SourceRef = &observer.QueryHandle{Ref: ref, Aggregate: agg} allAnomalies = append(allAnomalies, anomaly) state.segmentStartTime = state.buf[changeIdx].Timestamp - 1 } - state.lastProcessedCount = status.pointCount state.lastWriteGen = status.writeGeneration } } diff --git a/comp/anomalydetection/observer/impl/metrics_detector_scanmw_test.go b/comp/anomalydetection/observer/impl/metrics_detector_scanmw_test.go index 8e703bba3f2d..a6b1ef3471c9 100644 --- a/comp/anomalydetection/observer/impl/metrics_detector_scanmw_test.go +++ b/comp/anomalydetection/observer/impl/metrics_detector_scanmw_test.go @@ -158,5 +158,5 @@ func TestScanMW_Reset(t *testing.T) { d.Reset() assert.Empty(t, d.series, "reset should clear all state") - assert.Nil(t, d.cachedSeries, "reset should clear cached series") + assert.Nil(t, d.cachedRefs, "reset should clear cached refs") } diff --git a/comp/anomalydetection/observer/impl/metrics_detector_scanwelch.go b/comp/anomalydetection/observer/impl/metrics_detector_scanwelch.go index 58e6631460e0..e1b2d7246759 100644 --- a/comp/anomalydetection/observer/impl/metrics_detector_scanwelch.go +++ b/comp/anomalydetection/observer/impl/metrics_detector_scanwelch.go @@ -21,8 +21,7 @@ type scanwelchStateKey struct { // scanwelchSeriesState holds per-series streaming state for the ScanWelch detector. // Same pattern as scanmwSeriesState and BOCPD (metrics_detector_bocpd.go). type scanwelchSeriesState struct { - lastProcessedCount int - lastWriteGen int64 + lastWriteGen int64 // Segment tracking: only scan [segmentStartTime, dataTime]. segmentStartTime int64 @@ -66,9 +65,9 @@ type ScanWelchDetector struct { // per-series state keyed by ref+agg series map[scanwelchStateKey]*scanwelchSeriesState - // Cache the discovered series list across Detect calls. - cachedSeries []observer.SeriesMeta - cachedGen uint64 + // Cache the discovered series refs across Detect calls. + cachedRefs []observer.SeriesRef + cachedGen uint64 } // NewScanWelchDetector creates a ScanWelch detector with default settings. @@ -96,7 +95,7 @@ func (d *ScanWelchDetector) Name() string { // Reset clears all per-series state for replay/reanalysis. func (d *ScanWelchDetector) Reset() { d.series = make(map[scanwelchStateKey]*scanwelchSeriesState) - d.cachedSeries = nil + d.cachedRefs = nil d.cachedGen = 0 } @@ -115,7 +114,7 @@ func (d *ScanWelchDetector) RemoveSeries(refs []observer.SeriesRef) { delete(d.series, scanwelchStateKey{ref: ref, agg: agg}) } } - d.cachedSeries = nil + d.cachedRefs = nil d.cachedGen = 0 } @@ -125,26 +124,25 @@ func (d *ScanWelchDetector) Detect(storage observer.StorageReader, dataTime int6 d.ensureDefaults() gen := storage.SeriesGeneration() - if d.cachedSeries == nil || gen != d.cachedGen { - d.cachedSeries = storage.ListSeries(observer.WorkloadSeriesFilter()) + if d.cachedRefs == nil || gen != d.cachedGen { + metas := storage.ListSeries(observer.WorkloadSeriesFilter()) + d.cachedRefs = make([]observer.SeriesRef, len(metas)) + for i, m := range metas { + d.cachedRefs[i] = m.Ref + } d.cachedGen = gen } // Bulk-fetch point counts and write generations in a single lock acquisition. - // This avoids 2×len(series) individual RLock/RUnlock calls per Detect() call. - refs := make([]observer.SeriesRef, len(d.cachedSeries)) - for i, meta := range d.cachedSeries { - refs[i] = meta.Ref - } - bulkStatus := bulkSeriesStatus(storage, refs, dataTime) + bulkStatus := bulkSeriesStatus(storage, d.cachedRefs, dataTime) var allAnomalies []observer.Anomaly - for i, meta := range d.cachedSeries { + for i, ref := range d.cachedRefs { status := bulkStatus[i] for _, agg := range d.Aggregations { - sk := scanwelchStateKey{ref: meta.Ref, agg: agg} + sk := scanwelchStateKey{ref: ref, agg: agg} state, exists := d.series[sk] if !exists { @@ -152,13 +150,12 @@ func (d *ScanWelchDetector) Detect(storage observer.StorageReader, dataTime int6 d.series[sk] = state } - // Replay optimization: skip unless MinSegment new points are visible. - // The scan needs MinSegment points per side to evaluate a split, so - // fewer new points can't create a new valid split boundary. This cuts - // per-series scans from O(timestamps) to O(timestamps/MinSegment). - // During live ingestion, writeGen changes on every write so this - // condition falls through to the gen check and behaves as before. - if status.pointCount < state.lastProcessedCount+d.MinSegment && status.writeGeneration == state.lastWriteGen { + // Skip if nothing has been written since last Detect. + // writeGeneration increments on every Add (including same-bucket merges), + // so this is the sole correct guard. A count-based skip is broken under + // time-based point retention because trim+add keeps pointCount stable + // while still changing writeGeneration. + if status.writeGeneration == state.lastWriteGen { continue } @@ -167,7 +164,7 @@ func (d *ScanWelchDetector) Detect(storage observer.StorageReader, dataTime int6 // state.buf which grows once and is reused across scans. state.buf = state.buf[:0] var seriesMeta *observer.Series - storage.ForEachPoint(meta.Ref, state.segmentStartTime, dataTime, agg, func(s *observer.Series, p observer.Point) { + storage.ForEachPoint(ref, state.segmentStartTime, dataTime, agg, func(s *observer.Series, p observer.Point) { if seriesMeta == nil { // Capture series metadata on first point (valid during callback). sCopy := *s @@ -177,19 +174,17 @@ func (d *ScanWelchDetector) Detect(storage observer.StorageReader, dataTime int6 }) if seriesMeta == nil || len(state.buf) < d.MinPoints { - state.lastProcessedCount = status.pointCount state.lastWriteGen = status.writeGeneration continue } anomaly, changeIdx, found := d.scanWelch(state.buf, seriesMeta, agg) if found { - anomaly.SourceRef = &observer.QueryHandle{Ref: meta.Ref, Aggregate: agg} + anomaly.SourceRef = &observer.QueryHandle{Ref: ref, Aggregate: agg} allAnomalies = append(allAnomalies, anomaly) state.segmentStartTime = state.buf[changeIdx].Timestamp - 1 } - state.lastProcessedCount = status.pointCount state.lastWriteGen = status.writeGeneration } } diff --git a/comp/anomalydetection/observer/impl/metrics_detector_scanwelch_test.go b/comp/anomalydetection/observer/impl/metrics_detector_scanwelch_test.go index 57e098252263..a65fefe77c04 100644 --- a/comp/anomalydetection/observer/impl/metrics_detector_scanwelch_test.go +++ b/comp/anomalydetection/observer/impl/metrics_detector_scanwelch_test.go @@ -148,5 +148,5 @@ func TestScanWelch_Reset(t *testing.T) { d.Reset() assert.Empty(t, d.series, "reset should clear all state") - assert.Nil(t, d.cachedSeries, "reset should clear cached series") + assert.Nil(t, d.cachedRefs, "reset should clear cached refs") } diff --git a/comp/anomalydetection/observer/impl/observer.go b/comp/anomalydetection/observer/impl/observer.go index 30eae3bb8766..2cc2a8cb845a 100644 --- a/comp/anomalydetection/observer/impl/observer.go +++ b/comp/anomalydetection/observer/impl/observer.go @@ -103,7 +103,7 @@ func (m *metricObs) GetSampleRate() float64 { // logObs contains copied log data and implements observerdef.LogView. type logObs struct { - content []byte + content string status string tags []string hostname string @@ -113,7 +113,7 @@ type logObs struct { // Ensure logObs implements observerdef.LogView var _ observerdef.LogView = (*logObs)(nil) -func (l *logObs) GetContent() []byte { +func (l *logObs) GetContent() string { return l.content } @@ -172,13 +172,17 @@ func NewComponent(deps Requires) Provides { settings := settingsFromAgentConfig(catalog, cfg) detectors, correlators, extractors, _ := catalog.Instantiate(settings) + storageCfg := storageConfig{ + MaxSeries: cfg.GetInt("observer.storage.max_series"), + EvictionFloorRatio: cfg.GetFloat64("observer.storage.eviction_floor_ratio"), + PointRetentionSecs: int64(cfg.GetInt("observer.storage.point_retention_secs")), + } eng := newEngine(engineConfig{ - storage: newTimeSeriesStorage(), - extractors: extractors, - detectors: detectors, - correlators: correlators, - contextProviders: collectContextProviders(extractors), - scheduler: ¤tBehaviorPolicy{}, + storage: newTimeSeriesStorageWith(storageCfg), + extractors: extractors, + detectors: detectors, + correlators: correlators, + scheduler: ¤tBehaviorPolicy{}, }) // Wire each injected reporter into its own reporterEventSink subscription. @@ -203,6 +207,16 @@ func NewComponent(deps Requires) Provides { hfFilterSources := make(map[metrics.MetricSource]struct{}) + // Wire per-component processing time directly to the telemetry gauge, + // bypassing ObserverTelemetry object construction on the hot path. + processingTimeGauge, ok := th.telemetryGauges[telemetryDetectorProcessingTimeNs] + if !ok { + panic("observer: telemetry gauge not registered: " + telemetryDetectorProcessingTimeNs) + } + eng.onProcessingTime = func(detectorTag string, nanos float64) { + processingTimeGauge.Set(nanos, detectorTag) + } + obs := &observerImpl{ engine: eng, catalog: catalog, @@ -342,7 +356,7 @@ func NewComponent(deps Requires) Provides { "msg": message, }) handle.ObserveLog(&agentLogView{ - content: payload, + content: string(payload), status: strings.ToLower(level.String()), tags: tags, hostname: "", @@ -462,12 +476,11 @@ type seriesDetectorAdapter struct { // bounding per-call cost to O(windowSec) instead of O(totalPoints). windowSec int64 - // cachedSeries / cachedGen mirror the pattern used by BOCPDDetector, - // ScanWelchDetector, and ScanMWDetector: storage.SeriesGeneration() only - // advances when a brand-new series key is created, so we can avoid the - // per-Detect full-map ListSeries scan on steady-state cardinality. - cachedSeries []observerdef.SeriesMeta - cachedGen uint64 + // cachedRefs / cachedGen: only holds SeriesRef values — avoids holding + // the full SeriesMeta (Name, Tags, Namespace) alive in the hot path. + // Refreshed only when SeriesGeneration changes. + cachedRefs []observerdef.SeriesRef + cachedGen uint64 // lastVisibleCount is keyed by the storage's compact SeriesRef so we // avoid rebuilding a string key per series per Detect call. SeriesRefs @@ -492,7 +505,7 @@ func (a *seriesDetectorAdapter) Name() string { // Reset clears adapter-local caches and resets the wrapped detector when supported. func (a *seriesDetectorAdapter) Reset() { a.lastVisibleCount = make(map[observerdef.SeriesRef]int) - a.cachedSeries = nil + a.cachedRefs = nil a.cachedGen = 0 if resetter, ok := a.detector.(interface{ Reset() }); ok { resetter.Reset() @@ -507,7 +520,7 @@ func (a *seriesDetectorAdapter) Reset() { // Concurrency invariant: this method runs on the single observerImpl.run() // goroutine that drives every other adapter callback (Detect, Reset). The // engine's fanOutSeriesRemoval is the only caller. Mutating lastVisibleCount -// and cachedSeries without a lock is safe under that invariant only. +// and cachedRefs without a lock is safe under that invariant only. func (a *seriesDetectorAdapter) RemoveSeries(refs []observerdef.SeriesRef) { if len(refs) == 0 { return @@ -517,7 +530,7 @@ func (a *seriesDetectorAdapter) RemoveSeries(refs []observerdef.SeriesRef) { delete(a.lastVisibleCount, ref) } } - a.cachedSeries = nil + a.cachedRefs = nil a.cachedGen = 0 if remover, ok := a.detector.(observerdef.SeriesRemover); ok { remover.RemoveSeries(refs) @@ -526,27 +539,31 @@ func (a *seriesDetectorAdapter) RemoveSeries(refs []observerdef.SeriesRef) { func (a *seriesDetectorAdapter) Detect(storage observerdef.StorageReader, dataTime int64) observerdef.DetectionResult { gen := storage.SeriesGeneration() - if a.cachedSeries == nil || gen != a.cachedGen { - a.cachedSeries = storage.ListSeries(observerdef.WorkloadSeriesFilter()) + if a.cachedRefs == nil || gen != a.cachedGen { + metas := storage.ListSeries(observerdef.WorkloadSeriesFilter()) + a.cachedRefs = make([]observerdef.SeriesRef, len(metas)) + for i, m := range metas { + a.cachedRefs[i] = m.Ref + } a.cachedGen = gen } var allAnomalies []observerdef.Anomaly var allTelemetry []observerdef.ObserverTelemetry - for _, meta := range a.cachedSeries { - visibleCount := storage.PointCountUpTo(meta.Ref, dataTime) - if prev, ok := a.lastVisibleCount[meta.Ref]; ok && prev == visibleCount { + for _, ref := range a.cachedRefs { + visibleCount := storage.PointCountUpTo(ref, dataTime) + if prev, ok := a.lastVisibleCount[ref]; ok && prev == visibleCount { continue } - a.lastVisibleCount[meta.Ref] = visibleCount + a.lastVisibleCount[ref] = visibleCount for _, agg := range a.aggregations { start := int64(0) if a.windowSec > 0 { start = dataTime - a.windowSec } - series := storage.GetSeriesRange(meta.Ref, start, dataTime, agg) + series := storage.GetSeriesRange(ref, start, dataTime, agg) if series == nil || len(series.Points) == 0 { continue } @@ -565,7 +582,7 @@ func (a *seriesDetectorAdapter) Detect(storage observerdef.StorageReader, dataTi Aggregate: agg, } result.Anomalies[j].SourceRef = &observerdef.QueryHandle{ - Ref: meta.Ref, + Ref: ref, Aggregate: agg, } } @@ -767,8 +784,8 @@ func (o *observerImpl) AddTelemetry(name string, value float64, timestamp int64, // Implements DebugView. func (o *observerImpl) ReplayStoredData() { // resetAnalysisState resets detectors/correlators and tracking state but - // preserves extractor state (contextRefs + provider pattern registry) so - // enrichAnomaly can still attach log pattern context during replay. + // preserves extractor state so enrichAnomaly can still attach log pattern + // context (stored on seriesStats) during replay. o.engine.resetAnalysisState() o.engine.ReplayStoredData() } @@ -787,7 +804,7 @@ func (o *observerImpl) StorageReader() observerdef.StorageReader { func (o *observerImpl) IngestLogSync(source string, msg observerdef.LogView) { timestampMs := msg.GetTimestampUnixMilli() lo := &logObs{ - content: copyBytes(msg.GetContent()), + content: msg.GetContent(), status: msg.GetStatus(), tags: copyTags(msg.GetTags()), hostname: msg.GetHostname(), @@ -892,7 +909,7 @@ func (h *handle) ObserveLog(msg observerdef.LogView) { obs := observation{ source: h.source, log: &logObs{ - content: copyBytes(msg.GetContent()), + content: msg.GetContent(), status: msg.GetStatus(), tags: copyTags(msg.GetTags()), hostname: msg.GetHostname(), @@ -916,7 +933,7 @@ type logView struct { obs *logObs } -func (v *logView) GetContent() []byte { return v.obs.content } +func (v *logView) GetContent() string { return v.obs.content } func (v *logView) GetStatus() string { return v.obs.status } func (v *logView) GetTags() []string { return v.obs.tags } func (v *logView) GetHostname() string { return v.obs.hostname } @@ -925,25 +942,15 @@ func (v *logView) GetTimestampUnixMilli() int64 { return v.obs.timestampMs } // agentLogView is a minimal LogView implementation for agent-internal logs. // It is immediately copied by the observer handle, so it must not be retained. type agentLogView struct { - content []byte + content string status string tags []string hostname string timestampMs int64 } -func (v *agentLogView) GetContent() []byte { return v.content } +func (v *agentLogView) GetContent() string { return v.content } func (v *agentLogView) GetStatus() string { return v.status } func (v *agentLogView) GetTags() []string { return v.tags } func (v *agentLogView) GetHostname() string { return v.hostname } func (v *agentLogView) GetTimestampUnixMilli() int64 { return v.timestampMs } - -// copyBytes creates a copy of a byte slice. -func copyBytes(b []byte) []byte { - if b == nil { - return nil - } - result := make([]byte, len(b)) - copy(result, b) - return result -} diff --git a/comp/anomalydetection/observer/impl/storage.go b/comp/anomalydetection/observer/impl/storage.go index f0266cad16ba..11139a11c738 100644 --- a/comp/anomalydetection/observer/impl/storage.go +++ b/comp/anomalydetection/observer/impl/storage.go @@ -19,24 +19,57 @@ import ( observer "github.com/DataDog/datadog-agent/comp/anomalydetection/observer/def" ) +// storageConfig holds the tunable parameters for timeSeriesStorage. +// All three values have defaults that match the package-level constants. +type storageConfig struct { + // MaxSeries is the cap on live series; when exceeded on an Advance call, + // series are evicted until the count drops to the eviction target. + // 0 disables eviction. + MaxSeries int + + // EvictionFloorRatio controls how far below MaxSeries eviction drains. + // The eviction target is MaxSeries*(1-EvictionFloorRatio). + // e.g. 0.1 → drain to 90% of the cap, creating a 10% hysteresis band. + EvictionFloorRatio float64 + + // PointRetentionSecs is how long data points are kept per series. + // Points older than (latest data timestamp - PointRetentionSecs) are + // trimmed on each Add. 0 disables trimming. + PointRetentionSecs int64 +} + +// defaultStorageConfig returns a storageConfig with the hard-coded defaults. +func defaultStorageConfig() storageConfig { + return storageConfig{ + MaxSeries: storageMaxSeries, + EvictionFloorRatio: storageEvictionBandRatio, + PointRetentionSecs: storagePointRetentionSecs, + } +} + // timeSeriesStorage is an internal storage for time series data. type timeSeriesStorage struct { + cfg storageConfig mu sync.RWMutex - series map[string]*seriesStats + series map[uint64]*seriesStats // keyed by seriesKeyHash; no string retained per entry // observationTimestamps tracks all timestamps where observations occurred, // even if no metric series was written for that timestamp. observationTimestamps map[int64]struct{} // Compact numeric IDs for O(1) lookups and API responses. - seriesIDs map[string]observer.SeriesRef // internal key → numeric ref - seriesIDKeys []string // numeric ID → internal key (index = ID) - seriesIDStats []*seriesStats // numeric ID → *seriesStats (index = ID) + seriesIDStats []*seriesStats // numeric ID → *seriesStats (index = ID) // Global generation for the series catalog; increments only when a new // series key is created, not on every write to an existing series. seriesGen uint64 + // tagIntern maps a fnv64a hash of a series' sorted tag set to the canonical + // []string slice shared by all series with that tag combination, plus a + // reference count. When the count drops to zero on eviction the entry is + // deleted. Protected by s.mu (write lock). + tagIntern map[uint64]*tagInternEntry + // Drop accounting for invalid/unsafe input values. droppedNonFinite int64 droppedExtreme int64 @@ -47,11 +80,21 @@ type timeSeriesStorage struct { // seriesStats contains accumulated statistics for a time series (internal). // Data is stored in columnar layout: parallel arrays indexed by point position. // Timestamps are stored in sorted order, enabling binary search for range queries. +// tagInternEntry is the value stored in timeSeriesStorage.tagIntern. +// tags is the canonical []string shared by all series with the same tag set. +// count is the number of live series currently referencing it. +type tagInternEntry struct { + tags []string + count int +} + type seriesStats struct { - Namespace string - Name string - Tags []string - internalKey string // cached map key to avoid recomputation + Namespace string + Name string + Tags []string + tagsHash uint64 // fnv64a hash of Tags; 0 means not interned + ref observer.SeriesRef // compact numeric ID assigned on creation + context *observer.MetricContext // optional; set by extractors for anomaly enrichment // writeGeneration is per-series and increments on every Add, including // same-bucket merges into an existing point. @@ -170,12 +213,18 @@ func searchAfter(timestamps []int64, value int64) int { }) } -// newTimeSeriesStorage creates a new time series storage. +// newTimeSeriesStorage creates a new time series storage with default config. func newTimeSeriesStorage() *timeSeriesStorage { + return newTimeSeriesStorageWith(defaultStorageConfig()) +} + +// newTimeSeriesStorageWith creates a new time series storage with explicit config. +func newTimeSeriesStorageWith(cfg storageConfig) *timeSeriesStorage { return &timeSeriesStorage{ - series: make(map[string]*seriesStats), + cfg: cfg, + series: make(map[uint64]*seriesStats), observationTimestamps: make(map[int64]struct{}), - seriesIDs: make(map[string]observer.SeriesRef), + tagIntern: make(map[uint64]*tagInternEntry), droppedByMetric: make(map[string]int64), sampledDrops: make(map[string]int), } @@ -190,12 +239,9 @@ func newTimeSeriesStorage() *timeSeriesStorage { type AddResult struct { // IsNew is true if this Add created a brand-new series (cardinality +1). IsNew bool - // StorageKey is the canonical seriesKey for this point. Callers that - // need to index further state by the same key (e.g. engine.contextRefs) - // can reuse this value instead of recomputing seriesKey(...) themselves. - // Empty string is returned when the point is dropped pre-key-compute - // (non-finite or sentinel values). - StorageKey string + // Ref is the SeriesRef assigned to this point's series. -1 when the point + // is dropped (non-finite or sentinel values); callers should guard with Ref >= 0. + Ref observer.SeriesRef } // Add inserts a (namespace, name, value, timestamp, tags) point into storage. @@ -208,33 +254,52 @@ func (s *timeSeriesStorage) Add(namespace, name string, value float64, timestamp if math.IsInf(value, 0) || math.IsNaN(value) { s.recordDroppedValue("non_finite", namespace, name, value, timestamp, tags) - return AddResult{} + return AddResult{Ref: -1} } // Guard against known finite sentinel values (MaxFloat64 used as "unlimited") // that overflow downstream aggregation math when summed. if value == math.MaxFloat64 || value == -math.MaxFloat64 { s.recordDroppedValue("extreme", namespace, name, value, timestamp, tags) - return AddResult{} + return AddResult{Ref: -1} + } + h := seriesKeyHash(namespace, name, tags) + canonTags := canonicalizeTags(tags) + + stats, exists := s.series[h] + // Collision guard: verify full identity (namespace + name + tags). + if exists && (stats.Namespace != namespace || stats.Name != name || !tagsEqual(stats.Tags, canonTags)) { + // Hash collision — extremely rare with FNV-64a (~10^-14 at 1000 series). + log.Printf("[observer] WARN: seriesKeyHash collision h=%d: incumbent={%s,%s} new={%s,%s}", h, stats.Namespace, stats.Name, namespace, name) + exists = false + for _, st := range s.seriesIDStats { + if st != nil && st.Namespace == namespace && st.Name == name && tagsEqual(st.Tags, canonTags) { + stats = st + exists = true + break + } + } } - key := seriesKey(namespace, name, tags) - - stats, exists := s.series[key] if !exists { + // Only intern on new series creation so the ref count tracks exactly + // the number of live series holding the canonical slice. + canonical, th := s.internTags(tags) + id := observer.SeriesRef(len(s.seriesIDStats)) stats = &seriesStats{ - Namespace: namespace, - Name: name, - Tags: canonicalizeTags(tags), - internalKey: key, - } - s.series[key] = stats - // Assign a compact numeric ID. - id := observer.SeriesRef(len(s.seriesIDKeys)) - s.seriesIDs[key] = id - s.seriesIDKeys = append(s.seriesIDKeys, key) + Namespace: namespace, + Name: name, + Tags: canonical, + tagsHash: th, + ref: id, + } + // Only claim the hash slot if empty — avoids displacing an existing + // series on the vanishingly rare hash collision. + if _, occupied := s.series[h]; !occupied { + s.series[h] = stats + } s.seriesIDStats = append(s.seriesIDStats, stats) s.seriesGen++ } - res := AddResult{IsNew: !exists, StorageKey: key} + res := AddResult{IsNew: !exists, Ref: stats.ref} stats.writeGeneration++ // Bucket by second. @@ -263,9 +328,30 @@ func (s *timeSeriesStorage) Add(namespace, name string, value float64, timestamp stats.counts = insertInt64(stats.counts, idx, 1) stats.mins = insertFloat64(stats.mins, idx, value) stats.maxes = insertFloat64(stats.maxes, idx, value) + + if s.cfg.PointRetentionSecs > 0 { + // searchAfter returns first index where ts > value; subtracting 1 makes + // the window inclusive: keep points where ts >= bucket-PointRetentionSecs. + if trim := searchAfter(stats.timestamps, bucket-s.cfg.PointRetentionSecs-1); trim > 0 { + stats.timestamps = trimFront(stats.timestamps, trim) + stats.sums = trimFront(stats.sums, trim) + stats.counts = trimFront(stats.counts, trim) + stats.mins = trimFront(stats.mins, trim) + stats.maxes = trimFront(stats.maxes, trim) + } + } + return res } +// trimFront removes the first n elements from s in-place, preserving the +// backing array to avoid allocation. Used to enforce the point retention window. +func trimFront[T any](s []T, n int) []T { + keep := len(s) - n + copy(s, s[n:]) + return s[:keep] +} + // insertInt64 inserts v at position idx in s, maintaining order. func insertInt64(s []int64, idx int, v int64) []int64 { s = append(s, 0) @@ -318,9 +404,7 @@ func (s *timeSeriesStorage) GetSeries(namespace, name string, tags []string, agg defer s.mu.RUnlock() if tags != nil { - // Exact match with tags - key := seriesKey(namespace, name, tags) - stats := s.series[key] + stats := s.lookupByHash(namespace, name, tags) if stats == nil { return nil } @@ -329,9 +413,8 @@ func (s *timeSeriesStorage) GetSeries(namespace, name string, tags []string, agg } // tags is nil: find first matching series by namespace and name - prefix := namespace + "|" + name + "|" - for key, stats := range s.series { - if len(key) >= len(prefix) && key[:len(prefix)] == prefix { + for _, stats := range s.seriesIDStats { + if stats != nil && stats.Namespace == namespace && stats.Name == name { series := stats.toSeries(agg) return &series } @@ -345,8 +428,7 @@ func (s *timeSeriesStorage) GetSeriesSince(namespace, name string, tags []string s.mu.RLock() defer s.mu.RUnlock() - key := seriesKey(namespace, name, tags) - stats := s.series[key] + stats := s.lookupByHash(namespace, name, tags) if stats == nil { return nil } @@ -383,7 +465,10 @@ func (s *timeSeriesStorage) Namespaces() []string { defer s.mu.RUnlock() seen := make(map[string]struct{}) - for _, stats := range s.series { + for _, stats := range s.seriesIDStats { + if stats == nil { + continue + } seen[stats.Namespace] = struct{}{} } result := make([]string, 0, len(seen)) @@ -400,7 +485,10 @@ func (s *timeSeriesStorage) AllSeries(namespace string, agg Aggregate) []observe defer s.mu.RUnlock() var result []observer.Series - for _, stats := range s.series { + for _, stats := range s.seriesIDStats { + if stats == nil { + continue + } if stats.Namespace == namespace { result = append(result, stats.toSeries(agg)) } @@ -417,7 +505,10 @@ func (s *timeSeriesStorage) TimeBounds() (minTs int64, maxTs int64, ok bool) { var max int64 found := false - for _, stats := range s.series { + for _, stats := range s.seriesIDStats { + if stats == nil { + continue + } n := stats.pointCount() if n == 0 { continue @@ -453,7 +544,10 @@ func (s *timeSeriesStorage) MaxTimestamp() int64 { defer s.mu.RUnlock() var max int64 - for _, stats := range s.series { + for _, stats := range s.seriesIDStats { + if stats == nil { + continue + } if n := stats.pointCount(); n > 0 { if t := stats.timestamps[n-1]; t > max { max = t @@ -539,6 +633,222 @@ func tagsSorted(tags []string) bool { return true } +func joinTags(tags []string) string { + switch len(tags) { + case 0: + return "" + case 1: + return tags[0] + default: + return strings.Join(tags, ",") + } +} + +// fnv64a constants (same as hash/fnv stdlib). +const ( + fnvOffsetBasis64 = uint64(14695981039346656037) + fnvPrime64 = uint64(1099511628211) +) + +// fnv64aString computes FNV-1a over a string without allocating a hasher or +// converting to []byte. Produces identical output to hash/fnv.New64a(). +func fnv64aString(s string) uint64 { + h := fnvOffsetBasis64 + for i := 0; i < len(s); i++ { + h ^= uint64(s[i]) + h *= fnvPrime64 + } + return h +} + +// fnv64aMix folds an additional string into an existing FNV-1a hash, separated +// by '|'. Useful for hashing multiple fields without concatenating them first. +func fnv64aMix(h uint64, s string) uint64 { + h ^= uint64('|') + h *= fnvPrime64 + for i := 0; i < len(s); i++ { + h ^= uint64(s[i]) + h *= fnvPrime64 + } + return h +} + +// fnv64aMixUint64 folds a uint64 value into an existing FNV-1a hash +// (little-endian byte order, matching encoding/binary.LittleEndian). +func fnv64aMixUint64(h, v uint64) uint64 { + for i := 0; i < 8; i++ { + h ^= v & 0xFF + h *= fnvPrime64 + v >>= 8 + } + return h +} + +// fnv64aMixInt64 folds an int64 value into an existing FNV-1a hash. +func fnv64aMixInt64(h uint64, v int64) uint64 { + return fnv64aMixUint64(h, uint64(v)) +} + +// seriesKeyHash computes a FNV-1a hash over namespace, name, and sorted tags +// without allocating a string. The separator layout matches seriesKey so that +// seriesKeyHash(ns, n, tags) == fnv64aString(seriesKey(ns, n, tags)). +func seriesKeyHash(namespace, name string, tags []string) uint64 { + if len(tags) > 1 && !tagsSorted(tags) { + tags = canonicalizeTags(tags) + } + h := fnv64aString(namespace) + h = fnv64aMix(h, name) + h ^= uint64('|') + h *= fnvPrime64 + for i, t := range tags { + if i > 0 { + h ^= uint64(',') + h *= fnvPrime64 + } + for j := 0; j < len(t); j++ { + h ^= uint64(t[j]) + h *= fnvPrime64 + } + } + return h +} + +func tagsEqual(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +// storageMaxSeries is the default cap on the number of live series in storage. +// When exceeded on an Advance call, the oldest series (by last write timestamp) +// are evicted until the count drops to storageEvictionTarget (see below). +// Set to 0 to disable. +const storageMaxSeries = 50_000 + +// storageEvictionBandRatio controls how far below the cap eviction drains. +// When the cap is hit, series are removed until count ≤ storageEvictionTarget. +// A ratio of 0.1 means evict down to 90% of the cap — creating a 10% band +// before the next eviction pass is needed. Larger = rarer but bigger passes. +const storageEvictionBandRatio = 0.1 + +// storageEvictionTarget is the default count eviction drains to when the cap fires. +// Retained for reference; runtime code uses storageConfig.EvictionFloorRatio instead. +const storageEvictionTarget = storageMaxSeries - int(storageMaxSeries*storageEvictionBandRatio) + +// storagePointRetentionSecs is how long data points are retained per series. +// Points older than (latest data timestamp - retention) are trimmed on each Add. +// At 10k logs/sec total ingest, 120s ≈ 60 MB of point data regardless of +// series cardinality. ScanMW needs at least 30 points per segment; 120s +// satisfies that at any realistic per-series rate. Set to 0 to disable. +const storagePointRetentionSecs = 120 + +// tagInternMaxSize caps the number of unique tag-set entries in the intern +// pool. New combinations beyond the cap are used as-is (no sharing, no pool +// growth); hits on already-interned combinations still return the canonical +// slice. Matches the default for dogstatsd_string_interner_size. +const tagInternMaxSize = 4096 + +// hashTags computes a fnv64a hash over sorted tags without constructing the +// joined string. Distinct from seriesKeyHash (which includes namespace+name). +// Returns 0 only for empty input; remaps the rare zero hash to 1 as sentinel. +func hashTags(tags []string) uint64 { + if len(tags) == 0 { + return 0 + } + h := fnvOffsetBasis64 + for i, t := range tags { + if i > 0 { + h ^= uint64(',') + h *= fnvPrime64 + } + for j := 0; j < len(t); j++ { + h ^= uint64(t[j]) + h *= fnvPrime64 + } + } + if h == 0 { + h = 1 + } + return h +} + +// internTags sorts tags (if needed), hashes, and either returns the canonical +// []string from the pool (incrementing its ref count) or inserts a new entry. +// Returns the canonical slice and its hash. Hash 0 means not interned (cap or +// collision). Must be called with s.mu write-locked. +func (s *timeSeriesStorage) internTags(tags []string) ([]string, uint64) { + if len(tags) == 0 { + return nil, 0 + } + sorted := make([]string, len(tags)) + copy(sorted, tags) + if len(sorted) > 1 && !tagsSorted(sorted) { + sort.Strings(sorted) + } + th := hashTags(sorted) + if entry, ok := s.tagIntern[th]; ok { + if tagsEqual(entry.tags, sorted) { + entry.count++ + return entry.tags, th + } + // Hash collision — skip interning. + return sorted, 0 + } + if len(s.tagIntern) >= tagInternMaxSize { + return sorted, 0 + } + entry := &tagInternEntry{tags: sorted, count: 1} + s.tagIntern[th] = entry + return sorted, th +} + +// releaseTagIntern decrements the ref count for the intern entry at th and +// deletes it when count reaches zero. No-op when th is 0. Must be called with +// s.mu write-locked. +func (s *timeSeriesStorage) releaseTagIntern(th uint64) { + if th == 0 { + return + } + if entry, ok := s.tagIntern[th]; ok { + entry.count-- + if entry.count == 0 { + delete(s.tagIntern, th) + } + } +} + +// TagInternedCount returns the number of unique tag-set entries in the intern +// pool. Useful for telemetry and tests. +func (s *timeSeriesStorage) TagInternedCount() int { + s.mu.RLock() + defer s.mu.RUnlock() + return len(s.tagIntern) +} + +// lookupByHash finds a series by hash with identity verification. +// Returns nil if not found. Caller must hold s.mu (read or write). +func (s *timeSeriesStorage) lookupByHash(namespace, name string, tags []string) *seriesStats { + canonTags := canonicalizeTags(tags) + h := seriesKeyHash(namespace, name, tags) + stats := s.series[h] + if stats != nil && stats.Namespace == namespace && stats.Name == name && tagsEqual(stats.Tags, canonTags) { + return stats + } + // Hash miss or collision: linear scan fallback. + for _, st := range s.seriesIDStats { + if st != nil && st.Namespace == namespace && st.Name == name && tagsEqual(st.Tags, canonTags) { + return st + } + } + return nil +} + // resolveByID returns the seriesStats for a numeric series ID. // Returns nil for out-of-range IDs. Caller must hold s.mu (read or write). func (s *timeSeriesStorage) resolveByID(ref observer.SeriesRef) *seriesStats { @@ -565,6 +875,136 @@ func (s *timeSeriesStorage) GetSeriesMeta(ref observer.SeriesRef) *observer.Seri } } +// EvictToCapacity evicts the oldest series (by last write timestamp) when the +// live series count exceeds cap, draining down to target. The band between the +// two thresholds prevents a fan-out on every Advance when the count hovers +// near the cap. Returns the freed SeriesRefs for detector state cleanup. +// The engine calls EvictDefault() in production; this method is exposed for tests +// that need to exercise eviction with explicit limits. +func (s *timeSeriesStorage) EvictToCapacity(seriesLimit, target int) []observer.SeriesRef { + if seriesLimit <= 0 { + return nil + } + s.mu.Lock() + defer s.mu.Unlock() + + // Count first — common case is under limit, skip the allocation entirely. + count := 0 + for _, st := range s.seriesIDStats { + if st != nil { + count++ + } + } + if count <= seriesLimit { + return nil + } + + type entry struct { + ref observer.SeriesRef + lastTs int64 + } + candidates := make([]entry, 0, count) + for _, st := range s.seriesIDStats { + if st == nil { + continue + } + lastTs := int64(0) + if n := len(st.timestamps); n > 0 { + lastTs = st.timestamps[n-1] + } + candidates = append(candidates, entry{ref: st.ref, lastTs: lastTs}) + } + + excess := count - target + if excess <= 0 { + return nil + } + + sort.Slice(candidates, func(i, j int) bool { + return candidates[i].lastTs < candidates[j].lastTs + }) + + var freed []observer.SeriesRef + for i := 0; i < excess; i++ { + st := s.resolveByID(candidates[i].ref) + if st == nil { + continue + } + s.releaseTagIntern(st.tagsHash) + h := seriesKeyHash(st.Namespace, st.Name, st.Tags) + if s.series[h] == st { + delete(s.series, h) + } + s.seriesIDStats[candidates[i].ref] = nil + freed = append(freed, candidates[i].ref) + } + if len(freed) > 0 { + s.seriesGen++ + } + return freed +} + +// EvictDefault evicts to capacity using the storage's own config. +// The eviction target is cfg.MaxSeries*(1-cfg.EvictionFloorRatio). +func (s *timeSeriesStorage) EvictDefault() []observer.SeriesRef { + if s.cfg.MaxSeries <= 0 { + return nil + } + target := s.cfg.MaxSeries - int(float64(s.cfg.MaxSeries)*s.cfg.EvictionFloorRatio) + return s.EvictToCapacity(s.cfg.MaxSeries, target) +} + +// SetContext attaches enrichment context to a series. Called by the engine +// after storage.Add whenever a MetricOutput carries a non-nil Context. +func (s *timeSeriesStorage) SetContext(ref observer.SeriesRef, ctx *observer.MetricContext) { + s.mu.Lock() + defer s.mu.Unlock() + if ss := s.resolveByID(ref); ss != nil { + ss.context = ctx + } +} + +// GetContext returns the enrichment context for a series, or nil if none was set. +func (s *timeSeriesStorage) GetContext(ref observer.SeriesRef) *observer.MetricContext { + s.mu.RLock() + defer s.mu.RUnlock() + if ss := s.resolveByID(ref); ss != nil { + return ss.context + } + return nil +} + +// RemoveSeriesByMetricName removes all series in the given namespace with the +// given metric name (all tag variants). Used by the engine when an extractor +// signals that a pattern has been evicted. Returns the freed SeriesRefs. +func (s *timeSeriesStorage) RemoveSeriesByMetricName(namespace, name string) []observer.SeriesRef { + s.mu.Lock() + defer s.mu.Unlock() + var refs []observer.SeriesRef + for _, st := range s.seriesIDStats { + if st != nil && st.Namespace == namespace && st.Name == name { + refs = append(refs, st.ref) + } + } + if len(refs) == 0 { + return nil + } + for _, ref := range refs { + st := s.resolveByID(ref) + if st == nil { + continue + } + s.releaseTagIntern(st.tagsHash) + h := seriesKeyHash(st.Namespace, st.Name, st.Tags) + if s.series[h] == st { + delete(s.series, h) + } + s.seriesIDStats[ref] = nil + } + s.seriesGen++ + return refs +} + // seriesMeta is lightweight series metadata including point count, // used for API listing without materializing point data. type seriesMeta struct { @@ -582,10 +1022,13 @@ func (s *timeSeriesStorage) ListSeriesMetadata(namespace string) []seriesMeta { defer s.mu.RUnlock() var result []seriesMeta - for key, stats := range s.series { + for _, stats := range s.seriesIDStats { + if stats == nil { + continue + } if stats.Namespace == namespace { result = append(result, seriesMeta{ - Ref: s.seriesIDs[key], + Ref: stats.ref, Namespace: stats.Namespace, Name: stats.Name, Tags: copyTags(stats.Tags), @@ -611,11 +1054,7 @@ func (s *timeSeriesStorage) GetSeriesByNumericID(ref observer.SeriesRef, agg Agg s.mu.RLock() defer s.mu.RUnlock() - if ref < 0 || int(ref) >= len(s.seriesIDKeys) { - return nil - } - key := s.seriesIDKeys[ref] - stats := s.series[key] + stats := s.resolveByID(ref) if stats == nil { return nil } @@ -628,8 +1067,11 @@ func (s *timeSeriesStorage) ListAllSeriesCompact() []seriesCompact { s.mu.RLock() defer s.mu.RUnlock() - result := make([]seriesCompact, 0, len(s.series)) - for _, st := range s.series { + result := make([]seriesCompact, 0, len(s.seriesIDStats)) + for _, st := range s.seriesIDStats { + if st == nil { + continue + } result = append(result, seriesCompact{ Namespace: st.Namespace, Name: st.Name, @@ -659,7 +1101,10 @@ func (s *timeSeriesStorage) DumpToFile(path string) error { } var out []dumpSeries - for _, st := range s.series { + for _, st := range s.seriesIDStats { + if st == nil { + continue + } ds := dumpSeries{ Namespace: st.Namespace, Name: st.Name, @@ -691,7 +1136,10 @@ func (s *timeSeriesStorage) DataTimestamps() []int64 { defer s.mu.RUnlock() seen := make(map[int64]struct{}) - for _, stats := range s.series { + for _, stats := range s.seriesIDStats { + if stats == nil { + continue + } for _, ts := range stats.timestamps { seen[ts] = struct{}{} } @@ -711,7 +1159,7 @@ func (s *timeSeriesStorage) DataTimestamps() []int64 { // SeriesGeneration returns a counter that increments whenever the series // catalog changes — either when a new series key is created or when an -// existing key is removed via RemoveSeriesByKeys. Callers can use this to +// existing key is removed via RemoveSeriesByRefs. Callers can use this to // safely cache ListSeries results. func (s *timeSeriesStorage) SeriesGeneration() uint64 { s.mu.RLock() @@ -719,53 +1167,37 @@ func (s *timeSeriesStorage) SeriesGeneration() uint64 { return s.seriesGen } -// RemoveSeriesByKeys deletes the listed internal series keys (as produced by -// seriesKey). The compact numeric SeriesRef IDs assigned to each removed -// series are retired but NEVER reused: the slot in seriesIDStats is set to -// nil so any stale SeriesRef resolves to nil via resolveByID, and the slot -// in seriesIDKeys is set to "" — the slice length is preserved so subsequent -// index lookups remain bounds-safe, but the original key string is no longer -// referenced and can be garbage-collected. GetSeriesByNumericID's nil-stats -// guard handles the empty-string lookup safely (s.series[""] is always nil). -// Returns the SeriesRefs that were actually freed (one per successful removal, -// in input order; unknown keys are silently skipped). seriesGen is bumped iff -// at least one series was removed so cached ListSeries results are invalidated. +// RemoveSeriesByRefs deletes series by their compact numeric SeriesRef IDs. +// The slot in seriesIDStats is set to nil so any stale SeriesRef resolves to +// nil via resolveByID. The series is also removed from the hash map unless it +// was collision-displaced (another series owns the hash slot). +// Returns the SeriesRefs that were actually freed (unknown refs are skipped). +// seriesGen is bumped iff at least one series was removed. // // Callers use the returned refs to fan out per-series teardown to detector // state that's keyed by SeriesRef (BOCPD, ScanMW, ScanWelch posterior maps, -// seriesDetectorAdapter.lastVisibleCount, etc.). Without that fan-out, those -// maps grow with the cumulative number of series ever observed even though -// storage shrinks — defeating the LRU caps put on the upstream extractors. -// -// This is the storage-side counterpart to engine.removeContextRefsForEvictedKeys: -// the engine's contextRefs index keeps track of which storage key was created -// for which extractor context key, so when an extractor evicts a context the -// engine can pass the corresponding storage keys here to free their tags + -// columnar arrays. Without this path, evicted patterns leak indefinitely. -func (s *timeSeriesStorage) RemoveSeriesByKeys(keys []string) []observer.SeriesRef { - if len(keys) == 0 { +// seriesDetectorAdapter.lastVisibleCount, etc.). +func (s *timeSeriesStorage) RemoveSeriesByRefs(refs []observer.SeriesRef) []observer.SeriesRef { + if len(refs) == 0 { return nil } s.mu.Lock() defer s.mu.Unlock() var removed []observer.SeriesRef - for _, key := range keys { - if _, exists := s.series[key]; !exists { + for _, ref := range refs { + stats := s.resolveByID(ref) + if stats == nil { continue } - delete(s.series, key) - if id, ok := s.seriesIDs[key]; ok { - if int(id) < len(s.seriesIDStats) { - s.seriesIDStats[id] = nil - } - if int(id) < len(s.seriesIDKeys) { - // Free the key string so it can be GC'd; keep the slot so - // seriesIDKeys remains addressable for stale-ref reads. - s.seriesIDKeys[id] = "" - } - delete(s.seriesIDs, key) - removed = append(removed, id) + // Release tag intern entry before dropping the series. + s.releaseTagIntern(stats.tagsHash) + // Remove from hash map only if this series owns the slot. + h := seriesKeyHash(stats.Namespace, stats.Name, stats.Tags) + if s.series[h] == stats { + delete(s.series, h) } + s.seriesIDStats[ref] = nil + removed = append(removed, ref) } if len(removed) > 0 { s.seriesGen++ @@ -795,12 +1227,12 @@ func (s *timeSeriesStorage) CompactSeriesID(fullKey string) string { aggStr = nameWithAgg[idx+1:] } - // Look up the storage key (without agg suffix). - storageKey := seriesKey(namespace, baseName, tags) - numID, found := s.seriesIDs[storageKey] - if !found { + // Look up by hash (without agg suffix). + stats := s.lookupByHash(namespace, baseName, tags) + if stats == nil { return fullKey } + numID := stats.ref if aggStr != "" { return fmt.Sprintf("%d:%s", numID, aggStr) @@ -815,14 +1247,17 @@ func (s *timeSeriesStorage) ListSeries(filter observer.SeriesFilter) []observer. s.mu.RLock() defer s.mu.RUnlock() - // Preallocate to len(s.series): an upper bound under the lock that lets + // Preallocate to len(s.seriesIDStats): an upper bound under the lock that lets // us avoid repeated growslice in the common case where the filter matches // most series. Detectors and the adapter call this on every advance, so // even after the cache-by-gen optimisations the worst-case cost matters // when seriesGen does churn (e.g. cardinality blow-ups in extractors). - result := make([]observer.SeriesMeta, 0, len(s.series)) + result := make([]observer.SeriesMeta, 0, len(s.seriesIDStats)) listSeriesLoop: - for key, stats := range s.series { + for _, stats := range s.seriesIDStats { + if stats == nil { + continue + } if filter.Namespace != "" { if stats.Namespace != filter.Namespace { continue @@ -841,7 +1276,7 @@ listSeriesLoop: continue } result = append(result, observer.SeriesMeta{ - Ref: s.seriesIDs[key], + Ref: stats.ref, Namespace: stats.Namespace, Name: stats.Name, Tags: stats.Tags, @@ -868,7 +1303,10 @@ func (s *timeSeriesStorage) TotalSampleCount(excludeNamespace string) int64 { s.mu.RLock() defer s.mu.RUnlock() total := int64(0) - for _, stats := range s.series { + for _, stats := range s.seriesIDStats { + if stats == nil { + continue + } if excludeNamespace != "" && stats.Namespace == excludeNamespace { continue } @@ -883,7 +1321,10 @@ func (s *timeSeriesStorage) TotalSeriesCount(excludeNamespace string) int { s.mu.RLock() defer s.mu.RUnlock() total := 0 - for _, stats := range s.series { + for _, stats := range s.seriesIDStats { + if stats == nil { + continue + } if excludeNamespace != "" && stats.Namespace == excludeNamespace { continue } diff --git a/comp/anomalydetection/observer/impl/storage_test.go b/comp/anomalydetection/observer/impl/storage_test.go index 6f79b6b9f27a..dff77970dd89 100644 --- a/comp/anomalydetection/observer/impl/storage_test.go +++ b/comp/anomalydetection/observer/impl/storage_test.go @@ -6,9 +6,12 @@ package observerimpl import ( + "fmt" + "hash/fnv" "math" "sync" "testing" + "unsafe" observer "github.com/DataDog/datadog-agent/comp/anomalydetection/observer/def" "github.com/stretchr/testify/assert" @@ -561,6 +564,57 @@ func TestTimeBoundsSkipsNonPositivePrefixOnly(t *testing.T) { assert.Equal(t, int64(20), maxTs) } +func TestEvictToCapacity_EviictsOldestToTarget(t *testing.T) { + const cap, target = 20, 15 + s := newTimeSeriesStorage() + n := cap + 5 + // Add n series with distinct last timestamps: series i gets timestamp i+1. + for i := 0; i < n; i++ { + s.Add("ns", fmt.Sprintf("metric.%d", i), 1.0, int64(i+1), nil) + } + require.Equal(t, n, s.TotalSeriesCount("")) + + freed := s.EvictToCapacity(cap, target) + + // Should drain to target, not just cap. + want := n - target + assert.Len(t, freed, want) + assert.Equal(t, target, s.TotalSeriesCount("")) + + // The freed series must be the oldest (lowest timestamps). + for i := 0; i < want; i++ { + assert.Nil(t, s.GetSeries("ns", fmt.Sprintf("metric.%d", i), nil, AggregateAverage), + "series %d (oldest) should have been evicted", i) + } + for i := want; i < n; i++ { + assert.NotNil(t, s.GetSeries("ns", fmt.Sprintf("metric.%d", i), nil, AggregateAverage), + "series %d (newest) should survive", i) + } +} + +func TestPointRetentionTruncatesOldAndPreservesOrder(t *testing.T) { + s := newTimeSeriesStorage() + // Add points spanning twice the retention window so trimming definitely fires. + total := storagePointRetentionSecs * 2 + for i := 0; i < total; i++ { + s.Add("ns", "metric", float64(i), int64(i+1), nil) + } + + series := s.GetSeries("ns", "metric", nil, AggregateSum) + require.NotNil(t, series) + + // Only points within the retention window should be kept. + // Window is [latest-retention, latest] inclusive, so at most retention+1 points. + assert.LessOrEqual(t, len(series.Points), storagePointRetentionSecs+1, + "points older than retention window should be dropped") + + // Timestamps must remain sorted (binary search invariant). + for i := 1; i < len(series.Points); i++ { + assert.Greater(t, series.Points[i].Timestamp, series.Points[i-1].Timestamp, + "timestamps must be strictly increasing after trimming") + } +} + func TestTimeSeriesStorage_ListSeries_ExcludeNamespaces(t *testing.T) { s := newTimeSeriesStorage() s.Add(observer.TelemetryNamespace, "internal.gauge", 1, 1000, nil) @@ -578,63 +632,46 @@ func TestTimeSeriesStorage_ListSeries_ExcludeNamespaces(t *testing.T) { assert.Equal(t, observer.TelemetryNamespace, onlyTel[0].Namespace) } -func TestTimeSeriesStorage_RemoveSeriesByKeys(t *testing.T) { +func TestTimeSeriesStorage_RemoveSeriesByRefs(t *testing.T) { s := newTimeSeriesStorage() - s.Add("ns", "a", 1.0, 1000, []string{"k:1"}) - s.Add("ns", "b", 2.0, 1000, []string{"k:2"}) - s.Add("ns", "c", 3.0, 1000, []string{"k:3"}) + resA := s.Add("ns", "a", 1.0, 1000, []string{"k:1"}) + resB := s.Add("ns", "b", 2.0, 1000, []string{"k:2"}) + resC := s.Add("ns", "c", 3.0, 1000, []string{"k:3"}) require.Equal(t, 3, s.TotalSeriesCount("")) genBefore := s.SeriesGeneration() - keyB := seriesKey("ns", "b", []string{"k:2"}) - keyC := seriesKey("ns", "c", []string{"k:3"}) - refB := s.seriesIDs[keyB] - refC := s.seriesIDs[keyC] + refA, refB, refC := resA.Ref, resB.Ref, resC.Ref - removed := s.RemoveSeriesByKeys([]string{keyB, keyC, "nonexistent"}) - require.Len(t, removed, 2, "unknown keys are silently ignored") + removed := s.RemoveSeriesByRefs([]observer.SeriesRef{refB, refC, observer.SeriesRef(999)}) + require.Len(t, removed, 2, "unknown refs are silently ignored") require.ElementsMatch(t, []observer.SeriesRef{refB, refC}, removed, "freed refs are returned for fan-out to detectors") require.Equal(t, 1, s.TotalSeriesCount(""), "only series 'a' should remain") require.Greater(t, s.SeriesGeneration(), genBefore, "seriesGen bumps on removal") require.Nil(t, s.GetSeriesMeta(refB), "removed ref resolves to nil") require.Nil(t, s.GetSeriesMeta(refC), "removed ref resolves to nil") - - refA := s.seriesIDs[seriesKey("ns", "a", []string{"k:1"})] require.NotNil(t, s.GetSeriesMeta(refA), "surviving series still resolvable") - // Evicted slots in seriesIDKeys are cleared to "" so the original key - // string can be GC'd. Slot is kept in place (slice length unchanged) so - // subsequent stale-ref index lookups stay bounds-safe. - require.Equal(t, "", s.seriesIDKeys[refB], "evicted seriesIDKeys slot must be empty so the key string can be GC'd") - require.Equal(t, "", s.seriesIDKeys[refC], "evicted seriesIDKeys slot must be empty so the key string can be GC'd") - require.NotEqual(t, "", s.seriesIDKeys[refA], "surviving seriesIDKeys slot must be intact") - - // A subsequent Add for the same key creates a fresh series with a new ref. + // A subsequent Add for the same series creates a fresh ref. s.Add("ns", "b", 99.0, 1100, []string{"k:2"}) require.Equal(t, 2, s.TotalSeriesCount(""), "re-add re-creates the series") - newRefB := s.seriesIDs[keyB] + newRefB := s.Add("ns", "b", 1.0, 1200, []string{"k:2"}).Ref require.NotEqual(t, refB, newRefB, "new ref minted; old ref id is retired") require.Nil(t, s.GetSeriesMeta(refB), "old ref still resolves to nil after re-add") } -func TestTimeSeriesStorage_RemoveSeriesByKeysEmptyOrUnknown(t *testing.T) { +func TestTimeSeriesStorage_RemoveSeriesByRefsEmptyOrUnknown(t *testing.T) { s := newTimeSeriesStorage() s.Add("ns", "a", 1.0, 1000, nil) genBefore := s.SeriesGeneration() - require.Empty(t, s.RemoveSeriesByKeys(nil)) - require.Empty(t, s.RemoveSeriesByKeys([]string{})) - require.Empty(t, s.RemoveSeriesByKeys([]string{"unknown1", "unknown2"})) + require.Empty(t, s.RemoveSeriesByRefs(nil)) + require.Empty(t, s.RemoveSeriesByRefs([]observer.SeriesRef{})) + require.Empty(t, s.RemoveSeriesByRefs([]observer.SeriesRef{999, 1000})) require.Equal(t, genBefore, s.SeriesGeneration(), "no removal → no gen bump") } -func TestTimeSeriesStorage_AddReturnsCanonicalKey(t *testing.T) { - // Add returns the same string seriesKey would compute from the same - // inputs, including under tag canonicalization. Callers (e.g. the engine - // populating contextRefs) rely on this so they can skip a second - // seriesKey call. If this contract drifts, the optimisation silently - // produces wrong-keyed entries. +func TestTimeSeriesStorage_AddReturnsRef(t *testing.T) { s := newTimeSeriesStorage() cases := []struct { @@ -654,32 +691,185 @@ func TestTimeSeriesStorage_AddReturnsCanonicalKey(t *testing.T) { t.Run(tc.name, func(t *testing.T) { res := s.Add(tc.namespace, tc.metric, 1.0, 1000, tc.tags) assert.True(t, res.IsNew, "first write should report IsNew=true") - wantKey := seriesKey(tc.namespace, tc.metric, tc.tags) - assert.Equal(t, wantKey, res.StorageKey, "Add must return seriesKey-equivalent storage key") + assert.GreaterOrEqual(t, int(res.Ref), 0, "Add must return a valid SeriesRef") - // Second write of the same series returns the same key and IsNew=false. + // Second write of the same series returns the same ref and IsNew=false. res2 := s.Add(tc.namespace, tc.metric, 2.0, 1001, tc.tags) assert.False(t, res2.IsNew, "second write should report IsNew=false") - assert.Equal(t, wantKey, res2.StorageKey, "Add must return the same key on subsequent writes") + assert.Equal(t, res.Ref, res2.Ref, "Add must return the same ref on subsequent writes") }) } } -func TestTimeSeriesStorage_AddDroppedReturnsEmptyKey(t *testing.T) { - // Pre-key-compute drops (non-finite, sentinel values) return empty key. - // Callers must check StorageKey != "" before reusing it for downstream - // state (e.g. contextRefs). +func TestTimeSeriesStorage_AddDroppedReturnsInvalidRef(t *testing.T) { + // Dropped points (non-finite, sentinel values) return Ref == -1. + // Callers must check Ref >= 0 before using it for downstream state. s := newTimeSeriesStorage() res := s.Add("ns", "m", math.NaN(), 1000, nil) assert.False(t, res.IsNew) - assert.Empty(t, res.StorageKey, "NaN drop must return empty key") + assert.Less(t, int(res.Ref), 0, "NaN drop must return invalid ref") res = s.Add("ns", "m", math.Inf(1), 1000, nil) assert.False(t, res.IsNew) - assert.Empty(t, res.StorageKey, "+Inf drop must return empty key") + assert.Less(t, int(res.Ref), 0, "+Inf drop must return invalid ref") res = s.Add("ns", "m", math.MaxFloat64, 1000, nil) assert.False(t, res.IsNew) - assert.Empty(t, res.StorageKey, "MaxFloat64 sentinel drop must return empty key") + assert.Less(t, int(res.Ref), 0, "MaxFloat64 sentinel drop must return invalid ref") +} + +// TestFnv64aString_MatchesStdlib verifies the inline fnv64a helpers produce +// identical output to the stdlib hash/fnv implementation. +func TestFnv64aString_MatchesStdlib(t *testing.T) { + inputs := []string{ + "", + "hello", + "GET /api/v1/users completed in 45ms", + `{"duration_ms":45,"status":200,"foo":"bar"}`, + "source|service|env|host", + } + for _, s := range inputs { + h := fnv.New64a() + h.Write([]byte(s)) + assert.Equal(t, h.Sum64(), fnv64aString(s), "mismatch for input: %q", s) + } +} + +func TestFnv64aMix_MatchesStdlib(t *testing.T) { + // Verify that fnv64aString(a) + fnv64aMix(h, b) matches hashing "a|b" as + // a single contiguous stream through the stdlib hasher. + a, b := "source", "service" + h := fnv.New64a() + h.Write([]byte(a)) + h.Write([]byte{'|'}) + h.Write([]byte(b)) + + got := fnv64aMix(fnv64aString(a), b) + assert.Equal(t, h.Sum64(), got) +} + +func TestFnv64aMixUint64_MatchesStdlib(t *testing.T) { + // Verify fnv64aMixUint64 matches binary.Write(h, LittleEndian, v). + h := fnv.New64a() + var buf [8]byte + v := uint64(0xDEADBEEFCAFEBABE) + buf[0] = byte(v) + buf[1] = byte(v >> 8) + buf[2] = byte(v >> 16) + buf[3] = byte(v >> 24) + buf[4] = byte(v >> 32) + buf[5] = byte(v >> 40) + buf[6] = byte(v >> 48) + buf[7] = byte(v >> 56) + h.Write(buf[:]) + + got := fnv64aMixUint64(fnvOffsetBasis64, v) + assert.Equal(t, h.Sum64(), got) +} + +func TestTimeSeriesStorage_TagIntern_PoolGrows(t *testing.T) { + s := newTimeSeriesStorage() + assert.Equal(t, 0, s.TagInternedCount()) + + s.Add("ns", "m1", 1.0, 1000, []string{"env:prod", "host:a"}) + assert.Equal(t, 1, s.TagInternedCount(), "one combination interned") + + s.Add("ns", "m2", 1.0, 1000, []string{"env:prod", "host:b"}) + assert.Equal(t, 2, s.TagInternedCount(), "second distinct combination interned") + + // Same combination as m1 — pool must not grow. + s.Add("ns2", "m1", 1.0, 1000, []string{"env:prod", "host:a"}) + assert.Equal(t, 2, s.TagInternedCount(), "repeated combination must not grow pool") +} + +func TestTimeSeriesStorage_TagIntern_SharedSlice(t *testing.T) { + // Verify that two series with the same tag combination share the same + // []string backing array. Inspects seriesStats.Tags directly via resolveByID. + s := newTimeSeriesStorage() + + // Build tags at runtime to defeat Go's compile-time string interning. + tags := []string{"env:" + string([]byte{'p', 'r', 'o', 'd'}), "host:a"} + + res1 := s.Add("ns", "m1", 1.0, 1000, tags) + res2 := s.Add("ns", "m2", 1.0, 1000, tags) + + s.mu.RLock() + stats1 := s.resolveByID(res1.Ref) + stats2 := s.resolveByID(res2.Ref) + s.mu.RUnlock() + + require.NotNil(t, stats1) + require.NotNil(t, stats2) + + ptr1 := uintptr(unsafe.Pointer(unsafe.SliceData(stats1.Tags))) + ptr2 := uintptr(unsafe.Pointer(unsafe.SliceData(stats2.Tags))) + assert.Equal(t, ptr1, ptr2, "series with identical tag sets must share the same []string backing array") + assert.Equal(t, 1, s.TagInternedCount()) +} + +func TestTimeSeriesStorage_TagIntern_Eviction(t *testing.T) { + s := newTimeSeriesStorage() + + tags := []string{"env:prod", "host:a"} + res1 := s.Add("ns", "m1", 1.0, 1000, tags) + res2 := s.Add("ns", "m2", 1.0, 1000, tags) + assert.Equal(t, 1, s.TagInternedCount(), "one pool entry for the shared combination") + + s.RemoveSeriesByRefs([]observer.SeriesRef{res1.Ref}) + assert.Equal(t, 1, s.TagInternedCount(), "pool entry must survive while m2 still references it") + + s.RemoveSeriesByRefs([]observer.SeriesRef{res2.Ref}) + assert.Equal(t, 0, s.TagInternedCount(), "pool entry must be freed when last referencing series is evicted") +} + +func TestTimeSeriesStorage_TagIntern_NilAndEmptyTags(t *testing.T) { + // Nil and empty tag slices must not produce an intern pool entry, + // and removing such a series must not touch the pool. + s := newTimeSeriesStorage() + + res1 := s.Add("ns", "nil_tags", 1.0, 1000, nil) + assert.Equal(t, 0, s.TagInternedCount(), "nil tags must not create an intern entry") + + res2 := s.Add("ns", "empty_tags", 1.0, 1000, []string{}) + assert.Equal(t, 0, s.TagInternedCount(), "empty tags must not create an intern entry") + + s.RemoveSeriesByRefs([]observer.SeriesRef{res1.Ref, res2.Ref}) + assert.Equal(t, 0, s.TagInternedCount(), "removal of uninterned series must leave pool empty") +} + +func TestTimeSeriesStorage_TagIntern_UnsortedTagsShareEntry(t *testing.T) { + // Two series with the same tags in different order must share one pool entry. + s := newTimeSeriesStorage() + + tags1 := []string{"host:a", "env:prod"} // unsorted + tags2 := []string{"env:prod", "host:a"} // sorted + + res1 := s.Add("ns", "m1", 1.0, 1000, tags1) + res2 := s.Add("ns", "m2", 1.0, 1000, tags2) + assert.Equal(t, 1, s.TagInternedCount(), "same tags in different order must share one pool entry") + + s.mu.RLock() + stats1 := s.resolveByID(res1.Ref) + stats2 := s.resolveByID(res2.Ref) + s.mu.RUnlock() + + ptr1 := uintptr(unsafe.Pointer(unsafe.SliceData(stats1.Tags))) + ptr2 := uintptr(unsafe.Pointer(unsafe.SliceData(stats2.Tags))) + assert.Equal(t, ptr1, ptr2, "unsorted and sorted variants must share the same backing array") +} + +func TestTimeSeriesStorage_TagIntern_Cap(t *testing.T) { + s := newTimeSeriesStorage() + + for i := 0; i < tagInternMaxSize; i++ { + s.Add("ns", fmt.Sprintf("m%d", i), 1.0, 1000, []string{fmt.Sprintf("unique:tag%d", i)}) + } + assert.Equal(t, tagInternMaxSize, s.TagInternedCount(), "pool should be at cap") + + s.Add("ns", "overflow", 1.0, 1000, []string{"unique:overflow"}) + assert.Equal(t, tagInternMaxSize, s.TagInternedCount(), "pool must not exceed cap") + + s.Add("ns2", "m0", 1.0, 1000, []string{"unique:tag0"}) + assert.Equal(t, tagInternMaxSize, s.TagInternedCount(), "hit on existing entry must not grow pool") } diff --git a/comp/anomalydetection/observer/impl/test_helpers_test.go b/comp/anomalydetection/observer/impl/test_helpers_test.go index 42ebb47ce597..1a8b0580af71 100644 --- a/comp/anomalydetection/observer/impl/test_helpers_test.go +++ b/comp/anomalydetection/observer/impl/test_helpers_test.go @@ -20,14 +20,14 @@ func (r *noopTestReporter) Report(_ reporterdef.ReportOutput) {} // mockLogView implements observer.LogView for testing. type mockLogView struct { - content []byte + content string status string tags []string hostname string timestampMs int64 } -func (m *mockLogView) GetContent() []byte { return m.content } +func (m *mockLogView) GetContent() string { return m.content } func (m *mockLogView) GetStatus() string { return m.status } func (m *mockLogView) GetTags() []string { return m.tags } func (m *mockLogView) GetHostname() string { return m.hostname } diff --git a/comp/anomalydetection/recorder/impl/recorder.go b/comp/anomalydetection/recorder/impl/recorder.go index 941be5d95e4a..d5734d74dbbb 100644 --- a/comp/anomalydetection/recorder/impl/recorder.go +++ b/comp/anomalydetection/recorder/impl/recorder.go @@ -283,13 +283,9 @@ func (h *recordingHandle) ObserveMetric(sample observer.MetricView) { func (h *recordingHandle) ObserveLog(msg observer.LogView) { h.inner.ObserveLog(msg) - content := msg.GetContent() - contentCopy := make([]byte, len(content)) - copy(contentCopy, content) - h.recorder.logParquetWriter.WriteLog( h.name, - contentCopy, + []byte(msg.GetContent()), msg.GetStatus(), msg.GetHostname(), msg.GetTags(), diff --git a/internal/qbranch/anomalydetection-testbench/bench/api.go b/internal/qbranch/anomalydetection-testbench/bench/api.go index a2a419420344..5b6a5b9291a2 100644 --- a/internal/qbranch/anomalydetection-testbench/bench/api.go +++ b/internal/qbranch/anomalydetection-testbench/bench/api.go @@ -914,7 +914,7 @@ func (api *BenchAPI) handleLogs(w http.ResponseWriter, r *http.Request) { result = append(result, logEntryResponse{ TimestampMs: ts, Status: l.GetStatus(), - Content: string(l.GetContent()), + Content: l.GetContent(), Hostname: l.GetHostname(), Tags: tags, }) diff --git a/internal/qbranch/anomalydetection-testbench/bench/bench.go b/internal/qbranch/anomalydetection-testbench/bench/bench.go index d24e8712b9dc..93ebf84db1f9 100644 --- a/internal/qbranch/anomalydetection-testbench/bench/bench.go +++ b/internal/qbranch/anomalydetection-testbench/bench/bench.go @@ -37,7 +37,7 @@ type logDataView struct { // Ensure logDataView implements observerdef.LogView. var _ observerdef.LogView = (*logDataView)(nil) -func (v *logDataView) GetContent() []byte { return v.data.Content } +func (v *logDataView) GetContent() string { return string(v.data.Content) } func (v *logDataView) GetStatus() string { return v.data.Status } func (v *logDataView) GetHostname() string { return v.data.Hostname } func (v *logDataView) GetTags() []string { return v.data.Tags } diff --git a/pkg/config/setup/config.go b/pkg/config/setup/config.go index 8d757852c1e6..f5fe766949fe 100644 --- a/pkg/config/setup/config.go +++ b/pkg/config/setup/config.go @@ -366,6 +366,17 @@ func InitConfig(config pkgconfigmodel.Setup) { // the engine by LogMetricsExtractors continue to flow into storage, // so log-only anomaly detection keeps working. config.BindEnvAndSetDefault("observer.ingest_metrics.enabled", true) + // Storage tuning: cap on number of live time series before eviction fires. + // 0 disables eviction. See storageConfig in storage.go. + config.BindEnvAndSetDefault("observer.storage.max_series", 50000) + // Storage tuning: hysteresis band for eviction. When the cap is hit, series + // are removed until count ≤ max_series*(1-eviction_floor_ratio). + // 0.1 = drain to 90% of the cap before the next eviction pass is needed. + config.BindEnvAndSetDefault("observer.storage.eviction_floor_ratio", 0.1) + // Storage tuning: how long (in seconds) data points are retained per series. + // Points older than (latest data timestamp - point_retention_secs) are trimmed on each Add. + // 0 disables trimming. + config.BindEnvAndSetDefault("observer.storage.point_retention_secs", 120) config.BindEnvAndSetDefault("observer.traces.fetch_interval", 5*time.Second) config.BindEnvAndSetDefault("observer.traces.max_fetch_batch", 100) config.BindEnvAndSetDefault("observer.profiles.fetch_interval", 10*time.Second)