From 50c773d67f3b53948e3d8cf139c3a9271c1fb5d8 Mon Sep 17 00:00:00 2001 From: Aayush Seth Date: Mon, 7 Jul 2025 10:59:48 -0700 Subject: [PATCH 1/9] initial functions for filtering --- src/sentry/seer/math.py | 159 +++++++++++++++++++++++++++ src/sentry/seer/workflows/compare.py | 98 ++++++++++++++++- 2 files changed, 256 insertions(+), 1 deletion(-) diff --git a/src/sentry/seer/math.py b/src/sentry/seer/math.py index 85b21b00c38e9c..4a1c10a2fae588 100644 --- a/src/sentry/seer/math.py +++ b/src/sentry/seer/math.py @@ -86,3 +86,162 @@ def _rrf(kl_rank: int, entropy_rank: int) -> float: def rank_min(xs: list[float], ascending: bool = False): ranks = {x: rank for rank, x in enumerate(sorted(set(xs), reverse=not ascending), 1)} return [ranks[x] for x in xs] + + +def boxcox_transform( + values: list[float], lambda_param: float | None = None +) -> tuple[list[float], float]: + """ + Apply BoxCox transformation to a list of values. + + Parameters: + values: List of positive values to transform + lambda_param: BoxCox lambda parameter. If None, finds optimal lambda. + + Returns: + Tuple of (transformed values, lambda parameter used) + """ + + if lambda_param is not None: + if lambda_param == 0.0: + transformed = [math.log(max(v, 1e-10)) for v in values] + else: + transformed = [(pow(max(v, 1e-10), lambda_param) - 1) / lambda_param for v in values] + return transformed, lambda_param + + # Find optimal lambda using MLE + optimal_lambda = boxcox_normmax(values) + + if optimal_lambda == 0.0: + transformed = [math.log(max(v, 1e-10)) for v in values] + else: + transformed = [(pow(max(v, 1e-10), optimal_lambda) - 1) / optimal_lambda for v in values] + + return transformed, optimal_lambda + + +def boxcox_llf(lambda_param: float, values: list[float]) -> float: + """ + Compute the Box-Cox log-likelihood function. + + Parameters: + lambda_param: BoxCox lambda parameter + values: List of positive values + + Returns: + Log-likelihood value + """ + n = len(values) + if n == 0: + return 0.0 + + # Transform the data + if lambda_param == 0.0: + y = [math.log(max(v, 1e-10)) for v in values] + else: + y = [(pow(max(v, 1e-10), lambda_param) - 1) / lambda_param for v in values] + + # Calculate mean and sum of squares + y_mean = sum(y) / n + sum_sq = sum((yi - y_mean) ** 2 for yi in y) + + # Log-likelihood calculation + # llf = (lambda - 1) * sum(log(x)) - n/2 * log(sum_sq) + log_sum = sum(math.log(max(v, 1e-10)) for v in values) + llf = (lambda_param - 1) * log_sum - (n / 2) * math.log(max(sum_sq, 1e-10)) + + return llf + + +def boxcox_normmax(values: list[float]) -> float: + """ + Calculate the approximate optimal lambda parameter for BoxCox transformation that maximizes the log-likelihood. + + Uses MLE method with ternary search rather than Brent's methodfor efficient optimization. + + Parameters: + values: List of positive values + + Returns: + Approximate optimal lambda parameter + """ + if not values: + return 0.0 + + if any(v <= 0 for v in values): + raise ValueError("All values must be positive for BoxCox transformation") + + left = -2.0 + right = 2.0 + tolerance = 1e-6 + max_iters = 50 + iters = 0 + + while right - left > tolerance and iters < max_iters: + m1 = left + (right - left) / 3 + m2 = right - (right - left) / 3 + + llf_m1 = boxcox_llf(m1, values) + llf_m2 = boxcox_llf(m2, values) + + if llf_m1 > llf_m2: + right = m2 + else: + left = m1 + + iters += 1 + + return (left + right) / 2 + + +def calculate_z_scores(values: list[float]) -> list[float]: + """ + Calculate z-scores for a list of values. + + Parameters: + values: List of numerical values + + Returns: + List of z-scores corresponding to input values + """ + if not values: + return [] + + mean_val = sum(values) / len(values) + variance = sum((x - mean_val) ** 2 for x in values) / len(values) + std_dev = math.sqrt(variance) + + if std_dev == 0: + return [0.0] * len(values) + + return [(x - mean_val) / std_dev for x in values] + + +def filter_by_z_score_threshold( + values: list[float], z_threshold: float = 1.5, lambda_param: float = 0.0 +) -> list[int]: + """ + Get indices of values that pass BoxCox + z-score filtering. + + This function applies BoxCox normalization to the values, + calculates z-scores, and returns indices where z-scores >= threshold. + + Parameters: + values: List of numerical values to filter + z_threshold: Minimum z-score threshold for inclusion + lambda_param: BoxCox lambda parameter (0 for log transformation) + + Returns: + List of indices that pass the filtering criteria + """ + if not values: + return [] + + # Apply BoxCox transformation - unpack the tuple to get just the transformed values + transformed_values, _ = boxcox_transform(values, lambda_param) + + # Calculate z-scores on transformed data + z_scores = calculate_z_scores(transformed_values) + + # Return indices that meet the threshold + return [i for i, z_score in enumerate(z_scores) if z_score >= z_threshold] diff --git a/src/sentry/seer/workflows/compare.py b/src/sentry/seer/workflows/compare.py index d90bf7f9d9564e..153305663f32f1 100644 --- a/src/sentry/seer/workflows/compare.py +++ b/src/sentry/seer/workflows/compare.py @@ -2,7 +2,13 @@ from collections.abc import Callable, Generator, Mapping, Sequence from typing import TypeVar -from sentry.seer.math import entropy, kl_divergence, laplace_smooth, rrf_score +from sentry.seer.math import ( + entropy, + filter_by_z_score_threshold, + kl_divergence, + laplace_smooth, + rrf_score, +) T = TypeVar("T") @@ -13,6 +19,36 @@ Score = tuple[str, float] +def filter_by_z_score( + data: Sequence[KeyedValueCount], z_threshold: float = 1.5, lambda_param: float = 0.0 +) -> list[KeyedValueCount]: + """ + Filter data by applying BoxCox transformation and z-score filtering. + + This function applies BoxCox normalization to the count values in the data, + calculates z-scores, and filters to keep only items with z-scores >= threshold. + + Parameters: + data: Sequence of (key, value, count) tuples + z_threshold: Minimum z-score threshold for inclusion + lambda_param: BoxCox lambda parameter (0 for log transformation) + + Returns: + Filtered list of (key, value, count) tuples + """ + if not data: + return [] + + # Extract counts (the third element of each tuple) + counts = [count for _, _, count in data] + + # Get indices that pass the filtering criteria + passing_indices = filter_by_z_score_threshold(counts, z_threshold, lambda_param) + + # Filter data based on passing indices + return [data[i] for i in passing_indices] + + def keyed_kl_score( baseline: Sequence[KeyedValueCount], outliers: Sequence[KeyedValueCount], @@ -186,3 +222,63 @@ def _ensure_symmetry(a: Distribution, b: Distribution) -> tuple[Distribution, Di def _smooth_distribution(dist: Distribution) -> Distribution: return dict(zip(dist.keys(), laplace_smooth(list(dist.values())))) + + +def keyed_rrf_score_with_filtering( + baseline: Sequence[KeyedValueCount], + outliers: Sequence[KeyedValueCount], + total_baseline: int, + total_outliers: int, + entropy_alpha: float = 0.2, + kl_alpha: float = 0.8, + offset: int = 60, + apply_filtering: bool = True, + z_threshold: float = 1.5, + lambda_param: float = 0.0, + filter_baseline: bool = False, + filter_outliers: bool = True, +) -> tuple[list[tuple[str, float]], list[KeyedValueCount], list[KeyedValueCount]]: + """ + RRF score a multi-dimensional distribution with optional BoxCox + z-score filtering. + + This function demonstrates how to apply filtering as an independent step before RRF scoring. + + Parameters: + baseline: Baseline distribution data + outliers: Outliers distribution data + total_baseline: Total count for baseline + total_outliers: Total count for outliers + entropy_alpha: Weight for entropy in RRF + kl_alpha: Weight for KL divergence in RRF + offset: RRF offset parameter + apply_filtering: Whether to apply BoxCox + z-score filtering + z_threshold: Z-score threshold for filtering + lambda_param: BoxCox lambda parameter + filter_baseline: Whether to filter baseline data + filter_outliers: Whether to filter outliers data + + Returns: + Tuple of (scores, filtered_baseline, filtered_outliers) + This allows you to inspect the intermediary filtering results + """ + filtered_baseline = list(baseline) + filtered_outliers = list(outliers) + + if apply_filtering: + if filter_baseline: + filtered_baseline = filter_by_z_score(baseline, z_threshold, lambda_param) + if filter_outliers: + filtered_outliers = filter_by_z_score(outliers, z_threshold, lambda_param) + + # Apply RRF scoring to the filtered data + scores = keyed_rrf_score( + filtered_baseline, + filtered_outliers, + total_baseline, + total_outliers, + entropy_alpha, + kl_alpha, + offset, + ) + + return scores, filtered_baseline, filtered_outliers From 07ad2f424fda28099343195086cd1405cb14be9d Mon Sep 17 00:00:00 2001 From: Aayush Seth Date: Mon, 7 Jul 2025 11:45:42 -0700 Subject: [PATCH 2/9] make more consistent with scipy implementation --- src/sentry/seer/math.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/src/sentry/seer/math.py b/src/sentry/seer/math.py index 4a1c10a2fae588..932152b4103527 100644 --- a/src/sentry/seer/math.py +++ b/src/sentry/seer/math.py @@ -124,6 +124,8 @@ def boxcox_llf(lambda_param: float, values: list[float]) -> float: """ Compute the Box-Cox log-likelihood function. + Uses numerically stable log-space arithmetic following scipy's implementation. + Parameters: lambda_param: BoxCox lambda parameter values: List of positive values @@ -135,25 +137,28 @@ def boxcox_llf(lambda_param: float, values: list[float]) -> float: if n == 0: return 0.0 - # Transform the data + log_values = [math.log(max(v, 1e-10)) for v in values] + log_sum = sum(log_values) + if lambda_param == 0.0: - y = [math.log(max(v, 1e-10)) for v in values] + log_mean = log_sum / n + log_var = sum((lv - log_mean) ** 2 for lv in log_values) / n + logvar = math.log(max(log_var, 1e-10)) else: - y = [(pow(max(v, 1e-10), lambda_param) - 1) / lambda_param for v in values] - - # Calculate mean and sum of squares - y_mean = sum(y) / n - sum_sq = sum((yi - y_mean) ** 2 for yi in y) - - # Log-likelihood calculation - # llf = (lambda - 1) * sum(log(x)) - n/2 * log(sum_sq) - log_sum = sum(math.log(max(v, 1e-10)) for v in values) - llf = (lambda_param - 1) * log_sum - (n / 2) * math.log(max(sum_sq, 1e-10)) + # For λ≠0: Use log-space arithmetic for numerical stability + # This avoids computing (x^λ - 1)/λ directly which can overflow + # Uses identity: var((x^λ - 1)/λ) = var(x^λ)/λ² + logx = [lambda_param * lv for lv in log_values] # log(x^λ) = λ*log(x) + logx_mean = sum(logx) / n + logx_var = sum((lx - logx_mean) ** 2 for lx in logx) / n + # log(var(y)) = log(var(x^λ)) - 2*log(|λ|) + logvar = math.log(max(logx_var, 1e-10)) - 2 * math.log(abs(lambda_param)) - return llf + # Box-Cox log-likelihood: (λ-1)*Σlog(x) - n/2*log(var(y)) + return (lambda_param - 1) * log_sum - (n / 2) * logvar -def boxcox_normmax(values: list[float]) -> float: +def boxcox_normmax(values: list[float], max_iters: int = 100) -> float: """ Calculate the approximate optimal lambda parameter for BoxCox transformation that maximizes the log-likelihood. @@ -161,6 +166,7 @@ def boxcox_normmax(values: list[float]) -> float: Parameters: values: List of positive values + max_iters: Maximum number of iterations to run for ternary search Returns: Approximate optimal lambda parameter @@ -174,7 +180,6 @@ def boxcox_normmax(values: list[float]) -> float: left = -2.0 right = 2.0 tolerance = 1e-6 - max_iters = 50 iters = 0 while right - left > tolerance and iters < max_iters: From d26e4b373091b1e38ec1b6bbd085b1bc1116106c Mon Sep 17 00:00:00 2001 From: Aayush Seth Date: Mon, 7 Jul 2025 15:18:43 -0700 Subject: [PATCH 3/9] update return --- .../organization_group_suspect_flags.py | 2 + src/sentry/seer/math.py | 8 +- src/sentry/seer/workflows/compare.py | 118 +++++++++++------- 3 files changed, 74 insertions(+), 54 deletions(-) diff --git a/src/sentry/issues/endpoints/organization_group_suspect_flags.py b/src/sentry/issues/endpoints/organization_group_suspect_flags.py index f1745934dbd67e..a193c88cdff43f 100644 --- a/src/sentry/issues/endpoints/organization_group_suspect_flags.py +++ b/src/sentry/issues/endpoints/organization_group_suspect_flags.py @@ -21,6 +21,7 @@ class ResponseDataItem(TypedDict): score: float baseline_percent: float distribution: Distribution + is_filtered: bool class ResponseData(TypedDict): @@ -78,6 +79,7 @@ def get(self, request: Request, group: Group) -> Response: "flag": item["flag"], "score": item["score"], "issue_id": group.id, + "is_filtered": item["is_filtered"], }, ) diff --git a/src/sentry/seer/math.py b/src/sentry/seer/math.py index 932152b4103527..b514d7814bbeee 100644 --- a/src/sentry/seer/math.py +++ b/src/sentry/seer/math.py @@ -223,7 +223,7 @@ def calculate_z_scores(values: list[float]) -> list[float]: def filter_by_z_score_threshold( - values: list[float], z_threshold: float = 1.5, lambda_param: float = 0.0 + values: list[float], z_threshold: float = 1.5, lambda_param: float | None = None ) -> list[int]: """ Get indices of values that pass BoxCox + z-score filtering. @@ -234,7 +234,7 @@ def filter_by_z_score_threshold( Parameters: values: List of numerical values to filter z_threshold: Minimum z-score threshold for inclusion - lambda_param: BoxCox lambda parameter (0 for log transformation) + lambda_param: BoxCox lambda parameter (None for automatic selection) Returns: List of indices that pass the filtering criteria @@ -242,11 +242,7 @@ def filter_by_z_score_threshold( if not values: return [] - # Apply BoxCox transformation - unpack the tuple to get just the transformed values transformed_values, _ = boxcox_transform(values, lambda_param) - - # Calculate z-scores on transformed data z_scores = calculate_z_scores(transformed_values) - # Return indices that meet the threshold return [i for i, z_score in enumerate(z_scores) if z_score >= z_threshold] diff --git a/src/sentry/seer/workflows/compare.py b/src/sentry/seer/workflows/compare.py index 153305663f32f1..9f7e774397f50d 100644 --- a/src/sentry/seer/workflows/compare.py +++ b/src/sentry/seer/workflows/compare.py @@ -3,6 +3,7 @@ from typing import TypeVar from sentry.seer.math import ( + boxcox_transform, entropy, filter_by_z_score_threshold, kl_divergence, @@ -20,7 +21,7 @@ def filter_by_z_score( - data: Sequence[KeyedValueCount], z_threshold: float = 1.5, lambda_param: float = 0.0 + data: Sequence[KeyedValueCount], z_threshold: float = 1.5, lambda_param: float | None = None ) -> list[KeyedValueCount]: """ Filter data by applying BoxCox transformation and z-score filtering. @@ -31,7 +32,7 @@ def filter_by_z_score( Parameters: data: Sequence of (key, value, count) tuples z_threshold: Minimum z-score threshold for inclusion - lambda_param: BoxCox lambda parameter (0 for log transformation) + lambda_param: BoxCox lambda parameter (None for automatic selection) Returns: Filtered list of (key, value, count) tuples @@ -39,13 +40,9 @@ def filter_by_z_score( if not data: return [] - # Extract counts (the third element of each tuple) counts = [count for _, _, count in data] - - # Get indices that pass the filtering criteria passing_indices = filter_by_z_score_threshold(counts, z_threshold, lambda_param) - # Filter data based on passing indices return [data[i] for i in passing_indices] @@ -83,6 +80,8 @@ def keyed_rrf_score( entropy_alpha: float = 0.2, kl_alpha: float = 0.8, offset: int = 60, + filter_rrf: bool = False, + z_threshold: float = 1.5, ) -> list[tuple[str, float]]: """ RRF score a multi-dimensional distribution of values. Returns a list of key, score pairs. @@ -112,6 +111,26 @@ def _scoring_fn(baseline: list[float], outliers: list[float]): entropy_scores.append(entropy_score) kl_scores.append(kl_score) + if filter_rrf: + normalized_entropy_scores, _ = boxcox_transform(entropy_scores) + normalized_kl_scores, _ = boxcox_transform(kl_scores) + + filtered_keys = [] + filtered_entropy_scores = [] + filtered_kl_scores = [] + + for i, (key, normalized_entropy_score, normalized_kl_score) in enumerate( + zip(keys, normalized_entropy_scores, normalized_kl_scores) + ): + if normalized_entropy_score > z_threshold or normalized_kl_score > z_threshold: + filtered_keys.append(key) + filtered_entropy_scores.append(entropy_scores[i]) + filtered_kl_scores.append(kl_scores[i]) + + keys = filtered_keys + entropy_scores = filtered_entropy_scores + kl_scores = filtered_kl_scores + return sorted( zip(keys, rrf_score(entropy_scores, kl_scores, entropy_alpha, kl_alpha, offset)), key=lambda k: k[1], @@ -224,7 +243,7 @@ def _smooth_distribution(dist: Distribution) -> Distribution: return dict(zip(dist.keys(), laplace_smooth(list(dist.values())))) -def keyed_rrf_score_with_filtering( +def keyed_rrf_score_with_filter( baseline: Sequence[KeyedValueCount], outliers: Sequence[KeyedValueCount], total_baseline: int, @@ -232,53 +251,56 @@ def keyed_rrf_score_with_filtering( entropy_alpha: float = 0.2, kl_alpha: float = 0.8, offset: int = 60, - apply_filtering: bool = True, z_threshold: float = 1.5, - lambda_param: float = 0.0, - filter_baseline: bool = False, - filter_outliers: bool = True, -) -> tuple[list[tuple[str, float]], list[KeyedValueCount], list[KeyedValueCount]]: +) -> list[tuple[str, float, bool]]: """ - RRF score a multi-dimensional distribution with optional BoxCox + z-score filtering. - - This function demonstrates how to apply filtering as an independent step before RRF scoring. + RRF score a multi-dimensional distribution of values. Returns a list of key, score pairs, and a mapping of if the key was filtered. + Duplicates are not tolerated. - Parameters: - baseline: Baseline distribution data - outliers: Outliers distribution data - total_baseline: Total count for baseline - total_outliers: Total count for outliers - entropy_alpha: Weight for entropy in RRF - kl_alpha: Weight for KL divergence in RRF - offset: RRF offset parameter - apply_filtering: Whether to apply BoxCox + z-score filtering - z_threshold: Z-score threshold for filtering - lambda_param: BoxCox lambda parameter - filter_baseline: Whether to filter baseline data - filter_outliers: Whether to filter outliers data + Sample distribution: + [("key", "true", 93), ("key", "false", 219), ("other", "true", 1)] - Returns: - Tuple of (scores, filtered_baseline, filtered_outliers) - This allows you to inspect the intermediary filtering results + Sample output: + [("key", 0.5, True), ("key", 0.3, False), ("other", 0.1, False)] """ - filtered_baseline = list(baseline) - filtered_outliers = list(outliers) - - if apply_filtering: - if filter_baseline: - filtered_baseline = filter_by_z_score(baseline, z_threshold, lambda_param) - if filter_outliers: - filtered_outliers = filter_by_z_score(outliers, z_threshold, lambda_param) - - # Apply RRF scoring to the filtered data - scores = keyed_rrf_score( - filtered_baseline, - filtered_outliers, + + def _scoring_fn(baseline: list[float], outliers: list[float]): + return (entropy(outliers), kl_divergence(baseline, outliers)) + + scored_keys = _score_each_key( + baseline, + outliers, total_baseline, total_outliers, - entropy_alpha, - kl_alpha, - offset, + scoring_fn=_scoring_fn, ) - return scores, filtered_baseline, filtered_outliers + keys = [] + entropy_scores = [] + kl_scores = [] + + for key, (entropy_score, kl_score) in scored_keys: + keys.append(key) + entropy_scores.append(entropy_score) + kl_scores.append(kl_score) + + normalized_entropy_scores, _ = boxcox_transform(entropy_scores) + normalized_kl_scores, _ = boxcox_transform(kl_scores) + + filtered_keys = [False] * len(keys) + + for i, (key, normalized_entropy_score, normalized_kl_score) in enumerate( + zip(keys, normalized_entropy_scores, normalized_kl_scores) + ): + if normalized_entropy_score > z_threshold or normalized_kl_score > z_threshold: + filtered_keys[i] = True + + return sorted( + zip( + keys, + rrf_score(entropy_scores, kl_scores, entropy_alpha, kl_alpha, offset), + filtered_keys, + ), + key=lambda k: k[1], + reverse=True, + ) From 74fc10b98c5cc5e73a64293420ab6d5995780c3e Mon Sep 17 00:00:00 2001 From: Aayush Seth Date: Mon, 7 Jul 2025 16:00:56 -0700 Subject: [PATCH 4/9] types and tests --- src/sentry/issues/suspect_flags.py | 8 +- src/sentry/seer/math.py | 37 +------ src/sentry/seer/workflows/compare.py | 58 +---------- tests/sentry/seer/test_math.py | 63 ++++++++++++ tests/sentry/seer/workflows/test_compare.py | 108 +++++++++++++++++++- 5 files changed, 182 insertions(+), 92 deletions(-) diff --git a/src/sentry/issues/suspect_flags.py b/src/sentry/issues/suspect_flags.py index 0a456b073e593c..76c3267c05bbe7 100644 --- a/src/sentry/issues/suspect_flags.py +++ b/src/sentry/issues/suspect_flags.py @@ -5,7 +5,7 @@ import sentry_sdk from snuba_sdk import Column, Condition, Entity, Function, Limit, Op, Query, Request -from sentry.seer.workflows.compare import KeyedValueCount, keyed_rrf_score +from sentry.seer.workflows.compare import KeyedValueCount, keyed_rrf_score_with_filter from sentry.utils.snuba import raw_snql_query @@ -19,6 +19,7 @@ class Score(TypedDict): score: float baseline_percent: float distribution: Distribution + is_filtered: bool @sentry_sdk.trace @@ -42,7 +43,7 @@ def get_suspect_flag_scores( outliers_count = query_error_counts(org_id, project_id, start, end, envs, group_id=group_id) baseline_count = query_error_counts(org_id, project_id, start, end, envs, group_id=None) - keyed_scores = keyed_rrf_score( + keyed_scores = keyed_rrf_score_with_filter( baseline, outliers, total_baseline=baseline_count, @@ -67,8 +68,9 @@ def get_suspect_flag_scores( "score": score, "baseline_percent": baseline_percent_dict[key], "distribution": distributions[key], + "is_filtered": is_filtered, } - for key, score in keyed_scores + for key, score, is_filtered in keyed_scores ] diff --git a/src/sentry/seer/math.py b/src/sentry/seer/math.py index b514d7814bbeee..794cf04f61ca96 100644 --- a/src/sentry/seer/math.py +++ b/src/sentry/seer/math.py @@ -109,8 +109,7 @@ def boxcox_transform( transformed = [(pow(max(v, 1e-10), lambda_param) - 1) / lambda_param for v in values] return transformed, lambda_param - # Find optimal lambda using MLE - optimal_lambda = boxcox_normmax(values) + optimal_lambda = _boxcox_normmax(values) if optimal_lambda == 0.0: transformed = [math.log(max(v, 1e-10)) for v in values] @@ -120,7 +119,7 @@ def boxcox_transform( return transformed, optimal_lambda -def boxcox_llf(lambda_param: float, values: list[float]) -> float: +def _boxcox_llf(lambda_param: float, values: list[float]) -> float: """ Compute the Box-Cox log-likelihood function. @@ -158,7 +157,7 @@ def boxcox_llf(lambda_param: float, values: list[float]) -> float: return (lambda_param - 1) * log_sum - (n / 2) * logvar -def boxcox_normmax(values: list[float], max_iters: int = 100) -> float: +def _boxcox_normmax(values: list[float], max_iters: int = 100) -> float: """ Calculate the approximate optimal lambda parameter for BoxCox transformation that maximizes the log-likelihood. @@ -186,8 +185,8 @@ def boxcox_normmax(values: list[float], max_iters: int = 100) -> float: m1 = left + (right - left) / 3 m2 = right - (right - left) / 3 - llf_m1 = boxcox_llf(m1, values) - llf_m2 = boxcox_llf(m2, values) + llf_m1 = _boxcox_llf(m1, values) + llf_m2 = _boxcox_llf(m2, values) if llf_m1 > llf_m2: right = m2 @@ -220,29 +219,3 @@ def calculate_z_scores(values: list[float]) -> list[float]: return [0.0] * len(values) return [(x - mean_val) / std_dev for x in values] - - -def filter_by_z_score_threshold( - values: list[float], z_threshold: float = 1.5, lambda_param: float | None = None -) -> list[int]: - """ - Get indices of values that pass BoxCox + z-score filtering. - - This function applies BoxCox normalization to the values, - calculates z-scores, and returns indices where z-scores >= threshold. - - Parameters: - values: List of numerical values to filter - z_threshold: Minimum z-score threshold for inclusion - lambda_param: BoxCox lambda parameter (None for automatic selection) - - Returns: - List of indices that pass the filtering criteria - """ - if not values: - return [] - - transformed_values, _ = boxcox_transform(values, lambda_param) - z_scores = calculate_z_scores(transformed_values) - - return [i for i, z_score in enumerate(z_scores) if z_score >= z_threshold] diff --git a/src/sentry/seer/workflows/compare.py b/src/sentry/seer/workflows/compare.py index 9f7e774397f50d..211ac0ab1ea37f 100644 --- a/src/sentry/seer/workflows/compare.py +++ b/src/sentry/seer/workflows/compare.py @@ -2,14 +2,7 @@ from collections.abc import Callable, Generator, Mapping, Sequence from typing import TypeVar -from sentry.seer.math import ( - boxcox_transform, - entropy, - filter_by_z_score_threshold, - kl_divergence, - laplace_smooth, - rrf_score, -) +from sentry.seer.math import boxcox_transform, entropy, kl_divergence, laplace_smooth, rrf_score T = TypeVar("T") @@ -20,32 +13,6 @@ Score = tuple[str, float] -def filter_by_z_score( - data: Sequence[KeyedValueCount], z_threshold: float = 1.5, lambda_param: float | None = None -) -> list[KeyedValueCount]: - """ - Filter data by applying BoxCox transformation and z-score filtering. - - This function applies BoxCox normalization to the count values in the data, - calculates z-scores, and filters to keep only items with z-scores >= threshold. - - Parameters: - data: Sequence of (key, value, count) tuples - z_threshold: Minimum z-score threshold for inclusion - lambda_param: BoxCox lambda parameter (None for automatic selection) - - Returns: - Filtered list of (key, value, count) tuples - """ - if not data: - return [] - - counts = [count for _, _, count in data] - passing_indices = filter_by_z_score_threshold(counts, z_threshold, lambda_param) - - return [data[i] for i in passing_indices] - - def keyed_kl_score( baseline: Sequence[KeyedValueCount], outliers: Sequence[KeyedValueCount], @@ -80,8 +47,6 @@ def keyed_rrf_score( entropy_alpha: float = 0.2, kl_alpha: float = 0.8, offset: int = 60, - filter_rrf: bool = False, - z_threshold: float = 1.5, ) -> list[tuple[str, float]]: """ RRF score a multi-dimensional distribution of values. Returns a list of key, score pairs. @@ -111,26 +76,6 @@ def _scoring_fn(baseline: list[float], outliers: list[float]): entropy_scores.append(entropy_score) kl_scores.append(kl_score) - if filter_rrf: - normalized_entropy_scores, _ = boxcox_transform(entropy_scores) - normalized_kl_scores, _ = boxcox_transform(kl_scores) - - filtered_keys = [] - filtered_entropy_scores = [] - filtered_kl_scores = [] - - for i, (key, normalized_entropy_score, normalized_kl_score) in enumerate( - zip(keys, normalized_entropy_scores, normalized_kl_scores) - ): - if normalized_entropy_score > z_threshold or normalized_kl_score > z_threshold: - filtered_keys.append(key) - filtered_entropy_scores.append(entropy_scores[i]) - filtered_kl_scores.append(kl_scores[i]) - - keys = filtered_keys - entropy_scores = filtered_entropy_scores - kl_scores = filtered_kl_scores - return sorted( zip(keys, rrf_score(entropy_scores, kl_scores, entropy_alpha, kl_alpha, offset)), key=lambda k: k[1], @@ -255,6 +200,7 @@ def keyed_rrf_score_with_filter( ) -> list[tuple[str, float, bool]]: """ RRF score a multi-dimensional distribution of values. Returns a list of key, score pairs, and a mapping of if the key was filtered. + The filtered keys are those that have a normalized entropy or kl score greater than the z_threshold. Duplicates are not tolerated. Sample distribution: diff --git a/tests/sentry/seer/test_math.py b/tests/sentry/seer/test_math.py index 66e3f03c1c5e21..af6e85b3ae0923 100644 --- a/tests/sentry/seer/test_math.py +++ b/tests/sentry/seer/test_math.py @@ -1,6 +1,8 @@ import math from sentry.seer.math import ( + boxcox_transform, + calculate_z_scores, entropy, kl_divergence, laplace_smooth, @@ -90,3 +92,64 @@ def test_rrf_score(): def test_rank_min(): assert rank_min(xs=[1, 2, 2, 2, 3], ascending=False) == [3, 2, 2, 2, 1] assert rank_min(xs=[1, 2, 2, 2, 3], ascending=True) == [1, 2, 2, 2, 3] + + +def test_boxcox_transform(): + # Test with lambda = 0 (log transformation) + values = [1.0, 2.0, 4.0, 8.0] + transformed, lambda_used = boxcox_transform(values, lambda_param=0.0) + expected = [math.log(v) for v in values] + assert lambda_used == 0.0 + for t, e in zip(transformed, expected): + assert math.isclose(t, e, rel_tol=1e-9) + + # Test with lambda = 1 (no transformation, just (x-1)/1 = x-1) + transformed, lambda_used = boxcox_transform(values, lambda_param=1.0) + expected = [v - 1.0 for v in values] + assert lambda_used == 1.0 + for t, e in zip(transformed, expected): + assert math.isclose(t, e, rel_tol=1e-9) + + # Test with lambda = 0.5 (square root transformation) + transformed, lambda_used = boxcox_transform(values, lambda_param=0.5) + expected = [(math.sqrt(v) - 1.0) / 0.5 for v in values] + assert lambda_used == 0.5 + for t, e in zip(transformed, expected): + assert math.isclose(t, e, rel_tol=1e-9) + + # Test auto lambda detection + transformed, lambda_used = boxcox_transform(values, lambda_param=None) + assert isinstance(lambda_used, float) + assert len(transformed) == len(values) + + # Test empty input + transformed, lambda_used = boxcox_transform([], lambda_param=0.0) + assert transformed == [] + assert lambda_used == 0.0 + + +def test_calculate_z_scores(): + values = [1.0, 2.0, 3.0, 4.0, 5.0] + z_scores = calculate_z_scores(values) + + expected_mean = 3.0 + expected_std = math.sqrt(2.0) + expected = [(v - expected_mean) / expected_std for v in values] + + assert len(z_scores) == len(values) + for z, e in zip(z_scores, expected): + assert math.isclose(z, e, rel_tol=1e-9) + + same_values = [5.0, 5.0, 5.0, 5.0] + z_scores = calculate_z_scores(same_values) + assert all(z == 0.0 for z in z_scores) + + assert calculate_z_scores([]) == [] + + single_z = calculate_z_scores([42.0]) + assert single_z == [0.0] + + simple_values = [0.0, 10.0] + z_scores = calculate_z_scores(simple_values) + assert math.isclose(z_scores[0], -1.0, rel_tol=1e-9) + assert math.isclose(z_scores[1], 1.0, rel_tol=1e-9) diff --git a/tests/sentry/seer/workflows/test_compare.py b/tests/sentry/seer/workflows/test_compare.py index 446e6a5168c7ad..f712089c73bf95 100644 --- a/tests/sentry/seer/workflows/test_compare.py +++ b/tests/sentry/seer/workflows/test_compare.py @@ -1,6 +1,10 @@ import math -from sentry.seer.workflows.compare import keyed_kl_score, keyed_rrf_score +from sentry.seer.workflows.compare import ( + keyed_kl_score, + keyed_rrf_score, + keyed_rrf_score_with_filter, +) def test_keyed_kl_score(): @@ -211,3 +215,105 @@ def test_small_support(): ) attributes = [s[0] for s in scores] assert attributes == ["country", "browser", "device"] + + +def test_keyed_rrf_score_with_filter_basic(): + """ + Test basic functionality of keyed_rrf_score_with_filter + """ + baseline = [ + ("key", "true", 10), + ("key", "false", 200), + ("other", "true", 1000), + ("other", "false", 5000), + ] + outliers = [("key", "true", 10), ("other", "true", 100), ("other", "false", 500)] + + scores = keyed_rrf_score_with_filter( + baseline, + outliers, + total_baseline=sum(i[2] for i in baseline), + total_outliers=sum(i[2] for i in outliers), + z_threshold=1.5, + ) + + # Should return tuples of (key, score, filtered_boolean) + assert len(scores) == 2 + for key, score, filtered in scores: + assert isinstance(key, str) + assert isinstance(score, float) + assert isinstance(filtered, bool) + assert score >= 0 + + +def test_keyed_rrf_score_with_filter_threshold_behavior(): + """ + Test filtering behavior with different z_threshold values + """ + baseline = [ + ("key", "true", 10), + ("key", "false", 200), + ("other", "true", 1000), + ("other", "false", 5000), + ] + outliers = [("key", "true", 10), ("other", "true", 100), ("other", "false", 500)] + + # With high threshold, no keys should be filtered + high_threshold_scores = keyed_rrf_score_with_filter( + baseline, + outliers, + total_baseline=sum(i[2] for i in baseline), + total_outliers=sum(i[2] for i in outliers), + z_threshold=10.0, + ) + + for key, score, filtered in high_threshold_scores: + assert not filtered, f"Key {key} should not be filtered with high threshold" + + +def test_keyed_rrf_score_with_filter_empty_inputs(): + """ + Test with empty inputs + """ + scores = keyed_rrf_score_with_filter( + [], [], total_baseline=0, total_outliers=0, z_threshold=1.5 + ) + assert scores == [] + + +def test_keyed_rrf_score_with_filter_consistency_with_regular_rrf(): + """ + Test that the scores are consistent with keyed_rrf_score + """ + baseline = [ + ("key", "true", 10), + ("key", "false", 200), + ("other", "true", 1000), + ("other", "false", 5000), + ] + outliers = [("key", "true", 10), ("other", "true", 100), ("other", "false", 500)] + + # Get scores from both functions + filtered_scores = keyed_rrf_score_with_filter( + baseline, + outliers, + total_baseline=sum(i[2] for i in baseline), + total_outliers=sum(i[2] for i in outliers), + z_threshold=1.5, + ) + + regular_scores = keyed_rrf_score( + baseline, + outliers, + total_baseline=sum(i[2] for i in baseline), + total_outliers=sum(i[2] for i in outliers), + ) + + # Extract just the key-score pairs and sort them for comparison + filtered_key_scores = sorted([(key, score) for key, score, _ in filtered_scores]) + regular_key_scores = sorted(regular_scores) + + # The scores should be identical + for (key1, score1), (key2, score2) in zip(filtered_key_scores, regular_key_scores): + assert key1 == key2 + assert math.isclose(score1, score2, rel_tol=1e-9) From 1fd3aff72c9388d43a5cf85560c0e486e8ecb0c3 Mon Sep 17 00:00:00 2001 From: Aayush Seth Date: Mon, 7 Jul 2025 16:05:28 -0700 Subject: [PATCH 5/9] typo --- src/sentry/seer/math.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentry/seer/math.py b/src/sentry/seer/math.py index 794cf04f61ca96..9e792e5b37ce5b 100644 --- a/src/sentry/seer/math.py +++ b/src/sentry/seer/math.py @@ -161,7 +161,7 @@ def _boxcox_normmax(values: list[float], max_iters: int = 100) -> float: """ Calculate the approximate optimal lambda parameter for BoxCox transformation that maximizes the log-likelihood. - Uses MLE method with ternary search rather than Brent's methodfor efficient optimization. + Uses MLE method with ternary search rather than Brent's method for efficient optimization. Parameters: values: List of positive values From 18f9df8f609a5edf78bad8f102806d777d5056be Mon Sep 17 00:00:00 2001 From: Aayush Seth Date: Mon, 7 Jul 2025 16:50:20 -0700 Subject: [PATCH 6/9] Ensure 0 is handled and update tests --- src/sentry/seer/math.py | 5 +++-- .../endpoints/test_organization_group_suspect_flags.py | 2 ++ tests/sentry/issues/test_suspect_flags.py | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/sentry/seer/math.py b/src/sentry/seer/math.py index 9e792e5b37ce5b..7287260c1d71a1 100644 --- a/src/sentry/seer/math.py +++ b/src/sentry/seer/math.py @@ -173,8 +173,9 @@ def _boxcox_normmax(values: list[float], max_iters: int = 100) -> float: if not values: return 0.0 - if any(v <= 0 for v in values): - raise ValueError("All values must be positive for BoxCox transformation") + min_value = min(values) + if min_value <= 0: + values = [v - min_value + 1 for v in values] left = -2.0 right = 2.0 diff --git a/tests/sentry/issues/endpoints/test_organization_group_suspect_flags.py b/tests/sentry/issues/endpoints/test_organization_group_suspect_flags.py index 5f033923f93372..2ec27ab743ca81 100644 --- a/tests/sentry/issues/endpoints/test_organization_group_suspect_flags.py +++ b/tests/sentry/issues/endpoints/test_organization_group_suspect_flags.py @@ -69,6 +69,7 @@ def test_get(self) -> None: "true": 1, }, }, + "is_filtered": True, }, { "flag": "other", @@ -82,6 +83,7 @@ def test_get(self) -> None: "false": 1, }, }, + "is_filtered": False, }, ] } diff --git a/tests/sentry/issues/test_suspect_flags.py b/tests/sentry/issues/test_suspect_flags.py index 23ee063ddfdabb..d01dff6299c3ef 100644 --- a/tests/sentry/issues/test_suspect_flags.py +++ b/tests/sentry/issues/test_suspect_flags.py @@ -132,11 +132,13 @@ def test_get_suspect_flag_scores(self) -> None: "baseline": {"false": 1, "true": 1}, "outliers": {"true": 1}, }, + "is_filtered": True, }, { "flag": "other", "score": 0.016181914331041776, "baseline_percent": 0, "distribution": {"baseline": {"false": 2}, "outliers": {"false": 1}}, + "is_filtered": False, }, ] From 271bb90f6cda14b4c53efe0accea28a303032ac7 Mon Sep 17 00:00:00 2001 From: Aayush Seth Date: Mon, 7 Jul 2025 17:31:36 -0700 Subject: [PATCH 7/9] use the correct values for z score calculation --- src/sentry/seer/workflows/compare.py | 22 ++++++++++++------- .../test_organization_group_suspect_flags.py | 2 +- tests/sentry/issues/test_suspect_flags.py | 2 +- tests/sentry/seer/workflows/test_compare.py | 4 ++-- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/sentry/seer/workflows/compare.py b/src/sentry/seer/workflows/compare.py index 211ac0ab1ea37f..fc6f4b3ef43404 100644 --- a/src/sentry/seer/workflows/compare.py +++ b/src/sentry/seer/workflows/compare.py @@ -2,7 +2,14 @@ from collections.abc import Callable, Generator, Mapping, Sequence from typing import TypeVar -from sentry.seer.math import boxcox_transform, entropy, kl_divergence, laplace_smooth, rrf_score +from sentry.seer.math import ( + boxcox_transform, + calculate_z_scores, + entropy, + kl_divergence, + laplace_smooth, + rrf_score, +) T = TypeVar("T") @@ -232,14 +239,13 @@ def _scoring_fn(baseline: list[float], outliers: list[float]): normalized_entropy_scores, _ = boxcox_transform(entropy_scores) normalized_kl_scores, _ = boxcox_transform(kl_scores) + entropy_z_scores = calculate_z_scores(normalized_entropy_scores) + kl_z_scores = calculate_z_scores(normalized_kl_scores) - filtered_keys = [False] * len(keys) - - for i, (key, normalized_entropy_score, normalized_kl_score) in enumerate( - zip(keys, normalized_entropy_scores, normalized_kl_scores) - ): - if normalized_entropy_score > z_threshold or normalized_kl_score > z_threshold: - filtered_keys[i] = True + filtered_keys = [ + entropy_z_score <= z_threshold or kl_z_score <= z_threshold + for entropy_z_score, kl_z_score in zip(entropy_z_scores, kl_z_scores) + ] return sorted( zip( diff --git a/tests/sentry/issues/endpoints/test_organization_group_suspect_flags.py b/tests/sentry/issues/endpoints/test_organization_group_suspect_flags.py index 2ec27ab743ca81..8e662ecf7c43be 100644 --- a/tests/sentry/issues/endpoints/test_organization_group_suspect_flags.py +++ b/tests/sentry/issues/endpoints/test_organization_group_suspect_flags.py @@ -83,7 +83,7 @@ def test_get(self) -> None: "false": 1, }, }, - "is_filtered": False, + "is_filtered": True, }, ] } diff --git a/tests/sentry/issues/test_suspect_flags.py b/tests/sentry/issues/test_suspect_flags.py index d01dff6299c3ef..da50859f57c336 100644 --- a/tests/sentry/issues/test_suspect_flags.py +++ b/tests/sentry/issues/test_suspect_flags.py @@ -139,6 +139,6 @@ def test_get_suspect_flag_scores(self) -> None: "score": 0.016181914331041776, "baseline_percent": 0, "distribution": {"baseline": {"false": 2}, "outliers": {"false": 1}}, - "is_filtered": False, + "is_filtered": True, }, ] diff --git a/tests/sentry/seer/workflows/test_compare.py b/tests/sentry/seer/workflows/test_compare.py index f712089c73bf95..404e168c49de06 100644 --- a/tests/sentry/seer/workflows/test_compare.py +++ b/tests/sentry/seer/workflows/test_compare.py @@ -258,13 +258,13 @@ def test_keyed_rrf_score_with_filter_threshold_behavior(): ] outliers = [("key", "true", 10), ("other", "true", 100), ("other", "false", 500)] - # With high threshold, no keys should be filtered + # With low threshold, no keys should be filtered high_threshold_scores = keyed_rrf_score_with_filter( baseline, outliers, total_baseline=sum(i[2] for i in baseline), total_outliers=sum(i[2] for i in outliers), - z_threshold=10.0, + z_threshold=-10.0, ) for key, score, filtered in high_threshold_scores: From aa9645542b6cdb284b7488992c45ad1f3373628b Mon Sep 17 00:00:00 2001 From: Aayush Seth Date: Mon, 7 Jul 2025 21:31:37 -0700 Subject: [PATCH 8/9] bugs --- src/sentry/seer/math.py | 18 ++++++++++++++---- src/sentry/seer/workflows/compare.py | 4 ++-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/sentry/seer/math.py b/src/sentry/seer/math.py index 7287260c1d71a1..578ef17d9138bc 100644 --- a/src/sentry/seer/math.py +++ b/src/sentry/seer/math.py @@ -101,20 +101,30 @@ def boxcox_transform( Returns: Tuple of (transformed values, lambda parameter used) """ + min_value = min(values) if values else 0 + if min_value <= 0: + shift_amount = -min_value + 1 + shifted_values = [v + shift_amount for v in values] + else: + shifted_values = values if lambda_param is not None: if lambda_param == 0.0: - transformed = [math.log(max(v, 1e-10)) for v in values] + transformed = [math.log(max(v, 1e-10)) for v in shifted_values] else: - transformed = [(pow(max(v, 1e-10), lambda_param) - 1) / lambda_param for v in values] + transformed = [ + (pow(max(v, 1e-10), lambda_param) - 1) / lambda_param for v in shifted_values + ] return transformed, lambda_param optimal_lambda = _boxcox_normmax(values) if optimal_lambda == 0.0: - transformed = [math.log(max(v, 1e-10)) for v in values] + transformed = [math.log(max(v, 1e-10)) for v in shifted_values] else: - transformed = [(pow(max(v, 1e-10), optimal_lambda) - 1) / optimal_lambda for v in values] + transformed = [ + (pow(max(v, 1e-10), optimal_lambda) - 1) / optimal_lambda for v in shifted_values + ] return transformed, optimal_lambda diff --git a/src/sentry/seer/workflows/compare.py b/src/sentry/seer/workflows/compare.py index fc6f4b3ef43404..4765d7226ce179 100644 --- a/src/sentry/seer/workflows/compare.py +++ b/src/sentry/seer/workflows/compare.py @@ -207,7 +207,7 @@ def keyed_rrf_score_with_filter( ) -> list[tuple[str, float, bool]]: """ RRF score a multi-dimensional distribution of values. Returns a list of key, score pairs, and a mapping of if the key was filtered. - The filtered keys are those that have a normalized entropy or kl score greater than the z_threshold. + The filtered keys are those that have a normalized entropy and kl score less than the z_threshold. Duplicates are not tolerated. Sample distribution: @@ -243,7 +243,7 @@ def _scoring_fn(baseline: list[float], outliers: list[float]): kl_z_scores = calculate_z_scores(normalized_kl_scores) filtered_keys = [ - entropy_z_score <= z_threshold or kl_z_score <= z_threshold + entropy_z_score <= z_threshold and kl_z_score <= z_threshold for entropy_z_score, kl_z_score in zip(entropy_z_scores, kl_z_scores) ] From 93d2f054a45ecb71e88eb66788574a50227d2ce2 Mon Sep 17 00:00:00 2001 From: Aayush Seth Date: Tue, 8 Jul 2025 11:35:45 -0700 Subject: [PATCH 9/9] update using shifted values and clean up boxcox function --- src/sentry/seer/math.py | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/src/sentry/seer/math.py b/src/sentry/seer/math.py index 578ef17d9138bc..e0f433f7e74f5e 100644 --- a/src/sentry/seer/math.py +++ b/src/sentry/seer/math.py @@ -103,30 +103,23 @@ def boxcox_transform( """ min_value = min(values) if values else 0 if min_value <= 0: - shift_amount = -min_value + 1 + shift_amount = -min_value + 1e-10 shifted_values = [v + shift_amount for v in values] else: shifted_values = values - if lambda_param is not None: - if lambda_param == 0.0: - transformed = [math.log(max(v, 1e-10)) for v in shifted_values] - else: - transformed = [ - (pow(max(v, 1e-10), lambda_param) - 1) / lambda_param for v in shifted_values - ] - return transformed, lambda_param - - optimal_lambda = _boxcox_normmax(values) + # Get lambda parameter: use provided one or find optimal + lambda_param = _boxcox_normmax(shifted_values) if lambda_param is None else lambda_param - if optimal_lambda == 0.0: + # Apply transformation + if lambda_param == 0.0: transformed = [math.log(max(v, 1e-10)) for v in shifted_values] else: transformed = [ - (pow(max(v, 1e-10), optimal_lambda) - 1) / optimal_lambda for v in shifted_values + (pow(max(v, 1e-10), lambda_param) - 1) / lambda_param for v in shifted_values ] - return transformed, optimal_lambda + return transformed, lambda_param def _boxcox_llf(lambda_param: float, values: list[float]) -> float: @@ -183,10 +176,6 @@ def _boxcox_normmax(values: list[float], max_iters: int = 100) -> float: if not values: return 0.0 - min_value = min(values) - if min_value <= 0: - values = [v - min_value + 1 for v in values] - left = -2.0 right = 2.0 tolerance = 1e-6