Skip to content
80 changes: 39 additions & 41 deletions tools/tdgpt/taosanalytics/algo/tool/profile_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,11 +388,8 @@ def _validate_params(parsed_input):
has_threshold, top_n = _validate_result_constraints(result_obj, algo_type)
source_arr, source_ts_window = _parse_source_data(parsed_input["source_data"])

exclude_contained = _validate_bool_field(result_obj, "exclude_contained")
if algo_type != "dtw" and "exclude_contained" in result_obj:
raise ValueError('"result.exclude_contained" can only be set for dtw algorithm')

exclude_source = _validate_bool_field(result_obj, "exclude_source")
Comment thread
hjxilinx marked this conversation as resolved.
exclude_overlap = _validate_bool_field(result_obj, "exclude_overlap")

Comment on lines 388 to 393
Copy link

Copilot AI Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The refactor removes handling for the legacy result.exclude_contained field, but _validate_params also doesn’t reject it. Requests that still send exclude_contained will be silently accepted and the exclusion behavior won’t be applied. Consider either treating exclude_contained as an alias for exclude_overlap (backward compatibility) or raising a clear ValueError telling clients to migrate.

Copilot uses AI. Check for mistakes.
min_window, max_window = _validate_min_max_window(
algo_params.get("min_window", None),
Expand Down Expand Up @@ -434,8 +431,8 @@ def _validate_params(parsed_input):
"max_window": max_window,
"window_size_step": window_size_step,
"window_sliding_step": window_sliding_step,
"exclude_contained": exclude_contained,
"exclude_source": exclude_source,
"exclude_overlap": exclude_overlap,
"is_profile_list": is_profile_list
}

Expand All @@ -461,42 +458,44 @@ def _validate_possible_candidates(source_arr, data_list_size, min_window, max_wi
)


def _is_interval_contained(inner_window, outer_window):
"""Return whether ``outer_window`` strictly contains ``inner_window``.
def _is_interval_overlapping(window_a, window_b):
"""Return whether ``window_a`` and ``window_b`` overlap.

"Strict" means ``outer_window`` fully covers ``inner_window`` and the two
windows are not identical. At least one outer bound must extend beyond the
corresponding inner bound, so equal start/end bounds do not count as
containment.
Endpoint-touching multi-point windows (e.g. [1,5] and [5,8]) are treated as
adjacent and not overlapping. Two single-point windows [t,t] are considered
overlapping when they share the same timestamp.
"""
return (outer_window[0] <= inner_window[0]
and outer_window[1] >= inner_window[1]
and (outer_window[0] < inner_window[0] or outer_window[1] > inner_window[1]))
# Single-point windows with identical timestamps are the same point and overlap.
if window_a[0] == window_a[1] and window_b[0] == window_b[1]:
return window_a[0] == window_b[0]
return window_a[0] < window_b[1] and window_b[0] < window_a[1]
Comment thread
hjxilinx marked this conversation as resolved.


def _filter_exclude_overlap(matches, limit=None):
"""Greedily keep matches whose ts_window does not overlap with any already-kept match.

def _filter_exclude_contained(matches, limit=None):
matches must be sorted best-first. For each candidate, it is discarded if its
ts_window overlaps with an already-kept match's ts_window (adjacent windows
sharing only an endpoint are not considered overlapping).
"""
Comment thread
hjxilinx marked this conversation as resolved.
if len(matches) <= 1:
return matches

# matches are expected to be sorted by criteria ascending (best first for DTW).
# We greedily keep each match unless it is in a strict containment relationship
# (either direction) with an already-kept match. Because we process best-first,
# every already-kept match has a better (or equal) criteria value, so the current
# match is always the worse one in any containment pair and should be discarded.
kept = [] # list of (ts_window, original_index)

for idx, match in enumerate(matches):
ts_window = match.get("ts_window")
if not isinstance(ts_window, (list, tuple)) or len(ts_window) != 2:
raise ValueError(f'matches[{idx}].ts_window must be a [start_ts, end_ts] pair')
Comment thread
hjxilinx marked this conversation as resolved.
if ts_window[0] > ts_window[1]:
raise ValueError(f'matches[{idx}].ts_window must satisfy start_ts <= end_ts')

in_containment = any(
_is_interval_contained(ts_window, k_window)
or _is_interval_contained(k_window, ts_window)
has_overlap = any(
_is_interval_overlapping(ts_window, k_window)
for k_window, _ in kept
)

if not in_containment:
if not has_overlap:
kept.append((ts_window, idx))

if limit is not None and len(kept) >= limit:
Expand All @@ -506,9 +505,9 @@ def _filter_exclude_contained(matches, limit=None):
return [m for i, m in enumerate(matches) if i in kept_indices]


# When exclude_contained is active, the heap is oversampled by this factor so
# that containment filtering still yields target_rows results in most cases.
_CONTAINMENT_OVERSAMPLE = 8
# When exclusion filters are active, the heap is oversampled by this factor so
# that filtering still yields target_rows results in most cases.
Comment thread
hjxilinx marked this conversation as resolved.
_EXCLUSION_OVERSAMPLE = 8

def _heap_key(algo_type, criteria_val, seq_idx):
# Higher heap key means a better candidate after normalization of the metric:
Expand Down Expand Up @@ -536,14 +535,14 @@ def do_profile_search_impl(req_json):
max_window = parsed["max_window"]
window_size_step = parsed["window_size_step"]
window_sliding_step = parsed["window_sliding_step"]
exclude_contained = parsed["exclude_contained"]
exclude_source = parsed["exclude_source"]
exclude_overlap = parsed["exclude_overlap"]

source_norm = _normalize_series(source_arr, norm_type)
metric_type = "dtw_distance" if algo_type == "dtw" else "cosine_similarity"
threshold = float(result_obj["threshold"]) if has_threshold else None
target_rows = ProfileSearchLimits.MAX_PROFILE_SEARCH_RESULTS if top_n is None else top_n
need_exclusion_filter = (algo_type == "dtw" and exclude_contained)
need_exclusion_filter = exclude_overlap

def _build_candidates():
if parsed["is_profile_list"]:
Expand All @@ -558,10 +557,10 @@ def _build_candidates():
)

# Score all candidates once.
# - Without exclude_contained: stream results directly into a fixed-size heap,
# - Without exclude_overlap: stream results directly into a fixed-size heap,
# discarding weaker candidates on the fly. No retry is needed so there is no
# reason to accumulate a separate all_passed list.
# - With exclude_contained: every passing result is saved in all_passed so that
# - With exclude_overlap: every passing result is saved in all_passed so that
# the retry loop can rebuild the heap with a larger limit without recomputing
# any distances.
all_passed = [] if need_exclusion_filter else None
Expand Down Expand Up @@ -611,9 +610,9 @@ def _build_candidates():
top_heap.sort(key=lambda x: (-x[2]["criteria"], x[1]))
matches = [x[2] for x in top_heap]
else:
# exclude_contained is active: rebuild the heap from all_passed with a
# progressively larger heap_limit until containment filtering yields enough results.
oversample = _CONTAINMENT_OVERSAMPLE
# Exclusion filters are active: rebuild the heap from all_passed with a
# progressively larger heap_limit until filtering yields enough results.
oversample = _EXCLUSION_OVERSAMPLE
matches = []
total_passed = len(all_passed)

Expand All @@ -628,21 +627,20 @@ def _build_candidates():
elif key > top_heap[0][0]:
heapq.heapreplace(top_heap, heap_item)

if algo_type != "dtw":
raise RuntimeError('exclude_contained logic requires algo_type to be "dtw"')

top_heap.sort(key=lambda x: (x[2]["criteria"], x[1]))
if algo_type == "dtw":
top_heap.sort(key=lambda x: (x[2]["criteria"], x[1]))
else:
top_heap.sort(key=lambda x: (-x[2]["criteria"], x[1]))

matches = [x[2] for x in top_heap]

matches = _filter_exclude_contained(matches, limit=target_rows)
matches = matches[:target_rows]
matches = _filter_exclude_overlap(matches, limit=target_rows)
Comment thread
hjxilinx marked this conversation as resolved.

# Got enough results, or all passing candidates already fit in the heap.
if len(matches) >= target_rows or total_passed <= heap_limit:
break

# The heap was saturated and containment filtering removed too many entries.
# The heap was saturated and filtering removed too many entries.
# Double the oversample factor and rebuild from the cached scored list.
oversample *= 2

Expand Down
6 changes: 3 additions & 3 deletions tools/tdgpt/taosanalytics/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,8 @@ def do_profile_search(request, api_version):
- Or return all profiles with distance below the threshold when using dtw.
- Or return all profiles with similarity above the threshold when using cosine similarity.
- "num" and "threshold" cannot be set at the same time.
- "exclude_contained" is only applicable for dtw and means whether to exclude the worse matched profile in a strict-containment pair, keeping the better one (the match with the smaller distance). For example, if there are two matched profiles with ts window [1, 5] and [2, 4], and one strictly contains the other, the worse match will be excluded if "exclude_contained" is set to true.
- "exclude_source" is applicable for all algorithms and means whether to exclude the matched profile that contains the source profile. For example, if the source profile has ts window [2, 4], the matched profile with ts window [2, 4] will be excluded if "exclude_source" is set to true.
- "exclude_overlap" is applicable for all algorithms and means whether to exclude any matched profile that overlaps with a better-ranked result. For example, if there are two matched profiles with ts window [1, 5] and [4, 6], the profile [4, 6] will be excluded if "exclude_overlap" is set to true. Endpoint-touching windows are treated as adjacent/non-overlapping, so windows such as [1, 5] and [5, 9] are not excluded by "exclude_overlap".
- Threshold-based results are capped at 500 matches.
target_data.ts may be either:
- a unix timestamp list, such as [1, 2, 3, 4, 5, 6]
Expand All @@ -330,8 +330,8 @@ def do_profile_search(request, api_version):
},
"result": {
"num": 3,
"exclude_contained": true,
"exclude_source": true
"exclude_source": true,
"exclude_overlap": true
},
"source_data": {
"ts": [1000, 2000, 3000, 4000, 5000],
Expand Down
Loading
Loading