Skip to content

Commit d0fdc7e

Browse files
authored
Merge pull request #20776 from cockroachdb/fix-version-search-discrepancy
Fix version number search discrepancy between formats
2 parents 848c47a + ecaa62c commit d0fdc7e

File tree

1 file changed

+43
-1
lines changed

1 file changed

+43
-1
lines changed

src/current/algolia_index_intelligent_bloat_removal.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,41 @@ def is_duplicate_content(self, content: str) -> bool:
137137
SEEN_CONTENT_HASHES.add(content_hash)
138138
return False
139139

140+
def _is_version_spam(self, content: str, context: Dict[str, str]) -> bool:
141+
"""Context-aware version filtering to distinguish spam from legitimate version references."""
142+
# Version patterns to check
143+
v_pattern = re.compile(r'^v\d+\.\d+(\.\d+)?(-beta\.\d+)?\s*$', re.IGNORECASE)
144+
beta_pattern = re.compile(r'^beta-\d+\s*$', re.IGNORECASE)
145+
146+
# Check if content matches version patterns
147+
is_v_version = v_pattern.match(content)
148+
is_beta_version = beta_pattern.match(content)
149+
150+
if not (is_v_version or is_beta_version):
151+
return False
152+
153+
# Context clues that indicate this is legitimate version content, not spam
154+
page_url = context.get('url', '')
155+
156+
# ALWAYS preserve version numbers in release pages and version-specific content
157+
if any(area in page_url for area in ['/releases/', 'release-notes', 'changelog']):
158+
return False
159+
160+
# Handle beta versions - generally filter as spam unless in release context
161+
if is_beta_version:
162+
return True
163+
164+
# Handle v-versions based on length and complexity
165+
if is_v_version:
166+
# Preserve longer, more complex version strings
167+
if len(content) > 8 or '-beta' in content: # e.g., "v26.1.0-beta.1"
168+
return False
169+
170+
# Filter short version numbers outside release context (navigation spam)
171+
return True
172+
173+
return False
174+
140175
def is_bloat_content(self, content: str, context: Dict[str, str] = None) -> bool:
141176
"""Intelligently determine if content is bloat while preserving valuable content."""
142177
if not content or len(content.strip()) < MIN_CONTENT_LENGTH:
@@ -150,8 +185,15 @@ def is_bloat_content(self, content: str, context: Dict[str, str] = None) -> bool
150185
if pattern.search(content_clean):
151186
return False
152187

153-
# 2. Check for exact bloat patterns
188+
# 2. Context-aware version filtering before exact bloat patterns
189+
if self._is_version_spam(content_clean, context):
190+
return True
191+
192+
# 3. Check for exact bloat patterns (excluding version patterns handled above)
154193
for pattern in self.exact_bloat_patterns:
194+
# Skip version patterns since they're handled contextually above
195+
if pattern.pattern.startswith(('^v\\d+', '^beta-')):
196+
continue
155197
if pattern.match(content_clean):
156198
return True
157199

0 commit comments

Comments
 (0)