@@ -137,6 +137,41 @@ def is_duplicate_content(self, content: str) -> bool:
137137 SEEN_CONTENT_HASHES .add (content_hash )
138138 return False
139139
140+ def _is_version_spam (self , content : str , context : Dict [str , str ]) -> bool :
141+ """Context-aware version filtering to distinguish spam from legitimate version references."""
142+ # Version patterns to check
143+ v_pattern = re .compile (r'^v\d+\.\d+(\.\d+)?(-beta\.\d+)?\s*$' , re .IGNORECASE )
144+ beta_pattern = re .compile (r'^beta-\d+\s*$' , re .IGNORECASE )
145+
146+ # Check if content matches version patterns
147+ is_v_version = v_pattern .match (content )
148+ is_beta_version = beta_pattern .match (content )
149+
150+ if not (is_v_version or is_beta_version ):
151+ return False
152+
153+ # Context clues that indicate this is legitimate version content, not spam
154+ page_url = context .get ('url' , '' )
155+
156+ # ALWAYS preserve version numbers in release pages and version-specific content
157+ if any (area in page_url for area in ['/releases/' , 'release-notes' , 'changelog' ]):
158+ return False
159+
160+ # Handle beta versions - generally filter as spam unless in release context
161+ if is_beta_version :
162+ return True
163+
164+ # Handle v-versions based on length and complexity
165+ if is_v_version :
166+ # Preserve longer, more complex version strings
167+ if len (content ) > 8 or '-beta' in content : # e.g., "v26.1.0-beta.1"
168+ return False
169+
170+ # Filter short version numbers outside release context (navigation spam)
171+ return True
172+
173+ return False
174+
140175 def is_bloat_content (self , content : str , context : Dict [str , str ] = None ) -> bool :
141176 """Intelligently determine if content is bloat while preserving valuable content."""
142177 if not content or len (content .strip ()) < MIN_CONTENT_LENGTH :
@@ -150,8 +185,15 @@ def is_bloat_content(self, content: str, context: Dict[str, str] = None) -> bool
150185 if pattern .search (content_clean ):
151186 return False
152187
153- # 2. Check for exact bloat patterns
188+ # 2. Context-aware version filtering before exact bloat patterns
189+ if self ._is_version_spam (content_clean , context ):
190+ return True
191+
192+ # 3. Check for exact bloat patterns (excluding version patterns handled above)
154193 for pattern in self .exact_bloat_patterns :
194+ # Skip version patterns since they're handled contextually above
195+ if pattern .pattern .startswith (('^v\\ d+' , '^beta-' )):
196+ continue
155197 if pattern .match (content_clean ):
156198 return True
157199
0 commit comments