Skip to content

Commit 0e4f30f

Browse files
committed
fix: reduce false positives in large-response comparison fallback
1 parent e659543 commit 0e4f30f

1 file changed

Lines changed: 50 additions & 3 deletions

File tree

lib/request/comparison.py

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,55 @@
3434
from lib.core.threads import getCurrentThreadData
3535
from thirdparty import six
3636

37+
def _toBytes(value):
38+
if value is None:
39+
return b""
40+
elif isinstance(value, six.binary_type):
41+
return value
42+
elif isinstance(value, six.text_type):
43+
return getBytes(value, kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore")
44+
else:
45+
return getBytes(six.text_type(value), kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore")
46+
47+
def _sampledSimilarity(first, second):
48+
"""
49+
Lightweight fallback similarity for very large responses.
50+
51+
It avoids expensive full-sequence matching while still comparing actual
52+
content (not only response length), reducing false positives.
53+
"""
54+
55+
first, second = _toBytes(first), _toBytes(second)
56+
57+
if first == second:
58+
return 1.0
59+
elif not first or not second:
60+
return float(first == second)
61+
62+
firstLength, secondLength = len(first), len(second)
63+
ratio = 1.0 * min(firstLength, secondLength) / max(firstLength, secondLength)
64+
65+
window = min(4096, firstLength, secondLength)
66+
if not window:
67+
return ratio
68+
69+
similarity = 0.0
70+
positions = (0.0, 0.25, 0.5, 0.75, 1.0)
71+
72+
for position in positions:
73+
firstStart = int(max(0, firstLength - window) * position)
74+
secondStart = int(max(0, secondLength - window) * position)
75+
76+
firstChunk = first[firstStart:firstStart + window]
77+
secondChunk = second[secondStart:secondStart + window]
78+
79+
similarity += (1.0 * sum(left == right for left, right in zip(firstChunk, secondChunk)) / window)
80+
81+
similarity /= len(positions)
82+
83+
# Favor actual content match while still accounting for size drift.
84+
return 0.7 * similarity + 0.3 * ratio
85+
3786
def comparison(page, headers, code=None, getRatioValue=False, pageLength=None):
3887
if not isinstance(page, (six.text_type, six.binary_type, type(None))):
3988
logger.critical("got page of type %s; repr(page)[:200]=%s" % (type(page), repr(page)[:200]))
@@ -142,9 +191,7 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
142191
if not page or not seqMatcher.a:
143192
return float(seqMatcher.a == page)
144193
else:
145-
ratio = 1. * len(seqMatcher.a) / len(page)
146-
if ratio > 1:
147-
ratio = 1. / ratio
194+
ratio = _sampledSimilarity(seqMatcher.a, page)
148195
else:
149196
seq1, seq2 = None, None
150197

0 commit comments

Comments
 (0)