|
34 | 34 | from lib.core.threads import getCurrentThreadData |
35 | 35 | from thirdparty import six |
36 | 36 |
|
| 37 | +def _toBytes(value): |
| 38 | + if value is None: |
| 39 | + return b"" |
| 40 | + elif isinstance(value, six.binary_type): |
| 41 | + return value |
| 42 | + elif isinstance(value, six.text_type): |
| 43 | + return getBytes(value, kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore") |
| 44 | + else: |
| 45 | + return getBytes(six.text_type(value), kb.pageEncoding or DEFAULT_PAGE_ENCODING, "ignore") |
| 46 | + |
| 47 | +def _sampledSimilarity(first, second): |
| 48 | + """ |
| 49 | + Lightweight fallback similarity for very large responses. |
| 50 | +
|
| 51 | + It avoids expensive full-sequence matching while still comparing actual |
| 52 | + content (not only response length), reducing false positives. |
| 53 | + """ |
| 54 | + |
| 55 | + first, second = _toBytes(first), _toBytes(second) |
| 56 | + |
| 57 | + if first == second: |
| 58 | + return 1.0 |
| 59 | + elif not first or not second: |
| 60 | + return float(first == second) |
| 61 | + |
| 62 | + firstLength, secondLength = len(first), len(second) |
| 63 | + ratio = 1.0 * min(firstLength, secondLength) / max(firstLength, secondLength) |
| 64 | + |
| 65 | + window = min(4096, firstLength, secondLength) |
| 66 | + if not window: |
| 67 | + return ratio |
| 68 | + |
| 69 | + similarity = 0.0 |
| 70 | + positions = (0.0, 0.25, 0.5, 0.75, 1.0) |
| 71 | + |
| 72 | + for position in positions: |
| 73 | + firstStart = int(max(0, firstLength - window) * position) |
| 74 | + secondStart = int(max(0, secondLength - window) * position) |
| 75 | + |
| 76 | + firstChunk = first[firstStart:firstStart + window] |
| 77 | + secondChunk = second[secondStart:secondStart + window] |
| 78 | + |
| 79 | + similarity += (1.0 * sum(left == right for left, right in zip(firstChunk, secondChunk)) / window) |
| 80 | + |
| 81 | + similarity /= len(positions) |
| 82 | + |
| 83 | + # Favor actual content match while still accounting for size drift. |
| 84 | + return 0.7 * similarity + 0.3 * ratio |
| 85 | + |
37 | 86 | def comparison(page, headers, code=None, getRatioValue=False, pageLength=None): |
38 | 87 | if not isinstance(page, (six.text_type, six.binary_type, type(None))): |
39 | 88 | logger.critical("got page of type %s; repr(page)[:200]=%s" % (type(page), repr(page)[:200])) |
@@ -142,9 +191,7 @@ def _comparison(page, headers, code, getRatioValue, pageLength): |
142 | 191 | if not page or not seqMatcher.a: |
143 | 192 | return float(seqMatcher.a == page) |
144 | 193 | else: |
145 | | - ratio = 1. * len(seqMatcher.a) / len(page) |
146 | | - if ratio > 1: |
147 | | - ratio = 1. / ratio |
| 194 | + ratio = _sampledSimilarity(seqMatcher.a, page) |
148 | 195 | else: |
149 | 196 | seq1, seq2 = None, None |
150 | 197 |
|
|
0 commit comments