diff --git a/semhash/datamodels.py b/semhash/datamodels.py index 2539561..59aa5e3 100644 --- a/semhash/datamodels.py +++ b/semhash/datamodels.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import warnings from collections import defaultdict from dataclasses import dataclass, field @@ -156,7 +157,15 @@ def _to_hashable(record: Record) -> frozendict[str, str] | str: # Get the list of duplicates for the selected record raw_list = buckets.get(_to_hashable(selected), []) # Ensure we don't have duplicates in the list - deduped = {_to_hashable(rec): (rec, score) for rec, score in raw_list} + # Use full-record canonical JSON for dicts so that unhashable values are handled correctly + deduped = { + ( + json.dumps(rec, sort_keys=True, separators=(",", ":"), ensure_ascii=False) + if isinstance(rec, dict) + else rec + ): (rec, score) + for rec, score in raw_list + } result.append(SelectedWithDuplicates(record=selected, duplicates=list(deduped.values()))) return result diff --git a/semhash/version.py b/semhash/version.py index 0c3f13b..9bfefb0 100644 --- a/semhash/version.py +++ b/semhash/version.py @@ -1,2 +1,2 @@ -__version_triple__ = (0, 3, 2) +__version_triple__ = (0, 3, 3) __version__ = ".".join(map(str, __version_triple__))