From ae360203f6dc46f346f0faf6d469d473495f7a3c Mon Sep 17 00:00:00 2001 From: Shaad Alaka Date: Wed, 24 Jan 2024 16:07:24 +0100 Subject: [PATCH 01/11] Add sublime project/workspace files to gitignore --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f041eeb..0a288cf 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,6 @@ __pycache__/ dist valentine.egg-info build -.vscode/ \ No newline at end of file +.vscode/ +valentine.sublime-workspace +valentine.sublime-project From d742ca6074b54b3929006728bb15894965f49b61 Mon Sep 17 00:00:00 2001 From: Shaad Alaka Date: Wed, 24 Jan 2024 16:09:23 +0100 Subject: [PATCH 02/11] Add new MatcherResults object and overhaul metrics system --- valentine/__init__.py | 17 +- valentine/algorithms/matcher_results.py | 159 +++++++++++++ valentine/metrics/__init__.py | 31 +-- valentine/metrics/base_metric.py | 78 ++++++ valentine/metrics/metric_helpers.py | 79 ++++++ valentine/metrics/metrics.py | 303 +++++++----------------- 6 files changed, 419 insertions(+), 248 deletions(-) create mode 100644 valentine/algorithms/matcher_results.py create mode 100644 valentine/metrics/base_metric.py create mode 100644 valentine/metrics/metric_helpers.py diff --git a/valentine/__init__.py b/valentine/__init__.py index efb266a..79c8cc0 100644 --- a/valentine/__init__.py +++ b/valentine/__init__.py @@ -1,11 +1,11 @@ -from typing import Iterable, List, Union - import pandas as pd -import valentine.metrics as valentine_metrics import valentine.algorithms import valentine.data_sources +from typing import Iterable, List, Union +from valentine.algorithms.matcher_results import MatcherResults + class NotAValentineMatcher(Exception): pass @@ -13,7 +13,7 @@ class NotAValentineMatcher(Exception): def validate_matcher(matcher): if not isinstance(matcher, valentine.algorithms.BaseMatcher): - raise NotAValentineMatcher('The method that you selected is not supported by Valentine') + raise NotAValentineMatcher('Please provide a valid matcher') def valentine_match(df1: pd.DataFrame, @@ -26,10 +26,9 @@ def valentine_match(df1: pd.DataFrame, table_1 = valentine.data_sources.DataframeTable(df1, name=df1_name) table_2 = valentine.data_sources.DataframeTable(df2, name=df2_name) - matches = dict(sorted(matcher.get_matches(table_1, table_2).items(), - key=lambda item: item[1], reverse=True)) + matches = matcher.get_matches(table_1, table_2) - return matches + return MatcherResults(matches) def valentine_match_batch(df_iter_1: Iterable[pd.DataFrame], @@ -50,6 +49,4 @@ def valentine_match_batch(df_iter_1: Iterable[pd.DataFrame], table_2 = valentine.data_sources.DataframeTable(df2, name=table_2_name) matches.update(matcher.get_matches(table_1, table_2)) - matches = dict(sorted(matches.items(), key=lambda item: item[1], reverse=True)) - - return matches + return MatcherResults(matches) diff --git a/valentine/algorithms/matcher_results.py b/valentine/algorithms/matcher_results.py new file mode 100644 index 0000000..37aa5d0 --- /dev/null +++ b/valentine/algorithms/matcher_results.py @@ -0,0 +1,159 @@ +from __future__ import annotations +import math +from ..metrics import METRICS_CORE +from ..metrics.base_metric import Metric + +from typing import Dict, Tuple, List, Any, Set, Self + + +class MatcherResults(dict): + """This is a dictionary with additional valentine-specific functionality. + This class is the result of a matcher's `get_matches` method. + + Certain transformations such as "one_to_one" get cached, since they do not + differ from call to call and are required by many metrics. + + The assumption is that the results are sorted from high similarity to low + similarity. This is also enforced upon creation through sorting, as + dictionaries preserve their insertion order as of Python 3.6. + + Aside from transformations, one can also obtain metric scores based on the + results, which can be imported from the metrics module. The metrics come in + handy predefined sets as well, e.g. METRICS_CORE, which is the default. + """ + + def __init__(self: Self, res: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], *args, **kwargs): + self._cached_one_to_one = None + sorted_res = {k:res[k] for k in sorted(res, key=res.get, reverse=True)} + dict.__init__(self, sorted_res, *args, **kwargs) + + def one_to_one(self: Self) -> MatcherResults: + """A filter that takes a dict of column matches and returns a dict of 1 + to 1 matches. The filter works in the following way: At first it + gets the median similarity of the set of the values and removes all + matches that have a similarity lower than that. Then from what + remained it matches columns for me highest similarity to the lowest + till the columns have at most one match. + + Once calculated, the one-to-one matches are cached, to avoid redundant + calculations for metrics. + + Returns + ------- + MatcherResults + MatcherResults with one-to-one matches. + """ + if self._cached_one_to_one is not None: + return MatcherResults(self._cached_one_to_one.copy()) + + matches_dict = self.get_copy() + + set_match_values = set(matches_dict.values()) + + if len(set_match_values) < 2: + self._cached_one_to_one = matches_dict + return MatcherResults(matches_dict) + + matched = dict() + + for key in matches_dict.keys(): + matched[key[0]] = False + matched[key[1]] = False + + median = sorted(set_match_values, reverse=True)[ + math.ceil(len(set_match_values)/2)] + + matches1to1_dict = dict() + + for key in matches_dict.keys(): + if (not matched[key[0]]) and (not matched[key[1]]): + similarity = matches_dict.get(key) + if similarity is not None and similarity >= median: + matches1to1_dict[key] = similarity + matched[key[0]] = True + matched[key[1]] = True + else: + break + + self._cached_one_to_one = matches1to1_dict + return MatcherResults(matches1to1_dict) + + def take_top_percent(self: Self, percent: int) -> MatcherResults: + """Summary + Takes the top 'percent' of matches and returns a new MatcherResults + containing only these matches. + + Parameters + ---------- + percent : int + Percentage of matches to keep. + + Returns + ------- + MatcherResults + Matcher results containing only the + top 'percent' of matches. + """ + matches = self.get_copy() + number_to_keep = int( + math.ceil((percent / 100) * len(matches.keys()))) + matches = dict(sorted(matches.items(), + key=lambda x: x[1], + reverse=True)[:number_to_keep]) + return MatcherResults(matches) + + def take_top_n(self: Self, n: int) -> MatcherResults: + """Summary + Takes the top 'n' matches and returns a new MatcherResults + containing only these matches. + + Parameters + ---------- + n : int + Number of matches to keep. + + Returns + ------- + MatcherResults + Matcher results containing only the + top 'n' matches. + """ + matches = self.get_copy() + matches = dict(sorted(matches.items(), + key=lambda x: x[1], reverse=True)[:n]) + return MatcherResults(matches) + + def get_metrics(self: Self, ground_truth: List[Tuple[str, str]], metrics: Set[Metric] = METRICS_CORE) -> Dict[str, Any]: + """Summary + Given ground truth column matches and a set of metric instances, this + method will calculate scores for these metrics. Metrics can be imported + from the 'metrics' module, which also contains predefined sets of + metrics. + + Parameters + ---------- + ground_truth : List[Tuple[str, str]] + The ground truth column matches as a list of column name tuples. + metrics : Set[Metric], optional + The set of metric instances. + + Returns + ------- + Dict[str, Any] + A dictionary with metric scores. + """ + res = {} + for metric in metrics: + res.update(metric.apply(self, ground_truth)) + return res + + def get_copy(self: Self) -> MatcherResults: + """Summary + Returns a copy of this instance. + + Returns + ------- + MatcherResults + A copy of this MatcherResults instance. + """ + return MatcherResults(self.copy()) diff --git a/valentine/metrics/__init__.py b/valentine/metrics/__init__.py index 6b3a088..b05135d 100644 --- a/valentine/metrics/__init__.py +++ b/valentine/metrics/__init__.py @@ -1,23 +1,8 @@ -from valentine.metrics import metrics as metrics_module -from typing import List, Dict, Tuple - -metrics = {"names": ["precision", "recall", "f1_score", "precision_at_n_percent", "recall_at_sizeof_ground_truth"], - "args": { - "n": [10, 30, 50, 70, 90] - }} - - -def all_metrics(matches: List[Dict[Tuple[Tuple[str, str], Tuple[str, str]], float]], - golden_standard): - # load and print the specified metrics - metric_fns = [getattr(metrics_module, met) for met in metrics['names']] - - final_metrics = dict() - - for metric in metric_fns: - if metric.__name__ != "precision_at_n_percent": - final_metrics[metric.__name__] = metric(matches, golden_standard) - else: - for n in metrics['args']['n']: - final_metrics[metric.__name__.replace('_n_', '_' + str(n) + '_')] = metric(matches, golden_standard, n) - return final_metrics +from valentine.metrics.base_metric import Metric +from .metrics import * + +# Some predefined sets of metrics +METRICS_ALL = {metric() for metric in Metric.__subclasses__()} # Note: will also catch newly defined metrics +METRICS_CORE = {Precision(), Recall(), F1Score(), PrecisionTopNPercent(), RecallAtSizeofGroundTruth()} +METRICS_PRECISION_RECALL = {Precision(), Recall()} +METRICS_PRECISION_INCREASING_N = {PrecisionTopNPercent(n=x + 10) for x in range(0, 100, 10)} diff --git a/valentine/metrics/base_metric.py b/valentine/metrics/base_metric.py new file mode 100644 index 0000000..5f1e3fe --- /dev/null +++ b/valentine/metrics/base_metric.py @@ -0,0 +1,78 @@ +"""Provides the base metric class, that can be inherited from to implement +metrics. +""" +from __future__ import annotations +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from ..algorithms.matcher_results import MatcherResults +from abc import ABC, abstractmethod +from typing import Dict, Tuple, List, Any, Self, final + + +class Metric(ABC): + """Base class for a metric. Metrics can be prepared with parameters by + instantiating them, their application is deferred to a later moment this + way, which can be implemented by overriding the `apply` method. + + Metrics are tested for equivalence and hash based on their name. Hence, one + can override the `name` method to change under which key the metric appears + in the aggregated metrics obtained from a `MatcherResults`. + + All initialization arguments are expected to have default values, and thus + be keyword arguments. + """ + + @abstractmethod + def apply(self: Self, matches: MatcherResults, ground_truth: List[Tuple[str, str]]) -> Dict[str, Any]: + """Applies the metric to a `MatcherResults` instance, given ground + truth. + + Parameters + ---------- + matches : MatcherResults + The `MatcherResults` instance, obtained from `valentine_match`. + + ground_truth : List[Tuple[str, str]] + The ground truth column match pairs, by column name. + e.g. [("col1_tab_A", "col1_tab_B"), ...etc...] + + Raises + ------ + NotImplementedError + Override this method in concrete implementations. + """ + raise NotImplementedError + + def name(self: Self) -> str: + """The name of the metric, as it appears in the metric results. + + Returns + ------- + str + The name of the metric. + """ + return self.__class__.__name__ + + @final + def return_format(self: Self, value: Any) -> Dict[str, Any]: + """The return format of the `apply` method. + + Parameters + ---------- + value : Any + The metric value or score. + + Returns + ------- + Dict[str, Any] + The formatted metric value or score. + """ + return {self.name(): value} + + def __hash__(self: Self) -> int: + return str.__hash__(self.name()) + + def __eq__(self: Self, other: object) -> bool: + if isinstance(other, Metric): + return self.name() == other.name() + return False diff --git a/valentine/metrics/metric_helpers.py b/valentine/metrics/metric_helpers.py new file mode 100644 index 0000000..54e7b59 --- /dev/null +++ b/valentine/metrics/metric_helpers.py @@ -0,0 +1,79 @@ +from __future__ import annotations +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from ..algorithms.matcher_results import MatcherResults +from typing import Dict, Tuple, List + + +def get_tp_fn(matches: MatcherResults, + ground_truth: List[Tuple[str, str]], + n: int | None = None): + """Counts the amount of true positives and the amount of false + negatives among the matches in the given MatcherResults. + + Parameters + ---------- + matches : MatcherResults + A MatcherResults object that is obtained from a matcher. + ground_truth : list + A list with tuples that correspond to the ground truth matches. + e.g. [("col1_tab_A", "col1_tab_B"), ...etc...] + n : int, optional + The percentage of matches to consider. + e.g. (90) for 90% of the matches + + Returns + ------- + (int, int) + Amount of true positives and amount of false negatives. + """ + tp = 0 + fn = 0 + + matches_dict = matches.get_copy() + all_matches = [(m[0][1], m[1][1]) for m in matches_dict.keys()] + + if n is not None: + all_matches = all_matches[:n] + + for expected_match in ground_truth: + if expected_match in all_matches: + tp = tp + 1 + else: + fn = fn + 1 + return tp, fn + + +def get_fp(matches: MatcherResults, + ground_truth: List[Tuple[str, str]], + n: int | None = None): + """Counts the amount of false positives among the matches in the + given MatcherResults. + + Parameters + ---------- + matches : MatcherResults + A MatcherResults object that is obtained from a matcher. + ground_truth : list + A list with tuples that correspond to the ground truth matches. + e.g. [("col1_tab_A", "col1_tab_B"), ...etc...] + n : int, optional + The percentage of matches to consider. + e.g. (90) for 90% of the matches + + Returns + ------- + int + Amount of false positives. + """ + fp = 0 + matches_dict = matches.get_copy() + all_matches = [(m[0][1], m[1][1]) for m in matches_dict.keys()] + + if n is not None: + all_matches = all_matches[:n] + + for possible_match in all_matches: + if possible_match not in ground_truth: + fp = fp + 1 + return fp diff --git a/valentine/metrics/metrics.py b/valentine/metrics/metrics.py index 5af75bb..72c9e09 100644 --- a/valentine/metrics/metrics.py +++ b/valentine/metrics/metrics.py @@ -1,256 +1,129 @@ -import math -from typing import Dict, Tuple, List +"""Here one can find some common metric implementations. Custom metrics can be +made by subclassing the `Metric` ABC. +""" +from .base_metric import Metric +from .metric_helpers import * -def one_to_one_matches(matches: dict): - """ - A filter that takes a dict of column matches and returns a dict of 1 to 1 matches. The filter works in the following - way: At first it gets the median similarity of the set of the values and removes all matches - that have a similarity lower than that. Then from what remained it matches columns for me highest similarity - to the lowest till the columns have at most one match. - Parameters +class Precision(Metric): + """Metric for calculating precision. + + Attributes ---------- - matches : dict - The ranked list of matches - Returns - ------- - dict - The ranked list of matches after the 1 to 1 filter + one_to_one : bool + Whether to apply the one-to-one filter to the MatcherResults first. """ - set_match_values = set(matches.values()) - - if len(set_match_values) < 2: - return matches - matched = dict() + def __init__(self, one_to_one: bool = True): + self.one_to_one = one_to_one - for key in matches.keys(): - matched[key[0]] = False - matched[key[1]] = False + def apply(self, matches, ground_truth): + if self.one_to_one: + matches = matches.one_to_one() - median = sorted(set_match_values, reverse=True)[math.ceil(len(set_match_values)/2)] + tp, _ = get_tp_fn(matches, ground_truth) + fp = get_fp(matches, ground_truth) + precision = 0 + if tp + fp > 0: + precision = tp / (tp + fp) - matches1to1 = dict() + return self.return_format(precision) - for key in matches.keys(): - if (not matched[key[0]]) and (not matched[key[1]]): - similarity = matches.get(key) - if similarity >= median: - matches1to1[key] = similarity - matched[key[0]] = True - matched[key[1]] = True - else: - break - return matches1to1 +class Recall(Metric): + """Metric for calculating recall. -def get_tp_fn(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]], - n: int = None): - """ - Calculate the true positive and false negative numbers of the given matches - - Parameters + Attributes ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - n : int, optional - The percentage number that we want to consider from the ranked list (matches) - e.g. (90) for 90% of the matches - - Returns - ------- - (int, int) - True positive and false negative counts + one_to_one : bool + Whether to apply the one-to-one filter to the MatcherResults first. """ - tp = 0 - fn = 0 - all_matches = [(m[0][1], m[1][1]) for m in matches.keys()] + def __init__(self, one_to_one: bool = True): + self.one_to_one = one_to_one - if n is not None: - all_matches = all_matches[:n] + def apply(self, matches, ground_truth): + if self.one_to_one: + matches = matches.one_to_one() - for expected_match in golden_standard: - if expected_match in all_matches: - tp = tp + 1 - else: - fn = fn + 1 - return tp, fn + tp, fn = get_tp_fn(matches, ground_truth) + recall = 0 + if tp + fn > 0: + recall = tp / (tp + fn) + return self.return_format(recall) -def get_fp(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]], - n: int = None): - """ - Calculate the false positive number of the given matches - Parameters +class F1Score(Metric): + """Metric for calculating f1 score. + + Attributes ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - n : int, optional - The percentage number that we want to consider from the ranked list (matches) - e.g. (90) for 90% of the matches - - Returns - ------- - int - False positive + one_to_one : bool + Whether to apply the one-to-one filter to the MatcherResults first. """ - fp = 0 - all_matches = [(m[0][1], m[1][1]) for m in matches.keys()] + def __init__(self, one_to_one: bool = True): + self.one_to_one = one_to_one - if n is not None: - all_matches = all_matches[:n] + def apply(self, matches, ground_truth): + if self.one_to_one: + matches = matches.one_to_one() - for possible_match in all_matches: - if possible_match not in golden_standard: - fp = fp + 1 - return fp + tp, fn = get_tp_fn(matches, ground_truth) + fp = get_fp(matches, ground_truth) + f1 = 0 + if tp > 0: + pr = tp / (tp + fp) + re = tp / (tp + fn) + f1 = 2 * ((pr * re) / (pr + re)) + return self.return_format(f1) -def recall(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]], - one_to_one=True): - """ - Function that calculates the recall of the matches against the golden standard. If one_to_one is set to true, it - also performs an 1-1 match filer. Meaning that each column will match only with another one. - Parameters +class PrecisionTopNPercent(Metric): + """Metric for calculating precision of the top N percent of matches. + + Attributes ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - one_to_one : bool, optional - If to perform the 1-1 match filter - - Returns - ------- - float - The recall + n : int + The percent of matches to consider. + one_to_one : bool + Whether to apply the one-to-one filter to the MatcherResults first. """ - if one_to_one: - matches = one_to_one_matches(matches) - tp, fn = get_tp_fn(matches, golden_standard) - if tp + fn == 0: - return 0 - return tp / (tp + fn) + def __init__(self, one_to_one: bool = True, n: int = 10): + self.one_to_one = one_to_one + self.n = n -def precision(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]], - one_to_one=True): - """ - Function that calculates the precision of the matches against the golden standard. If one_to_one is set to true, it - also performs an 1-1 match filer. Meaning that each column will match only with another one. + def name(self): + return super().name().replace('N', str(self.n)) - Parameters - ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - one_to_one : bool, optional - If to perform the 1-1 match filter - - Returns - ------- - float - The precision - """ - if one_to_one: - matches = one_to_one_matches(matches) - tp, _ = get_tp_fn(matches, golden_standard) - fp = get_fp(matches, golden_standard) - if tp + fp == 0: - return 0 - return tp / (tp + fp) - - -def f1_score(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]], - one_to_one=True): - """ - Function that calculates the F1 score of the matches against the golden standard. If one_to_one is set to true, it - also performs an 1-1 match filer. Meaning that each column will match only with another one. + def apply(self, matches, ground_truth): + if self.one_to_one: + matches = matches.one_to_one() - Parameters - ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - one_to_one : bool, optional - If to perform the 1-1 match filter - - Returns - ------- - float - The f1_score - """ - pr = precision(matches, golden_standard, one_to_one) - re = recall(matches, golden_standard, one_to_one) - if pr + re == 0: - return 0 - return 2 * ((pr * re) / (pr + re)) + n_matches = matches.take_top_percent(self.n) + tp, _ = get_tp_fn(n_matches, ground_truth) + fp = get_fp(n_matches, ground_truth) + precision_top_n_percent = 0 + if tp + fp > 0: + precision_top_n_percent = tp / (tp + fp) -def precision_at_n_percent(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]], - n: int): - """ - Function that calculates the precision at n % - e.g. if n is 10 then only the first 10% of the matches will be considered for the precision calculation + return self.return_format(precision_top_n_percent) - Parameters - ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - n : int - The integer percentage number - Returns - ------- - float - The precision at n % +class RecallAtSizeofGroundTruth(Metric): + """Metric for calculating recall at the size of the ground truth. """ - number_to_keep = int(math.ceil((n / 100) * len(matches.keys()))) - tp, _ = get_tp_fn(matches, golden_standard, number_to_keep) - fp = get_fp(matches, golden_standard, number_to_keep) - if tp + fp == 0: - return 0 - return tp / (tp + fp) + def apply(self, matches, ground_truth): + n_matches = matches.take_top_n(len(ground_truth)) -def recall_at_sizeof_ground_truth(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]],): - """ - Function that calculates the recall at the size of the ground truth. - e.g. if the size of ground truth size is 10 then only the first 10 matches will be considered for - the recall calculation + tp, fn = get_tp_fn(n_matches, ground_truth) + recall = 0 + if tp + fn > 0: + recall = tp / (tp + fn) - Parameters - ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - golden_standard : list - A list that contains the golden standard - - Returns - ------- - float - The recall at the size of ground truth - """ - tp, fn = get_tp_fn(matches, golden_standard, len(golden_standard)) - if tp + fn == 0: - return 0 - return tp / (tp + fn) + return self.return_format(recall) From 0747d1ecb9a7f96cba78af6d7788af65ba4d8ff7 Mon Sep 17 00:00:00 2001 From: Shaad Alaka Date: Wed, 24 Jan 2024 16:11:56 +0100 Subject: [PATCH 03/11] Update tests and add new --- tests/test_matcher_results.py | 86 +++++++++++++++++++++++++++++ tests/test_metrics.py | 100 +++++++++++++++++++--------------- tests/test_valentine.py | 22 +++----- 3 files changed, 152 insertions(+), 56 deletions(-) create mode 100644 tests/test_matcher_results.py diff --git a/tests/test_matcher_results.py b/tests/test_matcher_results.py new file mode 100644 index 0000000..99a2860 --- /dev/null +++ b/tests/test_matcher_results.py @@ -0,0 +1,86 @@ +import unittest +import math + +from tests import df1, df2 +from valentine.algorithms.matcher_results import MatcherResults +from valentine.algorithms import JaccardDistanceMatcher +from valentine.metrics import Precision +from valentine import valentine_match + + +class TestMatcherResults(unittest.TestCase): + def setUp(self): + self.matches = valentine_match(df1, df2, JaccardDistanceMatcher()) + self.ground_truth = [ + ('Cited by', 'Cited by'), + ('Authors', 'Authors'), + ('EID', 'EID') + ] + + def test_dict(self): + assert isinstance(self.matches, dict) + + def test_get_metrics(self): + metrics = self.matches.get_metrics(self.ground_truth) + assert all([x in metrics for x in {"Precision", "Recall", "F1Score"}]) + + metrics_specific = self.matches.get_metrics(self.ground_truth, metrics={Precision()}) + assert "Precision" in metrics_specific + + def test_one_to_one(self): + m = self.matches + + # Add multiple matches per column + pairs = list(m.keys()) + for (ta, ca), (tb, cb) in pairs: + m[((ta, ca), (tb, cb + 'foo'))] = m[((ta, ca), (tb, cb))] / 2 + + # Verify that len gets corrected from 6 to 3 + m_one_to_one = m.one_to_one() + assert len(m_one_to_one) == 3 and len(m) == 6 + + # Verify that none of the lower similarity "foo" entries made it + for (ta, ca), (tb, cb) in pairs: + assert ((ta, ca), (tb, cb + 'foo')) not in m_one_to_one + + # Verify that the cache resets on a new MatcherResults instance + m_entry = MatcherResults(m) + assert m_entry._cached_one_to_one is None + + # Add one new entry with lower similarity + m_entry[(('table_1', 'BLA'), ('table_2', 'BLA'))] = 0.7214057 + + # Verify that the new one_to_one is different from the old one + m_entry_one_to_one = m_entry.one_to_one() + assert m_one_to_one != m_entry_one_to_one + + # Verify that all remaining values are above the median + median = sorted(list(m_entry.values()), reverse=True)[math.ceil(len(m_entry)/2)] + for k in m_entry_one_to_one: + assert m_entry_one_to_one[k] >= median + + def test_take_top_percent(self): + take_0_percent = self.matches.take_top_percent(0) + assert len(take_0_percent) == 0 + + take_40_percent = self.matches.take_top_percent(40) + assert len(take_40_percent) == 2 + + take_100_percent = self.matches.take_top_percent(100) + assert len(take_100_percent) == len(self.matches) + + def test_take_top_n(self): + take_none = self.matches.take_top_n(0) + assert len(take_none) == 0 + + take_some = self.matches.take_top_n(2) + assert len(take_some) == 2 + + take_all = self.matches.take_top_n(len(self.matches)) + assert len(take_all) == len(self.matches) + + take_more_than_all = self.matches.take_top_n(len(self.matches)+1) + assert len(take_more_than_all) == len(self.matches) + + def test_copy(self): + assert self.matches.get_copy() is not self.matches \ No newline at end of file diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 4fd55c6..feb0da6 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,47 +1,61 @@ import unittest - -import math -from valentine.metrics.metrics import one_to_one_matches -from copy import deepcopy - -matches = { - (('table_1', 'Cited by'), ('table_2', 'Cited by')): 0.8374313, - (('table_1', 'Authors'), ('table_2', 'Authors')): 0.83498037, - (('table_1', 'EID'), ('table_2', 'EID')): 0.8214057, -} - -ground_truth = [ - ('Cited by', 'Cited by'), - ('Authors', 'Authors'), - ('EID', 'EID') -] +from valentine.metrics import * +from valentine.algorithms.matcher_results import MatcherResults class TestMetrics(unittest.TestCase): - - def test_one_to_one(self): - m = deepcopy(matches) - - # Add multiple matches per column - pairs = list(m.keys()) - for (ta, ca), (tb, cb) in pairs: - m[((ta, ca), (tb, cb + 'foo'))] = m[((ta, ca), (tb, cb))] / 2 - - # Verify that len gets corrected to 3 - m_one_to_one = one_to_one_matches(m) - assert len(m_one_to_one) == 3 and len(m) == 6 - - # Verify that none of the lower similarity "foo" entries made it - for (ta, ca), (tb, cb) in pairs: - assert ((ta, ca), (tb, cb + 'foo')) not in m_one_to_one - - # Add one new entry with lower similarity - m_entry = deepcopy(matches) - m_entry[(('table_1', 'BLA'), ('table_2', 'BLA'))] = 0.7214057 - - m_entry_one_to_one = one_to_one_matches(m_entry) - - # Verify that all remaining values are above the median - median = sorted(set(m_entry.values()), reverse=True)[math.ceil(len(m_entry)/2)] - for k in m_entry_one_to_one: - assert m_entry_one_to_one[k] >= median + def setUp(self): + self.matches = MatcherResults({ + (('table_1', 'Cited by'), ('table_2', 'Cited by')): 0.8374313, + (('table_1', 'Authors'), ('table_2', 'Authors')): 0.83498037, + (('table_1', 'EID'), ('table_2', 'EID')): 0.8214057, + (('table_1', 'Title'), ('table_2', 'DUMMY1')): 0.8214057, + (('table_1', 'Title'), ('table_2', 'DUMMY2')): 0.8114057, + }) + self.ground_truth = [ + ('Cited by', 'Cited by'), + ('Authors', 'Authors'), + ('EID', 'EID'), + ('Title', 'Title'), + ('DUMMY3', 'DUMMY3') + + ] + + def test_precision(self): + precision = self.matches.get_metrics(self.ground_truth, metrics={Precision()}) + assert 'Precision' in precision and precision['Precision'] == 0.75 + + precision_not_one_to_one = self.matches.get_metrics(self.ground_truth, metrics={Precision(one_to_one=False)}) + assert 'Precision' in precision_not_one_to_one and precision_not_one_to_one['Precision'] == 0.6 + + def test_recall(self): + recall = self.matches.get_metrics(self.ground_truth, metrics={Recall()}) + assert 'Recall' in recall and recall['Recall'] == 0.6 + + recall_not_one_to_one = self.matches.get_metrics(self.ground_truth, metrics={Recall(one_to_one=False)}) + assert 'Recall' in recall_not_one_to_one and recall_not_one_to_one['Recall'] == 0.6 + + def test_f1(self): + f1 = self.matches.get_metrics(self.ground_truth, metrics={F1Score()}) + assert 'F1Score' in f1 and round(100*f1['F1Score']) == 67 + + f1_not_one_to_one = self.matches.get_metrics(self.ground_truth, metrics={F1Score(one_to_one=False)}) + assert 'F1Score' in f1_not_one_to_one and f1_not_one_to_one['F1Score'] == 0.6 + + def test_precision_top_n_percent(self): + precision_0 = self.matches.get_metrics(self.ground_truth, metrics={PrecisionTopNPercent(n=0)}) + assert 'PrecisionTop0Percent' in precision_0 and precision_0['PrecisionTop0Percent'] == 0 + + precision_50 = self.matches.get_metrics(self.ground_truth, metrics={PrecisionTopNPercent(n=50)}) + assert 'PrecisionTop50Percent' in precision_50 and precision_50['PrecisionTop50Percent'] == 1.0 + + precision = self.matches.get_metrics(self.ground_truth, metrics={Precision()}) + precision_100 = self.matches.get_metrics(self.ground_truth, metrics={PrecisionTopNPercent(n=100)}) + assert 'PrecisionTop100Percent' in precision_100 and precision_100['PrecisionTop100Percent'] == precision['Precision'] + + precision_70_not_one_to_one = self.matches.get_metrics(self.ground_truth, metrics={PrecisionTopNPercent(n=70, one_to_one=False)}) + assert 'PrecisionTop70Percent' in precision_70_not_one_to_one and precision_70_not_one_to_one['PrecisionTop70Percent'] == 0.75 + + def test_recall_at_size_of_ground_truth(self): + recall = self.matches.get_metrics(self.ground_truth, metrics={RecallAtSizeofGroundTruth()}) + assert 'RecallAtSizeofGroundTruth' in recall and recall['RecallAtSizeofGroundTruth'] == 0.6 \ No newline at end of file diff --git a/tests/test_valentine.py b/tests/test_valentine.py index 3614ee6..f4a7e07 100644 --- a/tests/test_valentine.py +++ b/tests/test_valentine.py @@ -2,9 +2,9 @@ from valentine.data_sources import DataframeTable -from valentine import valentine_match, valentine_match_batch, valentine_metrics, NotAValentineMatcher +from valentine import valentine_match, valentine_match_batch, NotAValentineMatcher from tests import df1, df2 -from valentine.algorithms import Coma, DistributionBased +from valentine.algorithms import JaccardDistanceMatcher class TestValentine(unittest.TestCase): @@ -12,7 +12,7 @@ class TestValentine(unittest.TestCase): def test_match(self): assert not DataframeTable(df1, name='df1_name').is_empty assert not DataframeTable(df2, name='df2_name').is_empty - matches = valentine_match(df1, df2, Coma(use_instances=True)) + matches = valentine_match(df1, df2, JaccardDistanceMatcher()) assert len(matches) > 0 try: valentine_match(df1, df2, None) @@ -21,14 +21,6 @@ def test_match(self): else: assert False - def test_metrics(self): - matches = valentine_match(df1, df2, Coma(use_instances=True)) - golden_standard = [('Cited by', 'Cited by'), - ('Authors', 'Authors'), - ('EID', 'EID')] - metrics = valentine_metrics.all_metrics(matches, golden_standard) - assert metrics['recall_at_sizeof_ground_truth'] == 1.0 - def test_batch_generator(self): n = 3 @@ -40,9 +32,13 @@ def generate_df2(): for _ in range(n): yield df2 - matches = valentine_match_batch(generate_df1(), generate_df2(), DistributionBased()) + matches = valentine_match_batch(generate_df1(), generate_df2(), JaccardDistanceMatcher()) assert len(matches) > 0 def test_batch_list(self): - matches = valentine_match_batch([df1, df1, df1], [df2, df2, df2], DistributionBased()) + matches = valentine_match_batch([df1, df1, df1], [df2, df2, df2], JaccardDistanceMatcher()) assert len(matches) > 0 + + def test_batch_names(self): + matches = valentine_match_batch([df1, df1], [df2, df2], JaccardDistanceMatcher(), ['ta1', 'tb1'], ['ta2', 'tb2']) + assert len(matches) > 0 \ No newline at end of file From d3913d85f9e18dcd8328884c0f4cb1614d43826d Mon Sep 17 00:00:00 2001 From: Shaad Alaka Date: Wed, 24 Jan 2024 16:14:11 +0100 Subject: [PATCH 04/11] Refactor Match class to a dataclass --- valentine/algorithms/match.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/valentine/algorithms/match.py b/valentine/algorithms/match.py index 53733d8..f22b6a7 100644 --- a/valentine/algorithms/match.py +++ b/valentine/algorithms/match.py @@ -1,20 +1,24 @@ -class Match(object): +from dataclasses import dataclass +from typing import Dict, Tuple, Self + + +@dataclass +class Match: """ - Class representing a match of two columns target is the one we want to find the matches of, source an other - that exists in the database and the similarity between the two. + Class representing a match of two columns. target is the one we want to + find the matches of, source an other that exists in the database and the + similarity between the two. - NOTE: Use the to_dict method when you want to append a match to a list of matches + NOTE: Use the to_dict method when you want to append a match to a list of + matches """ - def __init__(self, target_table_name: str, target_column_name: str, - source_table_name: str, source_column_name: str, - similarity: float): - self.target_table_name = target_table_name - self.target_column_name = target_column_name - self.source_table_name = source_table_name - self.source_column_name = source_column_name - self.similarity = similarity + target_table_name: str + target_column_name: str + source_table_name: str + source_column_name: str + similarity: float @property - def to_dict(self) -> dict: + def to_dict(self: Self) -> Dict[Tuple[Tuple[str, str], Tuple[str, str]], float]: return {((self.source_table_name, self.source_column_name), (self.target_table_name, self.target_column_name)): self.similarity} From 89266946311cc71b0fc1e3108adefce33ba4bdd5 Mon Sep 17 00:00:00 2001 From: Shaad Alaka Date: Wed, 24 Jan 2024 16:14:33 +0100 Subject: [PATCH 05/11] Update example --- examples/valentine_example.py | 36 ++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/examples/valentine_example.py b/examples/valentine_example.py index df3fd84..cb090db 100644 --- a/examples/valentine_example.py +++ b/examples/valentine_example.py @@ -1,8 +1,10 @@ import os import pandas as pd -from valentine import valentine_match, valentine_metrics -from valentine.algorithms import Coma +from valentine.metrics import F1Score, PrecisionTopNPercent +from valentine import valentine_match +from valentine.algorithms import JaccardDistanceMatcher import pprint +pp = pprint.PrettyPrinter(indent=4, sort_dicts=False) def main(): @@ -13,28 +15,40 @@ def main(): df2 = pd.read_csv(d2_path) # Instantiate matcher and run - # Coma requires java to be installed on your machine - # If java is not an option, all the other algorithms are in Python (e.g., Cupid) - matcher = Coma(use_instances=False) + matcher = JaccardDistanceMatcher() matches = valentine_match(df1, df2, matcher) + # MatcherResults is a wrapper object that has several useful + # utility/transformation functions + print("Found the following matches:") + pp.pprint(matches) + + print("\nGetting the one-to-one matches:") + pp.pprint(matches.one_to_one()) + # If ground truth available valentine could calculate the metrics ground_truth = [('Cited by', 'Cited by'), ('Authors', 'Authors'), ('EID', 'EID')] - metrics = valentine_metrics.all_metrics(matches, ground_truth) - - pp = pprint.PrettyPrinter(indent=4) - print("Found the following matches:") - pp.pprint(matches) + metrics = matches.get_metrics(ground_truth) print("\nAccording to the ground truth:") pp.pprint(ground_truth) - print("\nThese are the scores of the matcher:") + print("\nThese are the scores of the default metrics for the matcher:") pp.pprint(metrics) + print("\nYou can also get specific metric scores:") + pp.pprint(matches.get_metrics(ground_truth, metrics={ + PrecisionTopNPercent(n=80), + F1Score() + })) + + print("\nThe MatcherResults object is a dict and can be treated such:") + for match in matches: + print(f"{str(match): <60} {matches[match]}") + if __name__ == '__main__': main() From 84a19b6b57aee3babe1e54748646598304a50085 Mon Sep 17 00:00:00 2001 From: Shaad Alaka Date: Wed, 24 Jan 2024 16:35:58 +0100 Subject: [PATCH 06/11] Update readme to reflect new API --- README.md | 64 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index d90a1b1..4e369fa 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ After selecting one of the 5 matching methods, the user can initiate the pairwis matches = valentine_match(df1, df2, matcher, df1_name, df2_name) ``` -where df1 and df2 are the two pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input a name for each DataFrame (defaults are "table\_1" and "table\_2"). Function ```valentine_match``` returns a dictionary storing as keys column pairs from the two DataFrames and as values the corresponding similarity scores. +where df1 and df2 are the two pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input a name for each DataFrame (defaults are "table\_1" and "table\_2"). Function ```valentine_match``` returns a MatcherResults object, which is a dictionary with additional convenience methods, such as `one_to_one`, `take_top_percent`, `get_metrics` and more. It stores as keys column pairs from the two DataFrames and as values the corresponding similarity scores. ### Matching DataFrame Batch @@ -86,23 +86,48 @@ After selecting one of the 5 matching methods, the user can initiate the batch m matches = valentine_match_batch(df_iter_1, df_iter_2, matcher, df_iter_1_names, df_iter_2_names) ``` -where df_iter_1 and df_iter_2 are the two iterable structures containing pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input an iterable with names for each DataFrame. Function ```valentine_match_batch``` returns a dictionary storing as keys column pairs from the DataFrames and as values the corresponding similarity scores. +where df_iter_1 and df_iter_2 are the two iterable structures containing pandas DataFrames for which we want to find matches and matcher is one of Coma, Cupid, DistributionBased, JaccardLevenMatcher or SimilarityFlooding. The user can also input an iterable with names for each DataFrame. Function ```valentine_match_batch``` returns a MatcherResults object, which is a dictionary with additional convenience methods, such as `one_to_one`, `take_top_percent`, `get_metrics` and more. It stores as keys column pairs from the two DataFrames and as values the corresponding similarity scores. -### Measuring effectiveness -Based on the matches retrieved by calling `valentine_match` the user can use +### MatcherResults instance +The `MatcherResults` instance has some convenience methods that the user can use to either obtain a subset of the data or to transform the data. This instance is a dictionary and is sorted upon instantiation, from high similarity to low similarity. +```python +top_n_matches = matches.take_top_n(5) + +top_n_percent_matches = matches.take_top_percent(25) + +one_to_one_matches = matches.one_to_one() +``` + + +### Measuring effectiveness +The MatcherResults instance that is returned by `valentine_match` or `valentine_match_batch` also has a `get_metrics` method that the user can use ```python -metrics = valentine_metrics.all_metrics(matches, ground_truth) +metrics = matches.get_metrics(ground_truth) ``` -in order to get all effectiveness metrics, such as Precision, Recall, F1-score and others as described in the original Valentine paper. In order to do so, the user needs to also input the ground truth of matches based on which the metrics will be calculated. The ground truth can be given as a list of tuples representing column matches that should hold. +in order to get all effectiveness metrics, such as Precision, Recall, F1-score and others as described in the original Valentine paper. In order to do so, the user needs to also input the ground truth of matches based on which the metrics will be calculated. The ground truth can be given as a list of tuples representing column matches that should hold (see example below). + +By default, all the core metrics will be used for this with default parameters, but the user can also customize which metrics to run with what parameters, and implement own custom metrics by extending from the `Metric` base class. Some sets of metrics are available as well. + +```python +from valentine.metrics import F1Score, PrecisionTopNPercent, METRICS_PRECISION_INCREASING_N +metrics_custom = matches.get_metrics(ground_truth, metrics={F1Score(one_to_one=False), PrecisionTopNPercent(n=70)}) +metrics_prefefined_set = matches.get_metrics(ground_truth, metrics=METRICS_PRECISION_INCREASING_N) + +``` ### Example -The following block of code shows: 1) how to run a matcher from Valentine on two DataFrames storing information about authors and their publications, and then 2) how to assess its effectiveness based on a given ground truth (as found in [`valentine_example.py`](https://github.com/delftdata/valentine/blob/master/examples/valentine_example.py)): +The following block of code shows: 1) how to run a matcher from Valentine on two DataFrames storing information about authors and their publications, and then 2) how to assess its effectiveness based on a given ground truth (a more extensive example is shown in [`valentine_example.py`](https://github.com/delftdata/valentine/blob/master/examples/valentine_example.py)): ```python +import os +import pandas as pd +from valentine import valentine_match +from valentine.algorithms import Coma + # Load data using pandas d1_path = os.path.join('data', 'authors1.csv') d2_path = os.path.join('data', 'authors2.csv') @@ -120,7 +145,7 @@ ground_truth = [('Cited by', 'Cited by'), ('Authors', 'Authors'), ('EID', 'EID')] -metrics = valentine_metrics.all_metrics(matches, ground_truth) +metrics = matches.get_metrics(ground_truth) print(metrics) ``` @@ -128,17 +153,18 @@ print(metrics) The output of the above code block is: ``` -{(('table_1', 'Cited by'), ('table_2', 'Cited by')): 0.8374313, -(('table_1', 'Authors'), ('table_2', 'Authors')): 0.83498037, -(('table_1', 'EID'), ('table_2', 'EID')): 0.8214057} -{'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0, -'precision_at_10_percent': 1.0, -'precision_at_30_percent': 1.0, -'precision_at_50_percent': 1.0, -'precision_at_70_percent': 1.0, -'precision_at_90_percent': 1.0, -'recall_at_sizeof_ground_truth': 1.0} - +{ + (('table_1', 'Cited by'), ('table_2', 'Cited by')): 0.86994505, + (('table_1', 'Authors'), ('table_2', 'Authors')): 0.8679843, + (('table_1', 'EID'), ('table_2', 'EID')): 0.8571245 +} +{ + 'Recall': 1.0, + 'F1Score': 1.0, + 'RecallAtSizeofGroundTruth': 1.0, + 'Precision': 1.0, + 'PrecisionTop10Percent': 1.0 +} ``` ## Cite Valentine From 63ae60c036e60fbaf59eaa8bceebf71409d3f544 Mon Sep 17 00:00:00 2001 From: Shaad Alaka Date: Thu, 25 Jan 2024 14:10:46 +0100 Subject: [PATCH 07/11] Remove 'Self' typehint as this is only supported since Python 3.10 --- valentine/algorithms/match.py | 5 +++-- valentine/algorithms/matcher_results.py | 14 +++++++------- valentine/metrics/base_metric.py | 12 ++++++------ 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/valentine/algorithms/match.py b/valentine/algorithms/match.py index f22b6a7..edc251d 100644 --- a/valentine/algorithms/match.py +++ b/valentine/algorithms/match.py @@ -1,5 +1,6 @@ +from __future__ import annotations from dataclasses import dataclass -from typing import Dict, Tuple, Self +from typing import Dict, Tuple @dataclass @@ -19,6 +20,6 @@ class Match: similarity: float @property - def to_dict(self: Self) -> Dict[Tuple[Tuple[str, str], Tuple[str, str]], float]: + def to_dict(self: Match) -> Dict[Tuple[Tuple[str, str], Tuple[str, str]], float]: return {((self.source_table_name, self.source_column_name), (self.target_table_name, self.target_column_name)): self.similarity} diff --git a/valentine/algorithms/matcher_results.py b/valentine/algorithms/matcher_results.py index 37aa5d0..f37d58c 100644 --- a/valentine/algorithms/matcher_results.py +++ b/valentine/algorithms/matcher_results.py @@ -3,7 +3,7 @@ from ..metrics import METRICS_CORE from ..metrics.base_metric import Metric -from typing import Dict, Tuple, List, Any, Set, Self +from typing import Dict, Tuple, List, Any, Set class MatcherResults(dict): @@ -22,12 +22,12 @@ class MatcherResults(dict): handy predefined sets as well, e.g. METRICS_CORE, which is the default. """ - def __init__(self: Self, res: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], *args, **kwargs): + def __init__(self: MatcherResults, res: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], *args, **kwargs): self._cached_one_to_one = None sorted_res = {k:res[k] for k in sorted(res, key=res.get, reverse=True)} dict.__init__(self, sorted_res, *args, **kwargs) - def one_to_one(self: Self) -> MatcherResults: + def one_to_one(self: MatcherResults) -> MatcherResults: """A filter that takes a dict of column matches and returns a dict of 1 to 1 matches. The filter works in the following way: At first it gets the median similarity of the set of the values and removes all @@ -78,7 +78,7 @@ def one_to_one(self: Self) -> MatcherResults: self._cached_one_to_one = matches1to1_dict return MatcherResults(matches1to1_dict) - def take_top_percent(self: Self, percent: int) -> MatcherResults: + def take_top_percent(self: MatcherResults, percent: int) -> MatcherResults: """Summary Takes the top 'percent' of matches and returns a new MatcherResults containing only these matches. @@ -102,7 +102,7 @@ def take_top_percent(self: Self, percent: int) -> MatcherResults: reverse=True)[:number_to_keep]) return MatcherResults(matches) - def take_top_n(self: Self, n: int) -> MatcherResults: + def take_top_n(self: MatcherResults, n: int) -> MatcherResults: """Summary Takes the top 'n' matches and returns a new MatcherResults containing only these matches. @@ -123,7 +123,7 @@ def take_top_n(self: Self, n: int) -> MatcherResults: key=lambda x: x[1], reverse=True)[:n]) return MatcherResults(matches) - def get_metrics(self: Self, ground_truth: List[Tuple[str, str]], metrics: Set[Metric] = METRICS_CORE) -> Dict[str, Any]: + def get_metrics(self: MatcherResults, ground_truth: List[Tuple[str, str]], metrics: Set[Metric] = METRICS_CORE) -> Dict[str, Any]: """Summary Given ground truth column matches and a set of metric instances, this method will calculate scores for these metrics. Metrics can be imported @@ -147,7 +147,7 @@ def get_metrics(self: Self, ground_truth: List[Tuple[str, str]], metrics: Set[Me res.update(metric.apply(self, ground_truth)) return res - def get_copy(self: Self) -> MatcherResults: + def get_copy(self: MatcherResults) -> MatcherResults: """Summary Returns a copy of this instance. diff --git a/valentine/metrics/base_metric.py b/valentine/metrics/base_metric.py index 5f1e3fe..fdac1fe 100644 --- a/valentine/metrics/base_metric.py +++ b/valentine/metrics/base_metric.py @@ -6,7 +6,7 @@ if TYPE_CHECKING: from ..algorithms.matcher_results import MatcherResults from abc import ABC, abstractmethod -from typing import Dict, Tuple, List, Any, Self, final +from typing import Dict, Tuple, List, Any, final class Metric(ABC): @@ -23,7 +23,7 @@ class Metric(ABC): """ @abstractmethod - def apply(self: Self, matches: MatcherResults, ground_truth: List[Tuple[str, str]]) -> Dict[str, Any]: + def apply(self: Metric, matches: MatcherResults, ground_truth: List[Tuple[str, str]]) -> Dict[str, Any]: """Applies the metric to a `MatcherResults` instance, given ground truth. @@ -43,7 +43,7 @@ def apply(self: Self, matches: MatcherResults, ground_truth: List[Tuple[str, str """ raise NotImplementedError - def name(self: Self) -> str: + def name(self: Metric) -> str: """The name of the metric, as it appears in the metric results. Returns @@ -54,7 +54,7 @@ def name(self: Self) -> str: return self.__class__.__name__ @final - def return_format(self: Self, value: Any) -> Dict[str, Any]: + def return_format(self: Metric, value: Any) -> Dict[str, Any]: """The return format of the `apply` method. Parameters @@ -69,10 +69,10 @@ def return_format(self: Self, value: Any) -> Dict[str, Any]: """ return {self.name(): value} - def __hash__(self: Self) -> int: + def __hash__(self: Metric) -> int: return str.__hash__(self.name()) - def __eq__(self: Self, other: object) -> bool: + def __eq__(self: Metric, other: object) -> bool: if isinstance(other, Metric): return self.name() == other.name() return False From 982af93b8be77edca88481edb9a4c698fe74f447 Mon Sep 17 00:00:00 2001 From: Shaad Alaka Date: Thu, 25 Jan 2024 14:34:17 +0100 Subject: [PATCH 08/11] Utilize dataclass for the metrics --- tests/test_metrics.py | 5 ++++- valentine/metrics/base_metric.py | 21 +++++-------------- valentine/metrics/metric_helpers.py | 2 +- valentine/metrics/metrics.py | 31 ++++++++++++++--------------- 4 files changed, 25 insertions(+), 34 deletions(-) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index feb0da6..2ae14ea 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -58,4 +58,7 @@ def test_precision_top_n_percent(self): def test_recall_at_size_of_ground_truth(self): recall = self.matches.get_metrics(self.ground_truth, metrics={RecallAtSizeofGroundTruth()}) - assert 'RecallAtSizeofGroundTruth' in recall and recall['RecallAtSizeofGroundTruth'] == 0.6 \ No newline at end of file + assert 'RecallAtSizeofGroundTruth' in recall and recall['RecallAtSizeofGroundTruth'] == 0.6 + + def test_base_metric(self): + bla = Metric({}) \ No newline at end of file diff --git a/valentine/metrics/base_metric.py b/valentine/metrics/base_metric.py index fdac1fe..ba3ecc3 100644 --- a/valentine/metrics/base_metric.py +++ b/valentine/metrics/base_metric.py @@ -3,23 +3,20 @@ """ from __future__ import annotations from typing import TYPE_CHECKING + +from pandas.io.pytables import DataCol if TYPE_CHECKING: from ..algorithms.matcher_results import MatcherResults from abc import ABC, abstractmethod from typing import Dict, Tuple, List, Any, final +from dataclasses import dataclass +@dataclass(eq=True, frozen=True) class Metric(ABC): """Base class for a metric. Metrics can be prepared with parameters by instantiating them, their application is deferred to a later moment this way, which can be implemented by overriding the `apply` method. - - Metrics are tested for equivalence and hash based on their name. Hence, one - can override the `name` method to change under which key the metric appears - in the aggregated metrics obtained from a `MatcherResults`. - - All initialization arguments are expected to have default values, and thus - be keyword arguments. """ @abstractmethod @@ -41,7 +38,7 @@ def apply(self: Metric, matches: MatcherResults, ground_truth: List[Tuple[str, s NotImplementedError Override this method in concrete implementations. """ - raise NotImplementedError + pass def name(self: Metric) -> str: """The name of the metric, as it appears in the metric results. @@ -68,11 +65,3 @@ def return_format(self: Metric, value: Any) -> Dict[str, Any]: The formatted metric value or score. """ return {self.name(): value} - - def __hash__(self: Metric) -> int: - return str.__hash__(self.name()) - - def __eq__(self: Metric, other: object) -> bool: - if isinstance(other, Metric): - return self.name() == other.name() - return False diff --git a/valentine/metrics/metric_helpers.py b/valentine/metrics/metric_helpers.py index 54e7b59..cbf3b30 100644 --- a/valentine/metrics/metric_helpers.py +++ b/valentine/metrics/metric_helpers.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: from ..algorithms.matcher_results import MatcherResults -from typing import Dict, Tuple, List +from typing import Tuple, List def get_tp_fn(matches: MatcherResults, diff --git a/valentine/metrics/metrics.py b/valentine/metrics/metrics.py index 72c9e09..b184842 100644 --- a/valentine/metrics/metrics.py +++ b/valentine/metrics/metrics.py @@ -1,10 +1,13 @@ """Here one can find some common metric implementations. Custom metrics can be -made by subclassing the `Metric` ABC. +made by subclassing the `Metric` ABC. Marking them with the dataclass decorator +allows for proper hashing/equals without the boilerplate. """ from .base_metric import Metric from .metric_helpers import * +from dataclasses import dataclass +@dataclass(eq=True, frozen=True) class Precision(Metric): """Metric for calculating precision. @@ -13,9 +16,7 @@ class Precision(Metric): one_to_one : bool Whether to apply the one-to-one filter to the MatcherResults first. """ - - def __init__(self, one_to_one: bool = True): - self.one_to_one = one_to_one + one_to_one: bool = True def apply(self, matches, ground_truth): if self.one_to_one: @@ -30,6 +31,7 @@ def apply(self, matches, ground_truth): return self.return_format(precision) +@dataclass(eq=True, frozen=True) class Recall(Metric): """Metric for calculating recall. @@ -38,9 +40,7 @@ class Recall(Metric): one_to_one : bool Whether to apply the one-to-one filter to the MatcherResults first. """ - - def __init__(self, one_to_one: bool = True): - self.one_to_one = one_to_one + one_to_one: bool = True def apply(self, matches, ground_truth): if self.one_to_one: @@ -54,6 +54,7 @@ def apply(self, matches, ground_truth): return self.return_format(recall) +@dataclass(eq=True, frozen=True) class F1Score(Metric): """Metric for calculating f1 score. @@ -62,9 +63,7 @@ class F1Score(Metric): one_to_one : bool Whether to apply the one-to-one filter to the MatcherResults first. """ - - def __init__(self, one_to_one: bool = True): - self.one_to_one = one_to_one + one_to_one: bool = True def apply(self, matches, ground_truth): if self.one_to_one: @@ -81,20 +80,19 @@ def apply(self, matches, ground_truth): return self.return_format(f1) +@dataclass(eq=True, frozen=True) class PrecisionTopNPercent(Metric): """Metric for calculating precision of the top N percent of matches. Attributes ---------- - n : int - The percent of matches to consider. one_to_one : bool Whether to apply the one-to-one filter to the MatcherResults first. + n : int + The percent of matches to consider. """ - - def __init__(self, one_to_one: bool = True, n: int = 10): - self.one_to_one = one_to_one - self.n = n + one_to_one: bool = True + n: int = 10 def name(self): return super().name().replace('N', str(self.n)) @@ -114,6 +112,7 @@ def apply(self, matches, ground_truth): return self.return_format(precision_top_n_percent) +@dataclass(eq=True, frozen=True) class RecallAtSizeofGroundTruth(Metric): """Metric for calculating recall at the size of the ground truth. """ From 7fbe536abfb9811022acffbf60fa49aa522a54a0 Mon Sep 17 00:00:00 2001 From: Shaad Alaka Date: Thu, 25 Jan 2024 14:55:42 +0100 Subject: [PATCH 09/11] Also test metric helpers and fix equals test --- tests/test_metrics.py | 19 ++++++++++++++++--- valentine/metrics/metric_helpers.py | 8 +++++--- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 2ae14ea..d87dca6 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,7 +1,7 @@ import unittest from valentine.metrics import * from valentine.algorithms.matcher_results import MatcherResults - +from valentine.metrics.metric_helpers import get_fp, get_tp_fn class TestMetrics(unittest.TestCase): def setUp(self): @@ -60,5 +60,18 @@ def test_recall_at_size_of_ground_truth(self): recall = self.matches.get_metrics(self.ground_truth, metrics={RecallAtSizeofGroundTruth()}) assert 'RecallAtSizeofGroundTruth' in recall and recall['RecallAtSizeofGroundTruth'] == 0.6 - def test_base_metric(self): - bla = Metric({}) \ No newline at end of file + def test_metric_helpers(self): + limit = 2 + tp, fn = get_tp_fn(self.matches, self.ground_truth, n=limit) + assert tp <= len(self.ground_truth) and fn <= len(self.ground_truth) + + fp = get_fp(self.matches, self.ground_truth, n=limit) + assert fp <= limit + print(tp, fn, fp) + assert tp == 2 and fn == 3 # Since we limit to 2 of the matches + assert fp == 0 + + def test_metric_equals(self): + assert PrecisionTopNPercent(n=10, one_to_one=False) == PrecisionTopNPercent(n=10, one_to_one=False) + assert PrecisionTopNPercent(n=10, one_to_one=False) != PrecisionTopNPercent(n=10, one_to_one=True) + assert PrecisionTopNPercent(n=10, one_to_one=False) != Precision() diff --git a/valentine/metrics/metric_helpers.py b/valentine/metrics/metric_helpers.py index cbf3b30..036b91d 100644 --- a/valentine/metrics/metric_helpers.py +++ b/valentine/metrics/metric_helpers.py @@ -38,9 +38,10 @@ def get_tp_fn(matches: MatcherResults, for expected_match in ground_truth: if expected_match in all_matches: - tp = tp + 1 + tp += 1 else: - fn = fn + 1 + fn += 1 + return tp, fn @@ -75,5 +76,6 @@ def get_fp(matches: MatcherResults, for possible_match in all_matches: if possible_match not in ground_truth: - fp = fp + 1 + fp += 1 + return fp From fbbcc01c912372c9405691ff89a26bd9272b1523 Mon Sep 17 00:00:00 2001 From: Shaad Alaka Date: Thu, 25 Jan 2024 15:11:50 +0100 Subject: [PATCH 10/11] Remove print --- tests/test_metrics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index d87dca6..3099b2a 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -67,7 +67,6 @@ def test_metric_helpers(self): fp = get_fp(self.matches, self.ground_truth, n=limit) assert fp <= limit - print(tp, fn, fp) assert tp == 2 and fn == 3 # Since we limit to 2 of the matches assert fp == 0 From 5dc95dfc9c6e71fe321e1138afaacbe06185da30 Mon Sep 17 00:00:00 2001 From: Shaad Alaka Date: Thu, 25 Jan 2024 15:11:56 +0100 Subject: [PATCH 11/11] Remove unused import --- valentine/metrics/base_metric.py | 1 - 1 file changed, 1 deletion(-) diff --git a/valentine/metrics/base_metric.py b/valentine/metrics/base_metric.py index ba3ecc3..c3b33b3 100644 --- a/valentine/metrics/base_metric.py +++ b/valentine/metrics/base_metric.py @@ -4,7 +4,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from pandas.io.pytables import DataCol if TYPE_CHECKING: from ..algorithms.matcher_results import MatcherResults from abc import ABC, abstractmethod