diff --git a/examples/valentine_top_columns_example.py b/examples/valentine_top_columns_example.py new file mode 100644 index 0000000..e7f608f --- /dev/null +++ b/examples/valentine_top_columns_example.py @@ -0,0 +1,44 @@ +import os +import pandas as pd +from valentine import valentine_match +from valentine.algorithms import SimilarityFlooding +import pprint + +from valentine.metrics.metrics import get_top_n_columns + + +def main(): + # Load data using pandas + d1_path = os.path.join('data', 'authors1.csv') + d2_path = os.path.join('data', 'authors2.csv') + df1 = pd.read_csv(d1_path) + df2 = pd.read_csv(d2_path) + + # Instantiate matcher and run + matcher = SimilarityFlooding() + matches = valentine_match(df1, df2, matcher) + + # Find the top-n columns for all columns in dataframe1 (authors1.csv) + all_top_2_columns = get_top_n_columns(matches, 2) + + # Find the top-n columns for the column 'Authors' in dataframe1 + authors_top_2_columns = get_top_n_columns(matches, 2, [('table_1', 'Authors')]) + + authors_year_top_2_columns = get_top_n_columns(matches, 2, [('table_1', 'Authors'), ('table_1', 'Year')]) + + pp = pprint.PrettyPrinter(indent=4) + print("Found the following matches:") + pp.pprint(matches) + + print("Top 2 columns for each column (with their corresponding score):") + pp.pprint(all_top_2_columns) + + print("Top 2 columns for 'Authors' column in table 1:") + pp.pprint(authors_top_2_columns) + + print("Top 2 columns for multiple keys. Namely, 'Authors' and 'Year' columns in table 1:") + pp.pprint(authors_year_top_2_columns) + + +if __name__ == '__main__': + main() diff --git a/valentine/metrics/metrics.py b/valentine/metrics/metrics.py index 007d414..1f19238 100644 --- a/valentine/metrics/metrics.py +++ b/valentine/metrics/metrics.py @@ -1,4 +1,5 @@ import math +from itertools import chain from typing import Dict, Tuple, List @@ -28,7 +29,7 @@ def one_to_one_matches(matches: dict): matched[key[0]] = False matched[key[1]] = False - median = list(set_match_values)[math.ceil(len(set_match_values)/2)] + median = list(set_match_values)[math.ceil(len(set_match_values) / 2)] matches1to1 = dict() @@ -232,7 +233,7 @@ def precision_at_n_percent(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]] def recall_at_sizeof_ground_truth(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]],): + golden_standard: List[Tuple[str, str]], ): """ Function that calculates the recall at the size of the ground truth. e.g. if the size of ground truth size is 10 then only the first 10 matches will be considered for @@ -254,3 +255,60 @@ def recall_at_sizeof_ground_truth(matches: Dict[Tuple[Tuple[str, str], Tuple[str if tp + fn == 0: return 0 return tp / (tp + fn) + + +def get_top_n_columns(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], + n: int, + keys: List[Tuple[str, str]] = None): + """ + Returns the top n columns (regarding similarity) for each column (applies to both tables) with their + corresponding score + Example output (n=2): { + ('table_1', 'Authors'): [{'Access Type': 0.1515703989838858},{'Authors': 0.2816471572126128}], + ('table_2', 'Authors'): [{'Authors': 0.1515703989838858}, {'Cited by': 0.2816471572126128}] + ... + } + + Parameters + ---------- + matches : dict + Ranked list of matches from the match with higher similarity to lower + n : int + The maximum number of columns to return + keys : Tuple[str, str] + If specified, it will only return the top n columns for the given keys + Example : [('table_1', 'Authors'), ('table_1', 'Access Type')] + + Returns + ------- + dict + A dictionary with its keys to be equal to the column names of the first dataframe and its values to be + a list of dictionaries with the top n columns + + output example: + key: ('table_1', 'Authors') + value: [{'Access Type': 0.1515703989838858}, {'Authors': 0.2816471572126128}] + """ + + # Identify the keys of the top columns that are going to be returned. Use from parameters or get all unique keys + if keys is None: + keys = list(set(chain.from_iterable(sub for sub in matches.keys()))) + + # Create an empty dictionary where each column holds a list of the top similar + top_columns = {} + for key in keys: + top_columns[key] = list() + + # Iterate sort matches and add the columns to the dictionary + for column_a, column_b in sorted(matches): + score = matches[(column_a, column_b)] + + # Check whether column_a is of any interest and whether the top_n_columns are already present in the list + if (column_a in keys) and (len(top_columns[column_a]) < n): + top_columns[column_a].append({column_b[1]: score}) + + # Check whether column_b is of any interest and whether the top_n_columns are already present in the list + if (column_b in keys) and (len(top_columns[column_b]) < n): + top_columns[column_b].append({column_a[1]: score}) + + return top_columns