Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions examples/valentine_top_columns_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
import pandas as pd
from valentine import valentine_match
from valentine.algorithms import SimilarityFlooding
import pprint

from valentine.metrics.metrics import get_top_n_columns


def main():
# Load data using pandas
d1_path = os.path.join('data', 'authors1.csv')
d2_path = os.path.join('data', 'authors2.csv')
df1 = pd.read_csv(d1_path)
df2 = pd.read_csv(d2_path)

# Instantiate matcher and run
matcher = SimilarityFlooding()
matches = valentine_match(df1, df2, matcher)

# Find the top-n columns for all columns in dataframe1 (authors1.csv)
all_top_2_columns = get_top_n_columns(matches, 2)

# Find the top-n columns for the column 'Authors' in dataframe1
authors_top_2_columns = get_top_n_columns(matches, 2, [('table_1', 'Authors')])

authors_year_top_2_columns = get_top_n_columns(matches, 2, [('table_1', 'Authors'), ('table_1', 'Year')])

pp = pprint.PrettyPrinter(indent=4)
print("Found the following matches:")
pp.pprint(matches)

print("Top 2 columns for each column (with their corresponding score):")
pp.pprint(all_top_2_columns)

print("Top 2 columns for 'Authors' column in table 1:")
pp.pprint(authors_top_2_columns)

print("Top 2 columns for multiple keys. Namely, 'Authors' and 'Year' columns in table 1:")
pp.pprint(authors_year_top_2_columns)


if __name__ == '__main__':
main()
62 changes: 60 additions & 2 deletions valentine/metrics/metrics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import math
from itertools import chain
from typing import Dict, Tuple, List


Expand Down Expand Up @@ -28,7 +29,7 @@ def one_to_one_matches(matches: dict):
matched[key[0]] = False
matched[key[1]] = False

median = list(set_match_values)[math.ceil(len(set_match_values)/2)]
median = list(set_match_values)[math.ceil(len(set_match_values) / 2)]

matches1to1 = dict()

Expand Down Expand Up @@ -232,7 +233,7 @@ def precision_at_n_percent(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]]


def recall_at_sizeof_ground_truth(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float],
golden_standard: List[Tuple[str, str]],):
golden_standard: List[Tuple[str, str]], ):
"""
Function that calculates the recall at the size of the ground truth.
e.g. if the size of ground truth size is 10 then only the first 10 matches will be considered for
Expand All @@ -254,3 +255,60 @@ def recall_at_sizeof_ground_truth(matches: Dict[Tuple[Tuple[str, str], Tuple[str
if tp + fn == 0:
return 0
return tp / (tp + fn)


def get_top_n_columns(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float],
n: int,
keys: List[Tuple[str, str]] = None):
"""
Returns the top n columns (regarding similarity) for each column (applies to both tables) with their
corresponding score
Example output (n=2): {
('table_1', 'Authors'): [{'Access Type': 0.1515703989838858},{'Authors': 0.2816471572126128}],
('table_2', 'Authors'): [{'Authors': 0.1515703989838858}, {'Cited by': 0.2816471572126128}]
...
}

Parameters
----------
matches : dict
Ranked list of matches from the match with higher similarity to lower
n : int
The maximum number of columns to return
keys : Tuple[str, str]
If specified, it will only return the top n columns for the given keys
Example : [('table_1', 'Authors'), ('table_1', 'Access Type')]

Returns
-------
dict
A dictionary with its keys to be equal to the column names of the first dataframe and its values to be
a list of dictionaries with the top n columns

output example:
key: ('table_1', 'Authors')
value: [{'Access Type': 0.1515703989838858}, {'Authors': 0.2816471572126128}]
"""

# Identify the keys of the top columns that are going to be returned. Use from parameters or get all unique keys
if keys is None:
keys = list(set(chain.from_iterable(sub for sub in matches.keys())))

# Create an empty dictionary where each column holds a list of the top similar
top_columns = {}
for key in keys:
top_columns[key] = list()

# Iterate sort matches and add the columns to the dictionary
for column_a, column_b in sorted(matches):
score = matches[(column_a, column_b)]

# Check whether column_a is of any interest and whether the top_n_columns are already present in the list
if (column_a in keys) and (len(top_columns[column_a]) < n):
top_columns[column_a].append({column_b[1]: score})

# Check whether column_b is of any interest and whether the top_n_columns are already present in the list
if (column_b in keys) and (len(top_columns[column_b]) < n):
top_columns[column_b].append({column_a[1]: score})

return top_columns