diff --git a/tests/test_utils.py b/tests/test_utils.py index 9be26ab..a9186a0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,9 @@ import unittest +import pandas as pd + from tests import d1_path -from valentine.data_sources.utils import get_encoding, get_delimiter, is_date +from valentine.data_sources.utils import get_encoding, get_delimiter, is_date, add_noise_to_df_column from valentine.utils.utils import is_sorted, convert_data_type @@ -30,3 +32,12 @@ def test_get_delimiter(self): def test_is_date(self): date_str = "2019-04-26 18:03:50.941332" assert is_date(date_str) + + def test_add_noise_to_df_column(self): + # Tiny chance that this test will fail due to the random nature of the noise + test_df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': ['abcdefg', 'hijklmn', 'opqrst']}) + assert_df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': ['abcdefg', 'hijklmn', 'opqrst']}) + assert add_noise_to_df_column(test_df, 'a', 0.0)['a'].equals(assert_df['a']) + assert add_noise_to_df_column(test_df, 'b', 0.0)['b'].equals(assert_df['b']) + assert not add_noise_to_df_column(test_df, 'a', 0.99999)['a'].equals(assert_df['a']) + assert not add_noise_to_df_column(test_df, 'b', 0.99999)['b'].equals(assert_df['b']) diff --git a/valentine/data_sources/utils.py b/valentine/data_sources/utils.py index 4774686..28c9686 100644 --- a/valentine/data_sources/utils.py +++ b/valentine/data_sources/utils.py @@ -1,5 +1,7 @@ -import chardet import csv + +import chardet +import numpy as np from dateutil.parser import parse @@ -40,3 +42,27 @@ def is_date(string, fuzzy=False): return True except Exception: return False + + +def add_noise_to_df_column(df, column_name, noise_level): + """ + Adds noise to a specified column in a DataFrame. + + Parameters: + - df (pd.DataFrame): The DataFrame containing the column to which noise will be added. + - column_name (str): The name of the column to which noise will be added. + - noise_level (float): The level of noise to be added. For numerical columns, this indicates the standard deviation + of the Gaussian noise. For string columns, it represents the probability of permuting the + characters of each string. + + Returns: + - pd.DataFrame: The DataFrame with noise added to the specified column. + """ + if df[column_name].dtype in ["int64", "float64"]: + noise = np.random.normal(0, noise_level, df[column_name].shape[0]) + df[column_name] = df[column_name] + noise + elif df[column_name].dtype == "object": + for _ in range(df[column_name].shape[0]): + if np.random.rand() < noise_level: + df[column_name] = df[column_name].apply(lambda x: ''.join(np.random.permutation(list(str(x))))) + return df \ No newline at end of file