diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..d47b2f0 Binary files /dev/null and b/.DS_Store differ diff --git a/insightbench/agents.py b/insightbench/agents.py index 494afb7..fd5cd06 100644 --- a/insightbench/agents.py +++ b/insightbench/agents.py @@ -10,6 +10,9 @@ from insightbench.utils.metrics_utils import score_insight from insightbench import metrics from PIL import Image +import pandas as pd +from typing import Dict, List, Optional +from scripts.pattern_design import PatternDesigner class Agent: @@ -167,7 +170,7 @@ def __init__( goal="I want to find interesting trends in this dataset", verbose=False, temperature=0, - n_retries=2 + n_retries=2, ): self.goal = goal if savedir is None: @@ -482,3 +485,78 @@ def save_state_dict(self, fname): def load_state_dict(self, fname): with open(fname, "r") as f: self.insights_history = json.load(f) + + +class AgentDataGen: + def __init__( + self, + api_key: Optional[str] = None, + tasks_path: str = "insightbench/utils/domains_tasks.json", + ): + """Initialize the AgentDataGen with OpenAI API key and tasks path. + + Args: + api_key: OpenAI API key. If not provided, will try to get from OPENAI_API_KEY environment variable. + tasks_path: Path to the domains_tasks.json file + """ + self.pattern_designer = PatternDesigner(api_key) + self.tasks_path = tasks_path + self.tasks = self._load_tasks() + + def _load_tasks(self) -> dict: + """Load tasks from domains_tasks.json.""" + try: + with open(self.tasks_path, "r") as f: + return json.load(f) + except Exception as e: + raise ValueError(f"Failed to load tasks from {self.tasks_path}: {str(e)}") + + def generate_patterns(self, data: pd.DataFrame, task: str) -> Dict[str, List[Dict]]: + """Generate patterns for the given data and task. + + Args: + data: Input DataFrame containing the data to analyze + task: Description of the analytics task + + Returns: + Dictionary mapping column names to lists of pattern suggestions + """ + return self.pattern_designer.design_patterns(data, task) + + def generate_all_patterns( + self, data: pd.DataFrame, output_dir: str = "results/Patterns" + ) -> None: + """Generate patterns for all tasks and save them to the output directory. + + Args: + data: Input DataFrame containing the data to analyze + output_dir: Directory to save the generated patterns + """ + os.makedirs(output_dir, exist_ok=True) + + for domain, domain_tasks in self.tasks.items(): + print(f"\nProcessing domain: {domain}") + + # Create domain directory + domain_dir = os.path.join(output_dir, domain) + os.makedirs(domain_dir, exist_ok=True) + + for task in domain_tasks: + print(f"\nGenerating patterns for task: {task}") + + try: + # Generate patterns + patterns = self.generate_patterns(data, task) + + # Save patterns to file + task_filename = task.lower().replace(" ", "_") + "_patterns.json" + output_path = os.path.join(domain_dir, task_filename) + + with open(output_path, "w") as f: + json.dump(patterns, f, indent=2) + + print(f"Saved patterns to: {output_path}") + + except Exception as e: + print(f"Error generating patterns for task '{task}': {str(e)}") + continue diff --git a/insightbench/utils/domains_tasks.json b/insightbench/utils/domains_tasks.json new file mode 100644 index 0000000..f0f3eb9 --- /dev/null +++ b/insightbench/utils/domains_tasks.json @@ -0,0 +1,28 @@ +{ + "tasks": [ + "Fraud Detection", + "Recommendation Systems", + "Churn Analysis", + "Customer Segmentation", + "Network Analysis", + "Association Rule Mining", + "Dashboard Summary", + "Predictive Maintenance", + "Cohort Analysis", + "Attribution Modeling", + "Anomaly Detection", + "Feature Importance Ranking", + "Geospatial Analysis", + "Causality", + "Logs Clustering", + "Time Series Decomposition", + "Principal Component Analysis", + "Correlation Analysis", + "Knowledge Base", + "Multi-table Search", + "Huge Table Analysis", + "Topic Modeling", + "Market Analysis", + "Data Imputation" + ] +} \ No newline at end of file diff --git a/main.py b/main.py index e9eef4c..9919d77 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,12 @@ import os, argparse import pandas as pd - from insightbench.utils import agent_utils as au from insightbench import agents, benchmarks from insightbench.utils import exp_utils as eu from insightbench.utils.exp_utils import hash_dict, save_json +from dotenv import load_dotenv + +load_dotenv() def main(exp_dict, savedir, args): @@ -82,7 +84,7 @@ def main(exp_dict, savedir, args): parser.add_argument("-sb", "--savedir_base", type=str, default="results") parser.add_argument("-r", "--reset", type=int, default=0) # add openai api key - parser.add_argument("-o", "--openai_api_key", type=str, default=None) + # parser.add_argument("-o", "--openai_api_key", type=str, default=None) # dataset path parser.add_argument("-d", "--datadir", type=str, default="data/notebooks") @@ -102,7 +104,7 @@ def main(exp_dict, savedir, args): ) # set open ai env - os.environ["OPENAI_API_KEY"] = args.openai_api_key + # os.environ["OPENAI_API_KEY"] = # Loop through experiments for exp_dict in exp_list: diff --git a/results/.DS_Store b/results/.DS_Store new file mode 100644 index 0000000..80d25ef Binary files /dev/null and b/results/.DS_Store differ diff --git a/scripts/pattern_design.py b/scripts/pattern_design.py new file mode 100644 index 0000000..a0ee30b --- /dev/null +++ b/scripts/pattern_design.py @@ -0,0 +1,189 @@ +import pandas as pd +import json +from openai import OpenAI +from typing import Dict, List +import os, re + + +class PatternDesigner: + def __init__(self, api_key: str = None): + """Initialize the PatternDesigner with OpenAI API key. + + Args: + api_key: OpenAI API key. If not provided, will try to get from OPENAI_API_KEY environment variable. + """ + if api_key is None: + api_key = os.getenv("OPENAI_API_KEY") + if api_key is None: + raise ValueError( + "OpenAI API key not provided and OPENAI_API_KEY environment variable not set" + ) + self.client = OpenAI(api_key=api_key) + + def analyze_data(self, data: pd.DataFrame) -> str: + """Analyze the data and return a detailed summary of its structure.""" + summary = { + "num_rows": len(data), + "num_cols": len(data.columns), + "column_summaries": {}, + } + + for col in data.columns: + col_data = data[col] + col_summary = { + "dtype": str(col_data.dtype), + "num_missing": int(col_data.isnull().sum()), + "num_unique": int(col_data.nunique()), + "sample_values": col_data.dropna().unique()[:3].tolist(), + } + + if pd.api.types.is_numeric_dtype(col_data): + col_summary.update( + { + "mean": float(col_data.mean()), + "std": float(col_data.std()), + "min": float(col_data.min()), + "max": float(col_data.max()), + } + ) + elif pd.api.types.is_datetime64_any_dtype(col_data): + col_summary.update( + { + "min_date": str(col_data.min()), + "max_date": str(col_data.max()), + } + ) + elif pd.api.types.is_string_dtype(col_data): + col_summary.update( + {"top_frequent_values": col_data.value_counts().head(3).to_dict()} + ) + + summary["column_summaries"][col] = col_summary + + return json.dumps(summary, indent=2) + + def design_patterns(self, data: pd.DataFrame, task: str) -> Dict[str, List[Dict]]: + """Design patterns for each column based on the given analytics task.""" + data_summary = self.analyze_data(data) + + prompt = f""" + You are a data-centric AI expert designing synthetic data benchmarks to evaluate analytics models. + + Given a dataset summary and an analytics task, your job is to inject **2–3 realistic patterns across one or more columns** that: + - Mimic real-world behaviors or anomalies + - Interact with the dataset's structure and semantics + - Meaningfully impact model performance or insight extraction + - Allow for robust benchmarking of analytical reasoning + + --- + + Please follow these explicit steps in your reasoning (Chain-of-Thought): + + ### Step 1: Infer Key Performance Indicators (KPIs) + - Based on the dataset and task, identify 2–4 relevant KPIs that would be tracked by an analyst or model. + + ### Step 2: Identify Influential Columns and Relationships + - Which columns most influence these KPIs? + - Are there any natural correlations, temporal dynamics, or category-based splits that could affect KPI computation? + + ### Step 3: Design 2–3 Global Patterns + - Each pattern may involve **1 or more columns**, and should simulate a **plausible real-world event, behavior, or trend**. + - Avoid trivial noise (e.g., "random fluctuation"). Prefer **interpretable and benchmark-worthy** signals like: + - delayed effects + - conditionally induced trends + - cross-feature dependencies + - regime shifts + - temporal or category-driven anomalies + + ### Step 4: Explain for Each Pattern: + - What exactly is the injected pattern? + - Why is it useful from a benchmarking or insight perspective? + - Which KPIs does it affect, and how? + - What kind of analytical or modeling challenges does it test? + + --- + + ### Output format (JSON): + + {{ + "kpis": ["list of important KPIs"], + "patterns": [ + {{ + "pattern": "Description of the injected pattern", + "columns_involved": ["list of columns affected"], + "reasoning": "Why this pattern is meaningful and realistic", + "relevance_to_kpi": "Which KPIs it affects and how", + "benchmark_value": "What kind of insight or model evaluation this pattern enables" + }}, + ... + ] + }} + + --- + + ### Data Summary: + {data_summary} + + ### Analytics Task: + {task} + + """ + + response = self.client.chat.completions.create( + model="gpt-4o", # Using GPT-4o (OpenAI Omni) + messages=[ + { + "role": "system", + "content": "You are a data pattern design expert. Your task is to suggest meaningful patterns that can be injected into data columns to help accomplish specific analytics tasks. Always respond with valid JSON.", + }, + {"role": "user", "content": prompt}, + ], + ) + raw_response = response.choices[0].message.content + # Strip triple backticks and optional 'json' tag + cleaned_json_str = re.sub(r"^```(?:json)?\n|\n```$", "", raw_response.strip()) + try: + return json.loads(cleaned_json_str) + except json.JSONDecodeError: + raise ValueError("Failed to parse LLM response as JSON") + + +def main(): + # Get API key from environment variable + designer = ( + PatternDesigner() + ) # Will automatically use OPENAI_API_KEY from environment + + # Sample DataFrame + data = pd.DataFrame( + { + "date": ["2023-01-01", "2023-01-02", "2023-01-03"], + "sales": [100, 150, 200], + "category": ["A", "B", "A"], + } + ) + + task = "Anomaly detection" + + try: + patterns = designer.design_patterns(data, task) + + print("\nKey Performance Indicators (KPIs):") + for kpi in patterns.get("kpis", []): + print(f"- {kpi}") + + print("\nSuggested Patterns:") + for pattern in patterns.get("patterns", []): + print(f"\nPattern: {pattern['pattern']}") + print(f"Columns Involved: {', '.join(pattern['columns_involved'])}") + print(f"Reasoning: {pattern['reasoning']}") + print(f"Relevance to KPI: {pattern['relevance_to_kpi']}") + print(f"Benchmark Value: {pattern['benchmark_value']}") + print("-" * 80) + + except Exception as e: + print(f"Error: {e}") + + +if __name__ == "__main__": + main() diff --git a/scripts/pattern_inject.py b/scripts/pattern_inject.py new file mode 100644 index 0000000..98b73f6 --- /dev/null +++ b/scripts/pattern_inject.py @@ -0,0 +1,208 @@ +from openai import OpenAI +import json +import os +import shutil +import re +import subprocess +import pandas as pd + + +class PatternInjector: + def __init__(self, api_key: str = None): + """Initialize the PatternInjector with OpenAI API key. + + Args: + api_key: OpenAI API key. If not provided, will try to get from OPENAI_API_KEY environment variable. + """ + if api_key is None: + api_key = os.getenv("OPENAI_API_KEY") + if api_key is None: + raise ValueError( + "OpenAI API key not provided and OPENAI_API_KEY environment variable not set" + ) + self.client = OpenAI(api_key=api_key) + + def get_inject_codes(self, patterns: str) -> dict: + """Get the code to inject the pattern into the data. + + Args: + patterns: The patterns to inject. It is a json file with the following format: + { + "kpis": [...], + "patterns": [ + { + "pattern": "description of the pattern", + "columns_involved": ["col1", "col2", ...], + "reasoning": "explanation of why this pattern is useful", + "relevance_to_kpi": "how this pattern helps with the task", + "benchmark_value": "value to test against" + }, + ... + ] + } + + Returns: + The code to inject the pattern into the data. + """ + + print("Started getting inject codes ...") + + patterns_dict = json.loads(patterns) + patterns_list = patterns_dict.get("patterns", []) + output = {} + + for pattern_index, pattern_info in enumerate(patterns_list): + columns = pattern_info.get("columns_involved", []) + pattern_description = pattern_info.get("pattern", "") + reasoning = pattern_info.get("reasoning", "") + relevance = pattern_info.get("relevance_to_kpi", "") + + columns_str = "" + for i, column in enumerate(columns): + columns_str += f"'{i+1}. {column} \t" + + prompt = f""" + Imagine you are given a pandas DataFrame named `df`. + + You are tasked with writing a Python function that injects a pattern into the given pandas DataFrame named `df`. + Injecting a pattern means **modifying or adding values** in specific columns of the DataFrame to follow a certain logical structure, transformation rule, or simulated behavior. This may involve altering existing column values, adding new derived columns, or enforcing specific relationships between columns. + Here is what you need to do: + - The function **must be named** `pattern_{pattern_index+1}` (e.g., `pattern_3`) + - The signature of the function should be: + ```python + def pattern_{pattern_index+1}(df: pd.DataFrame) -> pd.DataFrame: + ``` + - It should be a **standalone function** (no external dependencies other than standard libraries and `pandas`, `numpy`, `re`) + - The function should: + 1. Take only one input: the DataFrame `df` + 2. **Modify `df` in-place** (i.e., apply transformations directly on `df`) + 3. Implement the following pattern injection: + - **Pattern Description**: {pattern_description} + - **Reasoning**: {reasoning} + - **Relevance**: {relevance} + - **Columns Involved**: {columns_str} + 4. Assume that the DataFrame already contains all columns listed in `Columns Involved`. + 5. Make sure your function is not using columns that are not listed in `Columns Involved`. + - If new columns need to be added as part of the pattern, ensure they are clearly named and added to `df` + - Handle errors gracefully (e.g., missing values, type mismatches) + - Return the modified `df` (even though it is modified in place, this improves flexibility) + + Output requirements: + - Only include the necessary `import` statements and the function definition. + - Do not include any explanation or comments. + - Only generate one function with all logic embedded. + - The code should be valid Python code and in python environment. + - Do not include usage examples or extra text. + - Function should have a return statement that returns the modified DataFrame. + + IMPORTANT NOTES: + - The function should be valid Python code and should not include any comments or explanations. + - The function should be self-contained and not rely on any external context or variables. + - The function should be able to handle the patterns described above and return a modified DataFrame. + - There should be no additional code except for the function definition and necessary imports. You should not write and include other functions or call this functions (not even writing a main function). + - The function should not include any print statements or logging. + + Please return only the code (imports + one function) in python environment. Nothing else. + """ + + response = self.client.chat.completions.create( + model="gpt-4o", + messages=[ + { + "role": "system", + "content": "You are a data pattern injection coding expert. Your task is to write correct and simple codes to inject the given patterns to help accomplish specific analytics tasks. Always respond with valid Python Code.", + }, + {"role": "user", "content": prompt}, + ], + ) + + output["Pattern" + str(pattern_index+1) + "_".join(columns)] = response.choices[0].message.content.strip() + print(f"Finished getting inject codes for pattern on columns: {"_".join(columns)}") + + print("Finished getting inject codes for all patterns.") + return output + + def inject_patterns( + self, + base_df: pd.DataFrame, + pattern_codes: dict, + hash_id: str = None, + ) -> pd.DataFrame: + """Inject the patterns into the data. + + Args: + base_df: The base DataFrame to inject the patterns into. + pattern_codes: The pattern codes to inject. It is a dictionary with the following format: + { + "name1": "code to inject for pattern1", + "name2": "code to inject for pattern2", + ... + } + hash_id: The hash ID to use for the temp directory. If not provided, will use "default". + """ + + print("Started injecting patterns ...") + + # Step 1: Create temp directory inside results/{hash_id}/codefiles/ + if hash_id is None: + hash_id = "default" + temp_dir = os.path.join("results", hash_id, "codefiles") + os.makedirs(temp_dir, exist_ok=True) + + # Step 2: Handle input data + if isinstance(base_df, pd.DataFrame): + df = base_df.copy() + filename = "temp_data.csv" + temp_csv_path = os.path.join(temp_dir, filename) + df.to_csv(temp_csv_path, index=False) + else: + raise ValueError( + "base_df should be a pandas DataFrame. Please provide a valid DataFrame." + ) + + # Step 3: Create Python scripts for each column + pattern_index = 1 + for pattern_name, raw_code in pattern_codes.items(): + print(f"Injecting pattern: {pattern_name}") + + match = re.search(r"```python(.*?)```", raw_code, re.DOTALL) + code = match.group(1).strip() if match else raw_code.strip() + + code = re.sub( + r"^\s*import\s+pandas\s+as\s+pd\s*\n?", "", code, flags=re.MULTILINE + ) + + func_name = "pattern_" + pattern_index + + final_code = "import pandas as pd\n" + code.strip() + "\n" + final_code += """if __name__ == "__main__":\n""" + final_code += f""" df = pd.read_csv("./{filename}")\n""" + final_code += f""" df = {func_name}(df)\n""" + final_code += f""" df.to_csv("./{filename}", index=False)""" + + # Create script in codefiles directory + script_name = f"{func_name}.py" + script_path = os.path.join(temp_dir, script_name) + + with open(script_path, "w") as f: + f.write(final_code) + + print(f"Created script for pattern: {pattern_name}") + + # Step 4: Run the script + subprocess.run(["python3", script_name], check=True, cwd=temp_dir) + + # Update the DataFrame with the modified data + df = pd.read_csv(temp_csv_path) + + print(f"Injected pattern for pattern: {pattern_name}") + + pattern_index += 1 + + print("Finished injecting patterns for all patterns.") + + # Step 5: Clean up + # Don't remove the directory since we want to keep the code files + # shutil.rmtree(temp_dir) + + return df