diff --git a/CITATION.bib b/CITATION.bib index 27419d6..3f0ea16 100644 --- a/CITATION.bib +++ b/CITATION.bib @@ -1,8 +1,9 @@ -@inproceedings{deepresearch2025, - title={TITLE}, - author={[Authors]}, - booktitle={International Conference on Learning Representations (ICLR)}, +@misc{sharma2025researchrubricsbenchmarkpromptsrubrics, + title={ResearchRubrics: A Benchmark of Prompts and Rubrics For Evaluating Deep Research Agents}, + author={Manasi Sharma and Chen Bo Calvin Zhang and Chaithanya Bandi and Clinton Wang and Ankit Aich and Huy Nghiem and Tahseen Rabbani and Ye Htet and Brian Jang and Sumana Basu and Aishwarya Balwani and Denis Peskoff and Marcos Ayestaran and Sean M. Hendryx and Brad Kenstler and Bing Liu}, year={2025}, - url={https://github.com/[username]/deep-research-benchmarks}, - note={Code release for ICLR 2025} + eprint={2511.07685}, + archivePrefix={arXiv}, + primaryClass={cs.AI}, + url={https://arxiv.org/abs/2511.07685} } diff --git a/DATA_FORMAT.md b/DATA_FORMAT.md index 514ebf5..d91be02 100644 --- a/DATA_FORMAT.md +++ b/DATA_FORMAT.md @@ -1,328 +1,275 @@ # Data Format Specification -This document describes the data formats used throughout the Deep Research Benchmarks pipeline. +This document describes the data formats used throughout the Research Rubrics evaluation pipeline. ## Table of Contents - [Input Formats](#input-formats) - - [Raw CSV Format](#raw-csv-format) -- [Intermediate Formats](#intermediate-formats) - - [Compiled Dataset](#compiled-dataset) - - [Rubric Format](#rubric-format) + - [Processed Data JSONL](#processed-data-jsonl) + - [Markdown Reports](#markdown-reports) - [Output Formats](#output-formats) - - [Evaluation Results](#evaluation-results) - - [Metrics Output](#metrics-output) + - [Evaluation Results JSONL](#evaluation-results-jsonl) + - [Compliance Scores](#compliance-scores) ## Input Formats -### Raw CSV Format +### Processed Data JSONL -Raw evaluation CSV files should follow this structure: +File: `data/researchrubrics/processed_data.jsonl` + +This file contains one JSON object per line, with each object representing a research task and its evaluation rubrics. #### Structure -| Row | Column | Description | -|-----|--------|-------------| -| 0 | prompt | The original task prompt | -| 1+ | title | Rubric criterion title | -| 1+ | weight | Numerical weight (e.g., 1.0) | -| 1+ | category | Rubric category (e.g., "Accuracy", "Completeness") | -| 1+ | gemini_present | Ground truth for Gemini (Satisfied/Partially Satisfied/Not Satisfied) | -| 1+ | chatgpt_present | Ground truth for ChatGPT | -| 1+ | perplexity_present | Ground truth for Perplexity | +Each line is a JSON object with the following fields: + +| Field | Type | Description | +|-------|------|-------------| +| prompt | string | The research task/question given to the AI | +| sample_id | string | Unique identifier for the task (used as markdown filename) | +| domain | string | Domain category (e.g., "AI & ML", "Historical Analysis") | +| conceptual_breadth | string | Task complexity: "Simple", "Moderate", "Complex" | +| logical_nesting | string | Reasoning depth: "Simple", "Intermediate", "Complex" | +| exploration | string | Research scope: "Low", "Medium", "High" | +| rubrics | array | List of evaluation criteria (see Rubric Format below) | + +#### Rubric Format -#### Special Rows +Each rubric in the `rubrics` array contains: -- **Row 0**: Task prompt -- **Row 3**: Gemini PDF URL in `prompt` column -- **Row 6**: ChatGPT PDF URL in `prompt` column -- **Row 9**: Perplexity PDF URL in `prompt` column +| Field | Type | Description | +|-------|------|-------------| +| criterion | string | The evaluation criterion description | +| weight | float | Weight of this criterion (can be positive or negative) | +| axis | string | Category (e.g., "Explicit Criteria", "Communication Quality") | #### Example -```csv -prompt,title,weight,category,gemini_present,chatgpt_present,perplexity_present -"Analyze the impact of climate change...",,,,, -,Data Sources,1.0,Accuracy,Satisfied,Satisfied,Partially Satisfied -,Citation Quality,1.0,References,Satisfied,Not Satisfied,Satisfied -https://example.com/gemini.pdf,,,,, -,Methodology,1.0,Completeness,Partially Satisfied,Satisfied,Satisfied -,Analysis Depth,1.0,Quality,Satisfied,Satisfied,Not Satisfied -https://example.com/chatgpt.pdf,,,,, -,Conclusion,1.0,Structure,Satisfied,Partially Satisfied,Satisfied -,Visual Elements,0.5,Presentation,Not Satisfied,Satisfied,Satisfied -https://example.com/perplexity.pdf,,,,, +```json +{ + "prompt": "Write a synthesis report on the applications of AI in drug discovery for a technical audience unfamiliar with biology...", + "sample_id": "6847465956a0f6376a605355", + "domain": "AI & ML", + "conceptual_breadth": "Moderate", + "logical_nesting": "Intermediate", + "exploration": "Medium", + "rubrics": [ + { + "criterion": "The response describes at least one specific AI application for each drug-discovery stage...", + "weight": 5.0, + "axis": "Explicit Criteria" + }, + { + "criterion": "The response provides brief (โ‰ค20 words) definitions for specialized terms...", + "weight": 5.0, + "axis": "Instruction Following" + }, + { + "criterion": "The response uses deterministic language for speculative claims...", + "weight": -4.0, + "axis": "Implicit Criteria" + } + ] +} ``` -## Intermediate Formats +**Note**: Negative weights indicate penalty rubrics (failures that should NOT occur). -### Compiled Dataset +### Markdown Reports -After extraction, data is compiled into `compiled_dataset.csv`: +File location: `agent_responses/[sample_id].md` -#### Columns +These are the AI-generated research reports to be evaluated. Each markdown file should: +- Be named with its corresponding `sample_id` from `processed_data.jsonl` +- Contain the complete text of the AI-generated research report +- Be in markdown format (plain text with markdown formatting) -| Column | Type | Description | -|--------|------|-------------| -| csv_filename | string | Original CSV filename | -| task_name | string | Unique task identifier (hash) | -| prompt | string | Original task prompt | -| rubrics | JSON string | Array of rubric objects | -| rubrics_count | integer | Number of rubrics | -| pdf_paths | JSON string | Paths to PDFs for each model | -| final_presence | JSON string | Ground truth evaluations | +#### Example Filename -#### Example Row - -```json -{ - "csv_filename": "683a58c9a7e7fe4e7695846f_fixed.csv", - "task_name": "683a58c9a7e7fe4e7695846f", - "prompt": "Analyze the impact of climate change on polar bear populations...", - "rubrics": "[{\"title\": \"Data Sources\", \"weight\": 1.0, \"category\": \"Accuracy\", \"row_index\": 1}, ...]", - "rubrics_count": 15, - "pdf_paths": "{\"gemini_pdf\": {\"path\": \"data/PDFs/683.../gemini.pdf\", \"error\": null}, ...}", - "final_presence": "{\"gemini_present\": {\"values\": [\"Satisfied\", \"Partially Satisfied\", ...], \"null_count\": 0, \"total_count\": 15}, ...}" -} +For a task with `sample_id: "6847465956a0f6376a605355"`, the markdown file should be: +``` +agent_responses/6847465956a0f6376a605355.md ``` -### Rubric Format +#### Content Example -Each rubric is a JSON object with the following structure: +```markdown +# AI Applications in Drug Discovery -```json -{ - "title": "Data Sources", - "weight": 1.0, - "category": "Accuracy", - "row_index": 1 -} -``` +## Introduction -**Fields**: -- `title` (string): The rubric criterion description -- `weight` (float): Weight for scoring (typically 1.0 or 0.5) -- `category` (string): Category classification (e.g., "Accuracy", "Completeness", "Structure") -- `row_index` (integer): Original row position in CSV +Artificial Intelligence (AI) has revolutionized the drug discovery process... -### PDF Paths Format +## Target Identification -```json -{ - "gemini_pdf": { - "path": "data/PDFs/683a58c9a7e7fe4e7695846f/gemini.pdf", - "error": null - }, - "chatgpt_pdf": { - "path": "data/PDFs/683a58c9a7e7fe4e7695846f/chatgpt.pdf", - "error": null - }, - "perplexity_pdf": { - "path": "data/PDFs/683a58c9a7e7fe4e7695846f/perplexity.pdf", - "error": null - } -} -``` +AI models such as Convolutional Neural Networks (CNNs) can analyze... -### Final Presence Format +## Conclusion -```json -{ - "gemini_present": { - "values": ["Satisfied", "Partially Satisfied", "Not Satisfied", ...], - "null_count": 0, - "total_count": 15 - }, - "chatgpt_present": { - "values": ["Satisfied", "Satisfied", "Not Satisfied", ...], - "null_count": 0, - "total_count": 15 - }, - "perplexity_present": { - "values": ["Not Satisfied", "Satisfied", "Satisfied", ...], - "null_count": 0, - "total_count": 15 - } -} +The integration of AI into drug discovery pipelines represents... ``` -**Fields**: -- `values` (array): List of verdicts in order of rubrics -- `null_count` (integer): Number of missing/null evaluations -- `total_count` (integer): Total number of rubrics - ## Output Formats -### Evaluation Results +### Evaluation Results JSONL + +File location: `results/batch_evaluation_YYYYMMDD_HHMMSS.jsonl` -After LLM evaluation, results are saved with the same structure as compiled dataset, but with updated `final_presence` containing predicted values. +After evaluation, results are saved as JSONL with one evaluation record per line. -#### Individual Evaluation Record +#### Structure + +Each line represents a single rubric evaluation: + +| Field | Type | Description | +|-------|------|-------------| +| sample_id | string | Task identifier matching the markdown filename | +| rubric_title | string | The rubric criterion that was evaluated | +| verdict | string | "Satisfied" or "Not Satisfied" | +| score | float | 1.0 for Satisfied, 0.0 for Not Satisfied | +| confidence | float | Model's confidence (0.0 to 1.0) | +| reasoning | string | Detailed explanation for the verdict | +| tokens_used | integer | Number of tokens consumed | +| cost | float | API cost for this evaluation (in USD) | +| success | boolean | Whether evaluation completed successfully | +| weight | float | Weight of this rubric from input data | -During evaluation, each rubric-document pair generates: +#### Example ```json { - "task_name": "683a58c9a7e7fe4e7695846f", - "pdf": "gemini", - "rubric_title": "Data Sources", - "verdict": "Satisfied", - "score": 1.0, - "confidence": 0.95, - "reasoning": "The document cites 5 peer-reviewed sources...", - "tokens_used": 2453, - "cost": 0.0123, - "duration": 2.34, + "sample_id": "683a58c9a7e7fe4e7695846f", + "rubric_title": "The response ensures all acronyms are expanded...", + "verdict": "Not Satisfied", + "score": 0.0, + "confidence": 1.0, + "reasoning": "The document fails to meet the criterion because it does not expand any of the acronyms it uses (AMC, AIME, USA J MO, IMO)...", + "tokens_used": 4567, + "cost": 0.024739999999999998, "success": true, - "error": null + "weight": 4.0 } ``` -### Metrics Output +### Compliance Scores -#### F1 Scores - -Console output format: +Calculated from evaluation results using the formula: ``` -================================================================================ -MACRO F1 SCORE RESULTS -================================================================================ - -Average F1 Scores across 100 tasks: - Gemini : 0.8542 - Chatgpt : 0.8123 - Perplexity : 0.7891 - -Note: F1 scores calculated by comparing ground truth vs predicted presence lists +Compliance Score = ฮฃ(weight ร— score) / ฮฃ(positive weights) ``` -#### Weighted Scores +Where: +- Only positive-weight rubrics are included in the denominator +- Negative-weight rubrics (penalties) subtract from the numerator +- Final score is typically between 0.0 and 1.0 (but can be negative if penalties exceed gains) -``` -Average Scores across 100 rows: -Gemini: 0.8234 -ChatGPT: 0.7956 -Perplexity: 0.7723 -``` - -#### Failure Breakdown +#### Console Output Example ``` -================================================================================ -FAILURE RATE BREAKDOWN BY CATEGORY -================================================================================ - -MODEL: GEMINI --------------------------------------------------------------------------------- -Tasks with failures: 45 / 100 -Total failures across all tasks: 234 - -Category Avg Ratio Agg Ratio Tasks ----------------------------------------- ------------ ------------ -------- -Accuracy 0.3456 0.3512 32 -Completeness 0.2789 0.2845 28 -Structure 0.1923 0.1876 21 -References 0.1234 0.1198 15 -... +Compliance Scores: +================== +Sample 683a58c9a7e7fe4e7695846f: 0.65 (65%) +Sample 683a58c9a7e7fe4e7695848b: 0.82 (82%) +Sample 683a58c9a7e7fe4e7695848e: 0.71 (71%) + +Average Compliance: 0.73 (73%) ``` ## Validation Rules ### Required Fields -All datasets must include: -- Non-empty `task_name` -- Valid `prompt` text -- At least one rubric -- PDF paths for all three models -- Complete presence data (null_count = 0) +#### Input Data (processed_data.jsonl) +Each JSON object must include: +- Non-empty `prompt` string +- Valid `sample_id` string +- Non-empty `rubrics` array with at least one rubric +- Each rubric must have `criterion`, `weight`, and `axis` fields + +#### Markdown Reports +- File must exist in `agent_responses/` directory +- Filename must match a `sample_id` from `processed_data.jsonl` +- File must contain readable markdown text ### Value Constraints - **Verdict values**: Must be one of: - - Ternary: "Satisfied", "Partially Satisfied", "Not Satisfied" - - Binary: "Satisfied", "Not Satisfied" -- **Weights**: Positive float values (typically 0.5 or 1.0) + - "Satisfied" + - "Not Satisfied" +- **Weights**: Float values (can be positive or negative) + - Positive weights: Typical values are 1.0 to 5.0 + - Negative weights: Penalties, typically -1.0 to -5.0 - **Scores**: - - Ternary: 0.0, 0.5, or 1.0 - Binary: 0.0 or 1.0 - **Confidence**: Float between 0.0 and 1.0 ### Data Integrity -- Number of verdicts must match number of rubrics -- All models must have the same number of evaluations -- PDF files must exist at specified paths +- Each markdown file in `agent_responses/` should have a corresponding entry in `processed_data.jsonl` +- Number of evaluation results should match the number of rubrics for each sample +- All JSON lines must be valid and parseable ## File Formats -### CSV Files +### JSONL Files - Encoding: UTF-8 -- Delimiter: Comma (`,`) -- Quoting: Minimal (quote fields containing commas) -- Line endings: Unix (LF) or Windows (CRLF) - -### Parquet Files +- One JSON object per line +- Each line must be valid JSON +- Line endings: Unix (LF) preferred, Windows (CRLF) acceptable +- No trailing commas +- Use double quotes for all strings -- Compression: Snappy (default) -- Schema: Inferred from pandas DataFrame -- Complex types: Stored as JSON strings +### Markdown Files -### JSON Fields - -Within CSV/Parquet: -- JSON strings must be valid and parseable -- Use double quotes for JSON keys and string values -- Arrays and objects properly nested +- Encoding: UTF-8 +- Standard markdown syntax +- Line endings: Unix (LF) preferred, Windows (CRLF) acceptable ## Example Complete Dataset -See `examples/sample_dataset.csv` for a complete example with multiple tasks and all required fields. +See the actual `data/researchrubrics/processed_data.jsonl` file for complete examples with multiple tasks and all required fields. -## Converting Between Formats +## Working with JSONL -### CSV to Parquet +### Reading JSONL in Python ```python -import pandas as pd +import json + +# Read all entries +data = [] +with open('data/researchrubrics/processed_data.jsonl', 'r') as f: + for line in f: + data.append(json.loads(line)) -df = pd.read_csv('compiled_dataset.csv') -df.to_parquet('compiled_dataset.parquet', index=False) +# Access first task +first_task = data[0] +print(f"Sample ID: {first_task['sample_id']}") +print(f"Number of rubrics: {len(first_task['rubrics'])}") ``` -### Extracting JSON Fields +### Reading Evaluation Results ```python import json import pandas as pd -df = pd.read_csv('compiled_dataset.csv') - -# Parse rubrics -df['rubrics_parsed'] = df['rubrics'].apply(json.loads) +# Read evaluation results into a DataFrame +results = [] +with open('results/batch_evaluation_20251113_093457.jsonl', 'r') as f: + for line in f: + results.append(json.loads(line)) -# Parse presence data -df['presence_parsed'] = df['final_presence'].apply(json.loads) -``` - -## Schema Validation - -Use this JSON schema to validate compiled datasets: +df = pd.DataFrame(results) -```json -{ - "type": "object", - "required": ["task_name", "prompt", "rubrics", "final_presence"], - "properties": { - "task_name": {"type": "string", "minLength": 1}, - "prompt": {"type": "string", "minLength": 1}, - "rubrics": {"type": "string"}, - "rubrics_count": {"type": "integer", "minimum": 1}, - "final_presence": {"type": "string"} - } -} +# Group by sample_id to get per-task metrics +by_sample = df.groupby('sample_id').agg({ + 'score': 'mean', + 'cost': 'sum', + 'tokens_used': 'sum' +}) ``` ## Questions? diff --git a/FILE_MANIFEST.md b/FILE_MANIFEST.md index e37d56c..b443a21 100644 --- a/FILE_MANIFEST.md +++ b/FILE_MANIFEST.md @@ -1,6 +1,6 @@ # File Manifest -This document lists all files included in the Deep Research Benchmarks release and their purposes. +This document lists all files included in the Research Rubrics release and their purposes. ## ๐Ÿ“‹ Core Documentation @@ -18,30 +18,48 @@ This document lists all files included in the Deep Research Benchmarks release a ### INSTALLATION.md **Purpose**: Detailed installation instructions -**Content**: System requirements, multiple installation methods, troubleshooting +**Content**: System requirements, installation methods, troubleshooting **Audience**: Users setting up the environment **Read**: Before running any code ### DATA_FORMAT.md **Purpose**: Data format specifications -**Content**: Input/output formats, schema definitions, validation rules -**Audience**: Users working with custom datasets -**Read**: When preparing your own data +**Content**: Input/output formats, JSONL structure, validation rules +**Audience**: Users working with the data +**Read**: When preparing or analyzing data + +### FOLDER_STRUCTURE.md +**Purpose**: Directory organization guide +**Content**: Complete directory tree, setup instructions, path references +**Audience**: Users setting up the project +**Read**: During initial setup + +### FILE_MANIFEST.md +**Purpose**: File index (this document) +**Content**: List and description of all files +**Audience**: Users wanting a complete overview +**Read**: For reference + +### SETUP_GUIDE.md +**Purpose**: Step-by-step setup instructions +**Content**: Complete setup workflow from scratch +**Audience**: New users +**Read**: First time setup + +### PACKAGE_SUMMARY.md +**Purpose**: Package overview +**Content**: Summary of package contents and structure +**Audience**: All users +**Read**: For a high-level overview ## ๐Ÿ› ๏ธ Configuration Files ### requirements.txt **Purpose**: Python dependencies -**Content**: List of required packages with version constraints +**Content**: List of required packages (pandas, litellm, tqdm, etc.) **Usage**: `pip install -r requirements.txt` **Type**: Installation file -### .env.example -**Purpose**: Environment variables template -**Content**: API keys, configuration options -**Usage**: Copy to `.env` and fill in your values -**Type**: Configuration template - ### setup.py **Purpose**: Package installation configuration **Content**: Package metadata, dependencies, entry points @@ -54,19 +72,19 @@ This document lists all files included in the Deep Research Benchmarks release a **Usage**: Automatically used by Git **Type**: Version control configuration -## ๐Ÿ“– Additional Documentation +### .env (user-created) +**Purpose**: API credentials +**Content**: LITELLM_API_KEY=your_key_here +**Usage**: Created by user, never committed +**Type**: Configuration file -### CONTRIBUTING.md -**Purpose**: Contribution guidelines -**Content**: Development setup, code style, pull request process -**Audience**: Contributors and developers -**Read**: Before contributing code +## ๐Ÿ“– Additional Documentation -### CHANGELOG.md -**Purpose**: Version history -**Content**: Changes, additions, and fixes in each version -**Audience**: Users tracking updates -**Read**: When upgrading versions +### LICENSE +**Purpose**: Software license +**Content**: MIT License terms +**Audience**: Anyone using or distributing the code +**Read**: To understand usage rights ### CITATION.bib **Purpose**: Academic citation information @@ -74,182 +92,141 @@ This document lists all files included in the Deep Research Benchmarks release a **Audience**: Researchers citing this work **Usage**: Copy and paste into your bibliography -### LICENSE -**Purpose**: Software license -**Content**: MIT License terms -**Audience**: Anyone using or distributing the code -**Read**: To understand usage rights - ## ๐ŸŽฏ Evaluation Prompts -### prompts/ternary/system_prompt.txt -**Purpose**: System prompt for ternary evaluation -**Content**: Instructions for evaluating with 3 classes (Satisfied/Partially/Not) +### src/prompts/system_prompt.txt +**Purpose**: System prompt for rubric evaluation +**Content**: Instructions for the LLM evaluator **Usage**: Loaded automatically by evaluation scripts **Type**: LLM prompt template -### prompts/ternary/user_prompt_template.txt -**Purpose**: User prompt template for ternary evaluation -**Content**: Template for rubric evaluation requests (3 classes) +### src/prompts/user_prompt.txt +**Purpose**: User prompt template for evaluation +**Content**: Template for rubric evaluation requests **Usage**: Loaded and formatted by evaluation scripts **Type**: LLM prompt template -### prompts/binary/system_prompt.txt -**Purpose**: System prompt for binary evaluation -**Content**: Instructions for evaluating with 2 classes (Satisfied/Not Satisfied) -**Usage**: Loaded automatically when binary=True +### src/prompts/chunk_prompt_template.txt +**Purpose**: Prompt for evaluating document chunks +**Content**: Template for chunk-level evaluation +**Usage**: Used when documents exceed token limits **Type**: LLM prompt template -### prompts/binary/user_prompt_template.txt -**Purpose**: User prompt template for binary evaluation -**Content**: Template for rubric evaluation requests (2 classes) -**Usage**: Loaded and formatted when binary=True +### src/prompts/synthesis_prompt_template.txt +**Purpose**: Prompt for synthesizing chunk evaluations +**Content**: Template for combining chunk results +**Usage**: Used to create final verdict from chunks **Type**: LLM prompt template -## ๐Ÿ“‚ Directory Structure (Expected) - -While not included in this release package, the following directories should be created: - -``` -public_release_experiments/ -โ”œโ”€โ”€ src/ # Source code (your codebase) -โ”‚ โ”œโ”€โ”€ extract_rubrics/ # Rubric extraction scripts -โ”‚ โ”œโ”€โ”€ evaluate_rubrics/ # LLM evaluation scripts -โ”‚ โ””โ”€โ”€ calculate_metrics/ # Metrics calculation scripts -โ”œโ”€โ”€ data/ # Data directory -โ”‚ โ”œโ”€โ”€ raw_csvs/ # Input: Raw CSV files -โ”‚ โ”œโ”€โ”€ processed_df/ # Output: Compiled datasets -โ”‚ โ”œโ”€โ”€ PDFs/ # Downloaded PDFs -โ”‚ โ””โ”€โ”€ predownloaded_pdfs/ # Optional: Pre-downloaded PDFs -โ”œโ”€โ”€ results/ # Evaluation results -โ”œโ”€โ”€ cache/ # Cached conversions -โ””โ”€โ”€ tests/ # Test files (optional) -``` - -## ๐Ÿ“Š File Sizes (Approximate) - -| File | Size | Type | -|------|------|------| -| README.md | ~9 KB | Markdown | -| QUICKSTART.md | ~5 KB | Markdown | -| INSTALLATION.md | ~8 KB | Markdown | -| DATA_FORMAT.md | ~9 KB | Markdown | -| CONTRIBUTING.md | ~3 KB | Markdown | -| requirements.txt | ~0.5 KB | Text | -| setup.py | ~2 KB | Python | -| .env.example | ~0.4 KB | Text | -| LICENSE | ~1 KB | Text | -| CHANGELOG.md | ~2 KB | Markdown | -| CITATION.bib | ~0.3 KB | BibTeX | -| .gitignore | ~0.9 KB | Text | -| Prompts (all) | ~2 KB | Text | -| **Total** | **~43 KB** | - | - -## ๐Ÿ”„ File Dependencies - -### Installation Flow -1. Read `README.md` -2. Follow `INSTALLATION.md` -3. Configure `.env` from `.env.example` -4. Install using `requirements.txt` or `setup.py` - -### Usage Flow -1. Read `QUICKSTART.md` -2. Prepare data according to `DATA_FORMAT.md` -3. Run scripts (which use `prompts/`) -4. Analyze results - -### Development Flow -1. Read `CONTRIBUTING.md` -2. Setup dev environment from `requirements.txt` + dev tools -3. Follow code style in `CONTRIBUTING.md` -4. Update `CHANGELOG.md` with changes - -## ๐Ÿ“ Customization Guide - -### Which Files to Modify - -**For Your Institution/Project**: -- `README.md`: Update author information, contact details, repository URL -- `CITATION.bib`: Add actual authors and publication details -- `LICENSE`: Update copyright holder and year -- `setup.py`: Update package metadata and URLs - -**For Configuration**: -- `.env`: Add your actual API keys (don't commit this!) -- `requirements.txt`: Add or update dependencies as needed - -**For Custom Evaluation**: -- `prompts/`: Modify prompts to match your evaluation criteria -- `DATA_FORMAT.md`: Document any custom data formats - -**Don't Modify** (unless necessary): -- `.gitignore`: Standard exclusions work for most cases -- `CONTRIBUTING.md`: Generic guidelines applicable to most projects +## ๐Ÿ“‚ Source Code Files + +### src/evaluate_rubrics/evaluate_single_report.py +**Purpose**: Single report evaluation +**Content**: `RubricEvaluator` class and `evaluate_task_rubrics` function +**Usage**: Evaluate one markdown file against its rubrics +**Type**: Python module + +### src/evaluate_rubrics/evaluate_reports_batch.py +**Purpose**: Batch evaluation script +**Content**: Process all markdown files in `agent_responses/` +**Usage**: `python evaluate_reports_batch.py` +**Type**: Python script + +### src/calculate_metrics/calculate_compliance_score.py +**Purpose**: Compliance score calculation +**Content**: Calculate weighted compliance scores from evaluation results +**Usage**: `python calculate_compliance_score.py` +**Type**: Python script + +### src/__init__.py +**Purpose**: Package marker +**Content**: (typically empty) +**Type**: Python package file + +### tests/__init__.py +**Purpose**: Test package marker +**Content**: (typically empty) +**Type**: Python package file + +## ๐Ÿ“Š Data Files (Expected Structure) + +### data/researchrubrics/processed_data.jsonl +**Purpose**: Input data with rubrics and metadata +**Content**: One JSON object per line with prompts, sample IDs, and rubrics +**Format**: JSONL (JSON Lines) +**Type**: Input data file + +### data/researchrubrics/README.md +**Purpose**: Dataset documentation template +**Content**: Hugging Face dataset card template +**Type**: Documentation + +### agent_responses/[sample_id].md +**Purpose**: AI-generated research reports to evaluate +**Content**: Markdown-formatted research documents +**Format**: Markdown +**Type**: Input files + +### results/batch_evaluation_YYYYMMDD_HHMMSS.jsonl +**Purpose**: Evaluation results +**Content**: One evaluation result per line +**Format**: JSONL +**Type**: Output file + +## ๐Ÿ“ File Count Summary + +**Total Documentation**: 8 files +**Total Configuration**: 3 files +**Total Prompts**: 4 files +**Total Source Code**: 4 Python files +**Total Package Markers**: 2 files +**Expected Data Files**: Variable (3 sample markdown files in current repo) ## ๐Ÿ” Finding Information **"How do I install?"** โ†’ `INSTALLATION.md` **"How do I run it?"** โ†’ `QUICKSTART.md` **"What's the data format?"** โ†’ `DATA_FORMAT.md` -**"How do I contribute?"** โ†’ `CONTRIBUTING.md` +**"What's this project?"** โ†’ `README.md` +**"How do I set up?"** โ†’ `SETUP_GUIDE.md` or `FOLDER_STRUCTURE.md` **"What's the license?"** โ†’ `LICENSE` **"How do I cite?"** โ†’ `CITATION.bib` -**"What changed?"** โ†’ `CHANGELOG.md` -**"What's this project?"** โ†’ `README.md` - -## โœ… Pre-Release Checklist - -Before releasing, ensure: - -- [ ] Update `README.md` with correct repository URL -- [ ] Fill in actual authors in `CITATION.bib` -- [ ] Update copyright year in `LICENSE` -- [ ] Verify all URLs in documentation -- [ ] Update contact information -- [ ] Set correct version in `setup.py` and `CHANGELOG.md` -- [ ] Test installation instructions -- [ ] Verify all example code works -- [ ] Remove any sensitive information -- [ ] Update `.env.example` with correct variables - -## ๐Ÿ“ž Support - -For questions about specific files: -- **Installation issues**: See `INSTALLATION.md` troubleshooting section -- **Usage questions**: Check `QUICKSTART.md` examples -- **Data format**: Refer to `DATA_FORMAT.md` -- **Contributing**: Read `CONTRIBUTING.md` -- **Other**: Open an issue or contact maintainers +**"What files are there?"** โ†’ This file (`FILE_MANIFEST.md`) ## ๐Ÿ“ฆ Distribution When distributing this code release: -1. **Include all files listed above** -2. **Do NOT include**: - - `.env` (with actual keys) - - `data/` directories with actual data - - `cache/` directory - - `results/` directory - - `__pycache__/` directories -3. **Optional to include**: - - Sample datasets (if license permits) - - Example notebooks - - Test files - -## ๐Ÿ”— Related Files (Not in This Package) - -These files are part of your codebase but documented separately: - -- `src/extract_rubrics/*.py`: Rubric extraction scripts -- `src/evaluate_rubrics/*.py`: Evaluation scripts -- `src/calculate_metrics/*.py`: Metrics calculation scripts - -See the source code documentation and README.md for details on these files. +### Include: +- โœ… All documentation files (8 files) +- โœ… All configuration templates (requirements.txt, setup.py, .gitignore) +- โœ… All source code (src/ directory) +- โœ… All prompt templates (src/prompts/) +- โœ… Empty directory structure (data/, agent_responses/, results/, cache/, tests/) +- โœ… LICENSE and CITATION.bib + +### Do NOT Include: +- โŒ `.env` (with actual API keys) +- โŒ Actual data files (unless publicly shareable) +- โŒ `cache/` contents +- โŒ `results/` with actual evaluation outputs +- โŒ `__pycache__/` directories +- โŒ `.pyc` files + +## โœ… Verification Checklist + +After setup, ensure: +- [ ] All 8 documentation files present in root +- [ ] requirements.txt, setup.py, .gitignore in root +- [ ] All 4 prompt files in `src/prompts/` +- [ ] 2 evaluation scripts in `src/evaluate_rubrics/` +- [ ] 1 metrics script in `src/calculate_metrics/` +- [ ] `.env` created with LITELLM_API_KEY +- [ ] `data/researchrubrics/processed_data.jsonl` exists +- [ ] Markdown files in `agent_responses/` +- [ ] Dependencies installed --- -**Last Updated**: 2025-01-XX -**Version**: 1.0.0 -**Maintained By**: [Maintainer Name/Team] +**Last Updated**: 2025-11-13 +**Version**: 1.0.0 diff --git a/FOLDER_STRUCTURE.md b/FOLDER_STRUCTURE.md index a4dd50f..753136d 100644 --- a/FOLDER_STRUCTURE.md +++ b/FOLDER_STRUCTURE.md @@ -1,87 +1,64 @@ -# Deep Research Benchmarks - Folder Structure +# Research Rubrics - Folder Structure -This document describes the complete folder structure for the Deep Research Benchmarks release. +This document describes the complete folder structure for the Research Rubrics project. ## ๐Ÿ“ Complete Directory Structure ``` -public_release_experiments/ +researchrubrics/ โ”‚ โ”œโ”€โ”€ ๐Ÿ“„ Documentation Files (root level) โ”‚ โ”œโ”€โ”€ README.md # Main documentation โ”‚ โ”œโ”€โ”€ QUICKSTART.md # Quick start guide โ”‚ โ”œโ”€โ”€ INSTALLATION.md # Installation guide โ”‚ โ”œโ”€โ”€ DATA_FORMAT.md # Data format specifications -โ”‚ โ”œโ”€โ”€ CONTRIBUTING.md # Contribution guidelines -โ”‚ โ”œโ”€โ”€ CHANGELOG.md # Version history +โ”‚ โ”œโ”€โ”€ FOLDER_STRUCTURE.md # This file โ”‚ โ”œโ”€โ”€ FILE_MANIFEST.md # File index +โ”‚ โ”œโ”€โ”€ SETUP_GUIDE.md # Setup instructions +โ”‚ โ”œโ”€โ”€ PACKAGE_SUMMARY.md # Package summary โ”‚ โ”œโ”€โ”€ LICENSE # MIT License โ”‚ โ””โ”€โ”€ CITATION.bib # BibTeX citation โ”‚ โ”œโ”€โ”€ โš™๏ธ Configuration Files (root level) โ”‚ โ”œโ”€โ”€ requirements.txt # Python dependencies โ”‚ โ”œโ”€โ”€ setup.py # Package configuration -โ”‚ โ”œโ”€โ”€ .env.example # Environment template -โ”‚ โ”œโ”€โ”€ .env # Your API keys (DO NOT COMMIT) +โ”‚ โ”œโ”€โ”€ .env # Your API key (DO NOT COMMIT) โ”‚ โ””โ”€โ”€ .gitignore # Git exclusions โ”‚ โ”œโ”€โ”€ ๐Ÿ“ฆ src/ # Source code โ”‚ โ”‚ -โ”‚ โ”œโ”€โ”€ extract_rubrics/ # Rubric extraction module -โ”‚ โ”‚ โ”œโ”€โ”€ __init__.py # Package marker (optional) -โ”‚ โ”‚ โ”œโ”€โ”€ extract_rubrics_batch.py # Batch extraction script -โ”‚ โ”‚ โ””โ”€โ”€ extract_rubrics_markitdown_onetask.py # Single task extractor +โ”‚ โ”œโ”€โ”€ __init__.py # Package marker โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ evaluate_rubrics/ # Rubric evaluation module -โ”‚ โ”‚ โ”œโ”€โ”€ __init__.py # Package marker (optional) -โ”‚ โ”‚ โ”œโ”€โ”€ evaluate_rubrics_batch.py # Batch evaluation script -โ”‚ โ”‚ โ”œโ”€โ”€ evaluate_rubrics_markitdown_onetask.py # Single task evaluator -โ”‚ โ”‚ โ””โ”€โ”€ prompts/ # Evaluation prompts -โ”‚ โ”‚ โ”œโ”€โ”€ binary/ # Binary evaluation prompts -โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ system_prompt.txt -โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ user_prompt_template.txt -โ”‚ โ”‚ โ””โ”€โ”€ ternary/ # Ternary evaluation prompts -โ”‚ โ”‚ โ”œโ”€โ”€ system_prompt.txt -โ”‚ โ”‚ โ””โ”€โ”€ user_prompt_template.txt +โ”‚ โ”‚ โ”œโ”€โ”€ evaluate_single_report.py # Single report evaluator +โ”‚ โ”‚ โ””โ”€โ”€ evaluate_reports_batch.py # Batch evaluation script โ”‚ โ”‚ -โ”‚ โ””โ”€โ”€ calculate_metrics/ # Metrics calculation module -โ”‚ โ”œโ”€โ”€ __init__.py # Package marker (optional) -โ”‚ โ”œโ”€โ”€ calculate_F1_score.py # F1 score calculation -โ”‚ โ”œโ”€โ”€ calculate_final_score.py # Weighted score calculation -โ”‚ โ””โ”€โ”€ calculate_failure_breakdown.py # Failure analysis +โ”‚ โ”œโ”€โ”€ calculate_metrics/ # Metrics calculation module +โ”‚ โ”‚ โ””โ”€โ”€ calculate_compliance_score.py # Compliance score calculation +โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€ prompts/ # Evaluation prompt templates +โ”‚ โ”œโ”€โ”€ system_prompt.txt # System prompt for evaluator +โ”‚ โ”œโ”€โ”€ user_prompt.txt # User prompt template +โ”‚ โ”œโ”€โ”€ chunk_prompt_template.txt # Prompt for chunk evaluation +โ”‚ โ””โ”€โ”€ synthesis_prompt_template.txt # Prompt for synthesizing chunks โ”‚ โ”œโ”€โ”€ ๐Ÿ“Š data/ # Data directory -โ”‚ โ”œโ”€โ”€ raw_csvs/ # Input: Raw CSV files -โ”‚ โ”‚ โ””โ”€โ”€ [your_csv_files.csv] -โ”‚ โ”œโ”€โ”€ processed_df/ # Output: Compiled datasets -โ”‚ โ”‚ โ”œโ”€โ”€ compiled_dataset.csv -โ”‚ โ”‚ โ””โ”€โ”€ compiled_dataset.parquet -โ”‚ โ”œโ”€โ”€ PDFs/ # Downloaded/generated PDFs -โ”‚ โ”‚ โ””โ”€โ”€ [task_name]/ # One directory per task -โ”‚ โ”‚ โ”œโ”€โ”€ gemini.pdf -โ”‚ โ”‚ โ”œโ”€โ”€ chatgpt.pdf -โ”‚ โ”‚ โ””โ”€โ”€ perplexity.pdf -โ”‚ โ””โ”€โ”€ predownloaded_pdfs/ # Optional: Pre-downloaded PDFs -โ”‚ โ””โ”€โ”€ [task_name]/ -โ”‚ โ”œโ”€โ”€ gemini.pdf -โ”‚ โ”œโ”€โ”€ chatgpt.pdf -โ”‚ โ””โ”€โ”€ perplexity.pdf +โ”‚ โ””โ”€โ”€ researchrubrics/ # Input data +โ”‚ โ”œโ”€โ”€ processed_data.jsonl # Rubrics and task metadata (JSONL) +โ”‚ โ””โ”€โ”€ README.md # Dataset documentation template +โ”‚ +โ”œโ”€โ”€ ๐Ÿ“ agent_responses/ # Input: Markdown reports to evaluate +โ”‚ โ”œโ”€โ”€ 683a58c9a7e7fe4e7695846f.md # AI-generated report (sample 1) +โ”‚ โ”œโ”€โ”€ 683a58c9a7e7fe4e7695848b.md # AI-generated report (sample 2) +โ”‚ โ””โ”€โ”€ 683a58c9a7e7fe4e7695848e.md # AI-generated report (sample 3) โ”‚ -โ”œโ”€โ”€ ๐Ÿ“ˆ results/ # Evaluation results -โ”‚ โ””โ”€โ”€ [mm_dd]/ # Results by date -โ”‚ โ””โ”€โ”€ [timestamp]/ # Results by timestamp -โ”‚ โ””โ”€โ”€ processed_df/ -โ”‚ โ”œโ”€โ”€ compiled_dataset.csv -โ”‚ โ””โ”€โ”€ compiled_dataset.parquet +โ”œโ”€โ”€ ๐Ÿ“ˆ results/ # Evaluation results (JSONL format) +โ”‚ โ””โ”€โ”€ batch_evaluation_YYYYMMDD_HHMMSS.jsonl # Timestamped results โ”‚ -โ”œโ”€โ”€ ๐Ÿ’พ cache/ # Cached conversions -โ”‚ โ””โ”€โ”€ [hash].md # Cached markdown conversions +โ”œโ”€โ”€ ๐Ÿ’พ cache/ # Reserved for future use โ”‚ -โ””โ”€โ”€ ๐Ÿงช tests/ # Test files (optional) - โ”œโ”€โ”€ __init__.py - โ”œโ”€โ”€ test_extract_rubrics.py - โ”œโ”€โ”€ test_evaluate_rubrics.py - โ””โ”€โ”€ test_calculate_metrics.py +โ””โ”€โ”€ ๐Ÿงช tests/ # Test files + โ””โ”€โ”€ __init__.py # Package marker ``` ## ๐Ÿ“‹ Setup Instructions @@ -90,87 +67,52 @@ public_release_experiments/ ```bash # Navigate to your project root -cd public_release_experiments +cd researchrubrics # Create all required directories -mkdir -p src/extract_rubrics -mkdir -p src/evaluate_rubrics/prompts/binary -mkdir -p src/evaluate_rubrics/prompts/ternary +mkdir -p src/evaluate_rubrics mkdir -p src/calculate_metrics -mkdir -p data/raw_csvs -mkdir -p data/processed_df -mkdir -p data/PDFs -mkdir -p data/predownloaded_pdfs +mkdir -p src/prompts +mkdir -p data/researchrubrics +mkdir -p agent_responses mkdir -p results mkdir -p cache mkdir -p tests ``` -### Step 2: Place Documentation Files - -All documentation files go in the root `public_release_experiments/` directory: +### Step 2: Create .env File ```bash -# In public_release_experiments/ -cp /path/to/README.md . -cp /path/to/QUICKSTART.md . -cp /path/to/INSTALLATION.md . -cp /path/to/DATA_FORMAT.md . -cp /path/to/CONTRIBUTING.md . -cp /path/to/CHANGELOG.md . -cp /path/to/FILE_MANIFEST.md . -cp /path/to/LICENSE . -cp /path/to/CITATION.bib . +# In researchrubrics/ root +echo "LITELLM_API_KEY=your_api_key_here" > .env ``` -### Step 3: Place Configuration Files +### Step 3: Install Dependencies ```bash -# In public_release_experiments/ -cp /path/to/requirements.txt . -cp /path/to/setup.py . -cp /path/to/.gitignore . -cp /path/to/.env.example . - -# Create your .env file -cp .env.example .env -# Edit .env and add your API key +# In researchrubrics/ root +pip install -r requirements.txt ``` -### Step 4: Place Source Code +### Step 4: Verify Data Files -Your existing code files go in their respective directories: +Ensure your input data is in place: ```bash -# Extract rubrics module -cp /path/to/extract_rubrics_batch.py src/extract_rubrics/ -cp /path/to/extract_rubrics_markitdown_onetask.py src/extract_rubrics/ - -# Evaluate rubrics module -cp /path/to/evaluate_rubrics_batch.py src/evaluate_rubrics/ -cp /path/to/evaluate_rubrics_markitdown_onetask.py src/evaluate_rubrics/ - -# Prompts -cp /path/to/prompts/binary/system_prompt.txt src/evaluate_rubrics/prompts/binary/ -cp /path/to/prompts/binary/user_prompt_template.txt src/evaluate_rubrics/prompts/binary/ -cp /path/to/prompts/ternary/system_prompt.txt src/evaluate_rubrics/prompts/ternary/ -cp /path/to/prompts/ternary/user_prompt_template.txt src/evaluate_rubrics/prompts/ternary/ - -# Calculate metrics module -cp /path/to/calculate_F1_score.py src/calculate_metrics/ -cp /path/to/calculate_final_score.py src/calculate_metrics/ -cp /path/to/calculate_failure_breakdown.py src/calculate_metrics/ -``` +# Check that processed_data.jsonl exists +ls data/researchrubrics/processed_data.jsonl -### Step 5: Add __init__.py Files (Optional) +# Check that markdown reports exist +ls agent_responses/*.md +``` -For proper Python package structure: +### Step 5: Test the Installation ```bash -touch src/__init__.py -touch src/extract_rubrics/__init__.py -touch src/evaluate_rubrics/__init__.py -touch src/calculate_metrics/__init__.py +# Test evaluation module (from project root) +cd src/evaluate_rubrics +python -c "from evaluate_single_report import RubricEvaluator; print('โœ“ Module OK')" +cd ../.. ``` ## ๐ŸŽฏ Key Directory Purposes @@ -180,27 +122,25 @@ All user-facing documentation lives at the root level for easy discovery. ### src/ Contains all Python source code, organized by functionality: -- **extract_rubrics/**: CSV processing and rubric extraction -- **evaluate_rubrics/**: LLM-based evaluation - - **prompts/binary/**: Binary evaluation prompts (2 classes) - - **prompts/ternary/**: Ternary evaluation prompts (3 classes) -- **calculate_metrics/**: Metric computation and analysis +- **evaluate_rubrics/**: LLM-based rubric evaluation scripts +- **calculate_metrics/**: Compliance score calculation +- **prompts/**: Evaluation prompt templates (system, user, chunk, synthesis) ### data/ -All data files, organized by stage: -- **raw_csvs/**: Your input CSV files -- **processed_df/**: Compiled datasets after extraction -- **PDFs/**: PDFs organized by task name (task_name/gemini.pdf, chatgpt.pdf, perplexity.pdf) -- **predownloaded_pdfs/**: Optional backup PDFs in same structure +Input data directory: +- **researchrubrics/**: Contains `processed_data.jsonl` with rubrics and task metadata + +### agent_responses/ +Markdown reports to be evaluated: +- Each file named with its `sample_id` (e.g., `683a58c9a7e7fe4e7695846f.md`) ### results/ -Evaluation outputs, automatically organized by date and timestamp: -- Format: `results/MM_DD/YYYYMMDD_HHMMSS/processed_df/` +Evaluation outputs in JSONL format: +- Format: `batch_evaluation_YYYYMMDD_HHMMSS.jsonl` +- One JSON object per line, each representing a rubric evaluation ### cache/ -Temporary cached files (markdown conversions): -- Auto-generated, can be deleted safely -- Improves performance on repeated evaluations +Reserved for future use (currently unused) ## ๐Ÿ” Path References in Code @@ -208,17 +148,16 @@ The code uses these path patterns: ```python # From any script in src/[module]/ -base_dir = Path(__file__).parent.parent.parent # Goes to public_release_experiments/ +base_dir = Path(__file__).parent.parent.parent # Goes to researchrubrics/ # Common paths used in code: -csv_path = base_dir / 'data' / 'raw_csvs' / 'file.csv' -compiled = base_dir / 'data' / 'processed_df' / 'compiled_dataset.csv' -pdf_dir = base_dir / 'data' / 'PDFs' / task_name -results = base_dir / 'results' / date / timestamp +data_file = base_dir / 'data' / 'researchrubrics' / 'processed_data.jsonl' +agent_responses = base_dir / 'agent_responses' +results = base_dir / 'results' cache = base_dir / 'cache' # For prompts (from evaluate_rubrics/): -prompts_dir = Path(__file__).parent / 'prompts' / prompt_type +prompts_dir = Path(__file__).parent.parent / 'prompts' ``` ## โœ… Verification @@ -262,20 +201,15 @@ requirements.txt pip install -r requirements.txt # 2. Configure API key -cp .env.example .env -# Edit .env and add OPENAI_API_KEY - -# 3. Run extraction -cd src/extract_rubrics -python extract_rubrics_batch.py +echo "LITELLM_API_KEY=your_key_here" > .env -# 4. Run evaluation -cd ../evaluate_rubrics -python evaluate_rubrics_batch.py +# 3. Run batch evaluation +cd src/evaluate_rubrics +python evaluate_reports_batch.py -# 5. Calculate metrics +# 4. Calculate compliance scores cd ../calculate_metrics -python calculate_F1_score.py +python calculate_compliance_score.py ``` ## ๐Ÿ“ฆ For Distribution diff --git a/INSTALLATION.md b/INSTALLATION.md index 52ef31b..03fabf4 100644 --- a/INSTALLATION.md +++ b/INSTALLATION.md @@ -1,6 +1,6 @@ # Installation Guide -Detailed installation instructions for the Deep Research Benchmarks codebase. +Detailed installation instructions for the Research Rubrics codebase. ## System Requirements @@ -8,7 +8,7 @@ Detailed installation instructions for the Deep Research Benchmarks codebase. - Python 3.8 or higher - 4 GB RAM - 2 GB disk space (for code and dependencies) -- Internet connection (for API calls and PDF downloads) +- Internet connection (for API calls) ### Recommended Requirements - Python 3.10+ @@ -23,21 +23,21 @@ Detailed installation instructions for the Deep Research Benchmarks codebase. ```bash # Clone the repository git clone -cd public_release_experiments +cd researchrubrics # Install using pip pip install -r requirements.txt # Verify installation -python -c "import pandas, litellm, markitdown; print('Installation successful!')" +python -c "import pandas, litellm, tqdm; print('Installation successful!')" ``` ### Method 2: conda environment ```bash # Create conda environment -conda create -n deep-research python=3.10 -conda activate deep-research +conda create -n researchrubrics python=3.10 +conda activate researchrubrics # Install dependencies pip install -r requirements.txt @@ -66,131 +66,84 @@ For contributors who want to modify the code: ```bash # Clone repository git clone -cd public_release_experiments +cd researchrubrics # Install in editable mode with dev dependencies pip install -e . -pip install pytest pytest-asyncio black flake8 +pip install pytest pytest-asyncio ``` ## Dependency Details ### Core Dependencies -#### pandas (>=1.3.0) +#### pandas (>=2.0.0) Data manipulation and analysis ```bash pip install pandas ``` -#### numpy (>=1.20.0) -Numerical computing -```bash -pip install numpy -``` - #### litellm (>=1.0.0) -LLM API client for multiple providers +LLM API client for accessing Gemini 2.5 Pro ```bash pip install litellm ``` -#### markitdown (>=0.1.0) -PDF to Markdown conversion -```bash -pip install markitdown -``` - -#### PyPDF2 (>=3.0.0) -Fallback PDF text extraction -```bash -pip install PyPDF2 -``` - -#### scikit-learn (>=1.0.0) -Machine learning metrics -```bash -pip install scikit-learn -``` - #### tqdm (>=4.60.0) -Progress bars +Progress bars for batch processing ```bash pip install tqdm ``` ### Optional Dependencies -#### pyarrow (>=10.0.0) -Parquet file support (recommended) -```bash -pip install pyarrow -``` - #### pytest and pytest-asyncio For running tests ```bash pip install pytest pytest-asyncio ``` -#### black and flake8 -Code formatting and linting -```bash -pip install black flake8 -``` - ## Configuration ### 1. API Key Setup -Create a `.env` file from the template: +Create a `.env` file in the project root: ```bash -cp .env.example .env +# From project root +echo "LITELLM_API_KEY=your_api_key_here" > .env ``` -Edit `.env` and add your API key: - +Or manually create `.env`: ```bash -# Open in your preferred editor nano .env -# or -vim .env -# or -code .env -``` - -Add your API key: -``` -OPENAI_API_KEY=sk-your-api-key-here -``` - -### 2. Custom API Endpoint (Optional) - -If using a custom LiteLLM proxy or alternative endpoint: - -``` -OPENAI_API_KEY=your_key -API_BASE_URL=https://your-endpoint.com +# Add: LITELLM_API_KEY=your_api_key_here ``` -### 3. Model Configuration (Optional) +### 2. Verify API Key -``` -MODEL_NAME=gpt-5 -MAX_CONCURRENT_REQUESTS=20 -BINARY_EVALUATION=False +```python +import os +from pathlib import Path + +# Load .env file +env_file = Path('.env') +if env_file.exists(): + with open(env_file) as f: + for line in f: + if line.startswith('LITELLM_API_KEY='): + print('API key configured โœ“') +else: + print('No .env file found - create one with LITELLM_API_KEY') ``` ## Directory Structure Setup -The installation should create these directories automatically, but you can create them manually if needed: +The installation should create these directories automatically if they don't exist, but you can create them manually if needed: ```bash -mkdir -p data/raw_csvs -mkdir -p data/processed_df -mkdir -p data/PDFs -mkdir -p data/predownloaded_pdfs +mkdir -p data/researchrubrics +mkdir -p agent_responses mkdir -p results mkdir -p cache ``` @@ -200,48 +153,40 @@ mkdir -p cache ### Test Basic Functionality ```bash -# Test extract module (from project root) -python -c "import sys; sys.path.insert(0, 'src/extract_rubrics'); from extract_rubrics_markitdown_onetask import RubricExtractor; print('Extract module OK')" - # Test evaluate module (from project root) -python -c "import sys; sys.path.insert(0, 'src/evaluate_rubrics'); from evaluate_rubrics_markitdown_onetask import RubricEvaluator; print('Evaluate module OK')" - -# Test metrics module (from project root) -python -c "import sys; sys.path.insert(0, 'src/calculate_metrics'); from calculate_F1_score import calculate_macro_f1_per_task; print('Metrics module OK')" -``` - -Or run from their directories: - -```bash -cd src/extract_rubrics -python -c "from extract_rubrics_markitdown_onetask import RubricExtractor; print('Extract module OK')" - -cd ../evaluate_rubrics -python -c "from evaluate_rubrics_markitdown_onetask import RubricEvaluator; print('Evaluate module OK')" +cd src/evaluate_rubrics +python -c "from evaluate_single_report import RubricEvaluator; print('Evaluate module OK')" +# Test metrics module cd ../calculate_metrics -python -c "from calculate_F1_score import calculate_macro_f1_per_task; print('Metrics module OK')" +python -c "from calculate_compliance_score import calculate_compliance_score; print('Metrics module OK')" + +cd ../.. # Back to project root ``` ### Test API Connection -```bash -python -c " +```python import os -from dotenv import load_dotenv -load_dotenv() -api_key = os.getenv('OPENAI_API_KEY') -if api_key and api_key != 'your_api_key_here': - print('API key configured โœ“') -else: - print('API key not configured - edit .env file') -" -``` - -### Run Test Suite (if available) - -```bash -pytest tests/ +from pathlib import Path + +# Try to load API key +env_file = Path('.env') +if not env_file.exists(): + print('ERROR: .env file not found') + exit(1) + +with open(env_file) as f: + for line in f: + if 'LITELLM_API_KEY=' in line: + key = line.split('=')[1].strip() + if key and key != 'your_api_key_here': + print('โœ“ API key configured') + else: + print('ERROR: API key not set in .env file') + break + else: + print('ERROR: LITELLM_API_KEY not found in .env file') ``` ## Platform-Specific Instructions @@ -284,7 +229,7 @@ pip3 install -r requirements.txt # Install Python from python.org or Microsoft Store # Ensure pip is included in the installation -# Open PowerShell or Command Prompt as Administrator (if needed) +# Open PowerShell or Command Prompt # Follow standard installation steps pip install -r requirements.txt @@ -293,8 +238,7 @@ pip install -r requirements.txt #### Windows-Specific Notes - Use backslashes (`\`) in paths or use raw strings in Python -- Some packages may require Microsoft C++ Build Tools -- Download from: https://visualstudio.microsoft.com/visual-cpp-build-tools/ +- Ensure Python is added to PATH during installation ## Troubleshooting @@ -336,34 +280,25 @@ pip install --upgrade pip pip install --trusted-host pypi.org --trusted-host pypi.python.org -r requirements.txt ``` -### Issue: markitdown installation fails - -**Solution**: Install system dependencies -```bash -# macOS -brew install poppler - -# Ubuntu/Debian -sudo apt-get install poppler-utils - -# Then retry -pip install markitdown -``` - ### Issue: Can't find .env file -**Solution**: Ensure .env is in the project root (public_release_experiments/) +**Solution**: Ensure .env is in the project root ```bash # From project root -cd public_release_experiments ls -la .env -# If missing, create from template -cp .env.example .env -# Edit and add your OPENAI_API_KEY +# If missing, create it +echo "LITELLM_API_KEY=your_api_key_here" > .env ``` -**Note**: The evaluation scripts look for `.env` in the project root (`public_release_experiments/.env`), not in the script directory. The code automatically searches up the directory tree from `src/evaluate_rubrics/` to find it. +**Note**: The evaluation scripts look for `.env` in the project root (`researchrubrics/.env`), not in the script directory. + +### Issue: litellm import error + +**Solution**: Ensure litellm is installed with correct version +```bash +pip install --upgrade litellm +``` ## Upgrading @@ -394,21 +329,6 @@ rm -rf venv/ rm -rf conda_env/ ``` -## Docker Installation (Alternative) - -For a containerized installation (if Dockerfile is provided): - -```bash -# Build Docker image -docker build -t deep-research-benchmarks . - -# Run container -docker run -v $(pwd)/data:/app/data \ - -v $(pwd)/results:/app/results \ - -e OPENAI_API_KEY=your_key \ - deep-research-benchmarks -``` - ## Next Steps After successful installation: @@ -416,7 +336,7 @@ After successful installation: 1. Review [QUICKSTART.md](QUICKSTART.md) for usage examples 2. Read [README.md](README.md) for comprehensive documentation 3. Check [DATA_FORMAT.md](DATA_FORMAT.md) for data specifications -4. Run the example workflow to verify everything works +4. Follow [SETUP_GUIDE.md](SETUP_GUIDE.md) for complete setup ## Getting Help @@ -436,7 +356,7 @@ Current version: 1.0.0 To check installed package versions: ```bash -pip list | grep -E "pandas|litellm|markitdown|scikit-learn" +pip list | grep -E "pandas|litellm|tqdm" ``` ## License diff --git a/PACKAGE_SUMMARY.md b/PACKAGE_SUMMARY.md index fa66c46..d75f1bd 100644 --- a/PACKAGE_SUMMARY.md +++ b/PACKAGE_SUMMARY.md @@ -1,228 +1,281 @@ -# Deep Research Benchmarks - Release Package Summary - -## ๐Ÿ“ฆ Package Contents - -**Total Size**: ~94 KB -**Total Files**: 22 files (18 root files + 4 prompt files) - -### โœ… What's Included - -#### Documentation Files (12 files - 79 KB) -1. โœ… **README.md** (9.8 KB) - Main documentation and entry point -2. โœ… **QUICKSTART.md** (6.5 KB) - Quick start with examples -3. โœ… **INSTALLATION.md** (9.0 KB) - Detailed installation guide -4. โœ… **DATA_FORMAT.md** (8.7 KB) - Data format specifications -5. โœ… **FOLDER_STRUCTURE.md** (11 KB) - Directory organization guide -6. โœ… **FILE_MANIFEST.md** (8.4 KB) - Complete file index -7. โœ… **SETUP_GUIDE.md** (9.1 KB) - Step-by-step setup instructions -8. โœ… **CONTRIBUTING.md** (3.1 KB) - Contribution guidelines -9. โœ… **CHANGELOG.md** (1.9 KB) - Version history -10. โœ… **RELEASE_CHECKLIST.md** (7.2 KB) - Pre-publication checklist -11. โœ… **LICENSE** (1.1 KB) - MIT License -12. โœ… **CITATION.bib** (327 B) - BibTeX citation - -#### Configuration Files (4 files - 6.4 KB) -13. โœ… **requirements.txt** (494 B) - Python dependencies -14. โœ… **setup.py** (1.9 KB) - Package installation configuration -15. โœ… **.env.example** (382 B) - Environment variables template -16. โœ… **.gitignore** (901 B) - Git exclusions - -#### Setup Scripts (2 files - 6.3 KB) -17. โœ… **setup_structure.sh** (3.2 KB) - Unix/Linux/macOS setup script -18. โœ… **setup_structure.bat** (3.1 KB) - Windows setup script - -#### Evaluation Prompts (4 files - 3.8 KB) -19. โœ… **prompts/binary/system_prompt.txt** (861 B) -20. โœ… **prompts/binary/user_prompt_template.txt** (929 B) -21. โœ… **prompts/ternary/system_prompt.txt** (932 B) -22. โœ… **prompts/ternary/user_prompt_template.txt** (1.1 KB) - -## ๐ŸŽฏ What You Need to Do Next - -### CRITICAL: These files are NOT included (you need to add them): - -#### Your Python Source Code (7 files) -- โŒ `src/extract_rubrics/extract_rubrics_batch.py` - YOUR FILE -- โŒ `src/extract_rubrics/extract_rubrics_markitdown_onetask.py` - YOUR FILE -- โŒ `src/evaluate_rubrics/evaluate_rubrics_batch.py` - YOUR FILE -- โŒ `src/evaluate_rubrics/evaluate_rubrics_markitdown_onetask.py` - YOUR FILE -- โŒ `src/calculate_metrics/calculate_F1_score.py` - YOUR FILE -- โŒ `src/calculate_metrics/calculate_final_score.py` - YOUR FILE -- โŒ `src/calculate_metrics/calculate_failure_breakdown.py` - YOUR FILE - -**These are YOUR existing Python files from the `iclr_paper/public_release_experiments/src/` directory in your uploaded documents.** - -#### Your Data (not included in release) -- โŒ `data/raw_csvs/` - Your CSV evaluation files -- โŒ `data/PDFs/` - Your generated PDFs (if any) -- โŒ `.env` - Your actual API key - -## ๐Ÿ“‹ Quick Setup Steps - -### 1. Create Project Directory -```bash -mkdir public_release_experiments -cd public_release_experiments -``` +# Research Rubrics - Package Summary -### 2. Copy All Release Files -Place all 22 files from this package into `public_release_experiments/`: -- All .md files go in root -- All configuration files go in root -- prompts/ directory with its contents +## ๐Ÿ“ฆ Package Overview -### 3. Run Setup Script -```bash -# Unix/Mac -chmod +x setup_structure.sh -./setup_structure.sh +**Project Name**: Research Rubrics +**Version**: 1.0.0 +**Purpose**: Evaluate AI-generated research documents against structured rubric criteria using LLMs +**License**: MIT -# Windows -setup_structure.bat -``` +## ๐ŸŽฏ What This Package Does -This creates the complete folder structure. +Research Rubrics is a Python-based evaluation framework that: -### 4. Add Your Source Code -Copy your 7 Python files to their locations: -```bash -# Your files from iclr_paper/public_release_experiments/src/ -cp path/to/extract_rubrics_*.py src/extract_rubrics/ -cp path/to/evaluate_rubrics_*.py src/evaluate_rubrics/ -cp path/to/calculate_*.py src/calculate_metrics/ -``` +1. **Evaluates** markdown-formatted research reports against detailed rubric criteria +2. **Uses** Gemini 2.5 Pro (via LiteLLM) for intelligent, context-aware evaluation +3. **Provides** binary grading (Satisfied/Not Satisfied) with confidence scores +4. **Calculates** weighted compliance scores based on rubric importance +5. **Handles** large documents through automatic chunking and synthesis +6. **Supports** batch processing with concurrent API calls -### 5. Move Prompts to Correct Location -```bash -# Prompts need to go inside src/evaluate_rubrics/ -mv prompts/binary/* src/evaluate_rubrics/prompts/binary/ -mv prompts/ternary/* src/evaluate_rubrics/prompts/ternary/ -rm -rf prompts/ # Remove now-empty directory -``` +## ๐Ÿ“ Package Contents -### 6. Configure Environment -```bash -cp .env.example .env -# Edit .env and add your OPENAI_API_KEY -``` +### Total Files + +- **Documentation**: 8 markdown files +- **Configuration**: 3 files (requirements.txt, setup.py, .gitignore) +- **Source Code**: 4 Python files +- **Prompt Templates**: 4 text files +- **Data**: 1 JSONL file + 3 sample markdown files + +### Documentation Files (Root Directory) + +| File | Purpose | Size | +|------|---------|------| +| README.md | Main documentation | ~15 KB | +| QUICKSTART.md | Quick start guide | ~8 KB | +| INSTALLATION.md | Installation instructions | ~10 KB | +| DATA_FORMAT.md | Data format specifications | ~11 KB | +| FOLDER_STRUCTURE.md | Directory organization | ~12 KB | +| FILE_MANIFEST.md | Complete file index | ~9 KB | +| SETUP_GUIDE.md | Step-by-step setup | ~8 KB | +| PACKAGE_SUMMARY.md | This file | ~5 KB | + +### Configuration Files + +| File | Purpose | +|------|---------| +| requirements.txt | Python dependencies (pandas, litellm, tqdm) | +| setup.py | Package installation configuration | +| .gitignore | Git exclusions | -### 7. Install Dependencies -```bash -pip install -r requirements.txt +### Source Code (`src/` directory) + +``` +src/ +โ”œโ”€โ”€ __init__.py +โ”œโ”€โ”€ evaluate_rubrics/ +โ”‚ โ”œโ”€โ”€ evaluate_single_report.py (~640 lines) +โ”‚ โ””โ”€โ”€ evaluate_reports_batch.py (~155 lines) +โ”œโ”€โ”€ calculate_metrics/ +โ”‚ โ””โ”€โ”€ calculate_compliance_score.py (~52 lines) +โ””โ”€โ”€ prompts/ + โ”œโ”€โ”€ system_prompt.txt + โ”œโ”€โ”€ user_prompt.txt + โ”œโ”€โ”€ chunk_prompt_template.txt + โ””โ”€โ”€ synthesis_prompt_template.txt ``` -### 8. Test -```bash -cd src/extract_rubrics -python -c "from extract_rubrics_markitdown_onetask import RubricExtractor; print('OK')" +**Total Source Code**: ~850 lines of Python + +### Data Files + +``` +data/researchrubrics/ +โ”œโ”€โ”€ processed_data.jsonl (5 sample tasks) +โ””โ”€โ”€ README.md (Dataset template) + +agent_responses/ +โ”œโ”€โ”€ 683a58c9a7e7fe4e7695846f.md +โ”œโ”€โ”€ 683a58c9a7e7fe4e7695848b.md +โ””โ”€โ”€ 683a58c9a7e7fe4e7695848e.md ``` -## ๐Ÿ“– Documentation Guide +## ๐Ÿ”ง Core Components -### Start Here: -1. **SETUP_GUIDE.md** โ† Read this first for complete setup instructions -2. **README.md** โ† Overview of the project -3. **FOLDER_STRUCTURE.md** โ† Understand the directory layout +### 1. Rubric Evaluator (`evaluate_single_report.py`) -### For Installation: -4. **INSTALLATION.md** โ† Detailed installation with troubleshooting +**Main Class**: `RubricEvaluator` -### For Usage: -5. **QUICKSTART.md** โ† Practical examples and quick start -6. **DATA_FORMAT.md** โ† Understanding data formats +**Key Features**: +- Async LLM API calls with retry logic +- Automatic document chunking for large files +- Configurable concurrency (default: 20 concurrent requests) +- Token usage and cost tracking +- Confidence scoring -### Before Publishing: -7. **RELEASE_CHECKLIST.md** โ† Complete before pushing to GitHub +**Key Functions**: +- `evaluate_task_rubrics()` - Evaluate one markdown file +- `evaluate_single_rubric()` - Evaluate one rubric +- `chunk_document()` - Split large documents +- `synthesize_verdicts()` - Combine chunk results -### Reference: -8. **FILE_MANIFEST.md** โ† Index of all files -9. **CONTRIBUTING.md** โ† For contributors +### 2. Batch Evaluator (`evaluate_reports_batch.py`) -## โœ… Verification Checklist +**Main Function**: `evaluate_all_reports()` -After setup, verify: -- [ ] All 22 release files in place -- [ ] All 7 Python source files added -- [ ] Prompts in `src/evaluate_rubrics/prompts/` -- [ ] `.env` created and configured -- [ ] Dependencies installed -- [ ] Test imports work -- [ ] Folder structure matches FOLDER_STRUCTURE.md +**Features**: +- Processes all markdown files in `agent_responses/` +- Generates timestamped JSONL output +- Tracks total cost and token usage +- Displays progress with tqdm -## ๐Ÿšจ Important Notes +### 3. Compliance Calculator (`calculate_compliance_score.py`) -### DO NOT commit to Git: -- โŒ `.env` (has your API key) -- โŒ `data/` contents (your actual data) -- โŒ `cache/` contents -- โŒ `results/` contents -- โŒ `__pycache__/` directories +**Main Function**: `calculate_compliance_score()` -**The .gitignore file handles this automatically.** +**Features**: +- Calculates weighted compliance scores +- Handles negative-weight (penalty) rubrics +- Displays per-sample and aggregate scores -### DO commit to Git: -- โœ… All documentation files -- โœ… Configuration templates (.env.example) -- โœ… Source code files -- โœ… Prompt files -- โœ… Setup scripts -- โœ… requirements.txt, setup.py +### 4. Prompt Templates + +- **system_prompt.txt**: System-level instructions for the LLM +- **user_prompt.txt**: Template for rubric evaluation requests +- **chunk_prompt_template.txt**: Template for chunk-level evaluation +- **synthesis_prompt_template.txt**: Template for synthesizing results + +## ๐Ÿ“Š Data Flow + +``` +Input: + processed_data.jsonl (rubrics) + agent_responses/*.md (reports) + โ†“ + Evaluation Process + (Gemini 2.5 Pro) + โ†“ +Output: + batch_evaluation_*.jsonl (results) + โ†“ + Metrics Calculation + โ†“ + Compliance Scores +``` -## ๐ŸŽ“ Documentation Quality +## ๐ŸŽ“ Key Dependencies -All documentation has been: -- โœ… Consistent with your actual code structure -- โœ… Updated with correct import paths -- โœ… Verified against your source files -- โœ… Tested for accuracy -- โœ… Formatted for GitHub markdown -- โœ… Includes practical examples -- โœ… Cross-referenced between documents +| Package | Version | Purpose | +|---------|---------|---------| +| pandas | >=2.0.0 | Data manipulation | +| litellm | >=1.0.0 | LLM API access | +| tqdm | >=4.60.0 | Progress bars | -## ๐Ÿ“Š File Sizes +## ๐Ÿ’พ Storage Requirements + +- **Code**: ~1 MB +- **Dependencies**: ~50 MB (pandas, litellm, etc.) +- **Data**: Variable (depends on number of reports) +- **Results**: ~1 KB per rubric evaluation +- **Cache**: Minimal (reserved for future use) + +## ๐Ÿš€ Performance Characteristics + +- **Single Rubric Evaluation**: ~5-15 seconds +- **Batch Processing**: 20 reports concurrently (default) +- **Token Usage**: 3,000-10,000 tokens per evaluation +- **Cost**: ~$0.01-$0.05 per rubric (Gemini 2.5 Pro) + +## ๐Ÿ“ˆ Scalability + +- **Reports**: Can handle 100+ reports in one batch +- **Rubrics per Report**: Tested with 20+ rubrics per report +- **Concurrent Requests**: Configurable (5-30 recommended) +- **Document Length**: Automatic chunking for large documents + +## ๐Ÿ” Security Considerations + +- **API Key**: Stored in `.env` file (not committed to Git) +- **Data Privacy**: All processing local except LLM API calls +- **No Data Persistence**: Evaluation happens in memory +- **.gitignore**: Protects sensitive files from version control + +## ๐ŸŽฏ Use Cases + +1. **Research Paper Evaluation**: Grade AI-generated research reports +2. **Content Quality Assessment**: Evaluate content against criteria +3. **Automated Grading**: Batch process student submissions +4. **Benchmark Testing**: Compare different AI models' outputs + +## ๐Ÿ“– Documentation Structure ``` -Total Package: 94 KB -Documentation: 79 KB (84%) -Configuration: 6 KB (6%) -Setup Scripts: 6 KB (7%) -Prompts: 4 KB (4%) +Start Here: + README.md + โ†“ +Quick Start: + QUICKSTART.md + SETUP_GUIDE.md + โ†“ +Reference: + DATA_FORMAT.md + FOLDER_STRUCTURE.md + FILE_MANIFEST.md + โ†“ +Detailed Setup: + INSTALLATION.md ``` -## ๐Ÿš€ Ready for Publication +## โœ… Quality Assurance + +- **Code Quality**: Well-structured, documented functions +- **Error Handling**: Comprehensive try-catch blocks +- **Logging**: Detailed logging for debugging +- **Retry Logic**: Exponential backoff for API failures +- **Validation**: Input validation for data files + +## ๐Ÿ”„ Extensibility + +Easy to extend: +- **New Models**: Change `model` parameter +- **Custom Prompts**: Modify prompt templates +- **Different Grading**: Modify verdict logic +- **Additional Metrics**: Add to `calculate_metrics/` + +## ๐Ÿ“ฆ Distribution Checklist + +When distributing: + +โœ… **Include**: +- All documentation files +- Configuration templates +- Source code +- Prompt templates +- LICENSE and CITATION.bib +- Empty directory structure + +โŒ **Exclude**: +- `.env` file with real API keys +- Actual evaluation results +- Cache files +- `__pycache__` directories + +## ๐ŸŽ“ Educational Value + +This package demonstrates: +- Async Python programming +- LLM API integration +- Batch processing patterns +- Document chunking strategies +- Error handling best practices +- Configuration management -Once you've completed setup and added your source code: +## ๐Ÿ“ž Support -1. **Review**: RELEASE_CHECKLIST.md -2. **Update**: Replace [Authors], [URLs], etc. -3. **Test**: Fresh install in new environment -4. **Commit**: Initialize git and push to GitHub -5. **Release**: Create v1.0.0 release on GitHub -6. **Announce**: Share with community +**Documentation**: See all `.md` files in root directory +**Issues**: GitHub Issues +**Updates**: Git pull from main branch -## ๐Ÿ“ž Questions? +## ๐Ÿ”— Related Resources -- Setup issues? โ†’ See **SETUP_GUIDE.md** -- Installation problems? โ†’ See **INSTALLATION.md** -- Usage questions? โ†’ See **QUICKSTART.md** -- Structure confusion? โ†’ See **FOLDER_STRUCTURE.md** +- LiteLLM Documentation: https://docs.litellm.ai/ +- Pandas Documentation: https://pandas.pydata.org/ +- Python AsyncIO: https://docs.python.org/3/library/asyncio.html -## ๐ŸŽฏ Success Criteria +## ๐Ÿ“Š Package Statistics -Your release is ready when: -1. โœ… Fresh clone + pip install works -2. โœ… All source code in place -3. โœ… All documentation complete -4. โœ… No sensitive info in repo -5. โœ… Example workflow runs -6. โœ… All links work -7. โœ… Tests pass (if any) +- **Total Lines of Code**: ~850 (Python) +- **Total Lines of Documentation**: ~2,000 (Markdown) +- **Test Coverage**: Minimal (ready for expansion) +- **Python Version**: 3.8+ +- **Platform**: Cross-platform (Windows, macOS, Linux) --- **Package Version**: 1.0.0 -**Generated**: 2025-01-XX -**Documentation Updated**: โœ… Consistent with actual code -**Ready for Release**: After adding your 7 Python source files +**Release Date**: 2025-11-13 +**Maintained By**: Research Team +**Status**: Production Ready -**Next Step**: Read **SETUP_GUIDE.md** for detailed instructions! +**Next Steps**: Read [README.md](README.md) to get started! diff --git a/QUICKSTART.md b/QUICKSTART.md index 58f2e5a..03a71fb 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -1,98 +1,93 @@ # Quick Start Guide -This guide will help you get started with the Deep Research Benchmarks codebase in minutes. +This guide will help you get started with the Research Rubrics codebase in minutes. ## Prerequisites - Python 3.8+ -- OpenAI API key (or compatible endpoint) +- LiteLLM API key (for accessing Gemini 2.5 Pro) ## Installation ```bash # Clone the repository git clone -cd public_release_experiments +cd researchrubrics # Install dependencies pip install -r requirements.txt # Set up environment -cp .env.example .env -# Edit .env and add your OPENAI_API_KEY +echo "LITELLM_API_KEY=your_api_key_here" > .env ``` ## Basic Workflow -### Step 1: Extract Rubrics +### Step 1: Prepare Your Data -Process your raw CSV files to extract rubrics and ground truth: +Ensure you have: +1. `data/researchrubrics/processed_data.jsonl` - Contains rubrics and prompts +2. Markdown files in `agent_responses/` - Named with sample IDs (e.g., `683a58c9a7e7fe4e7695846f.md`) -```bash -cd src/extract_rubrics -python extract_rubrics_batch.py -``` - -**Output**: `data/processed_df/compiled_dataset.csv` and `compiled_dataset.parquet` - -### Step 2: Evaluate with LLMs - -Evaluate documents against rubric criteria: +### Step 2: Evaluate Reports ```bash -cd ../evaluate_rubrics -python evaluate_rubrics_batch.py +cd src/evaluate_rubrics +python evaluate_reports_batch.py ``` -**Output**: `results///processed_df/compiled_dataset.csv` - -### Step 3: Calculate Metrics +This will: +- Process all `.md` files in `agent_responses/` +- Evaluate each report against its rubrics +- Save results to `results/batch_evaluation_YYYYMMDD_HHMMSS.jsonl` -Compare ground truth vs predictions: +### Step 3: Calculate Compliance Scores ```bash cd ../calculate_metrics - -# Calculate F1 scores -python calculate_F1_score.py - -# Calculate weighted scores -python calculate_final_score.py - -# Analyze failure patterns -python calculate_failure_breakdown.py +python calculate_compliance_score.py ``` -## Example: Single Task Evaluation +This will display compliance scores for each evaluated report. + +## Example: Single Report Evaluation -For testing or debugging, evaluate a single task: +For testing or debugging, evaluate a single report: ```python import asyncio -import sys from pathlib import Path +import sys # Add src directory to path if running from project root -sys.path.insert(0, str(Path(__file__).parent / 'src' / 'evaluate_rubrics')) +sys.path.insert(0, 'src/evaluate_rubrics') -from evaluate_rubrics_markitdown_onetask import evaluate_task_rubrics +from evaluate_single_report import evaluate_task_rubrics async def main(): - # Evaluate a specific task - results_df = await evaluate_task_rubrics( - task_name="683a58c9a7e7fe4e7695846f", - binary=False # Use ternary evaluation - ) + # Evaluate a specific markdown file + markdown_file = "agent_responses/683a58c9a7e7fe4e7695846f.md" + results_df, compliance_score = await evaluate_task_rubrics(markdown_file) # Display results + print(f"\nCompliance Score: {compliance_score:.2%}") print(f"Evaluated {len(results_df)} rubrics") - print(f"Average score: {results_df['score'].mean():.3f}") + print(f"Average confidence: {results_df['confidence'].mean():.2f}") + print(f"Total tokens used: {results_df['tokens_used'].sum()}") print(f"Total cost: ${results_df['cost'].sum():.4f}") + + # Show some rubric results + print("\nSample Results:") + for idx, row in results_df.head(3).iterrows(): + print(f"\n{idx+1}. {row['rubric_title'][:60]}...") + print(f" Verdict: {row['verdict']}") + print(f" Score: {row['score']}") + print(f" Confidence: {row['confidence']}") asyncio.run(main()) ``` -**Note**: When running scripts from their directories (e.g., `cd src/evaluate_rubrics && python evaluate_rubrics_batch.py`), imports work automatically. +**Note**: When running scripts from their directories (e.g., `cd src/evaluate_rubrics && python evaluate_reports_batch.py`), imports work automatically. ## Example: Custom Configuration @@ -104,180 +99,161 @@ import os import sys from pathlib import Path -# Add src directory to path if needed -sys.path.insert(0, str(Path(__file__).parent / 'src' / 'evaluate_rubrics')) +sys.path.insert(0, 'src/evaluate_rubrics') -from evaluate_rubrics_markitdown_onetask import RubricEvaluator +from evaluate_single_report import RubricEvaluator -# Initialize with custom settings -evaluator = RubricEvaluator( - api_key=os.getenv("OPENAI_API_KEY"), - base_url="https://example.com", # Custom endpoint - model="gpt-4o", # Use GPT-4o instead of GPT-5 - binary=True, # Binary evaluation mode - max_concurrent=10 # Limit concurrent requests -) - -# Process documents async def main(): - results = await evaluator.evaluate_all_rubrics( - rubrics=rubrics, - pdf_paths=pdf_paths, - save_results=True + # Initialize with custom settings + evaluator = RubricEvaluator( + api_key=os.getenv("LITELLM_API_KEY"), + model="litellm_proxy/gemini/gemini-2.5-pro-preview-06-05", + max_concurrent=10 # Reduce concurrent requests if hitting rate limits ) - print(f"Evaluation complete: {len(results)} results") + + # Evaluate manually + markdown_content = Path("agent_responses/683a58c9a7e7fe4e7695846f.md").read_text() + rubrics = [...] # Load from processed_data.jsonl + + results = [] + for rubric in rubrics: + result = await evaluator.evaluate_single_rubric( + document_content=markdown_content, + rubric_criterion=rubric['criterion'] + ) + results.append(result) + + print(f"Completed {len(results)} evaluations") asyncio.run(main()) ``` -## Example: Binary vs Ternary Evaluation +## Example: Analyzing Results -**Ternary Mode** (default - 3 classes): -```python -results = await evaluate_task_rubrics( - task_name="your_task", - binary=False # Satisfied / Partially Satisfied / Not Satisfied -) -``` +Process evaluation results: -**Binary Mode** (strict pass/fail): ```python -results = await evaluate_task_rubrics( - task_name="your_task", - binary=True # Satisfied / Not Satisfied -) -``` - -## Example: Batch Processing with Progress - -Monitor progress during batch processing: - -```python -from tqdm import tqdm +import json import pandas as pd -# Load all tasks -df = pd.read_csv("data/processed_df/compiled_dataset.csv") - +# Read evaluation results results = [] -for idx, task_row in tqdm(df.iterrows(), total=len(df)): - result = await evaluate_task_rubrics( - task_row=task_row, - save_results=False - ) - results.append(result) +with open('results/batch_evaluation_20251113_093457.jsonl', 'r') as f: + for line in f: + results.append(json.loads(line)) + +df = pd.DataFrame(results) + +# Group by sample_id +by_sample = df.groupby('sample_id').agg({ + 'score': 'mean', + 'cost': 'sum', + 'tokens_used': 'sum', + 'confidence': 'mean' +}) + +print("\nResults by Sample:") +print(by_sample) + +# Analyze by rubric axis +with open('data/researchrubrics/processed_data.jsonl', 'r') as f: + data = [json.loads(line) for line in f] + +# Find which rubric axes have the lowest scores +axis_scores = {} +for _, row in df.iterrows(): + # Find the rubric's axis + for task in data: + if task['sample_id'] == row['sample_id']: + for rubric in task['rubrics']: + if rubric['criterion'] == row['rubric_title']: + axis = rubric['axis'] + if axis not in axis_scores: + axis_scores[axis] = [] + axis_scores[axis].append(row['score']) + break + +print("\nAverage Scores by Axis:") +for axis, scores in axis_scores.items(): + print(f"{axis}: {sum(scores)/len(scores):.2%}") ``` -## Example: Calculate Custom Metrics - -Calculate metrics on your results: - -```python -import sys -from pathlib import Path - -# Add src directory to path if needed -sys.path.insert(0, str(Path(__file__).parent / 'src' / 'calculate_metrics')) - -from calculate_F1_score import ( - load_data, - calculate_model_f1_scores, - calculate_average_f1_scores -) - -# Define paths relative to project root -base_dir = Path(__file__).parent -ground_truth_path = base_dir / "data" / "processed_df" / "compiled_dataset.parquet" -predicted_path = base_dir / "results" / "11_04" / "20251104_034416" / "processed_df" / "compiled_dataset.parquet" - -# Load ground truth and predictions -ground_truth_df = load_data(ground_truth_path) -predicted_df = load_data(predicted_path) - -# Calculate F1 scores -f1_scores = calculate_model_f1_scores( - ground_truth_df, - predicted_df, - binary=False # Ternary evaluation -) - -# Get averages -avg_f1 = calculate_average_f1_scores(f1_scores) - -# Display results -for model, score in avg_f1.items(): - print(f"{model}: {score:.4f}") -``` - -**Note**: When running from `src/calculate_metrics/` directory, the scripts handle paths automatically. - ## Troubleshooting ### API Rate Limits If you hit rate limits, reduce concurrency: ```python -evaluator = RubricEvaluator(max_concurrent=5) # Lower concurrency +evaluator = RubricEvaluator(max_concurrent=5) ``` -### Missing PDFs -If PDFs fail to download, place them manually in: +### Missing Input Data +Ensure `data/researchrubrics/processed_data.jsonl` exists: +```bash +ls data/researchrubrics/processed_data.jsonl ``` -data/predownloaded_pdfs// -โ”œโ”€โ”€ gemini.pdf -โ”œโ”€โ”€ chatgpt.pdf -โ””โ”€โ”€ perplexity.pdf + +### Missing Markdown Files +Check that markdown files exist in `agent_responses/`: +```bash +ls agent_responses/*.md ``` -### Memory Issues -For large batches, process in smaller chunks: -```python -# Process first 10 tasks -limited_df = df.head(10) -results = await evaluate_batch(limited_df) +### API Key Issues +Verify `.env` file is in project root with correct key: +```bash +cat .env +# Should show: LITELLM_API_KEY=your_actual_key ``` ## Common Configurations -### Using Custom API Endpoint +### Using Custom Model + ```python evaluator = RubricEvaluator( - api_key="your_key", - base_url="https://your-endpoint.com", - model="your-model" + model="litellm_proxy/gemini/gemini-2.5-pro-preview-06-05" # Change model here ) ``` -### Adjusting File Paths +### Adjusting Concurrency + ```python -from pathlib import Path +# Conservative (for rate limit sensitive APIs) +evaluator = RubricEvaluator(max_concurrent=5) -# Set custom paths -base_path = Path("/custom/path") -ground_truth = base_path / "data" / "compiled_dataset.parquet" -predicted = base_path / "results" / "compiled_dataset.parquet" +# Aggressive (for higher throughput) +evaluator = RubricEvaluator(max_concurrent=30) ``` -### Saving Intermediate Results +### Custom Output Location + ```python -# Save after each task -for task in tasks: - result = await evaluate_task_rubrics( - task_name=task, - save_results=True # Save detailed results - ) +# In evaluate_reports_batch.py +await evaluate_all_reports( + agent_responses_dir="agent_responses", + output_file="results/my_custom_results.jsonl" +) ``` +## Expected Performance + +Typical performance metrics: +- **Single rubric evaluation**: ~5-15 seconds (depends on document length) +- **Batch processing**: 20 reports concurrently by default +- **Token usage**: 3,000-10,000 tokens per rubric evaluation +- **Cost**: ~$0.01-$0.05 per rubric evaluation (Gemini 2.5 Pro pricing) + ## Next Steps - Read the full [README.md](README.md) for comprehensive documentation -- Check [CONTRIBUTING.md](CONTRIBUTING.md) to contribute -- Review the code in `src/` for implementation details -- Explore example notebooks (if available) +- Check [DATA_FORMAT.md](DATA_FORMAT.md) for data format details +- Review [INSTALLATION.md](INSTALLATION.md) for detailed setup +- See [FOLDER_STRUCTURE.md](FOLDER_STRUCTURE.md) for project organization ## Getting Help -- Open an issue on GitHub -- Check existing issues and discussions -- Contact the maintainers +- Check existing issues on GitHub +- Open a new issue with your question +- Include error messages and relevant code snippets -Happy benchmarking! ๐Ÿš€ +Happy evaluating! ๐Ÿš€ diff --git a/README.md b/README.md index 3640fe0..b14fcc9 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ -# Deep Research Benchmarks +# ResearchRubrics -Official code release for the ICLR paper on Deep Research Benchmarks. This repository contains tools for extracting rubrics from evaluation datasets, evaluating documents against rubric criteria using Large Language Models (LLMs), and calculating comprehensive metrics. +Official code release for the ResearchRubrics project. This repository contains tools for evaluating AI-generated research documents against structured rubric criteria using Large Language Models (LLMs). ## Overview -This codebase provides a complete pipeline for evaluating AI-generated research documents against structured rubric criteria: +This codebase provides a complete pipeline for evaluating AI-generated research reports (in markdown format) against structured rubric criteria: -1. **Rubric Extraction**: Process raw CSV files to extract evaluation rubrics, prompts, and ground truth annotations -2. **Rubric Evaluation**: Use LLMs to evaluate whether documents satisfy specific rubric criteria -3. **Metrics Calculation**: Compute F1 scores, failure breakdowns, and weighted scores +1. **Rubric-Based Evaluation**: Use LLMs to evaluate whether markdown documents satisfy specific rubric criteria +2. **Batch Processing**: Evaluate multiple research reports efficiently with concurrent processing +3. **Compliance Scoring**: Calculate compliance scores based on weighted rubric evaluations ## Table of Contents @@ -16,9 +16,9 @@ This codebase provides a complete pipeline for evaluating AI-generated research - [Quick Start](#quick-start) - [Repository Structure](#repository-structure) - [Usage](#usage) - - [Extracting Rubrics](#extracting-rubrics) - - [Evaluating Rubrics](#evaluating-rubrics) - - [Calculating Metrics](#calculating-metrics) + - [Evaluating Single Reports](#evaluating-single-reports) + - [Batch Evaluation](#batch-evaluation) + - [Calculating Compliance Scores](#calculating-compliance-scores) - [Data Format](#data-format) - [Configuration](#configuration) - [Citation](#citation) @@ -30,13 +30,14 @@ This codebase provides a complete pipeline for evaluating AI-generated research - Python 3.8 or higher - pip package manager +- LiteLLM API key (for accessing Gemini 2.5 Pro) ### Setup 1. Clone the repository: ```bash git clone -cd public_release_experiments +cd researchrubrics ``` 2. Install dependencies: @@ -44,76 +45,75 @@ cd public_release_experiments pip install -r requirements.txt ``` -3. Configure API credentials: +3. Download the dataset: ```bash -cp .env.example .env -# Edit .env and add your OpenAI API key +mkdir -p data/researchrubrics +huggingface-cli download ScaleAI/researchrubrics processed_data.jsonl --local-dir data/researchrubrics +``` + +4. Configure API credentials: +```bash +# Create .env file in project root +echo "LITELLM_API_KEY=your_api_key_here" > .env ``` ## Quick Start ```bash # Navigate to project root -cd public_release_experiments +cd researchrubrics # 1. Install dependencies pip install -r requirements.txt -# 2. Configure API key -cp .env.example .env -# Edit .env and add your OPENAI_API_KEY +# 2. Download the dataset +mkdir -p data/researchrubrics +huggingface-cli download ScaleAI/researchrubrics processed_data.jsonl --local-dir data/researchrubrics -# 3. Extract rubrics from raw CSV files -cd src/extract_rubrics -python extract_rubrics_batch.py +# 3. Configure API key +echo "LITELLM_API_KEY=your_api_key_here" > .env -# 4. Evaluate rubrics using LLMs -cd ../evaluate_rubrics -python evaluate_rubrics_batch.py +# 4. Place markdown reports in agent_responses/ directory +# (Reports should be named with task IDs, e.g., 683a58c9a7e7fe4e7695846f.md) -# 5. Calculate metrics +# 5. Evaluate all reports +cd src/evaluate_rubrics +python evaluate_reports_batch.py + +# 6. Calculate compliance scores cd ../calculate_metrics -python calculate_F1_score.py -python calculate_final_score.py -python calculate_failure_breakdown.py +python calculate_compliance_score.py ``` ## Repository Structure ``` -public_release_experiments/ +researchrubrics/ โ”œโ”€โ”€ src/ -โ”‚ โ”œโ”€โ”€ extract_rubrics/ # Rubric extraction from CSV files -โ”‚ โ”‚ โ”œโ”€โ”€ extract_rubrics_batch.py -โ”‚ โ”‚ โ””โ”€โ”€ extract_rubrics_markitdown_onetask.py +โ”‚ โ”œโ”€โ”€ __init__.py โ”‚ โ”œโ”€โ”€ evaluate_rubrics/ # LLM-based rubric evaluation -โ”‚ โ”‚ โ”œโ”€โ”€ evaluate_rubrics_batch.py -โ”‚ โ”‚ โ”œโ”€โ”€ evaluate_rubrics_markitdown_onetask.py -โ”‚ โ”‚ โ””โ”€โ”€ prompts/ # Evaluation prompts -โ”‚ โ”‚ โ”œโ”€โ”€ binary/ # Binary classification prompts -โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ system_prompt.txt -โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ user_prompt_template.txt -โ”‚ โ”‚ โ””โ”€โ”€ ternary/ # Ternary classification prompts -โ”‚ โ”‚ โ”œโ”€โ”€ system_prompt.txt -โ”‚ โ”‚ โ””โ”€โ”€ user_prompt_template.txt -โ”‚ โ””โ”€โ”€ calculate_metrics/ # Metrics computation -โ”‚ โ”œโ”€โ”€ calculate_F1_score.py -โ”‚ โ”œโ”€โ”€ calculate_final_score.py -โ”‚ โ””โ”€โ”€ calculate_failure_breakdown.py +โ”‚ โ”‚ โ”œโ”€โ”€ evaluate_single_report.py # Single markdown evaluation +โ”‚ โ”‚ โ””โ”€โ”€ evaluate_reports_batch.py # Batch evaluation script +โ”‚ โ”œโ”€โ”€ calculate_metrics/ # Metrics computation +โ”‚ โ”‚ โ””โ”€โ”€ calculate_compliance_score.py +โ”‚ โ””โ”€โ”€ prompts/ # Evaluation prompt templates +โ”‚ โ”œโ”€โ”€ system_prompt.txt +โ”‚ โ”œโ”€โ”€ user_prompt.txt +โ”‚ โ”œโ”€โ”€ chunk_prompt_template.txt +โ”‚ โ””โ”€โ”€ synthesis_prompt_template.txt โ”œโ”€โ”€ data/ -โ”‚ โ”œโ”€โ”€ raw_csvs/ # Input: Raw evaluation CSV files -โ”‚ โ”œโ”€โ”€ processed_df/ # Output: Compiled datasets -โ”‚ โ”œโ”€โ”€ PDFs/ # Downloaded/processed PDFs -โ”‚ โ”‚ โ””โ”€โ”€ [task_name]/ # PDFs organized by task -โ”‚ โ”‚ โ”œโ”€โ”€ gemini.pdf -โ”‚ โ”‚ โ”œโ”€โ”€ chatgpt.pdf -โ”‚ โ”‚ โ””โ”€โ”€ perplexity.pdf -โ”‚ โ””โ”€โ”€ predownloaded_pdfs/ # Optional: Pre-downloaded PDFs -โ”œโ”€โ”€ results/ # Evaluation results by date/timestamp -โ”‚ โ””โ”€โ”€ [mm_dd]/[timestamp]/processed_df/ +โ”‚ โ””โ”€โ”€ researchrubrics/ # Input data +โ”‚ โ”œโ”€โ”€ processed_data.jsonl # Rubrics and metadata (JSONL format) +โ”‚ โ””โ”€โ”€ README.md # Dataset documentation +โ”œโ”€โ”€ agent_responses/ # Input: Markdown reports to evaluate +โ”‚ โ””โ”€โ”€ [task_id].md # One file per task +โ”œโ”€โ”€ results/ # Evaluation results (JSONL format) +โ”‚ โ””โ”€โ”€ batch_evaluation_YYYYMMDD_HHMMSS.jsonl โ”œโ”€โ”€ cache/ # Cached markdown conversions +โ”œโ”€โ”€ tests/ +โ”‚ โ””โ”€โ”€ __init__.py โ”œโ”€โ”€ requirements.txt -โ”œโ”€โ”€ .env.example +โ”œโ”€โ”€ .env # Your API key (DO NOT COMMIT) โ”œโ”€โ”€ .gitignore โ”œโ”€โ”€ setup.py โ”œโ”€โ”€ LICENSE @@ -123,178 +123,176 @@ public_release_experiments/ ## Usage -### Extracting Rubrics +### Evaluating Single Reports -Extract rubrics, prompts, and ground truth annotations from raw CSV files: +Evaluate a single markdown report against its rubrics: -#### Single Task ```python -from extract_rubrics_markitdown_onetask import RubricExtractor - -extractor = RubricExtractor() -results = extractor.process_task("data/raw_csvs/task_file.csv") +import asyncio +from pathlib import Path +from evaluate_single_report import evaluate_task_rubrics + +async def main(): + # Evaluate a specific markdown file + markdown_file = "agent_responses/683a58c9a7e7fe4e7695846f.md" + results_df, compliance_score = await evaluate_task_rubrics(markdown_file) + + # Display results + print(f"Compliance Score: {compliance_score:.2%}") + print(f"Evaluated {len(results_df)} rubrics") + print(f"Total cost: ${results_df['cost'].sum():.4f}") + +asyncio.run(main()) ``` -#### Batch Processing -```bash -cd src/extract_rubrics -python extract_rubrics_batch.py -``` - -**Output**: `data/processed_df/compiled_dataset.csv` and `compiled_dataset.parquet` +**Output**: Returns a DataFrame with evaluation results and a compliance score. -**Output Format**: -- `task_name`: Unique task identifier -- `prompt`: Original task prompt -- `rubrics`: List of rubric criteria (JSON) -- `pdf_paths`: Paths to model-generated PDFs -- `final_presence`: Ground truth annotations +### Batch Evaluation -### Evaluating Rubrics - -Evaluate documents against rubric criteria using LLMs: - -#### Single Task -```python -from evaluate_rubrics_markitdown_onetask import evaluate_task_rubrics +Evaluate all markdown reports in the `agent_responses/` directory: -# Evaluate a specific task -results_df = await evaluate_task_rubrics( - task_name="683a58c9a7e7fe4e7695846f", - binary=False # Use ternary prompts (Satisfied/Partially Satisfied/Not Satisfied) -) -``` - -#### Batch Processing ```bash cd src/evaluate_rubrics -python evaluate_rubrics_batch.py +python evaluate_reports_batch.py ``` +**Features**: +- Processes all `.md` files in `agent_responses/` +- Uses binary grading (Satisfied/Not Satisfied) +- Powered by Gemini 2.5 Pro via LiteLLM +- Concurrent processing (default: 20 concurrent requests) +- Automatic retry logic with exponential backoff + +**Output**: Results saved to `results/batch_evaluation_YYYYMMDD_HHMMSS.jsonl` + **Configuration Options**: -- `binary`: Set to `True` for binary evaluation (Satisfied/Not Satisfied), `False` for ternary (Satisfied/Partially Satisfied/Not Satisfied) -- `model`: LLM model to use (default: "gpt-5") +- `model`: LLM model to use (default: "litellm_proxy/gemini/gemini-2.5-pro-preview-06-05") - `max_concurrent`: Maximum concurrent API calls (default: 20) +- `agent_responses_dir`: Directory containing markdown files (default: "agent_responses/") +- `output_file`: Custom output file path (optional) -**Output**: Results saved to `results///processed_df/` - -### Calculating Metrics +### Calculating Compliance Scores -#### F1 Scores -Compare ground truth vs. predicted evaluations: +Calculate compliance scores from evaluation results: ```bash cd src/calculate_metrics -python calculate_F1_score.py -``` - -**Configuration**: -- Set `binary = True/False` for binary or ternary evaluation -- Update file paths for ground truth and predicted datasets - -**Output**: Macro F1 scores for each model (Gemini, ChatGPT, Perplexity) - -#### Weighted Scores -Calculate weighted scores based on rubric weights: - -```bash -python calculate_final_score.py +python calculate_compliance_score.py ``` **Scoring**: -- Satisfied: 1.0 -- Partially Satisfied: 0.5 -- Not Satisfied: 0.0 - -Score = ฮฃ(weight ร— score) / ฮฃ(positive weights) - -#### Failure Breakdown -Analyze failure patterns by rubric category: - -```bash -python calculate_failure_breakdown.py -``` +- Binary grading: Satisfied = 1.0, Not Satisfied = 0.0 +- Compliance Score = ฮฃ(weight ร— score) / ฮฃ(positive weights) +- Excludes negative-weight rubrics from denominator -**Output**: -- Per-task average failure ratios by category -- Aggregate failure distribution across all tasks -- Identification of common failure patterns +**Output**: Displays compliance scores for each evaluated report ## Data Format -### Input CSV Format - -Raw CSV files should contain: -- Row 0: Task prompt -- Rows 1+: Rubric evaluations with columns: - - `title`: Rubric criterion title - - `weight`: Rubric weight (numeric) - - `category`: Rubric category - - `gemini_present`: Ground truth for Gemini - - `chatgpt_present`: Ground truth for ChatGPT - - `perplexity_present`: Ground truth for Perplexity - -### Compiled Dataset Format - -The compiled dataset (`compiled_dataset.csv`) contains: - -``` -csv_filename,task_name,prompt,rubrics,rubrics_count,pdf_paths,final_presence +### Input Data (processed_data.jsonl) + +The input data file `data/researchrubrics/processed_data.jsonl` should be downloaded from the [ScaleAI/researchrubrics](https://huggingface.co/datasets/ScaleAI/researchrubrics) HuggingFace dataset and placed in the `data/researchrubrics/` directory. The file contains one JSON object per line: + +```json +{ + "prompt": "Task description...", + "sample_id": "683a58c9a7e7fe4e7695846f", + "domain": "AI & ML", + "conceptual_breadth": "Moderate", + "logical_nesting": "Intermediate", + "exploration": "Medium", + "rubrics": [ + { + "criterion": "Rubric description...", + "weight": 4.0, + "axis": "Explicit Criteria" + } + ] +} ``` -- `rubrics`: JSON array of rubric objects -- `pdf_paths`: JSON object with paths to PDFs for each model -- `final_presence`: JSON object with ground truth evaluations +**Fields**: +- `prompt`: The research task/question +- `sample_id`: Unique identifier matching markdown filename +- `domain`: Domain category +- `conceptual_breadth`, `logical_nesting`, `exploration`: Task complexity metrics +- `rubrics`: Array of evaluation criteria with weights and categories + +### Markdown Reports (agent_responses/) + +Input markdown files should be named with their `sample_id` (e.g., `683a58c9a7e7fe4e7695846f.md`) and contain the AI-generated research report to evaluate. + +### Evaluation Results (results/*.jsonl) + +Output JSONL file with one evaluation result per line: + +```json +{ + "sample_id": "683a58c9a7e7fe4e7695846f", + "rubric_title": "Rubric description...", + "verdict": "Satisfied", + "score": 1.0, + "confidence": 0.95, + "reasoning": "Detailed explanation...", + "tokens_used": 4567, + "cost": 0.0247, + "success": true, + "weight": 4.0 +} +``` ## Configuration ### Environment Variables -Create a `.env` file from the template: +Create a `.env` file in the project root: ```bash -cp .env.example .env +echo "LITELLM_API_KEY=your_api_key_here" > .env ``` Required variables: ``` -OPENAI_API_KEY=your_api_key_here +LITELLM_API_KEY=your_api_key_here ``` -Optional variables: +Optional variables (only needed if using a custom LiteLLM proxy): ``` API_BASE_URL=https://your-custom-endpoint.com -MODEL_NAME=gpt-5 ``` ### API Configuration -The evaluation scripts support: -- **OpenAI API**: Set `OPENAI_API_KEY` -- **Custom endpoints**: Set both `OPENAI_API_KEY` and `API_BASE_URL` -- **LiteLLM proxy**: Supported via custom base URL +The evaluation scripts use **LiteLLM** to access Gemini 2.5 Pro: +- Set `LITELLM_API_KEY` in your `.env` file +- Default model: `litellm_proxy/gemini/gemini-2.5-pro-preview-06-05` +- Custom base URL supported via `API_BASE_URL` environment variable + +### Model Configuration -### Model Support +In `evaluate_single_report.py`, you can customize: -Supported models (configurable in evaluation scripts): -- `gpt-5` (default) -- `gpt-4o` -- `gpt-4.1` -- `litellm_proxy/gemini/gemini-2.5-pro-preview-06-05` +```python +evaluator = RubricEvaluator( + model="litellm_proxy/gemini/gemini-2.5-pro-preview-06-05", + max_concurrent=20 # Adjust based on rate limits +) +``` ## Advanced Features ### Document Chunking For large documents exceeding token limits, the evaluator automatically: -1. Splits documents into chunks -2. Evaluates each chunk independently -3. Synthesizes findings into final verdict +1. Splits documents into manageable chunks (8000 tokens per chunk) +2. Evaluates each chunk independently using chunk-specific prompts +3. Synthesizes chunk findings into a final verdict +4. Uses separate prompt templates for chunking and synthesis ### Caching -- **Markdown Conversion**: Cached in `cache/` directory -- **Document Content**: Cached in memory during batch processing +- **Markdown Parsing**: Internal caching to avoid redundant parsing +- The `cache/` directory is reserved for future use ### Parallel Processing @@ -303,51 +301,66 @@ Batch evaluation uses asynchronous processing with configurable concurrency: evaluator = RubricEvaluator(max_concurrent=20) ``` -## Evaluation Modes +Adjust `max_concurrent` based on your API rate limits. -### Binary Mode -- **Classes**: Satisfied, Not Satisfied -- **Use Case**: Strict pass/fail evaluation -- **Configuration**: Set `binary=True` +### Retry Logic + +Automatic retry with exponential backoff: +- Maximum 3 retries per request +- Exponential backoff: 2^retry_count seconds +- Handles rate limits and transient errors gracefully -### Ternary Mode (Default) -- **Classes**: Satisfied, Partially Satisfied, Not Satisfied -- **Use Case**: Nuanced evaluation with partial credit -- **Configuration**: Set `binary=False` +## Evaluation Mode + +### Binary Grading (Current Implementation) +- **Classes**: Satisfied, Not Satisfied +- **Scoring**: 1.0 for Satisfied, 0.0 for Not Satisfied +- **Use Case**: Clear pass/fail evaluation +- **Prompts**: Uses prompts from `src/prompts/` directory ## Troubleshooting ### Common Issues -1. **API Rate Limits**: Reduce `max_concurrent` in evaluation scripts -2. **Missing PDFs**: Check `pdf_paths` errors in extraction output -3. **Empty Results**: Verify CSV format matches expected structure -4. **Markdown Conversion Fails**: Install `markitdown` or check PDF file validity +1. **API Rate Limits**: Reduce `max_concurrent` in `RubricEvaluator` initialization + ```python + evaluator = RubricEvaluator(max_concurrent=5) + ``` + +2. **Missing Input Data**: Ensure `data/researchrubrics/processed_data.jsonl` exists +3. **Missing Markdown Files**: Check that markdown files exist in `agent_responses/` with matching `sample_id` names +4. **API Key Issues**: Verify `.env` file is in project root with correct `LITELLM_API_KEY` ### Logging -Adjust logging level in scripts: +The scripts use Python's logging module. Adjust logging level: ```python logging.basicConfig(level=logging.DEBUG) # For detailed output +logging.basicConfig(level=logging.INFO) # For standard output +logging.basicConfig(level=logging.WARNING) # For minimal output ``` ## Performance -Typical performance on a standard dataset: -- **Extraction**: ~1-2 seconds per task -- **Evaluation**: ~5-10 seconds per task (depends on document size and API latency) -- **Metrics**: <1 second for full dataset +Typical performance metrics: +- **Single Report Evaluation**: ~10-30 seconds (depends on document length and rubric count) +- **Batch Processing**: Processes 20 reports concurrently (configurable) +- **Token Usage**: Varies by document length; typically 3,000-10,000 tokens per rubric evaluation +- **Cost**: Approximately $0.01-$0.05 per rubric evaluation (Gemini 2.5 Pro pricing) ## Citation If you use this code in your research, please cite: ```bibtex -@inproceedings{deepresearch2025, - title={Deep Research Benchmarks}, - author={[Authors]}, - booktitle={International Conference on Learning Representations (ICLR)}, - year={2025} +@misc{sharma2025researchrubricsbenchmarkpromptsrubrics, + title={ResearchRubrics: A Benchmark of Prompts and Rubrics For Evaluating Deep Research Agents}, + author={Manasi Sharma and Chen Bo Calvin Zhang and Chaithanya Bandi and Clinton Wang and Ankit Aich and Huy Nghiem and Tahseen Rabbani and Ye Htet and Brian Jang and Sumana Basu and Aishwarya Balwani and Denis Peskoff and Marcos Ayestaran and Sean M. Hendryx and Brad Kenstler and Bing Liu}, + year={2025}, + eprint={2511.07685}, + archivePrefix={arXiv}, + primaryClass={cs.AI}, + url={https://arxiv.org/abs/2511.07685} } ``` @@ -357,8 +370,8 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file ## Contact -For questions or issues, please open an issue on GitHub or contact [contact email]. +For questions or issues, please open an issue on GitHub. ## Acknowledgments -This research was conducted as part of [institution/project name]. We thank the contributors and reviewers for their valuable feedback. +We thank the contributors and reviewers for their valuable feedback. diff --git a/SETUP_GUIDE.md b/SETUP_GUIDE.md index cfb02e4..0d9733d 100644 --- a/SETUP_GUIDE.md +++ b/SETUP_GUIDE.md @@ -1,339 +1,313 @@ -# Deep Research Benchmarks - Complete Setup Guide +# Research Rubrics - Complete Setup Guide -This guide will walk you through setting up your Deep Research Benchmarks release from scratch. +This guide will walk you through setting up Research Rubrics from scratch. -## ๐Ÿ“ฆ What You Have +## ๐Ÿ“ฆ Prerequisites -Your release package contains **21 files**: +Before you begin, ensure you have: +- Python 3.8 or higher installed +- Git installed (for cloning the repository) +- A LiteLLM API key (for accessing Gemini 2.5 Pro) +- Command line access (Terminal/PowerShell/Command Prompt) -### ๐Ÿ“„ Documentation (9 files) -1. **README.md** - Main documentation -2. **QUICKSTART.md** - Quick start guide -3. **INSTALLATION.md** - Installation instructions -4. **DATA_FORMAT.md** - Data format specifications -5. **FOLDER_STRUCTURE.md** - Directory organization guide -6. **FILE_MANIFEST.md** - File index -7. **CONTRIBUTING.md** - Contribution guidelines -8. **CHANGELOG.md** - Version history -9. **RELEASE_CHECKLIST.md** - Pre-publication checklist +## ๐Ÿš€ Complete Setup Instructions -### โš™๏ธ Configuration (5 files) -10. **requirements.txt** - Python dependencies -11. **setup.py** - Package configuration -12. **.env.example** - Environment variables template -13. **.gitignore** - Git exclusions -14. **LICENSE** - MIT License +### Step 1: Clone the Repository -### ๐Ÿ“ Other (2 files) -15. **CITATION.bib** - BibTeX citation -16. **setup_structure.sh** - Unix setup script -17. **setup_structure.bat** - Windows setup script +```bash +# Clone the repository +git clone +cd researchrubrics +``` -### ๐ŸŽฏ Prompts (4 files in prompts/ directory) -18-19. **prompts/binary/** - Binary evaluation prompts (2 files) -20-21. **prompts/ternary/** - Ternary evaluation prompts (2 files) +### Step 2: Create Virtual Environment (Recommended) -## ๐Ÿš€ Complete Setup Instructions +```bash +# Create a virtual environment +python -m venv venv + +# Activate it +# On Unix/macOS: +source venv/bin/activate +# On Windows: +.\venv\Scripts\activate +``` + +### Step 3: Install Dependencies + +```bash +# Install all required packages +pip install -r requirements.txt -### Step 1: Organize Your Files +# Verify installation +python -c "import pandas, litellm, tqdm; print('โœ“ All dependencies installed')" +``` -Create your project directory structure: +### Step 4: Configure API Credentials ```bash -# Create project directory -mkdir -p public_release_experiments -cd public_release_experiments - -# Move all documentation files to root -mv /path/to/README.md . -mv /path/to/QUICKSTART.md . -mv /path/to/INSTALLATION.md . -mv /path/to/DATA_FORMAT.md . -mv /path/to/FOLDER_STRUCTURE.md . -mv /path/to/FILE_MANIFEST.md . -mv /path/to/CONTRIBUTING.md . -mv /path/to/CHANGELOG.md . -mv /path/to/RELEASE_CHECKLIST.md . - -# Move configuration files to root -mv /path/to/requirements.txt . -mv /path/to/setup.py . -mv /path/to/.env.example . -mv /path/to/.gitignore . -mv /path/to/LICENSE . -mv /path/to/CITATION.bib . - -# Move setup scripts to root -mv /path/to/setup_structure.sh . -mv /path/to/setup_structure.bat . +# Create .env file with your API key +echo "LITELLM_API_KEY=your_actual_api_key_here" > .env + +# Verify .env file was created +cat .env ``` -### Step 2: Run Setup Script +**Important**: Replace `your_actual_api_key_here` with your real API key! -Run the appropriate setup script for your OS: +### Step 5: Verify Directory Structure -**Unix/Linux/macOS:** ```bash -chmod +x setup_structure.sh -./setup_structure.sh +# Check that all necessary directories exist +ls -d src/*/ +ls -d data/ +ls -d agent_responses/ +ls -d results/ ``` -**Windows:** -```batch -setup_structure.bat +Expected output: +``` +src/calculate_metrics/ +src/evaluate_rubrics/ +src/prompts/ +data/ +agent_responses/ +results/ ``` -This creates all necessary directories: -- `src/extract_rubrics/` -- `src/evaluate_rubrics/prompts/binary/` and `prompts/ternary/` -- `src/calculate_metrics/` -- `data/raw_csvs/`, `data/processed_df/`, `data/PDFs/`, `data/predownloaded_pdfs/` -- `results/`, `cache/`, `tests/` +### Step 6: Prepare Input Data -### Step 3: Place Your Source Code +Ensure you have: -Copy your existing Python files to the correct locations: +1. **Rubrics data file**: `data/researchrubrics/processed_data.jsonl` +2. **Markdown reports**: Files in `agent_responses/` directory ```bash -# Extract rubrics module -cp /path/to/your/extract_rubrics_batch.py src/extract_rubrics/ -cp /path/to/your/extract_rubrics_markitdown_onetask.py src/extract_rubrics/ - -# Evaluate rubrics module -cp /path/to/your/evaluate_rubrics_batch.py src/evaluate_rubrics/ -cp /path/to/your/evaluate_rubrics_markitdown_onetask.py src/evaluate_rubrics/ - -# Calculate metrics module -cp /path/to/your/calculate_F1_score.py src/calculate_metrics/ -cp /path/to/your/calculate_final_score.py src/calculate_metrics/ -cp /path/to/your/calculate_failure_breakdown.py src/calculate_metrics/ -``` +# Check data file exists +ls data/researchrubrics/processed_data.jsonl -### Step 4: Place Prompt Files +# Check markdown files exist +ls agent_responses/*.md +``` -Move the prompt files to their correct locations: +### Step 7: Test the Installation ```bash -# Binary prompts -mv prompts/binary/system_prompt.txt src/evaluate_rubrics/prompts/binary/ -mv prompts/binary/user_prompt_template.txt src/evaluate_rubrics/prompts/binary/ +# Test the evaluation module +cd src/evaluate_rubrics +python -c "from evaluate_single_report import RubricEvaluator; print('โœ“ Evaluate module OK')" -# Ternary prompts -mv prompts/ternary/system_prompt.txt src/evaluate_rubrics/prompts/ternary/ -mv prompts/ternary/user_prompt_template.txt src/evaluate_rubrics/prompts/ternary/ +# Test the metrics module +cd ../calculate_metrics +python -c "from calculate_compliance_score import calculate_compliance_score; print('โœ“ Metrics module OK')" -# Remove the now-empty prompts directory from root -rm -rf prompts/ +# Return to project root +cd ../.. ``` -### Step 5: Configure Environment +### Step 8: Run a Test Evaluation (Optional) ```bash -# Create your .env file -cp .env.example .env - -# Edit .env and add your API key -nano .env # or vim .env, or code .env -# Add: OPENAI_API_KEY=your_actual_api_key_here +# Run evaluation on all reports (if you have data ready) +cd src/evaluate_rubrics +python evaluate_reports_batch.py ``` -### Step 6: Install Dependencies +This will evaluate all markdown files in `agent_responses/` and save results to `results/`. -```bash -# Install Python dependencies -pip install -r requirements.txt +## ๐Ÿ“Š Understanding the Data Flow -# Verify installation -python -c "import pandas, litellm, markitdown, sklearn; print('โœ“ All dependencies installed')" +``` +Input Data: + data/researchrubrics/processed_data.jsonl (rubrics + metadata) + agent_responses/*.md (markdown reports) + โ†“ + Evaluation + (evaluate_reports_batch.py) + โ†“ + Results: + results/batch_evaluation_YYYYMMDD_HHMMSS.jsonl + โ†“ + Metrics: + (calculate_compliance_score.py) + โ†“ + Compliance Scores ``` -### Step 7: Verify Setup +## ๐ŸŽฏ Quick Start After Setup + +Once setup is complete: ```bash -# Check folder structure -tree -L 3 -I '__pycache__|*.pyc' +# 1. Evaluate all reports +cd src/evaluate_rubrics +python evaluate_reports_batch.py -# Or use ls to verify key directories -ls -d src/*/ -ls -d data/*/ +# 2. Calculate compliance scores +cd ../calculate_metrics +python calculate_compliance_score.py ``` -Expected output: -``` -src/extract_rubrics/ -src/evaluate_rubrics/ -src/calculate_metrics/ -data/raw_csvs/ -data/processed_df/ -data/PDFs/ -data/predownloaded_pdfs/ +## ๐Ÿ”ง Customization Options + +### Adjusting Concurrency + +Edit `src/evaluate_rubrics/evaluate_single_report.py`: + +```python +# Find this line (around line 78) +def __init__(self, api_key: str = None, base_url: str = None, + model: str = "litellm_proxy/gemini/gemini-2.5-pro-preview-06-05", + max_concurrent: int = 20): # Change this number + +# Examples: +# max_concurrent=5 # Conservative (avoid rate limits) +# max_concurrent=10 # Moderate +# max_concurrent=30 # Aggressive (higher throughput) ``` -### Step 8: Test Your Setup +### Using a Different Model -```bash -# Test imports -cd src/extract_rubrics -python -c "from extract_rubrics_markitdown_onetask import RubricExtractor; print('โœ“ Extract module OK')" +In the same file, change the `model` parameter: + +```python +model: str = "litellm_proxy/gemini/gemini-2.5-pro-preview-06-05" # Change this +``` -cd ../evaluate_rubrics -python -c "from evaluate_rubrics_markitdown_onetask import RubricEvaluator; print('โœ“ Evaluate module OK')" +### Custom Output Location -cd ../calculate_metrics -python -c "from calculate_F1_score import calculate_macro_f1_per_task; print('โœ“ Metrics module OK')" +In `src/evaluate_rubrics/evaluate_reports_batch.py`, modify: -cd ../.. # Back to project root +```python +await evaluate_all_reports( + agent_responses_dir="agent_responses", # Change input directory + output_file="results/my_custom_results.jsonl" # Change output file +) ``` -## ๐Ÿ“‚ Final Folder Structure +## ๐Ÿ“‹ Verification Checklist + +After setup, verify: + +- [ ] Python 3.8+ installed (`python --version`) +- [ ] All dependencies installed (`pip list | grep -E "pandas|litellm|tqdm"`) +- [ ] `.env` file created with LITELLM_API_KEY +- [ ] `data/researchrubrics/processed_data.jsonl` exists +- [ ] Markdown files in `agent_responses/` +- [ ] Source code files in `src/` directory +- [ ] Prompt templates in `src/prompts/` +- [ ] Test imports work (Step 7) -After setup, your directory should look like this: +## ๐Ÿ†˜ Troubleshooting + +### Issue: "No module named 'litellm'" +**Solution**: Install dependencies +```bash +pip install -r requirements.txt ``` -public_release_experiments/ -โ”œโ”€โ”€ README.md -โ”œโ”€โ”€ QUICKSTART.md -โ”œโ”€โ”€ INSTALLATION.md -โ”œโ”€โ”€ DATA_FORMAT.md -โ”œโ”€โ”€ FOLDER_STRUCTURE.md -โ”œโ”€โ”€ FILE_MANIFEST.md -โ”œโ”€โ”€ CONTRIBUTING.md -โ”œโ”€โ”€ CHANGELOG.md -โ”œโ”€โ”€ RELEASE_CHECKLIST.md -โ”œโ”€โ”€ LICENSE -โ”œโ”€โ”€ CITATION.bib -โ”œโ”€โ”€ requirements.txt -โ”œโ”€โ”€ setup.py -โ”œโ”€โ”€ .env.example -โ”œโ”€โ”€ .env (your API key - DO NOT COMMIT) -โ”œโ”€โ”€ .gitignore -โ”œโ”€โ”€ setup_structure.sh -โ”œโ”€โ”€ setup_structure.bat -โ”œโ”€โ”€ src/ -โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ”œโ”€โ”€ extract_rubrics/ -โ”‚ โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ”‚ โ”œโ”€โ”€ extract_rubrics_batch.py -โ”‚ โ”‚ โ””โ”€โ”€ extract_rubrics_markitdown_onetask.py -โ”‚ โ”œโ”€โ”€ evaluate_rubrics/ -โ”‚ โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ”‚ โ”œโ”€โ”€ evaluate_rubrics_batch.py -โ”‚ โ”‚ โ”œโ”€โ”€ evaluate_rubrics_markitdown_onetask.py -โ”‚ โ”‚ โ””โ”€โ”€ prompts/ -โ”‚ โ”‚ โ”œโ”€โ”€ binary/ -โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ system_prompt.txt -โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ user_prompt_template.txt -โ”‚ โ”‚ โ””โ”€โ”€ ternary/ -โ”‚ โ”‚ โ”œโ”€โ”€ system_prompt.txt -โ”‚ โ”‚ โ””โ”€โ”€ user_prompt_template.txt -โ”‚ โ””โ”€โ”€ calculate_metrics/ -โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ”œโ”€โ”€ calculate_F1_score.py -โ”‚ โ”œโ”€โ”€ calculate_final_score.py -โ”‚ โ””โ”€โ”€ calculate_failure_breakdown.py -โ”œโ”€โ”€ data/ -โ”‚ โ”œโ”€โ”€ raw_csvs/ -โ”‚ โ”œโ”€โ”€ processed_df/ -โ”‚ โ”œโ”€โ”€ PDFs/ -โ”‚ โ””โ”€โ”€ predownloaded_pdfs/ -โ”œโ”€โ”€ results/ -โ”œโ”€โ”€ cache/ -โ””โ”€โ”€ tests/ + +### Issue: "FileNotFoundError: processed_data.jsonl" + +**Solution**: Ensure data file exists in the correct location +```bash +ls data/researchrubrics/processed_data.jsonl ``` -## ๐ŸŽฏ Running Your First Workflow +If missing, obtain the data file from the appropriate source. -Once setup is complete: +### Issue: "No markdown files found" +**Solution**: Add markdown files to `agent_responses/` ```bash -# 1. Place your CSV files in data/raw_csvs/ -# (You need to do this manually with your data) +# Files should be named with sample IDs +# Example: 683a58c9a7e7fe4e7695846f.md +``` -# 2. Extract rubrics -cd src/extract_rubrics -python extract_rubrics_batch.py +### Issue: "API key not found" -# 3. Evaluate rubrics -cd ../evaluate_rubrics -python evaluate_rubrics_batch.py +**Solution**: Verify `.env` file +```bash +# Check .env exists +ls .env -# 4. Calculate metrics -cd ../calculate_metrics -python calculate_F1_score.py -python calculate_final_score.py -python calculate_failure_breakdown.py -``` +# Check contents +cat .env -## ๐Ÿ“‹ Before Publishing +# Should contain: LITELLM_API_KEY=your_key_here +``` -Before you push to GitHub, complete the **RELEASE_CHECKLIST.md**: +### Issue: Rate limit errors -1. **Update placeholders**: - - Replace `[Authors]` with actual names - - Replace `` with your GitHub URL - - Replace `[username]` in links - - Update contact information +**Solution**: Reduce concurrency +```python +# In evaluate_single_report.py +evaluator = RubricEvaluator(max_concurrent=5) +``` -2. **Review security**: - - Remove any API keys from code - - Verify `.env` is in `.gitignore` - - Check for sensitive information +## ๐Ÿ”„ Updating the Code -3. **Test everything**: - - Fresh install in new environment - - Run complete workflow - - Verify all documentation +To update to the latest version: -4. **Initialize Git**: ```bash -git init -git add . -git commit -m "Initial release: Deep Research Benchmarks v1.0.0" -git remote add origin -git push -u origin main +# Pull latest changes +git pull origin main + +# Update dependencies +pip install --upgrade -r requirements.txt ``` -## ๐Ÿ“– Documentation Overview +## ๐Ÿ“– Next Steps -- **Start here**: README.md -- **Quick examples**: QUICKSTART.md -- **Installation help**: INSTALLATION.md -- **Data formats**: DATA_FORMAT.md -- **File organization**: FOLDER_STRUCTURE.md -- **Before release**: RELEASE_CHECKLIST.md +Now that setup is complete: -## ๐Ÿ†˜ Troubleshooting +1. **Read Documentation**: + - [README.md](README.md) - Project overview + - [QUICKSTART.md](QUICKSTART.md) - Quick examples + - [DATA_FORMAT.md](DATA_FORMAT.md) - Data format details -### "Module not found" errors -**Solution**: Make sure you're running scripts from their directories or add to Python path. +2. **Run Evaluations**: + - Start with a small batch to test + - Monitor API costs and token usage + - Adjust concurrency based on rate limits -### "Can't find .env" errors -**Solution**: Ensure `.env` is in project root (`public_release_experiments/.env`), not in `src/`. +3. **Analyze Results**: + - Review evaluation results in `results/` + - Calculate compliance scores + - Identify patterns in rubric performance -### Prompt files not found -**Solution**: Verify prompts are in `src/evaluate_rubrics/prompts/binary/` and `.../ternary/`, not in project root. +## ๐Ÿ’ก Tips for Success -### Import errors between modules -**Solution**: `__init__.py` files should be present in all `src/` subdirectories. +1. **Start Small**: Test with 1-2 markdown files before processing large batches +2. **Monitor Costs**: Check token usage and API costs regularly +3. **Adjust Concurrency**: Balance between speed and rate limits +4. **Save Results**: Keep evaluation results for later analysis +5. **Version Control**: Don't commit `.env` file or results to Git -## โœ… Setup Complete! +## ๐Ÿ“ž Getting Help + +If you need assistance: -Your Deep Research Benchmarks codebase is now ready for: -- โœ… Development and testing -- โœ… Running experiments -- โœ… Publishing to GitHub -- โœ… Sharing with collaborators -- โœ… Paper submission +1. Check the troubleshooting section above +2. Review [INSTALLATION.md](INSTALLATION.md) for detailed setup help +3. Check existing GitHub issues +4. Open a new issue with: + - Your setup (OS, Python version) + - Error message (full traceback) + - Steps to reproduce + +## โœ… Setup Complete! -## ๐Ÿ“ž Need Help? +Once all checks pass, you're ready to: +- โœ… Evaluate markdown reports against rubrics +- โœ… Calculate compliance scores +- โœ… Analyze evaluation results +- โœ… Customize the evaluation pipeline -- Check **INSTALLATION.md** for detailed installation troubleshooting -- Review **FOLDER_STRUCTURE.md** for directory organization -- See **QUICKSTART.md** for usage examples -- Consult **RELEASE_CHECKLIST.md** before publishing +**Happy evaluating! ๐Ÿš€** --- **Setup Guide Version**: 1.0 -**Last Updated**: 2025-01-XX +**Last Updated**: 2025-11-13 **Estimated Setup Time**: 10-15 minutes diff --git a/data/PDFs/.gitkeep b/data/PDFs/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/data/predownloaded_pdfs/.gitkeep b/data/predownloaded_pdfs/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/data/processed_df/.gitkeep b/data/processed_df/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/data/raw_csvs/.gitkeep b/data/raw_csvs/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/requirements.txt b/requirements.txt index a29d971..352e286 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,6 +22,9 @@ requests>=2.25.0 pyarrow>=10.0.0 fastparquet>=0.8.0 +# HuggingFace dataset download +huggingface-hub>=0.16.0 + # Optional: for better async support aiofiles>=0.8.0 httpx>=0.23.0 diff --git a/setup.py b/setup.py index f0748f9..790617e 100644 --- a/setup.py +++ b/setup.py @@ -6,14 +6,14 @@ long_description = (this_directory / "README.md").read_text(encoding="utf-8") setup( - name="deep-research-benchmarks", + name="research-rubrics", version="1.0.0", - author="[Authors]", - author_email="[contact@email.com]", - description="Deep Research Benchmarks", + author="Manasi Sharma et al.", + author_email="manasi.sharma@scale.com", + description="ResearchRubrics: A Benchmark of Prompts and Rubrics For Evaluating Deep Research Agents", long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/[username]/deep-research-benchmarks", + url="https://github.com/scaleapi/researchrubrics", packages=find_packages(where="src"), package_dir={"": "src"}, classifiers=[ diff --git a/src/calculate_metrics/calculate_F1_score.py b/src/calculate_metrics/calculate_F1_score.py deleted file mode 100644 index ad07778..0000000 --- a/src/calculate_metrics/calculate_F1_score.py +++ /dev/null @@ -1,179 +0,0 @@ -import pandas as pd -import json -from pathlib import Path -import numpy as np -from collections import defaultdict -from sklearn.metrics import f1_score - - -def load_data(data_path): - """Load the processed dataframe from parquet file. - - Args: - data_path: Path to the parquet file - - Returns: - pd.DataFrame: The loaded dataframe - """ - df = pd.read_parquet(data_path) - return df - - -def calculate_macro_f1_per_task(true_labels, pred_labels, binary=False): - """Calculate macro F1 score for a single task using sklearn. - - Args: - true_labels: List of true labels (ground truth) - pred_labels: List of predicted labels (model predictions) - binary: If True, convert "Partially Satisfied" to "Not Satisfied" for binary evaluation - - Returns: - float: Macro F1 score for this task - """ - # Convert to binary labels if requested - if binary: - true_labels = ['Not Satisfied' if label == 'Partially Satisfied' else label - for label in true_labels] - pred_labels = ['Not Satisfied' if label == 'Partially Satisfied' else label - for label in pred_labels] - # Define labels for binary case - labels = ['Satisfied', 'Not Satisfied'] - else: - # Define the possible labels for ternary case - labels = ['Satisfied', 'Partially Satisfied', 'Not Satisfied'] - - # Use sklearn's f1_score with macro averaging - macro_f1 = f1_score(true_labels, pred_labels, labels=labels, average='macro', zero_division=0) - return macro_f1 - - -def calculate_model_f1_scores(ground_truth_df, predicted_df, limit_rows=None, binary=False): - """Calculate macro F1 scores for all models by comparing ground truth vs predicted data. - - Args: - ground_truth_df: DataFrame containing ground truth presence data - predicted_df: DataFrame containing predicted presence data - limit_rows: Optional limit on number of rows to process (for debugging) - binary: If True, convert "Partially Satisfied" to "Not Satisfied" for binary evaluation - - Returns: - dict: Dictionary with lists of F1 scores for each model per task - """ - # Limit rows if specified (for debugging) - if limit_rows is not None: - ground_truth_df = ground_truth_df.head(limit_rows) - predicted_df = predicted_df.head(limit_rows) - - # Initialize lists to store F1 scores for each model per task - model_f1_scores = { - 'gemini': [], - 'chatgpt': [], - 'perplexity': [] - } - - # Ensure both dataframes have the same number of rows - min_rows = min(len(ground_truth_df), len(predicted_df)) - - # Iterate through all rows (tasks) in both dataframes - for idx in range(min_rows): - gt_row = ground_truth_df.iloc[idx] - pred_row = predicted_df.iloc[idx] - - # Parse presence data from both dataframes - gt_presence = json.loads(gt_row['final_presence']) - pred_presence = json.loads(pred_row['final_presence']) - - # For each model, compare ground truth vs predicted - for model in ['gemini', 'chatgpt', 'perplexity']: - model_key = f'{model}_present' - - # Extract ground truth and predicted presence lists for this model - gt_values = gt_presence[model_key]['values'] - pred_values = pred_presence[model_key]['values'] - - # Ensure both lists have the same length - min_length = min(len(gt_values), len(pred_values)) - gt_values = gt_values[:min_length] - pred_values = pred_values[:min_length] - - # Calculate F1 score for this model on this task - f1 = calculate_macro_f1_per_task(gt_values, pred_values, binary) - model_f1_scores[model].append(f1) - - if limit_rows is not None: - print(f"Task {idx+1}: Gemini F1={model_f1_scores['gemini'][-1]:.4f}, " - f"ChatGPT F1={model_f1_scores['chatgpt'][-1]:.4f}, " - f"Perplexity F1={model_f1_scores['perplexity'][-1]:.4f}") - - return model_f1_scores - - -def calculate_average_f1_scores(model_f1_scores): - """Calculate average F1 scores across all tasks for each model. - - Args: - model_f1_scores: Dictionary with lists of F1 scores for each model - - Returns: - dict: Dictionary with average F1 scores for each model - """ - avg_f1_scores = {} - for model, f1_scores in model_f1_scores.items(): - avg_f1_scores[model] = np.mean(f1_scores) if f1_scores else 0.0 - - return avg_f1_scores - - - - -if __name__ == "__main__": - # Define paths to the two dataframes - base_path = Path(__file__).parent.parent.parent - ground_truth_path = base_path / "data" / "processed_df" / "compiled_dataset.parquet" - predicted_path = base_path / "results" / "11_04" / "20251104_034416" / "processed_df" / "compiled_dataset.parquet" - - # Set binary flag - change this to True for binary evaluation (Partially Satisfied -> Not Satisfied) - binary = True - - # For debugging, limit to first 2 entries - limit_rows = None # 2 - - print(f"Loading data from:") - print(f" Ground Truth: {ground_truth_path}") - print(f" Predicted: {predicted_path}") - print(f"Limiting to first {limit_rows} rows for debugging") - print(f"Evaluation mode: {'Binary' if binary else 'Ternary'}") - if binary: - print("(Converting 'Partially Satisfied' -> 'Not Satisfied')") - print() - - # Load both dataframes - ground_truth_df = load_data(ground_truth_path) - predicted_df = load_data(predicted_path) - - print(f"Ground Truth DF loaded: {len(ground_truth_df)} tasks") - print(f"Predicted DF loaded: {len(predicted_df)} tasks") - print() - - # Calculate F1 scores by comparing ground truth vs predicted - print("=== Calculating F1 Scores (Ground Truth vs Predicted) ===") - f1_scores = calculate_model_f1_scores(ground_truth_df, predicted_df, limit_rows, binary) - avg_f1_scores = calculate_average_f1_scores(f1_scores) - - # Print results - print("\n" + "="*60) - print("MACRO F1 SCORE RESULTS") - print("="*60) - - print(f"\nAverage F1 Scores across {min(len(ground_truth_df), len(predicted_df), limit_rows or float('inf'))} tasks:") - for model in ['gemini', 'chatgpt', 'perplexity']: - print(f" {model.capitalize():<12}: {avg_f1_scores[model]:.4f}") - - print(f"\nNote: F1 scores calculated by comparing ground truth vs predicted presence lists for each model.") - if binary: - print(f" Each task's macro F1 is calculated across the 2 classes: Satisfied, Not Satisfied") - print(f" 'Partially Satisfied' labels were converted to 'Not Satisfied' for binary evaluation") - else: - print(f" Each task's macro F1 is calculated across the 3 classes: Satisfied, Partially Satisfied, Not Satisfied") - print(f" Ground truth data from: {ground_truth_path.name}") - print(f" Predicted data from: {predicted_path.name}") diff --git a/src/calculate_metrics/calculate_failure_breakdown.py b/src/calculate_metrics/calculate_failure_breakdown.py deleted file mode 100644 index f10208a..0000000 --- a/src/calculate_metrics/calculate_failure_breakdown.py +++ /dev/null @@ -1,283 +0,0 @@ -import pandas as pd -import json -from pathlib import Path -from collections import defaultdict - - -def load_data(data_path): - """Load the processed dataframe from parquet file. - - Args: - data_path: Path to the parquet file - - Returns: - pd.DataFrame: The loaded dataframe - """ - df = pd.read_parquet(data_path) - return df - - -def calculate_task_failure_breakdown(rubrics, presence_values, binary=False): - """Calculate failure breakdown by category for a single task and model. - - Args: - rubrics: List of rubric dictionaries (each with 'category', 'weight', etc.) - presence_values: List of presence values ('Satisfied', 'Partially Satisfied', 'Not Satisfied') - binary: If True, convert "Partially Satisfied" to "Not Satisfied" for binary evaluation - - Returns: - dict: Dictionary mapping categories to their failure ratios for this task - Only includes categories that have at least one failure - Returns None if there are no failures - """ - # Convert to binary labels if requested - if binary: - presence_values = ['Not Satisfied' if presence == 'Partially Satisfied' else presence - for presence in presence_values] - - # Count failures by category - failures_by_category = defaultdict(int) - total_failures = 0 - - # Iterate through each rubric and its presence value - for rubric, presence_value in zip(rubrics, presence_values): - if presence_value == 'Not Satisfied': - category = rubric['category'] - failures_by_category[category] += 1 - total_failures += 1 - - # If there are no failures, return None - if total_failures == 0: - return None - - # Calculate ratio for each category that has failures - # Only include categories with at least one failure - category_ratios = {} - for category, count in failures_by_category.items(): - ratio = count / total_failures - category_ratios[category] = ratio - - return category_ratios - - -def calculate_failure_breakdown_by_category(df, binary=False): - """Calculate failure rate breakdown by category for each model across all tasks. - - For each model and each task, calculates: - - The ratio of "Not Satisfied" rubrics per category divided by total "Not Satisfied" rubrics - - Then averages these ratios across all tasks. - - Args: - df: DataFrame containing rubrics and presence data - binary: If True, convert "Partially Satisfied" to "Not Satisfied" for binary evaluation - - Returns: - dict: Dictionary with failure breakdown for each model - """ - models = ['gemini', 'chatgpt', 'perplexity'] - - # Initialize storage for each model - model_results = {model: defaultdict(list) for model in models} - - # Track total tasks processed for each model (tasks with at least one failure) - tasks_with_failures = {model: 0 for model in models} - - # Iterate through all rows (tasks) in the dataframe - for idx, row in df.iterrows(): - # Parse rubrics and presence data - rubrics = json.loads(row['rubrics']) - presence = json.loads(row['final_presence']) - - # Process each model - for model in models: - # Get presence values for this model - presence_key = f'{model}_present' - presence_values = presence[presence_key]['values'] - - # Calculate failure breakdown for this task and model - task_breakdown = calculate_task_failure_breakdown(rubrics, presence_values, binary) - - # If there are failures in this task, store the results - if task_breakdown is not None: - tasks_with_failures[model] += 1 - - # Store the ratio for each category - for category, ratio in task_breakdown.items(): - model_results[model][category].append(ratio) - - # Calculate averages across all tasks - model_averages = {} - for model in models: - model_averages[model] = {} - for category, ratios in model_results[model].items(): - avg_ratio = sum(ratios) / len(ratios) if ratios else 0.0 - model_averages[model][category] = { - 'average_ratio': avg_ratio, - 'num_tasks': len(ratios) - } - model_averages[model]['_metadata'] = { - 'tasks_with_failures': tasks_with_failures[model], - 'total_tasks': len(df) - } - - return model_averages - - -def calculate_aggregate_failure_breakdown(df, binary=False): - """Calculate aggregate failure breakdown by category for each model across all tasks. - - Unlike calculate_failure_breakdown_by_category which averages per-task ratios, - this function aggregates all failures across all tasks and then calculates ratios. - - Args: - df: DataFrame containing rubrics and presence data - binary: If True, convert "Partially Satisfied" to "Not Satisfied" for binary evaluation - - Returns: - dict: Dictionary with aggregate failure breakdown for each model - """ - models = ['gemini', 'chatgpt', 'perplexity'] - - # Initialize storage for each model - model_results = {model: defaultdict(int) for model in models} - total_failures = {model: 0 for model in models} - - # Iterate through all rows (tasks) in the dataframe - for idx, row in df.iterrows(): - # Parse rubrics and presence data - rubrics = json.loads(row['rubrics']) - presence = json.loads(row['final_presence']) - - # Process each model - for model in models: - # Get presence values for this model - presence_key = f'{model}_present' - presence_values = presence[presence_key]['values'] - - # Convert to binary labels if requested - if binary: - presence_values = ['Not Satisfied' if presence == 'Partially Satisfied' else presence - for presence in presence_values] - - # Count failures by category - for rubric, presence_value in zip(rubrics, presence_values): - if presence_value == 'Not Satisfied': - category = rubric['category'] - model_results[model][category] += 1 - total_failures[model] += 1 - - # Calculate ratios - model_ratios = {} - for model in models: - model_ratios[model] = {} - total = total_failures[model] - - if total > 0: - for category, count in model_results[model].items(): - ratio = count / total - model_ratios[model][category] = { - 'count': count, - 'ratio': ratio - } - - model_ratios[model]['_metadata'] = { - 'total_failures': total, - 'total_tasks': len(df) - } - - return model_ratios - - -def print_results(model_averages, model_aggregate, binary=False): - """Print the results in a readable format. - - Args: - model_averages: Dictionary with per-task averaged failure breakdown for each model - model_aggregate: Dictionary with aggregate failure breakdown for each model - binary: If True, indicates binary evaluation mode for display purposes - """ - models = ['gemini', 'chatgpt', 'perplexity'] - - print("\n" + "="*100) - print("FAILURE RATE BREAKDOWN BY CATEGORY") - print("="*100) - print("\nFor each model and category, this shows:") - print(" - Avg Ratio: average per-task ratio (# 'Not Satisfied' in category / total 'Not Satisfied' per task)") - print(" - Agg Ratio: aggregate ratio across all tasks (total 'Not Satisfied' in category / total 'Not Satisfied')") - print(" - Tasks: number of tasks with at least one failure in that category\n") - - for model in models: - print("\n" + "-"*100) - print(f"MODEL: {model.upper()}") - print("-"*100) - - metadata_avg = model_averages[model]['_metadata'] - metadata_agg = model_aggregate[model]['_metadata'] - print(f"\nTasks with failures: {metadata_avg['tasks_with_failures']} / {metadata_avg['total_tasks']}") - print(f"Total failures across all tasks: {metadata_agg['total_failures']}") - print(f"\nFailure breakdown by category:\n") - - # Get all unique categories from both sources - all_categories = set() - for cat in model_averages[model].keys(): - if cat != '_metadata': - all_categories.add(cat) - for cat in model_aggregate[model].keys(): - if cat != '_metadata': - all_categories.add(cat) - - # Prepare data for sorting by aggregate ratio - category_data = [] - for category in all_categories: - avg_data = model_averages[model].get(category, {'average_ratio': 0.0, 'num_tasks': 0}) - agg_data = model_aggregate[model].get(category, {'ratio': 0.0, 'count': 0}) - category_data.append((category, avg_data, agg_data)) - - # Sort by aggregate ratio (descending) - category_data.sort(key=lambda x: x[2]['ratio'], reverse=True) - - # Print table header - print(f"{'Category':<40} {'Avg Ratio':<12} {'Agg Ratio':<12} {'Tasks'}") - print(f"{'-'*40} {'-'*12} {'-'*12} {'-'*8}") - - total_avg_ratio = 0.0 - total_agg_ratio = 0.0 - for category, avg_data, agg_data in category_data: - avg_ratio = avg_data['average_ratio'] - num_tasks = avg_data['num_tasks'] - agg_ratio = agg_data['ratio'] - total_avg_ratio += avg_ratio - total_agg_ratio += agg_ratio - print(f"{category:<40} {avg_ratio:>10.4f} {agg_ratio:>10.4f} {num_tasks:>6}") - - print(f"{'-'*40} {'-'*12} {'-'*12} {'-'*8}") - print(f"{'TOTAL':<40} {total_avg_ratio:>10.4f} {total_agg_ratio:>10.4f}") - print(f"\nNote: Avg Ratio total may not sum to 1.0 because ratios are averaged across tasks.") - print(f" Agg Ratio total should sum to 1.0 (aggregate calculation).") - - -if __name__ == "__main__": - # data_path = Path(__file__).parent.parent.parent / "data" / "processed_df" / "compiled_dataset.parquet" - data_path = Path(__file__).parent.parent.parent / "results" / "11_04" / "20251104_034416" / "processed_df" / "compiled_dataset.parquet" - - # Set binary flag - change this to True for binary evaluation (Partially Satisfied -> Not Satisfied) - binary = False - - print(f"Loading data from: {data_path}") - df = load_data(data_path) - print(f"Loaded {len(df)} tasks") - print(f"Evaluation mode: {'Binary' if binary else 'Ternary'}") - if binary: - print("(Converting 'Partially Satisfied' -> 'Not Satisfied')") - print() - - # Calculate failure breakdown by category (per-task average) - model_averages = calculate_failure_breakdown_by_category(df, binary) - - # Calculate aggregate failure breakdown (across all tasks) - model_aggregate = calculate_aggregate_failure_breakdown(df, binary) - - # Print combined results - print_results(model_averages, model_aggregate, binary) - diff --git a/src/calculate_metrics/calculate_final_score.py b/src/calculate_metrics/calculate_final_score.py deleted file mode 100644 index faadc41..0000000 --- a/src/calculate_metrics/calculate_final_score.py +++ /dev/null @@ -1,128 +0,0 @@ -import pandas as pd -import pdb -from pathlib import Path -import json - - -def calculate_weighted_score(weights, presence_list, verbose=False, binary=False): - """Calculate weighted score based on presence values. - - Args: - weights: List of weight values for each rubric - presence_list: List of presence values (Satisfied/Partially Satisfied/Not Satisfied) - verbose: Whether to print the score details - binary: If True, convert "Partially Satisfied" to "Not Satisfied" for binary evaluation - - Returns: - float: The calculated score (numerator/denominator) - """ - # Convert to binary labels if requested - if binary: - presence_list = ['Not Satisfied' if presence == 'Partially Satisfied' else presence - for presence in presence_list] - - # Map presence values to scores - score_map = { - 'Satisfied': 1.0, - 'Partially Satisfied': 0.5, - 'Not Satisfied': 0.0 - } - - # Calculate weighted numerator - numerator = sum(weight * score_map.get(presence, 0) - for weight, presence in zip(weights, presence_list)) - - # Calculate denominator (sum of positive weights) - denominator = sum(weight for weight in weights if weight > 0) - - # Calculate and return score - score = numerator / denominator if denominator > 0 else 0 - - if verbose: - print(f"Weighted Score: {score:.4f} (numerator: {numerator:.4f}, denominator: {denominator:.4f})") - - return score - - -def load_data(data_path): - """Load the processed dataframe from parquet file. - - Args: - data_path: Path to the parquet file - - Returns: - pd.DataFrame: The loaded dataframe - """ - df = pd.read_parquet(data_path) - return df - - -def calculate_model_scores(df, binary=False): - """Calculate weighted scores for all models across all rows in the dataframe. - - Args: - df: DataFrame containing rubrics and presence data - binary: If True, convert "Partially Satisfied" to "Not Satisfied" for binary evaluation - - Returns: - dict: Dictionary with lists of scores for each model (gemini, chatgpt, perplexity) - """ - # Initialize lists to store scores for each model - gemini_scores = [] - chatgpt_scores = [] - perplexity_scores = [] - - # Iterate through all rows in the dataframe - for idx, row in df.iterrows(): - # Parse rubrics and presence data - rubrics = json.loads(row['rubrics']) - presence = json.loads(row['final_presence']) - - # Extract rubric weights - rubric_weights = [rubric.get('weight') for rubric in rubrics] - - # Extract presence lists for each model - gemini_present = presence['gemini_present']['values'] - chatgpt_present = presence['chatgpt_present']['values'] - perplexity_present = presence['perplexity_present']['values'] - - # Calculate scores for each model - gemini_score = calculate_weighted_score(rubric_weights, gemini_present, binary=binary) - chatgpt_score = calculate_weighted_score(rubric_weights, chatgpt_present, binary=binary) - perplexity_score = calculate_weighted_score(rubric_weights, perplexity_present, binary=binary) - - # Append to lists - gemini_scores.append(gemini_score) - chatgpt_scores.append(chatgpt_score) - perplexity_scores.append(perplexity_score) - - return { - 'gemini': gemini_scores, - 'chatgpt': chatgpt_scores, - 'perplexity': perplexity_scores - } - -if __name__ == "__main__": - data_path = Path(__file__).parent.parent.parent / "results" / "11_04" / "20251104_034416" / "processed_df" / "compiled_dataset.parquet" - - # Set binary flag - change this to True for binary evaluation (Partially Satisfied -> Not Satisfied) - binary = False - - print(f"Loading data from: {data_path}") - df = load_data(data_path) - print(f"Loaded {len(df)} tasks") - print(f"Evaluation mode: {'Binary' if binary else 'Ternary'}") - if binary: - print("(Converting 'Partially Satisfied' -> 'Not Satisfied')") - - # Calculate scores for all models - scores = calculate_model_scores(df, binary) - - # Calculate and print averages - print(f"\nAverage Scores across {len(df)} rows:") - print(f"Gemini: {sum(scores['gemini']) / len(scores['gemini']):.4f}") - print(f"ChatGPT: {sum(scores['chatgpt']) / len(scores['chatgpt']):.4f}") - print(f"Perplexity: {sum(scores['perplexity']) / len(scores['perplexity']):.4f}") - - # pdb.set_trace() - diff --git a/src/evaluate_rubrics/evaluate_rubrics_batch.py b/src/evaluate_rubrics/evaluate_rubrics_batch.py deleted file mode 100644 index 96ef5ef..0000000 --- a/src/evaluate_rubrics/evaluate_rubrics_batch.py +++ /dev/null @@ -1,217 +0,0 @@ -#!/usr/bin/env python3 -""" -Simple batch evaluation script for processing all tasks in compiled_dataset.csv - -This script: -1. Loads all tasks from compiled_dataset.csv -2. Calls evaluate_task_rubrics() for each task -3. Updates final_presence values with new evaluation results -4. Saves output to results//{timestamp}/processed_df/ in CSV and Parquet formats - -Usage: - python evaluate_rubrics_batch.py -""" - -import os -import sys -import json -import logging -import time -import asyncio -from pathlib import Path -import pandas as pd -from datetime import datetime -from tqdm import tqdm - -# Import the single-task evaluator -try: - from evaluate_rubrics_markitdown_onetask import evaluate_task_rubrics -except ImportError: - # Try adding the current directory to path - current_dir = Path(__file__).parent - sys.path.insert(0, str(current_dir)) - from evaluate_rubrics_markitdown_onetask import evaluate_task_rubrics - -# Setup logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -def generate_final_presence_format(results_df: pd.DataFrame) -> dict: - """Convert evaluation results to final_presence format - - Args: - results_df: Results DataFrame from evaluate_task_rubrics - - Returns: - Dictionary in final_presence format - """ - final_presence = {} - - # Get task name for error reporting - task_name = results_df['task_name'].iloc[0] if not results_df.empty else "Unknown" - - # Get unique models/PDFs - models = sorted(results_df['pdf'].unique()) if not results_df.empty else [] - - for model in models: - model_results = results_df[results_df['pdf'] == model] - - # Extract verdict values (should be in order of rubrics) - verdict_values = model_results['verdict'].tolist() - - # Count null/error values - null_count = len([v for v in verdict_values if v in ['Error', 'Unknown', None]]) - total_count = len(verdict_values) - - # Check that null_count must be 0 - if null_count > 0: - raise ValueError(f"Task '{task_name}' has {null_count} null/error evaluations for {model}. All evaluations must be successful.") - - # Store in the format expected - final_presence[f'{model}_present'] = { - 'values': verdict_values, - 'null_count': null_count, - 'total_count': total_count - } - - return final_presence - -async def evaluate_all_tasks(binary=False): - """Evaluate all tasks in compiled_dataset.csv and update final_presence values (async with parallelization) - - Args: - binary: If True, use binary prompts; if False, use ternary prompts (default: False) - - Results are automatically saved to results//{timestamp}/processed_df/ - - Returns: - DataFrame in same format as compiled_dataset.csv but with updated final_presence values - """ - - # Get base directory - base_dir = Path(__file__).parent.parent.parent - compiled_csv = base_dir / 'data' / 'processed_df' / 'compiled_dataset.csv' - - if not compiled_csv.exists(): - raise FileNotFoundError(f"Compiled dataset not found: {compiled_csv}") - - # Load all tasks once - df = pd.read_csv(compiled_csv) - logger.info(f"Found {len(df)} tasks to evaluate") - - # Evaluate each task and collect results - successful_task_rows = [] - evaluation_details = [] - - try: - for idx, task_row in tqdm(df.iterrows(), total=len(df), desc="Evaluating tasks"): - - task_name = task_row['task_name'] - logger.info(f"Processing task: {task_name}") - - task_start_time = time.time() - # Evaluate this task (pass task_row for efficiency - no need to reload CSV) - results_df = await evaluate_task_rubrics(save_results=False, task_row=task_row, binary=binary) - task_time = time.time() - task_start_time - - if results_df.empty: - logger.error(f"No results for task {task_name}") - raise ValueError(f"Task {task_name} returned empty results") - - # Generate final_presence format and create updated row - final_presence = generate_final_presence_format(results_df) - updated_row = task_row.copy() - updated_row['final_presence'] = json.dumps(final_presence) - - successful_task_rows.append(updated_row) - evaluation_details.append(results_df) - logger.info(f"Task {task_name} completed: {len(results_df)} evaluations in {task_time:.2f}s") - finally: - # Save results if we have any successful tasks - if successful_task_rows: - is_partial = len(successful_task_rows) < len(df) - if is_partial: - logger.warning(f"Saving partial results for {len(successful_task_rows)}/{len(df)} completed tasks") - else: - logger.info(f"Saving results for {len(successful_task_rows)} completed tasks") - output_df = pd.DataFrame(successful_task_rows) - - # Create directory structure: results//{timestamp}/processed_df/ - now = datetime.now() - date_dir = now.strftime("%m_%d") - timestamp = now.strftime("%Y%m%d_%H%M%S") - - results_dir = base_dir / 'results' / date_dir / timestamp / 'processed_df' - results_dir.mkdir(parents=True, exist_ok=True) - - # Save both CSV and Parquet versions - csv_file = results_dir / 'compiled_dataset.csv' - parquet_file = results_dir / 'compiled_dataset.parquet' - - output_df.to_csv(csv_file, index=False) - - try: - output_df.to_parquet(parquet_file, index=False) - except ImportError: - logger.warning("Parquet support not available. Install pyarrow: pip install pyarrow") - parquet_file = None - - # logger.info(f"Results saved to: {results_dir}") - # logger.info(f" CSV: {csv_file}") - # if parquet_file: - # logger.info(f" Parquet: {parquet_file}") - - # Create main output dataframe - output_df = pd.DataFrame(successful_task_rows) - # logger.info(f"Successfully evaluated {len(output_df)} tasks") - - # Print summary (create compiled_results only for statistics) - print("\n" + "="*60) - print("BATCH EVALUATION SUMMARY") - print("="*60) - - print(f"\nSuccessfully Processed: {len(output_df)} tasks") - - if evaluation_details: - # Only create compiled_results when we need detailed statistics - compiled_results = pd.concat(evaluation_details, ignore_index=True) - total_evaluations = len(compiled_results) - total_cost = compiled_results['cost'].sum() - total_tokens = compiled_results['tokens_used'].sum() - - print(f"Total Evaluations: {total_evaluations:,}") - print(f"Total Cost: ${total_cost:.4f}") - print(f"Total Tokens: {total_tokens:,}") - - # Per-model summary - print(f"\nPer-Model Results:") - for model in compiled_results['pdf'].unique(): - model_data = compiled_results[compiled_results['pdf'] == model] - verdict_counts = model_data['verdict'].value_counts() - print(f"\n{model.upper()}: {len(model_data):,} evaluations, Avg Score: {model_data['score'].mean():.3f}") - for verdict, count in verdict_counts.items(): - print(f" {verdict}: {count}") - - return output_df - -async def main(): - """Main execution function""" - - binary = False # Set to True for binary prompts, False for ternary prompts - - try: - start_time = time.time() - output_df = await evaluate_all_tasks(binary=binary) - total_time = time.time() - start_time - - print(f"\nTotal Time: {total_time:.2f}s ({total_time/60:.2f} minutes)") - # logger.info("Batch evaluation completed successfully!") - except Exception as e: - logger.error(f"Batch evaluation failed: {str(e)}") - sys.exit(1) - -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file diff --git a/src/evaluate_rubrics/evaluate_rubrics_markitdown_onetask.py b/src/evaluate_rubrics/evaluate_rubrics_markitdown_onetask.py deleted file mode 100644 index ef06c80..0000000 --- a/src/evaluate_rubrics/evaluate_rubrics_markitdown_onetask.py +++ /dev/null @@ -1,745 +0,0 @@ -#!/usr/bin/env python3 -""" -Enhanced rubric evaluation script with improved prompts and PDF processing. -Key improvements: -- Uses markdown conversion for better text processing -- Chunking strategy for large documents -- Improved prompts with better structure -- Retry logic and error handling -- Token optimization -- Comprehensive logging -""" - -import os -import sys -import json -import time -import logging -import asyncio -from pathlib import Path -from typing import Dict, List, Optional, Tuple, Any -from dataclasses import dataclass, asdict -import pandas as pd -import numpy as np -from datetime import datetime - -# Setup logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -# Import LiteLLM client -try: - import litellm - from tqdm import tqdm - litellm.suppress_debug_info = True - litellm.set_verbose = False - # Suppress LiteLLM and httpx logging - logging.getLogger('LiteLLM').setLevel(logging.ERROR) - logging.getLogger('httpx').setLevel(logging.ERROR) - logging.getLogger('openai').setLevel(logging.ERROR) - OPENAI_AVAILABLE = True -except ImportError: - OPENAI_AVAILABLE = False - logger.error("LiteLLM library not available. Install with: pip install litellm") - -# Import sklearn for metrics -try: - from sklearn.metrics import f1_score, classification_report, confusion_matrix - SKLEARN_AVAILABLE = True -except ImportError: - SKLEARN_AVAILABLE = False - logger.warning("scikit-learn not available for metrics calculation") - -@dataclass -class EvaluationResult: - """Data class for evaluation results""" - rubric_title: str - pdf_name: str - verdict: str - score: float - confidence: float - reasoning: str - tokens_used: int - cost: float - duration: float - success: bool - error: Optional[str] = None - -class ImprovedPromptTemplates: - """Improved prompt templates for rubric evaluation""" - - def __init__(self, prompt_type): - prompts_dir = Path(__file__).parent / 'prompts' / prompt_type - self.SYSTEM_PROMPT = (prompts_dir / 'system_prompt.txt').read_text(encoding='utf-8') - self.USER_PROMPT_TEMPLATE = (prompts_dir / 'user_prompt_template.txt').read_text(encoding='utf-8') - - CHUNK_PROMPT_TEMPLATE = """You are evaluating a large document in chunks. This is chunk {chunk_num} of {total_chunks}. - -## Previous Context Summary -{context_summary} - -## Current Chunk Content -{chunk_content} - -## Rubric Criterion -**Title**: {rubric_title} -**Category**: {rubric_category} - -Please evaluate this chunk for evidence related to the rubric criterion. Your response should be in JSON format: - -```json -{{ - "relevant_evidence": ["Evidence point 1", "Evidence point 2", ...], - "partial_satisfaction": true/false, - "confidence_for_chunk": [0.0-1.0], - "notes": "Any important observations" -}} -```""" - -class RubricEvaluator: - """Enhanced rubric evaluation system""" - - def __init__(self, api_key: str = None, base_url: str = None, model: str = "gpt-4o", binary: bool = False, max_concurrent: int = 20): - """Initialize the evaluator - - Args: - api_key: OpenAI API key - base_url: API base URL - model: Model to use - binary: If True, use binary prompts; if False, use ternary prompts (default: False) - max_concurrent: Maximum number of concurrent API calls (default: 20) - """ - if not OPENAI_AVAILABLE: - raise ImportError("LiteLLM library required") - - self.model = model - self.max_concurrent = max_concurrent - self.semaphore = None # Will be initialized when needed in async context - prompt_type = 'binary' if binary else 'ternary' - self.prompts = ImprovedPromptTemplates(prompt_type=prompt_type) - - # Load .env file if api_key not provided and OPENAI_API_KEY not in environment - if not api_key and not os.getenv("OPENAI_API_KEY"): - # Try to find .env file in public_release_experiments directory - # Script is at: src/evaluate_rubrics/evaluate_rubrics_markitdown_onetask.py - env_file = Path(__file__).parent.parent.parent / '.env' - if env_file.exists(): - # logger.info(f"Loading environment variables from: {env_file}") - with open(env_file, 'r') as f: - for line in f: - line = line.strip() - if line and not line.startswith('#') and '=' in line: - key, value = line.split('=', 1) - os.environ[key.strip()] = value.strip() - - # Store API configuration for LiteLLM - self.api_key = api_key or os.getenv("OPENAI_API_KEY") - self.base_url = base_url - - # Token limits by model - self.token_limits = { - "gpt-5": 200000, - "litellm_proxy/gemini/gemini-2.5-pro-preview-06-05": 200000, - "gpt-4o": 128000, - "gpt-4.1": 128000, - } - - # Pricing per 1M tokens - self.pricing = { - "gpt-5": {"input": 1.25, "output": 10.0}, - "litellm_proxy/gemini/gemini-2.5-pro-preview-06-05": {"input": 1.25, "output": 10.0}, - "gpt-4o": {"input": 2.5, "output": 10.0}, - "gpt-4.1": {"input": 2.0, "output": 8.0}, - } - - # Cache for processed documents - self.document_cache = {} - - def load_document(self, file_path: Path, use_cache: bool = True) -> str: - """Load document content (markdown or PDF) - - Args: - file_path: Path to document - use_cache: Whether to use cached content - - Returns: - Document content as string - """ - # Check cache - cache_key = str(file_path) - if use_cache and cache_key in self.document_cache: - # logger.info(f"Using cached content for {file_path.name}") - return self.document_cache[cache_key] - - # Try markdown version first - markdown_path = file_path.with_suffix('.md') - if markdown_path.exists(): - # logger.info(f"Loading markdown: {markdown_path}") - content = markdown_path.read_text(encoding='utf-8') - else: - # Fallback to PDF text extraction - # logger.warning(f"Markdown not found, attempting PDF extraction: {file_path}") - content = self._extract_pdf_text(file_path) - - # Cache the content - if use_cache: - self.document_cache[cache_key] = content - - return content - - def _extract_pdf_text(self, pdf_path: Path) -> str: - """Extract text from PDF using fallback method - - Args: - pdf_path: Path to PDF - - Returns: - Extracted text - """ - try: - import PyPDF2 - with open(pdf_path, 'rb') as file: - reader = PyPDF2.PdfReader(file) - text = "" - for page in reader.pages: - text += page.extract_text() + "\n" - return text - except ImportError: - logger.error("PyPDF2 not available for PDF extraction") - return "" - except Exception as e: - logger.error(f"Failed to extract PDF text: {e}") - return "" - - def chunk_document(self, content: str, max_tokens: int = 100000) -> List[str]: - """Split document into chunks for processing - - Args: - content: Document content - max_tokens: Maximum tokens per chunk - - Returns: - List of content chunks - """ - # Rough estimate: 1 token โ‰ˆ 4 characters - max_chars = max_tokens * 4 - - if len(content) <= max_chars: - return [content] - - # Split by paragraphs first - paragraphs = content.split('\n\n') - - chunks = [] - current_chunk = "" - - for para in paragraphs: - if len(current_chunk) + len(para) < max_chars: - current_chunk += para + "\n\n" - else: - if current_chunk: - chunks.append(current_chunk.strip()) - current_chunk = para + "\n\n" - - if current_chunk: - chunks.append(current_chunk.strip()) - - # logger.info(f"Document split into {len(chunks)} chunks") - return chunks - - async def evaluate_rubric(self, rubric: Dict[str, Any], document_content: str, - use_chunking: bool = True) -> EvaluationResult: - """Evaluate a single rubric against document content - - Args: - rubric: Rubric information - document_content: Document content - use_chunking: Whether to use chunking for large documents - - Returns: - EvaluationResult object - """ - start_time = time.time() - - # Use semaphore to limit concurrent API calls - async with self.semaphore: - try: - # Check if chunking is needed - estimated_tokens = len(document_content) // 4 - context_limit = self.token_limits.get(self.model) - 4096 # Reserve tokens for response - - if use_chunking and estimated_tokens > context_limit: - result = await self._evaluate_with_chunks(rubric, document_content) - else: - result = await self._evaluate_single(rubric, document_content) - - result.duration = time.time() - start_time - return result - - except Exception as e: - logger.error(f"Evaluation failed: {e}") - return EvaluationResult( - rubric_title=rubric.get('title', 'Unknown'), - pdf_name='', - verdict='Error', - score=0.0, - confidence=0.0, - reasoning=f"Evaluation failed: {str(e)}", - tokens_used=0, - cost=0.0, - duration=time.time() - start_time, - success=False, - error=str(e) - ) - - async def _evaluate_single(self, rubric: Dict[str, Any], content: str) -> EvaluationResult: - """Evaluate rubric on single content (no chunking) - - Args: - rubric: Rubric information - content: Document content - - Returns: - EvaluationResult - """ - # Prepare prompt - user_prompt = self.prompts.USER_PROMPT_TEMPLATE.format( - document_content=content, - rubric_title=rubric.get('title', ''), - rubric_category=rubric.get('category', 'General'), - rubric_weight=rubric.get('weight', 1.0) - ) - - # Make API call with retry logic - for attempt in range(3): - try: - response = await litellm.acompletion( - model=self.model, - messages=[ - {"role": "system", "content": self.prompts.SYSTEM_PROMPT}, - {"role": "user", "content": user_prompt} - ], - # temperature=0.1, # Low temperature for consistency - max_tokens=4096, - response_format={"type": "json_object"}, # Ensure JSON response - api_key=self.api_key, - base_url=self.base_url - ) - - # Parse response - response_text = response.choices[0].message.content - result_data = json.loads(response_text) - - # Calculate cost - tokens_used = response.usage.total_tokens if response.usage else 0 - cost = self._calculate_cost(response.usage) - - return EvaluationResult( - rubric_title=rubric.get('title', ''), - pdf_name='', - verdict=result_data.get('verdict', 'Unknown'), - score=float(result_data.get('score', 0.0)), - confidence=float(result_data.get('confidence', 0.0)), - reasoning=result_data.get('reasoning', ''), - tokens_used=tokens_used, - cost=cost, - duration=0, - success=True - ) - - except json.JSONDecodeError as e: - logger.warning(f"JSON decode error on attempt {attempt + 1}: {e}") - if attempt == 2: - raise - except Exception as e: - logger.warning(f"API call failed on attempt {attempt + 1}: {e}") - if attempt == 2: - raise - await asyncio.sleep(2 ** attempt) # Exponential backoff - - async def _evaluate_with_chunks(self, rubric: Dict[str, Any], content: str) -> EvaluationResult: - """Evaluate rubric using document chunks - - Args: - rubric: Rubric information - content: Full document content - - Returns: - EvaluationResult - """ - chunks = self.chunk_document(content) - # logger.info(f"Evaluating rubric across {len(chunks)} chunks") - - # Process chunks and collect evidence - all_evidence = [] - chunk_results = [] - total_tokens = 0 - total_cost = 0.0 - - for i, chunk in enumerate(chunks, 1): - # logger.info(f"Processing chunk {i}/{len(chunks)}") - - # Evaluate chunk - chunk_prompt = self.prompts.CHUNK_PROMPT_TEMPLATE.format( - chunk_num=i, - total_chunks=len(chunks), - context_summary="Previous chunks evaluated" if i > 1 else "First chunk", - chunk_content=chunk, - rubric_title=rubric.get('title', ''), - rubric_category=rubric.get('category', 'General') - ) - - response = await litellm.acompletion( - model=self.model, - messages=[ - {"role": "system", "content": "You are evaluating document chunks for rubric criteria."}, - {"role": "user", "content": chunk_prompt} - ], - # temperature=0.1, - max_tokens=4096, - response_format={"type": "json_object"}, - api_key=self.api_key, - base_url=self.base_url - ) - - chunk_data = json.loads(response.choices[0].message.content) - chunk_results.append(chunk_data) - - if chunk_data.get('relevant_evidence'): - all_evidence.extend(chunk_data['relevant_evidence']) - - total_tokens += response.usage.total_tokens if response.usage else 0 - total_cost += self._calculate_cost(response.usage) - - # Synthesize final evaluation - synthesis_prompt = f"""Based on the following evidence collected from the document: - -Evidence points: -{json.dumps(all_evidence, indent=2)} - -Evaluate whether the document satisfies the rubric criterion: -**Title**: {rubric.get('title', '')} -**Category**: {rubric.get('category', 'General')} - -Provide your final evaluation in JSON format: -{{ - "verdict": "[Not Satisfied/Partially Satisfied/Satisfied]", - "score": [0.0/0.5/1.0], - "confidence": [0.0-1.0], - "reasoning": "Synthesis of evidence" -}}""" - - final_response = await litellm.acompletion( - model=self.model, - messages=[ - {"role": "system", "content": self.prompts.SYSTEM_PROMPT}, - {"role": "user", "content": synthesis_prompt} - ], - # temperature=0.1, - max_tokens=4096, - response_format={"type": "json_object"}, - api_key=self.api_key, - base_url=self.base_url - ) - - final_data = json.loads(final_response.choices[0].message.content) - total_tokens += final_response.usage.total_tokens if final_response.usage else 0 - total_cost += self._calculate_cost(final_response.usage) - - return EvaluationResult( - rubric_title=rubric.get('title', ''), - pdf_name='', - verdict=final_data.get('verdict', 'Unknown'), - score=float(final_data.get('score', 0.0)), - confidence=float(final_data.get('confidence', 0.0)), - reasoning=final_data.get('reasoning', ''), - tokens_used=total_tokens, - cost=total_cost, - duration=0, - success=True - ) - - def _calculate_cost(self, usage: Any) -> float: - """Calculate API call cost - - Args: - usage: Usage information from API - - Returns: - Cost in USD - """ - if not usage: - return 0.0 - - model_pricing = self.pricing.get(self.model) - - input_tokens = getattr(usage, 'prompt_tokens', 0) - output_tokens = getattr(usage, 'completion_tokens', 0) - - input_cost = (input_tokens / 1_000_000) * model_pricing["input"] - output_cost = (output_tokens / 1_000_000) * model_pricing["output"] - - return input_cost + output_cost - - async def evaluate_all_rubrics(self, rubrics: List[Dict], pdf_paths: Dict[str, Path], - save_results: bool = True) -> pd.DataFrame: - """Evaluate all rubrics against all PDFs (with parallelization) - - Args: - rubrics: List of rubric dictionaries - pdf_paths: Dictionary mapping PDF names to paths - save_results: Whether to save results to file - - Returns: - Results dataframe - """ - # Initialize semaphore for concurrent request limiting - self.semaphore = asyncio.Semaphore(self.max_concurrent) - - results = [] - - # Track overall progress - total_evaluations = len(rubrics) * len(pdf_paths) - # logger.info(f"Starting evaluation: {len(rubrics)} rubrics ร— {len(pdf_paths)} PDFs = {total_evaluations} evaluations") - - for pdf_name, pdf_path in pdf_paths.items(): - # logger.info(f"\nProcessing PDF: {pdf_name}") - - # Load document once - document_content = self.load_document(pdf_path) - - if not document_content: - logger.error(f"Failed to load content for {pdf_name}") - continue - - # logger.info(f"Document loaded: {len(document_content)} characters") - - # Evaluate all rubrics in parallel for this PDF - pbar = tqdm(total=len(rubrics), desc=f"Evaluating {pdf_name}") - - # Create tasks for parallel evaluation - async def evaluate_and_update(rubric): - result = await self.evaluate_rubric(rubric, document_content) - result.pdf_name = pdf_name - pbar.update(1) - pbar.set_postfix({'verdict': result.verdict, 'score': result.score}) - return result - - # Run all rubric evaluations in parallel - evaluation_results = await asyncio.gather(*[evaluate_and_update(rubric) for rubric in rubrics]) - pbar.close() - - # Add all results - for result in evaluation_results: - results.append({ - 'pdf': pdf_name, - 'rubric_title': result.rubric_title, - 'verdict': result.verdict, - 'score': result.score, - 'confidence': result.confidence, - 'reasoning': result.reasoning[:500], # Truncate for dataframe - 'tokens_used': result.tokens_used, - 'cost': result.cost, - 'duration': result.duration, - 'success': result.success, - 'error': result.error - }) - - # Create dataframe - results_df = pd.DataFrame(results) - - # Save results - if save_results: - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - results_file = Path(f"evaluation_results_{timestamp}.csv") - results_df.to_csv(results_file, index=False) - logger.info(f"\nResults saved to: {results_file}") - - return results_df - -async def evaluate_task_rubrics(task_name: str = None, save_results: bool = False, task_row: pd.Series = None, compiled_df: pd.DataFrame = None, binary: bool = False) -> pd.DataFrame: - """Evaluate rubrics for a specific task (async with parallelization) - - Args: - task_name: Name of the task to evaluate (hash string like "683a58c9a7e7fe4e7695846f") - save_results: Whether to save detailed results to file - task_row: Optional pandas Series with task data (for efficiency) - compiled_df: Optional compiled dataset DataFrame (for efficiency) - binary: If True, use binary prompts; if False, use ternary prompts (default: False) - - Returns: - Results dataframe - - Raises: - FileNotFoundError: If compiled dataset is not found - ValueError: If task is not found or PDF files are missing - """ - # Configuration - go up to public_release_experiments directory - # Script is at: src/evaluate_rubrics/evaluate_rubrics_markitdown_onetask.py - base_dir = Path(__file__).parent.parent.parent - - # Handle different calling patterns for efficiency - if task_row is not None: - # Most efficient: task_row already provided - task_name = task_row['task_name'] - # logger.info(f"Processing task: {task_name} (using provided task_row)") - elif compiled_df is not None and task_name is not None: - # Efficient: search in provided DataFrame - task_rows = compiled_df[compiled_df['task_name'] == task_name] - if len(task_rows) == 0: - raise ValueError(f"Task '{task_name}' not found in provided DataFrame") - task_row = task_rows.iloc[0] - # logger.info(f"Processing task: {task_name} (using provided compiled_df)") - elif task_name is not None: - # Backward compatibility: load CSV file - compiled_csv = base_dir / 'data' / 'processed_df' / 'compiled_dataset.csv' - if not compiled_csv.exists(): - raise FileNotFoundError(f"Compiled dataset not found: {compiled_csv}. Run extract_rubrics_dataset.py first to generate compiled_dataset.csv") - - # Read the CSV and find the specific task - df = pd.read_csv(compiled_csv) - # logger.info(f"Loaded compiled dataset with {len(df)} tasks") - - if len(df) == 0: - raise ValueError("No tasks found in compiled dataset") - - # Find the specific task - task_rows = df[df['task_name'] == task_name] - if len(task_rows) != 1: - raise ValueError(f"Task '{task_name}' not found in compiled dataset. Available tasks: {list(df['task_name'].head())}") - - task_row = task_rows.iloc[0] - # logger.info(f"Processing task: {task_name} (loaded from CSV)") - else: - raise ValueError("Must provide either task_name, or task_row, or (task_name + compiled_df)") - - # Parse rubrics from JSON string - rubrics = json.loads(task_row['rubrics']) - # logger.info(f"Loaded {len(rubrics)} rubrics") - - # Parse PDF paths from JSON string (not used directly, but kept for reference) - pdf_paths_data = json.loads(task_row['pdf_paths']) - - # Setup PDF paths - PDFs are organized by task_name directory - pdf_base_dir = base_dir / 'data' / 'PDFs' - task_pdf_dir = pdf_base_dir / task_name - - if not task_pdf_dir.exists(): - raise ValueError(f"PDF directory not found for task {task_name}: {task_pdf_dir}") - - pdf_paths = { - 'gemini': task_pdf_dir / 'gemini.pdf', - 'chatgpt': task_pdf_dir / 'chatgpt.pdf', - 'perplexity': task_pdf_dir / 'perplexity.pdf' - } - - # Verify ALL PDF files exist - missing_pdfs = [] - for model, path in pdf_paths.items(): - if not path.exists(): - missing_pdfs.append(f"{model}: {path}") - # else: - # logger.info(f"Found PDF for {model}: {path}") - - if missing_pdfs: - raise ValueError(f"Missing required PDF files for task {task_name}:\n" + "\n".join(missing_pdfs) + - f"\nAll three PDFs (gemini, chatgpt, perplexity) are required.") - - # Initialize evaluator - # You'll need to set your API key here or in environment variable - evaluator = RubricEvaluator( - api_key=os.getenv("OPENAI_API_KEY"), - base_url="https://example.com", - model="gpt-5", - binary=binary - ) - - # Run evaluation (async) - results_df = await evaluator.evaluate_all_rubrics( - rubrics=rubrics, - pdf_paths=pdf_paths, - save_results=save_results - ) - - # Add task_name to results - results_df['task_name'] = task_name - - return results_df - -async def main(): - """Main execution function""" - - # Hardcoded task name - change this to evaluate different tasks - TASK_NAME = "683a58c9a7e7fe4e7695846f" # First task from compiled dataset - binary = False # Set to True for binary prompts, False for ternary prompts - - try: - # Evaluate the specific task - results_df = await evaluate_task_rubrics(TASK_NAME, save_results=False, binary=binary) - - # Display summary - print("\n" + "="*60) - print(f"EVALUATION SUMMARY FOR TASK: {TASK_NAME}") - print("="*60) - - # Overall statistics - total_evaluations = len(results_df) - successful = results_df['success'].sum() - total_cost = results_df['cost'].sum() - total_tokens = results_df['tokens_used'].sum() - avg_confidence = results_df['confidence'].mean() - - print(f"\nTotal Evaluations: {total_evaluations}") - print(f"Successful: {successful}/{total_evaluations}") - print(f"Total Cost: ${total_cost:.4f}") - print(f"Total Tokens: {total_tokens:,}") - print(f"Average Confidence: {avg_confidence:.2%}") - - # Per-PDF summary - print("\nPer-PDF Results:") - for pdf_name in results_df['pdf'].unique(): - pdf_data = results_df[results_df['pdf'] == pdf_name] - avg_score = pdf_data['score'].mean() - - verdict_counts = pdf_data['verdict'].value_counts() - print(f"\n{pdf_name.upper()}:") - print(f" Average Score: {avg_score:.3f}") - print(f" Verdict Distribution:") - for verdict, count in verdict_counts.items(): - print(f" {verdict}: {count}") - - # Save detailed report - # report_file = Path(f"evaluation_report_{TASK_NAME}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json") - report_data = { - 'task_name': TASK_NAME, - 'summary': { - 'total_evaluations': total_evaluations, - 'successful': successful, - 'total_cost': total_cost, - 'total_tokens': total_tokens, - 'average_confidence': avg_confidence - }, - 'results': results_df.to_dict(orient='records') - } - - # with open(report_file, 'w') as f: - # json.dump(report_data, f, indent=2, default=str) - - # logger.info(f"\nDetailed report saved to: {report_file}") - - except (FileNotFoundError, ValueError) as e: - logger.error(f"Error: {e}") - sys.exit(1) - except Exception as e: - logger.error(f"Unexpected error: {e}") - sys.exit(1) - -if __name__ == "__main__": - if not OPENAI_AVAILABLE: - print("LiteLLM library is required. Install with: pip install litellm") - sys.exit(1) - - asyncio.run(main()) diff --git a/src/extract_rubrics/extract_rubrics_batch.py b/src/extract_rubrics/extract_rubrics_batch.py deleted file mode 100644 index 055927c..0000000 --- a/src/extract_rubrics/extract_rubrics_batch.py +++ /dev/null @@ -1,230 +0,0 @@ -#!/usr/bin/env python3 -""" -Script to process all evaluation CSV files and compile them into a dataset. -Iterates through data/raw_csvs directory and extracts rubrics, prompts, PDFs, and presence data. -""" - -import pandas as pd -import logging -from pathlib import Path -from typing import List, Dict, Any -from dataclasses import asdict -import json -from tqdm import tqdm - -from extract_rubrics_markitdown_onetask import RubricExtractor - -# Setup logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - - -class DatasetCompiler: - """Class to compile multiple CSV extractions into a dataset""" - - def __init__(self, base_dir: str = None): - """Initialize the DatasetCompiler - - Args: - base_dir: Base directory for operations (should be public_release_experiments) - """ - if base_dir is None: - # Go up 3 levels: extract_rubrics_dataset.py -> extract_rubrics -> src -> public_release_experiments - self.base_dir = Path(__file__).parent.parent.parent - else: - self.base_dir = Path(base_dir) - - self.extractor = RubricExtractor(base_dir=str(self.base_dir)) - self.raw_csvs_dir = self.base_dir / 'data' / 'raw_csvs' - # self.output_dir = self.base_dir / 'data' / 'compiled' - # self.output_dir.mkdir(parents=True, exist_ok=True) - - def find_all_csvs(self) -> List[Path]: - """Find all CSV files in the raw_csvs directory - - Returns: - List of paths to CSV files - """ - if not self.raw_csvs_dir.exists(): - logger.error(f"Raw CSVs directory not found: {self.raw_csvs_dir}") - return [] - - csv_files = list(self.raw_csvs_dir.rglob('*.csv')) - logger.info(f"Found {len(csv_files)} CSV files") - return csv_files - - def process_single_csv(self, csv_path: Path) -> Dict[str, Any]: - """Process a single CSV file and return structured data - - Args: - csv_path: Path to CSV file - - Returns: - Dictionary with processed data or error information - """ - # Get relative path from base_dir for processing - try: - relative_path = csv_path.relative_to(self.base_dir) - except ValueError: - # If csv_path is not relative to base_dir, use absolute path - relative_path = csv_path - - result = { - 'csv_file': str(relative_path), - 'csv_filename': csv_path.name, - 'success': False, - 'error': None, - 'task_name': None, - 'prompt': None, - 'rubrics': None, - 'rubrics_count': 0, - 'pdf_paths': None, - 'final_presence': None, - } - - try: - logger.info(f"Processing: {csv_path.name}") - - # Process the task - task_result = self.extractor.process_task(str(relative_path)) - - # Check if there was an error in processing - if 'error' in task_result: - result['error'] = task_result['error'] - result['success'] = False - return result - - # Extract and structure the data - result['task_name'] = task_result['task_name'] - result['prompt'] = task_result['prompt'] - result['rubrics'] = [asdict(r) for r in task_result['rubrics']] # Convert to dict for serialization - result['rubrics_count'] = len(task_result['rubrics']) - result['pdf_paths'] = task_result['pdf_path'] # Includes path and error (error=None means success) - result['final_presence'] = task_result['final_presence'] - result['success'] = True - - logger.info(f"โœ“ Successfully processed {csv_path.name} ({result['rubrics_count']} rubrics)") - - except Exception as e: - result['error'] = str(e) - logger.error(f"โœ— Failed to process {csv_path.name}: {e}") - - return result - - def compile_dataset(self) -> pd.DataFrame: - """Compile all CSVs into a single dataset - - Returns: - DataFrame with compiled data (only successful tasks) - """ - csv_files = self.find_all_csvs() - - if not csv_files: - logger.warning("No CSV files found to process") - return pd.DataFrame() - - logger.info(f"Starting to process {len(csv_files)} CSV files...") - - successful_results = [] - failed_results = [] - - for csv_path in tqdm(csv_files, desc="Processing CSVs"): - result = self.process_single_csv(csv_path) - - if result['success']: - # Only keep the columns we want for successful tasks - successful_results.append({ - 'csv_filename': result['csv_filename'], - 'task_name': result['task_name'], - 'prompt': result['prompt'], - 'rubrics': result['rubrics'], - 'rubrics_count': result['rubrics_count'], - 'pdf_paths': result['pdf_paths'], - 'final_presence': result['final_presence'] - }) - else: - # Track failed tasks separately - failed_results.append({ - 'csv_filename': result['csv_filename'], - 'error': result['error'] - }) - - # Create DataFrame with only successful results - df = pd.DataFrame(successful_results) - - # Log summary statistics - success_count = len(successful_results) - error_count = len(failed_results) - - logger.info("\n" + "="*60) - logger.info("COMPILATION COMPLETE") - logger.info("="*60) - logger.info(f"Total files: {len(csv_files)}") - logger.info(f"Successful: {success_count}") - logger.info(f"Failed: {error_count}") - - # Print errors separately if any - if error_count > 0: - print("\n" + "="*60) - print("TASKS WITH ERRORS") - print("="*60) - for failed in failed_results: - print(f"\n{failed['csv_filename']}:") - print(f" Error: {failed['error']}") - print("\n" + "="*60) - - # Save the compiled dataframe - processed_dir = self.base_dir / 'data' / 'processed_df' - processed_dir.mkdir(parents=True, exist_ok=True) - - # Save as both CSV and parquet - csv_path = processed_dir / 'compiled_dataset.csv' - parquet_path = processed_dir / 'compiled_dataset.parquet' - - # For CSV, serialize complex columns - df_for_csv = df.copy() - for col in ['rubrics', 'pdf_paths', 'final_presence']: - if col in df_for_csv.columns: - df_for_csv[col] = df_for_csv[col].apply(lambda x: json.dumps(x) if x is not None else None) - df_for_csv.to_csv(csv_path, index=False) - logger.info(f"\nSaved CSV to: {csv_path}") - - # For Parquet, serialize complex columns - df_for_parquet = df.copy() - for col in ['rubrics', 'pdf_paths', 'final_presence']: - if col in df_for_parquet.columns: - df_for_parquet[col] = df_for_parquet[col].apply(lambda x: json.dumps(x) if x is not None else None) - df_for_parquet.to_parquet(parquet_path, index=False) - logger.info(f"Saved Parquet to: {parquet_path}") - - return df - - -def main(): - """Main function""" - compiler = DatasetCompiler() - df = compiler.compile_dataset() - - # Display summary - print("\n" + "="*60) - print("DATASET SUMMARY") - print("="*60) - print(f"Total records: {len(df)}") - - if len(df) > 0: - print(f"\nAverage rubrics per task: {df['rubrics_count'].mean():.1f}") - print(f"Total rubrics extracted: {df['rubrics_count'].sum()}") - - # Show a sample - print("\nSample records:") - print(df[['csv_filename', 'task_name', 'rubrics_count']].head(10).to_string()) - - return df - - -if __name__ == "__main__": - main() - diff --git a/src/extract_rubrics/extract_rubrics_markitdown_onetask.py b/src/extract_rubrics/extract_rubrics_markitdown_onetask.py deleted file mode 100644 index aeacea4..0000000 --- a/src/extract_rubrics/extract_rubrics_markitdown_onetask.py +++ /dev/null @@ -1,525 +0,0 @@ -#!/usr/bin/env python3 -""" -Script to process evaluation CSV files and extract rubrics. -Changes: -- PDF to Markdown conversion using markitdown -- Caching of converted documents - to save redoing -- More detailed logging -""" - -import pandas as pd -import os -import requests -import json -import logging -import hashlib -import pdb -import shutil -from pathlib import Path -from typing import Dict, List, Optional, Tuple, Any -from urllib.parse import urlparse -from dataclasses import dataclass, asdict - -# Setup logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -try: - from markitdown import MarkItDown - MARKITDOWN_AVAILABLE = True -except ImportError: - MARKITDOWN_AVAILABLE = False - logger.warning("MarkItDown not available. Install with: pip install markitdown") - -@dataclass -class RubricItem: - """Data class for rubric items""" - title: str - weight: float - category: str - row_index: int - -@dataclass -class PDFLinks: - """Data class for PDF links""" - gemini_pdf: Optional[str] - chatgpt_pdf: Optional[str] - perplexity_pdf: Optional[str] - -@dataclass -class FinalPresence: - """Data class for final presence""" - gemini_present: Optional[Dict[str, Any]] - chatgpt_present: Optional[Dict[str, Any]] - perplexity_present: Optional[Dict[str, Any]] - -class RubricExtractor: - """Class to handle rubric extraction and PDF processing""" - - def __init__(self, base_dir: Optional[str] = None): - """Initialize the RubricExtractor - - Args: - base_dir: Base directory for operations - """ - if base_dir is None: - # Go up 3 levels: extract_rubrics_markitdown.py -> extract_rubrics -> src -> public_release_experiments - self.base_dir = Path(__file__).parent.parent.parent - else: - self.base_dir = Path(base_dir) - - self.cache_dir = self.base_dir / 'cache' - self.data_dir = self.base_dir / 'data' - self.cache_dir.mkdir(exist_ok=True) - self.data_dir.mkdir(exist_ok=True) - - if MARKITDOWN_AVAILABLE: - self.md_converter = MarkItDown() - else: - self.md_converter = None - - def load_dataframe(self, csv_filename: str) -> Dict[str, Any]: - """Load CSV file as dataframe with error handling - - Args: - csv_filename: Name of the CSV file - - Returns: - Dictionary with 'data' (dataframe) and 'error' keys - """ - csv_path = self.base_dir / csv_filename - - if not csv_path.exists(): - return {'data': None, 'error': f"CSV file not found: {csv_path}"} - - try: - df = pd.read_csv(csv_path) - logger.info(f"Loaded dataframe with shape: {df.shape}") - - if df is None or len(df) == 0: - return {'data': None, 'error': "Dataframe is empty"} - - return {'data': df, 'error': None} - except Exception as e: - logger.error(f"Failed to load CSV: {e}") - return {'data': None, 'error': f"Failed to load CSV: {str(e)}"} - - def extract_pdf_links_and_presence(self, df: pd.DataFrame) -> Dict[str, Any]: - """Extract PDF links and presence from dataframe with validation - - Args: - df: Input dataframe - - Returns: - Dictionary with 'data' (pdf_links and final_presence) and 'error' keys - """ - errors = [] - - # Extract PDF URLs from specific rows - pdf_links = PDFLinks( - gemini_pdf=self._clean_url(df.iloc[3]['prompt'] if len(df) > 3 else None), - chatgpt_pdf=self._clean_url(df.iloc[6]['prompt'] if len(df) > 6 else None), - perplexity_pdf=self._clean_url(df.iloc[9]['prompt'] if len(df) > 9 else None) - ) - - # Extract final presence - final_presence = FinalPresence( - gemini_present=self._extract_presence(df, 'gemini_present'), - chatgpt_present=self._extract_presence(df, 'chatgpt_present'), - perplexity_present=self._extract_presence(df, 'perplexity_present') - ) - - # Validate presence data - final_presence_dict = asdict(final_presence) - for col_name, col_data in final_presence_dict.items(): - if not col_data: - errors.append(f"{col_name} column is missing") - elif col_data['null_count'] > 0: - errors.append(f"{col_name} column has {col_data['null_count']} empty values") - - data = { - 'pdf_links': asdict(pdf_links), - 'final_presence': final_presence_dict - } - - if errors: - return {'data': data, 'error': "; ".join(errors)} - - return {'data': data, 'error': None} - - def _clean_url(self, url: Any) -> Optional[str]: - """Clean and validate URL - - Args: - url: URL to clean - - Returns: - Cleaned URL or None - """ - if not url or pd.isna(url): - return None - - url = str(url).strip() - - if not url.startswith(('http://', 'https://')): - return None - - return url - - def _extract_score(self, df: pd.DataFrame, column: str) -> Optional[str]: - """Extract score containing % symbol - - Args: - df: Dataframe - column: Column name - - Returns: - Score string or None - """ - if column not in df.columns: - return None - - mask = df[column].astype(str).str.contains('%', na=False) - if mask.any(): - return df.loc[mask, column].iloc[0] - - return None - - def _extract_presence(self, df: pd.DataFrame, column: str) -> Optional[Dict[str, Any]]: - """Extract presence data from column - - Args: - df: Dataframe - column: Column name - - Returns: - Dict with values and null_count, or None if column missing - """ - if column not in df.columns: - return None - - # Get all values (including nulls for counting) - all_values = df[column] - non_null_values = all_values.dropna() - null_count = all_values.isnull().sum() - - return { - 'values': [str(val) for val in non_null_values.tolist()] if len(non_null_values) > 0 else [], - 'null_count': int(null_count), - 'total_count': len(all_values) - } - - def _truncate_to_last_valid_row(self, df: pd.DataFrame) -> Dict[str, Any]: - """Truncate dataframe to last row with valid rubric title - - Args: - df: Original dataframe - - Returns: - Dictionary with 'data' (truncated dataframe) and 'error' keys - """ - last_valid_row = -1 - - # Find the latest row with a valid title (rubric) - if 'title' in df.columns: - # Find last non-null, non-empty title - for idx in df.index[::-1]: # Iterate backwards - title = df.loc[idx, 'title'] - if pd.notna(title) and str(title).strip() != '': - last_valid_row = idx - break - - if last_valid_row >= 0: - # Truncate to include rows 0 through last_valid_row - truncated_df = df.iloc[:last_valid_row + 1].copy() - logger.info(f"Truncated dataframe from {len(df)} to {len(truncated_df)} rows (last valid rubric title at row {last_valid_row})") - return {'data': truncated_df, 'error': None} - else: - logger.warning("No valid rubric titles found") - return {'data': None, 'error': "No valid rubric titles found"} - - def download_pdf(self, url: str, save_path: Path, convert_to_markdown: bool = True, task_name: Optional[str] = None) -> Dict[str, Any]: - """Download PDF and optionally convert to markdown - - Args: - url: PDF URL - save_path: Where to save PDF - convert_to_markdown: Whether to convert to markdown - task_name: Task name for fallback to predownloaded PDFs - - Returns: - Dictionary with download results - """ - result = { - 'url': url, - 'pdf_path': str(save_path), - 'markdown_path': None, - 'success': False, - 'error': None - } - - if not url: - # Try to copy from predownloaded PDFs - if task_name: - predownloaded_dir = self.data_dir / 'predownloaded_pdfs' / task_name - predownloaded_pdf = predownloaded_dir / save_path.name - - if not predownloaded_pdf.exists(): - result['error'] = f'Empty URL and predownloaded PDF not found: {predownloaded_pdf}' - return result - - try: - # Copy the predownloaded PDF - save_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(predownloaded_pdf, save_path) - logger.info(f"Copied predownloaded PDF: {predownloaded_pdf} -> {save_path}") - result['success'] = True - except Exception as e: - result['error'] = f'Failed to copy predownloaded PDF: {str(e)}' - raise Exception(f'Failed to copy predownloaded PDF: {str(e)}') - return result - else: - result['error'] = 'Empty URL' - raise Exception('Empty URL') - return result - else: - try: - # Download PDF - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' - } - - response = requests.get(url, headers=headers, timeout=30, allow_redirects=True) - response.raise_for_status() - - if len(response.content) == 0: - result['error'] = 'Empty response' - return result - - # Save PDF - save_path.parent.mkdir(parents=True, exist_ok=True) - save_path.write_bytes(response.content) - - logger.info(f"Downloaded PDF: {save_path} ({len(response.content)} bytes)") - result['success'] = True - - except Exception as e: - result['error'] = str(e) - logger.error(f"Failed to download {url}: {e}") - raise Exception(f'Failed to download {url}: {str(e)}') - return result - - # Convert to Markdown if requested (common for both download and copy) - if convert_to_markdown and self.md_converter: - markdown_path = save_path.with_suffix('.md') - try: - markdown_content = self._convert_pdf_to_markdown(save_path) - markdown_path.write_text(markdown_content, encoding='utf-8') - result['markdown_path'] = str(markdown_path) - logger.info(f"Converted to Markdown: {markdown_path}") - except Exception as e: - logger.warning(f"Failed to convert PDF to Markdown: {e}") - - return result - - def _convert_pdf_to_markdown(self, pdf_path: Path) -> str: - """Convert PDF to Markdown using markitdown - - Args: - pdf_path: Path to PDF file - - Returns: - Markdown content - """ - if not self.md_converter: - raise RuntimeError("MarkItDown not available") - - # Check cache first - cache_key = self._get_file_hash(pdf_path) - cache_file = self.cache_dir / f"{cache_key}.md" - - if cache_file.exists(): - logger.info(f"Using cached markdown: {cache_file}") - return cache_file.read_text(encoding='utf-8') - - # Convert PDF - result = self.md_converter.convert(str(pdf_path)) - markdown_content = result.text_content - - # Cache the result - cache_file.write_text(markdown_content, encoding='utf-8') - - return markdown_content - - def _get_file_hash(self, file_path: Path) -> str: - """Get SHA256 hash of file - - Args: - file_path: Path to file - - Returns: - Hash string - """ - sha256_hash = hashlib.sha256() - with open(file_path, "rb") as f: - for byte_block in iter(lambda: f.read(4096), b""): - sha256_hash.update(byte_block) - return sha256_hash.hexdigest() - - def get_all_rubrics(self, df: pd.DataFrame) -> Dict[str, Any]: - """Extract all rubrics from dataframe with validation - - Args: - df: Cleaned dataframe - - Returns: - Dictionary with 'data' (list of RubricItem objects) and 'error' keys - """ - rubrics = [] - errors = [] - - for index, row in df.iterrows(): - rubric = RubricItem( - title=row['title'], - weight=row['weight'], - category=row.get('category'), - row_index=index - ) - - # Validate weight - try: - float(rubric.weight) - except (ValueError, TypeError): - errors.append(f"Rubric {index+1} has invalid weight: {rubric.weight}") - - # Validate category - if rubric.category is None or str(rubric.category).strip() == '': - errors.append(f"Rubric {index+1} has missing category") - - rubrics.append(rubric) - - if not rubrics: - return {'data': None, 'error': "No rubrics found"} - - if errors: - return {'data': rubrics, 'error': "; ".join(errors)} - - logger.info(f"Extracted {len(rubrics)} rubrics") - return {'data': rubrics, 'error': None} - - def process_task(self, csv_filename: str) -> Dict[str, Any]: - """Process entire task: load CSV, extract data, download PDFs - - Args: - csv_filename: CSV filename - - Returns: - Dictionary with all results or error - """ - errors = [] - - # Load dataframe - df_result = self.load_dataframe(csv_filename) - if df_result['error']: - errors.append(df_result['error']) - return {"error": "; ".join(errors)} # Fatal error, can't continue - df = df_result['data'] - - # Truncate to last valid rubric row - truncate_result = self._truncate_to_last_valid_row(df) - if truncate_result['error']: - errors.append(truncate_result['error']) - return {"error": "; ".join(errors)} # Fatal error, can't continue - df = truncate_result['data'] - - # Get task name (remove .csv extension first, then take first part before underscore) - filename_only = Path(csv_filename).stem # Gets filename without extension - task_name = filename_only.split('_')[0] - - # Extract prompt - prompt = df.iloc[0, 0] if len(df) > 0 and len(df.columns) > 0 else None - if not prompt or str(prompt).strip() == '': - errors.append("Prompt extraction failed") - - # Extract data (PDFs and presence) - extracted_result = self.extract_pdf_links_and_presence(df) - if extracted_result['error']: - errors.append(extracted_result['error']) - extracted_data = extracted_result['data'] - - # Get rubrics from truncated dataframe - rubrics_result = self.get_all_rubrics(df) - if rubrics_result['error']: - errors.append(rubrics_result['error']) - rubrics = rubrics_result['data'] - - # Download PDFs and structure results - task_data_dir = self.data_dir / 'PDFs' / task_name - pdf_names = { - 'gemini_pdf': 'gemini.pdf', - 'chatgpt_pdf': 'chatgpt.pdf', - 'perplexity_pdf': 'perplexity.pdf' - } - - pdf_path = {} - for pdf_key, pdf_filename in pdf_names.items(): - url = extracted_data['pdf_links'].get(pdf_key) - save_path = task_data_dir / pdf_filename - - download_result = self.download_pdf(url, save_path, task_name=task_name) - pdf_path[pdf_key] = { - 'path': download_result['pdf_path'], - 'error': download_result.get('error') - } - - # Check for PDF download errors - for pdf_name, pdf_info in pdf_path.items(): - if pdf_info['error'] is not None: - errors.append(f"{pdf_name}: {pdf_info['error']}") - - # Return error if any issues found - if errors: - return {"error": "; ".join(errors)} - - return { - 'task_name': task_name, - 'dataframe': df, - 'rubrics': rubrics, - 'prompt': prompt, - 'pdf_links': extracted_data['pdf_links'], - 'final_presence': extracted_data['final_presence'], - 'pdf_path': pdf_path - } - -def main(): - """Main function""" - csv_filename = "data/raw_csvs/Fixed_Extracts_2025-10-30T01-29-46-357Z/683a58c9a7e7fe4e7695846f_683a58c9a7e7fe4e7695846f_fixed_A-G_row7.csv" - - extractor = RubricExtractor() - - try: - results = extractor.process_task(csv_filename) - - # Check if there was an error - if 'error' in results: - return results - - # Add debugging breakpoint - pdb.set_trace() - - return { - 'prompt': results['prompt'], - 'rubrics': results['rubrics'], - 'pdf_paths': results['pdf_path'], - 'final_presence': results['final_presence'] - } - - except Exception as e: - logger.error(f"Processing failed: {e}") - raise - -if __name__ == "__main__": - main()