VectorInstitute · afkanpour · Nov 18, 2025 · Aug 26, 2025 · Sep 5, 2025 · Sep 5, 2025
diff --git a/README.md b/README.md
@@ -73,18 +73,77 @@ Utilize the capability and the corresponding subject LLM score to select or gene
 ```bash
 python -m src.run_lbo
 ```
-
 ### Agentic Generation Scripts
 
-Generate areas, capabilities, and tasks using multi-agent debate systems. Configure parameters in `src/cfg/agentic_config.yaml`.
+These scripts implement the multi-agent debate workflow for automated generation of areas, capabilities, tasks, and solutions.
+All configurable parameters are defined in `src/cfg/agentic_config.yaml`.
+
+---
 
+#### 1. Generate Areas
+Generate domain areas using the scientist–moderator debate system:
 ```bash
-# Generate capability areas
 python -m src.agentic_area_generator
+```
+
+Output location:
+```
+~/<output_dir>/<domain>/<exp_id>/areas/<areas_tag>/areas.json
+```
+Where:
+- <output_dir> comes from `global_cfg.output_dir`
+- <domain> comes from `global_cfg.domain` (spaces replaced with underscores)
+- <exp_id> comes from `exp_cfg.exp_id`
+- <areas_tag> is the tag used for the generated areas
+
+#### 2. Generate Capabilities
+Generate capabilities for each area:
+```bash
+python -m src.agentic_capability_generator pipeline_tags.areas_tag=_YYYYMMDD_HHMMSS pipeline_tags.resume_capabilities_tag=_YYYYMMDD_HHMMSS
+```
+
+**Options:**
+- `pipeline_tags.areas_tag` specifies which set of areas to use when generating capabilities.
+- `pipeline_tags.resume_capabilities_tag` (optional) resumes a previous capability generation run.
+
+**Output location:**
+```
+~/<output_dir>/<domain>/<exp_id>/capabilities/<capabilities_tag>/<area>/capabilities.json
+```
+Where:
+- <capabilities_tag> is the tag used for the generated capabilities (either resumed or auto-generated)
 
-# Generate capabilities for each area
-python -m src.agentic_capability_generator
 
-# Generate tasks for each capability
-python -m src.agentic_task_generator
+#### 3. Generate Tasks
+Generate evaluation tasks for a specific capabilities tag:
+```bash
+python -m src.agentic_task_generator pipeline_tags.capabilities_tag=_YYYYMMDD_HHMMSS pipeline_tags.resume_tasks_tag=_YYYYMMDD_HHMMSS
+```
+
+**Options:**
+- `pipeline_tags.capabilities_tag` specifies which set of capabilities to use when generating tasks.
+- `pipeline_tags.resume_tasks_tag` (optional) resumes a previous task generation run.
+
+**Output location:**
+```
+~/<output_dir>/<domain>/<exp_id>/tasks/<tasks_tag>/[<area>]-[<capability>]/tasks.json
+```
+Where:
+- <tasks_tag> is the tag used for the generated tasks (either resumed or auto-generated)
+
+#### 4. Generate Solutions
+Solve generated tasks using the multi-agent debate system:
+```bash
+python -m src.agentic_task_solver pipeline_tags.tasks_tag=_YYYYMMDD_HHMMSS pipeline_tags.resume_solutions_tag=_YYYYMMDD_HHMMSS
+```
+
+**Options:**
+- `pipeline_tags.tasks_tag` specifies which set of tasks to solve.
+- `pipeline_tags.resume_solutions_tag` (optional) resumes a previous solution generation run.
+
+**Output location:**
+```
+~/<output_dir>/<domain>/<exp_id>/task_solutions/<solutions_tag>/[<area>]-[<capability>]/<task_id>_solution.json
 ```
+Where:
+- <solutions_tag> is the tag used for the generated solutions (either resumed or auto-generated)
diff --git a/experimental/diverse_task_config.yaml b/experimental/diverse_task_config.yaml
@@ -0,0 +1,37 @@
+# Configuration for Diverse Task Generator
+
+# Model settings
+model:
+  name: gpt-4o  # OpenAI model to use
+  temperature: 1.0  # Temperature for all steps
+  max_tokens: 8192  # Max tokens for all steps
+  max_retries: 3  # Number of retry attempts for API calls
+  retry_delay: 2.0  # Initial delay between retries in seconds (exponential backoff)
+
+# Task generation settings
+generation:
+  tasks_per_blueprint: 3  # Number of tasks to generate per blueprint
+  min_subtopics: 3  # Suggested minimum number of sub-topics
+  max_subtopics: 8  # Suggested maximum number of sub-topics
+
+# Output settings
+output:
+  base_dir: diverse_task_outputs
+  save_intermediate_steps: true  # Save each step's output
+  pretty_print_json: true  # Indent JSON files
+
+# Input settings
+input:
+  capability_json_path: capability.json  # Default capability JSON file path
+
+# Verification criteria
+verification:
+  pass_threshold: 0.8  # Minimum pass rate to consider successful
+  strict_mode: false  # If true, all alignment criteria must pass
+
+# Example capability for quick testing
+example_capability:
+  name: "compound_interest_calculations"
+  description: "The ability to calculate compound interest for various scenarios, including different compounding frequencies (annually, semi-annually, quarterly, monthly), different time periods, and understanding how changes in principal, rate, or time affect the final amount."
+  domain: "personal_finance"
+  area: "investing_and_savings"
diff --git a/experimental/diverse_task_constants.py b/experimental/diverse_task_constants.py
@@ -0,0 +1,34 @@
+"""Constants for diverse task generation."""
+
+BLOOMS_TAXONOMY = {
+    "Remember": {
+        "description": "Recall or recognize facts, terms, and basic concepts. Example verbs: define, list, identify."
+    },
+    "Understand": {
+        "description": "Explain ideas or concepts and interpret information in one's own words. Example verbs: summarize, describe, classify."
+    },
+    "Apply": {
+        "description": "Use knowledge or methods in new but familiar situations. Example verbs: calculate, demonstrate, use, implement."
+    },
+    "Analyze": {
+        "description": "Break information into parts and examine relationships or patterns. Example verbs: differentiate, compare, examine, infer."
+    },
+    "Evaluate": {
+        "description": "Make judgments based on criteria and standards. Example verbs: justify, critique, assess, argue."
+    },
+    "Create": {
+        "description": "Combine elements to form a new pattern, structure, or product. Example verbs: design, compose, formulate, generate."
+    },
+}
+
+DIFFICULTY_LEVELS = {
+    "easy": {
+        "description": "Involves direct recall, recognition, or simple application of knowledge and procedures."
+    },
+    "medium": {
+        "description": "Requires connecting multiple ideas, performing multi-step reasoning, or applying knowledge in new but familiar contexts."
+    },
+    "hard": {
+        "description": "Involves complex reasoning, integration of several sub-topics, or solving non-trivial problems that demand deeper conceptual understanding."
+    },
+}
diff --git a/experimental/diverse_task_dataclasses.py b/experimental/diverse_task_dataclasses.py
@@ -0,0 +1,77 @@
+"""Dataclasses for the diverse task generation pipeline."""
+
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+
+@dataclass
+class Capability:
+    """Represents a capability to be tested."""
+
+    name: str
+    description: str
+    domain: str
+    area: Optional[str] = None
+    example_tasks: List[Dict] = field(default_factory=list)
+
+
+@dataclass
+class SubTopic:
+    """Represents a sub-topic within a capability."""
+
+    name: str
+    description: Optional[str] = None
+
+
+@dataclass
+class Combination:
+    """Represents a valid (content, difficulty, reasoning) combination."""
+
+    content: str
+    difficulty: str
+    reasoning: str
+    rationale: Optional[str] = None
+
+
+@dataclass
+class Blueprint:
+    """Represents a task blueprint for a specific combination."""
+
+    combination_id: int
+    subtopic: str
+    difficulty: str
+    reasoning: str
+    blueprint: str
+    key_characteristics: List[str] = field(default_factory=list)
+    example_question_outline: Optional[str] = None
+    rationale: Optional[str] = None
+
+
+@dataclass
+class Task:
+    """Represents a generated multiple-choice task."""
+
+    task_id: str
+    blueprint_id: int
+    subtopic: str
+    difficulty: str
+    reasoning: str
+    question: str
+    choices: Dict[str, str]
+    correct_answer: str
+    explanation: Optional[str] = None
+    alignment_notes: Optional[str] = None
+
+
+@dataclass
+class VerificationResult:
+    """Represents the verification result for a task."""
+
+    task_id: str
+    subtopic_aligned: bool
+    difficulty_aligned: bool
+    reasoning_aligned: bool
+    choices_appropriate: bool
+    overall_aligned: bool
+    feedback: str
+    suggested_improvements: Optional[str] = None