From f0cf76002f08e11e277fa9a1ca93e1657b8f63f2 Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Tue, 26 Aug 2025 13:47:06 -0400
Subject: [PATCH 01/19] adding refactored task generation. updated prompts to
 ask for json outputs, and updated corresponding output parser.

---
 src/agentic_task_generator.py    | 110 ++++++--
 src/task_generation/__init__.py  |  12 +
 src/task_generation/generator.py | 341 +++++++++++++++++++++++
 src/task_generation/messages.py  |  74 +++++
 src/task_generation/moderator.py | 462 +++++++++++++++++++++++++++++++
 src/task_generation/scientist.py | 244 ++++++++++++++++
 src/utils/agentic_prompts.py     |  58 ++--
 7 files changed, 1263 insertions(+), 38 deletions(-)
 create mode 100644 src/task_generation/__init__.py
 create mode 100644 src/task_generation/generator.py
 create mode 100644 src/task_generation/messages.py
 create mode 100644 src/task_generation/moderator.py
 create mode 100644 src/task_generation/scientist.py
diff --git a/src/agentic_task_generator.py b/src/agentic_task_generator.py
index 62a6a10..439f7a1 100644
--- a/src/agentic_task_generator.py
+++ b/src/agentic_task_generator.py
@@ -2,39 +2,111 @@
 
 import asyncio
 import logging
+import os
 import traceback
 
 import hydra
+import openlit
+from langfuse import Langfuse
 from omegaconf import DictConfig, OmegaConf
 
-from .task_generation import generate_tasks
+from src.task_generation import generate_tasks
 
 
+# Suppress OpenTelemetry console output
+os.environ["OTEL_LOG_LEVEL"] = "ERROR"
+os.environ["OTEL_METRICS_EXPORTER"] = "none"
+os.environ["OTEL_PYTHON_LOG_CORRELATION"] = "false"
+os.environ["OTEL_PYTHON_LOG_LEVEL"] = "ERROR"
+
 log = logging.getLogger("agentic_task_gen")
 
 
 @hydra.main(version_base=None, config_path="cfg", config_name="agentic_config")
 def main(cfg: DictConfig) -> None:
     """Run the multi-agent task generation system."""
-    log.info("Starting multi-agent task generation")
-    log.info("Configuration:\n%s", OmegaConf.to_yaml(cfg, resolve=True))
-
-    # Check for capabilities_tag parameter
     capabilities_tag = cfg.pipeline_tags.capabilities_tag
-    if capabilities_tag:
-        log.info(f"Using capabilities from tag: {capabilities_tag}")
-    else:
-        log.warning(
-            "No capabilities_tag provided. Please provide --pipeline_tags.capabilities_tag=<tag> to specify which capabilities to use."
-        )
-        return
-
-    try:
-        asyncio.run(generate_tasks(cfg, capabilities_tag))
-    except Exception as e:
-        log.error(f"Task generation failed: {e}")
-        log.error(f"Full traceback: {traceback.format_exc()}")
-        raise
+    domain_name = cfg.global_cfg.domain
+    exp_id = cfg.exp_cfg.exp_id
+
+    langfuse_client = Langfuse()
+    openlit.init(
+        tracer=langfuse_client._otel_tracer, disable_batch=True, disable_metrics=True
+    )
+
+    with langfuse_client.start_as_current_span(
+        name=f"ace_agentic_task_generation:{domain_name}:{exp_id}"
+    ) as span:
+        try:
+            msg = "Starting multi-agent task generation"
+            log.info(msg)
+            span.update(metadata={"system_started": msg})
+
+            config_yaml = OmegaConf.to_yaml(cfg, resolve=True)
+            msg = "Configuration loaded"
+            log.info("Configuration:\n%s", config_yaml)
+            span.update(
+                metadata={
+                    "configuration_loaded": msg,
+                    "config": config_yaml,
+                    "domain": domain_name,
+                    "exp_id": exp_id,
+                }
+            )
+
+            if capabilities_tag:
+                msg = f"Using capabilities from tag: {capabilities_tag}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "capabilities_tag_found": msg,
+                        "capabilities_tag": capabilities_tag,
+                    }
+                )
+            else:
+                error_msg = "No capabilities_tag provided. Please provide pipeline_tags.capabilities_tag=<tag> to specify which capabilities to use."
+                log.warning(error_msg)
+                span.update(
+                    level="WARNING",
+                    status_message="Missing capabilities_tag",
+                    metadata={"capabilities_tag_missing": error_msg},
+                )
+                return
+
+            span.update_trace(
+                metadata={
+                    "domain": domain_name,
+                    "exp_id": exp_id,
+                    "capabilities_tag": capabilities_tag,
+                    "config": config_yaml,
+                },
+                tags=["agentic_task_generation", exp_id],
+            )
+
+            asyncio.run(generate_tasks(cfg, capabilities_tag, langfuse_client))
+
+            msg = "Multi-agent task generation completed successfully"
+            log.info(msg)
+            span.update(metadata={"system_completed": msg})
+
+        except Exception as e:
+            error_msg = f"Task generation failed: {e}"
+            traceback_msg = f"Full traceback: {traceback.format_exc()}"
+
+            log.error(error_msg)
+            log.error(traceback_msg)
+
+            span.update(
+                level="ERROR",
+                status_message=str(e),
+                metadata={
+                    "system_error": error_msg,
+                    "error": str(e),
+                    "traceback": traceback_msg,
+                },
+            )
+
+            raise
 
 
 if __name__ == "__main__":
diff --git a/src/task_generation/__init__.py b/src/task_generation/__init__.py
new file mode 100644
index 0000000..2598dec
--- /dev/null
+++ b/src/task_generation/__init__.py
@@ -0,0 +1,12 @@
+"""Task generation package for multi-agent debate-based task generation."""
+
+from .generator import generate_tasks
+from .moderator import TaskModerator
+from .scientist import TaskScientist
+
+
+__all__ = [
+    "generate_tasks",
+    "TaskModerator",
+    "TaskScientist",
+]
diff --git a/src/task_generation/generator.py b/src/task_generation/generator.py
new file mode 100644
index 0000000..5be2742
--- /dev/null
+++ b/src/task_generation/generator.py
@@ -0,0 +1,341 @@
+"""Main task generation orchestration functions."""
+
+import asyncio
+import json
+import logging
+import traceback
+from datetime import datetime
+from pathlib import Path
+
+from autogen_core import (
+    EVENT_LOGGER_NAME,
+    ROOT_LOGGER_NAME,
+    TRACE_LOGGER_NAME,
+    DefaultTopicId,
+    SingleThreadedAgentRuntime,
+)
+from autogen_ext.models.openai import OpenAIChatCompletionClient
+from langfuse import Langfuse
+from omegaconf import DictConfig
+
+from src.task_generation.messages import Capability
+from src.task_generation.moderator import TaskModerator
+from src.task_generation.scientist import TaskScientist
+
+
+log = logging.getLogger("agentic_task_gen.generator")
+logging.getLogger(ROOT_LOGGER_NAME).setLevel(logging.WARNING)
+logging.getLogger(TRACE_LOGGER_NAME).setLevel(logging.WARNING)
+logging.getLogger(EVENT_LOGGER_NAME).setLevel(logging.WARNING)
+
+
+async def generate_tasks_for_capability(
+    cfg: DictConfig, capability: Capability, output_dir: Path, langfuse_client: Langfuse
+) -> None:
+    """Generate tasks for a single capability."""
+    with langfuse_client.start_as_current_span(
+        name=f"task_generation_for_capability:{capability.name}"
+    ) as span:
+        try:
+            msg = f"Generating tasks for capability: {capability.name}"
+            log.info(msg)
+            span.update(
+                metadata={
+                    "capability_generation_started": msg,
+                    "capability_name": capability.name,
+                    "capability_description": capability.description,
+                }
+            )
+
+            domain_name = cfg.global_cfg.domain
+
+            runtime = SingleThreadedAgentRuntime()
+
+            # Register scientists
+            await TaskScientist.register(
+                runtime,
+                "TaskScientistA",
+                lambda: TaskScientist(
+                    model_client=OpenAIChatCompletionClient(
+                        model=cfg.agents.scientist_a.model_name,
+                        seed=cfg.agents.scientist_a.seed,
+                    ),
+                    scientist_id="A",
+                    domain=domain_name,
+                    langfuse_client=langfuse_client,
+                ),
+            )
+
+            await TaskScientist.register(
+                runtime,
+                "TaskScientistB",
+                lambda: TaskScientist(
+                    model_client=OpenAIChatCompletionClient(
+                        model=cfg.agents.scientist_b.model_name,
+                        seed=cfg.agents.scientist_b.seed,
+                    ),
+                    scientist_id="B",
+                    domain=domain_name,
+                    langfuse_client=langfuse_client,
+                ),
+            )
+
+            # Register moderator
+            await TaskModerator.register(
+                runtime,
+                "TaskModerator",
+                lambda: TaskModerator(
+                    model_client=OpenAIChatCompletionClient(
+                        model=cfg.agents.moderator.model_name,
+                        seed=cfg.agents.moderator.seed,
+                    ),
+                    num_scientists=2,
+                    num_final_problems=cfg.task_generation.num_final_problems_per_capability,
+                    buffer_param=cfg.task_generation.buffer_param,
+                    agreement_threshold=cfg.task_generation.agreement_threshold,
+                    output_dir=output_dir,
+                    domain=domain_name,
+                    langfuse_client=langfuse_client,
+                ),
+            )
+
+            span.update(
+                metadata={
+                    "agents_registered": "All task agents registered successfully",
+                    "scientists": ["A", "B"],
+                    "moderator": True,
+                }
+            )
+
+            # Start runtime and process the capability
+            runtime.start()
+            await runtime.publish_message(capability, DefaultTopicId())
+
+            msg = f"Capability message published: {capability.name}"
+            log.info(msg)
+            span.update(
+                metadata={
+                    "capability_published": msg,
+                    "capability_name": capability.name,
+                }
+            )
+
+            # Wait for the runtime to stop when idle
+            try:
+                await runtime.stop_when_idle()
+
+                msg = f"Completed generating tasks for capability: {capability.name}"
+                log.info(msg)
+                span.update(metadata={"runtime_completed": msg})
+            except Exception as e:
+                msg = f"Error while generating tasks for capability {capability.name}: {e}"
+                log.error(msg)
+                span.update(
+                    level="ERROR",
+                    status_message=str(e),
+                    metadata={
+                        "runtime_error": msg,
+                        "error": str(e),
+                        "capability_name": capability.name,
+                    },
+                )
+                raise
+
+        except Exception as e:
+            error_msg = f"Error in generating tasks for {capability.name}: {e}"
+            traceback_msg = f"Traceback: {traceback.format_exc()}"
+
+            log.error(error_msg)
+            log.error(traceback_msg)
+
+            span.update(
+                level="ERROR",
+                status_message=str(e),
+                metadata={
+                    "capability_generation_error": error_msg,
+                    "error": str(e),
+                    "traceback": traceback_msg,
+                },
+            )
+            raise
+
+
+async def generate_tasks(
+    cfg: DictConfig, capabilities_tag: str, langfuse_client: Langfuse
+) -> None:
+    """Generate tasks for all capabilities."""
+    domain_name = cfg.global_cfg.domain
+    exp_id = cfg.exp_cfg.exp_id
+    tasks_tag = f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+
+    with langfuse_client.start_as_current_span(
+        name=f"ace_task_generation:{domain_name}:{exp_id}:{tasks_tag}"
+    ) as span:
+        try:
+            msg = f"Tasks will be saved with tag: {tasks_tag}"
+            log.info(msg)
+            span.update(
+                metadata={
+                    "generation_started": msg,
+                    "tasks_tag": tasks_tag,
+                    "domain": domain_name,
+                    "exp_id": exp_id,
+                }
+            )
+
+            msg = "Starting task generation process"
+            log.info(msg)
+            span.update(metadata={"process_started": msg})
+
+            span.update_trace(
+                metadata={
+                    "domain": domain_name,
+                    "exp_id": exp_id,
+                    "tasks_tag": tasks_tag,
+                    "capabilities_tag": capabilities_tag,
+                    "num_problems_per_capability": cfg.task_generation.num_final_problems_per_capability,
+                },
+                tags=["task_generation_process", exp_id],
+            )
+
+            # Read capabilities from the timestamped capabilities directory
+            capabilities_dir = (
+                Path.home()
+                / cfg.global_cfg.output_dir
+                / domain_name.replace(" ", "_")
+                / exp_id
+                / "capabilities"
+                / capabilities_tag
+            )
+
+            if not capabilities_dir.exists():
+                error_msg = f"Capabilities directory not found: {capabilities_dir}"
+                log.error(error_msg)
+                span.update(
+                    level="ERROR",
+                    status_message="Capabilities directory not found",
+                    metadata={
+                        "directory_not_found_error": error_msg,
+                        "capabilities_dir": str(capabilities_dir),
+                    },
+                )
+                raise FileNotFoundError(error_msg)
+
+            capabilities = []
+
+            # Iterate through area directories
+            for area_dir in capabilities_dir.iterdir():
+                if area_dir.is_dir():
+                    capabilities_file = area_dir / "capabilities.json"
+                    if capabilities_file.exists():
+                        with open(capabilities_file, "r", encoding="utf-8") as f:
+                            capabilities_data = json.load(f)
+
+                        if (
+                            isinstance(capabilities_data, dict)
+                            and "capabilities" in capabilities_data
+                        ):
+                            for cap_dict in capabilities_data["capabilities"]:
+                                if (
+                                    isinstance(cap_dict, dict)
+                                    and "name" in cap_dict
+                                    and "description" in cap_dict
+                                ):
+                                    capabilities.append(
+                                        Capability(
+                                            name=cap_dict["name"],
+                                            description=cap_dict["description"],
+                                            domain=cap_dict.get("domain", domain_name),
+                                            area=cap_dict.get("area", area_dir.name),
+                                        )
+                                    )
+
+            if not capabilities:
+                error_msg = f"No valid capabilities found in {capabilities_dir}"
+                span.update(
+                    level="ERROR",
+                    status_message="No valid capabilities found",
+                    metadata={
+                        "no_capabilities_error": error_msg,
+                        "capabilities_dir": str(capabilities_dir),
+                    },
+                )
+                raise ValueError(error_msg)
+
+            msg = f"Found {len(capabilities)} capabilities to process"
+            log.info(msg)
+            span.update(
+                metadata={
+                    "capabilities_loaded": msg,
+                    "num_capabilities": len(capabilities),
+                    "capability_names": [cap.name for cap in capabilities],
+                }
+            )
+
+            # Create timestamped output directory for tasks
+            output_dir = (
+                Path.home()
+                / cfg.global_cfg.output_dir
+                / domain_name.replace(" ", "_")
+                / exp_id
+                / "tasks"
+                / tasks_tag
+            )
+
+            msg = f"Output directory: {output_dir}"
+            log.info(msg)
+            span.update(
+                metadata={
+                    "output_directory_configured": msg,
+                    "output_dir": str(output_dir),
+                }
+            )
+
+            # Print the timestamp for future reference
+            print(f"Tasks generated with tag: {tasks_tag}")
+
+            # Process each capability individually
+            for i, capability in enumerate(capabilities):
+                msg = f"Processing capability {i + 1}/{len(capabilities)}: {capability.name}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        f"capability_{i + 1}_started": msg,
+                        "current_capability": capability.name,
+                        "progress": f"{i + 1}/{len(capabilities)}",
+                    }
+                )
+
+                await generate_tasks_for_capability(
+                    cfg, capability, output_dir, langfuse_client
+                )
+
+                msg = f"Completed capability {i + 1}/{len(capabilities)}: {capability.name}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        f"capability_{i + 1}_completed": msg,
+                        "completed_capability": capability.name,
+                    }
+                )
+
+                await asyncio.sleep(1)
+
+        except Exception as e:
+            error_msg = f"Error in generate_tasks: {e}"
+            traceback_msg = f"Traceback: {traceback.format_exc()}"
+
+            log.error(error_msg)
+            log.error(traceback_msg)
+
+            span.update(
+                level="ERROR",
+                status_message=str(e),
+                metadata={
+                    "generation_error": error_msg,
+                    "error": str(e),
+                    "traceback": traceback_msg,
+                },
+            )
+
+            raise
diff --git a/src/task_generation/messages.py b/src/task_generation/messages.py
new file mode 100644
index 0000000..09b5e9d
--- /dev/null
+++ b/src/task_generation/messages.py
@@ -0,0 +1,74 @@
+"""Message types and data classes for task generation."""
+
+from dataclasses import dataclass
+from typing import Dict, List
+
+
+@dataclass
+class Capability:
+    """A capability with name, description, domain, and area."""
+
+    name: str
+    description: str
+    domain: str
+    area: str
+
+
+@dataclass
+class ProblemProposalRequest:
+    """Request for problem proposals from scientists."""
+
+    capability_name: str
+    capability_description: str
+    capability_domain: str
+    capability_area: str
+    num_problems: int
+    sample_tasks: List[str]
+
+
+@dataclass
+class ScientistProblemProposal:
+    """Problem proposal from a scientist."""
+
+    scientist_id: str
+    capability_name: str
+    problems: Dict[str, str]  # task_id -> task_text
+    iteration: int
+
+
+@dataclass
+class ModeratorProblemReview:
+    """Moderator's review and filtering of problems."""
+
+    capability_name: str
+    final_problems: Dict[str, str]  # task_id -> task_text
+    rejected_problems: Dict[str, str]  # task_id -> rejection_reason
+    iteration: int
+
+
+@dataclass
+class SolutionRequest:
+    """Request for scientists to solve problems."""
+
+    capability_name: str
+    capability_description: str
+    capability_domain: str
+    capability_area: str
+    problems: Dict[str, str]  # task_id -> task_text
+
+
+@dataclass
+class ScientistSolutionProposal:
+    """Solution proposal from a scientist."""
+
+    scientist_id: str
+    capability_name: str
+    solutions: Dict[str, str]  # task_id -> solution
+
+
+@dataclass
+class FinalTaskSet:
+    """Final task set with problems and solutions."""
+
+    capability_name: str
+    tasks: Dict[str, Dict[str, str]]  # task_id -> {problem, answer}
diff --git a/src/task_generation/moderator.py b/src/task_generation/moderator.py
new file mode 100644
index 0000000..9e5c96b
--- /dev/null
+++ b/src/task_generation/moderator.py
@@ -0,0 +1,462 @@
+"""Task moderator agent for managing task generation workflow."""
+
+import json
+import logging
+import math
+import traceback
+from pathlib import Path
+from typing import Dict, List
+
+from autogen_core import (
+    DefaultTopicId,
+    MessageContext,
+    RoutedAgent,
+    default_subscription,
+    message_handler,
+)
+from autogen_core.models import (
+    ChatCompletionClient,
+    SystemMessage,
+    UserMessage,
+)
+from langfuse import Langfuse
+
+from src.task_generation.messages import (
+    Capability,
+    ProblemProposalRequest,
+    ScientistProblemProposal,
+    ScientistSolutionProposal,
+    SolutionRequest,
+)
+from src.utils.agentic_prompts import (
+    TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT,
+    TASK_MODERATOR_PROBLEM_USER_PROMPT,
+)
+from src.utils.json_utils import parse_llm_json_response
+
+
+log = logging.getLogger("agentic_task_gen.moderator")
+
+
+@default_subscription
+class TaskModerator(RoutedAgent):
+    """Moderator that merges scientist task proposals and manages iteration."""
+
+    def __init__(
+        self,
+        model_client: ChatCompletionClient,
+        num_scientists: int,
+        num_final_problems: int,
+        buffer_param: int,
+        agreement_threshold: float,
+        output_dir: Path,
+        domain: str,
+        langfuse_client: Langfuse,
+    ) -> None:
+        super().__init__("Task Moderator")
+        self._model_client = model_client
+        self._num_scientists = num_scientists
+        self._num_final_problems = num_final_problems
+        self._buffer_param = buffer_param
+        self._agreement_threshold = agreement_threshold
+        self._output_dir = output_dir
+        self._domain = domain
+        self._langfuse_client = langfuse_client
+
+        # Algorithm 1 state
+        self._num_remaining: Dict[str, int] = {}
+        self._final_problems: Dict[
+            str, Dict[str, str]
+        ] = {}  # capability -> {task_id: problem_text}
+        self._capabilities: Dict[str, Capability] = {}  # Store original capability info
+
+        # Problem design state
+        self._problem_proposals: Dict[
+            str, List[ScientistProblemProposal]
+        ] = {}  # capability -> proposals
+
+        # Solution design state
+        self._solution_proposals: Dict[
+            str, List[ScientistSolutionProposal]
+        ] = {}  # capability -> solutions
+
+    @message_handler
+    async def handle_capability(self, message: Capability, ctx: MessageContext) -> None:
+        """Handle capability and start Algorithm 1 for problem design."""
+        with self._langfuse_client.start_as_current_span(
+            name="task_moderator_handle_capability"
+        ) as span:
+            try:
+                msg = f"Task Moderator starting problem design for capability: {message.name}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "capability_received": msg,
+                        "capability_name": message.name,
+                        "capability_description": message.description,
+                        "capability_area": message.area,
+                    }
+                )
+
+                # Initialize Algorithm 1 state
+                self._num_remaining[message.name] = self._num_final_problems
+                self._final_problems[message.name] = {}
+                self._capabilities[message.name] = (
+                    message  # Store original capability info
+                )
+
+                await self._start_problem_iteration(message)
+
+            except Exception as e:
+                error_msg = f"Error in Task Moderator handle_capability: {e}"
+                traceback_msg = f"Traceback: {traceback.format_exc()}"
+
+                log.error(error_msg)
+                log.error(traceback_msg)
+
+                span.update(
+                    level="ERROR",
+                    status_message=str(e),
+                    metadata={
+                        "handle_capability_error": error_msg,
+                        "error": str(e),
+                        "traceback": traceback_msg,
+                    },
+                )
+                raise
+
+    async def _start_problem_iteration(self, capability: Capability) -> None:
+        """Start a problem generation iteration (Algorithm 1)."""
+        try:
+            num_remaining = self._num_remaining[capability.name]
+            if num_remaining <= 0:
+                log.info(
+                    f"Problem design completed for capability: {capability.name}, starting solution design"
+                )
+                await self._start_solution_design(capability)
+                return
+
+            # Calculate problems per scientist: ceil(num_remaining / M) + B
+            problems_per_scientist = (
+                math.ceil(num_remaining / self._num_scientists) + self._buffer_param
+            )
+
+            log.info(
+                f"Task Moderator requesting {problems_per_scientist} problems per scientist for capability: {capability.name} (remaining: {num_remaining})"
+            )
+
+            # Get sample tasks from existing final problems
+            sample_tasks = list(self._final_problems[capability.name].values())[
+                :3
+            ]  # Use up to 3 existing problems as samples
+
+            # Send problem proposal requests to all scientists
+            await self.publish_message(
+                ProblemProposalRequest(
+                    capability_name=capability.name,
+                    capability_description=capability.description,
+                    capability_domain=capability.domain,
+                    capability_area=capability.area,
+                    num_problems=problems_per_scientist,
+                    sample_tasks=sample_tasks,
+                ),
+                topic_id=DefaultTopicId(),
+            )
+
+        except Exception as e:
+            log.error(f"Error in Task Moderator _start_problem_iteration: {e}")
+            log.error(f"Traceback: {traceback.format_exc()}")
+            raise
+
+    @message_handler
+    async def handle_scientist_problem_proposal(
+        self, message: ScientistProblemProposal, ctx: MessageContext
+    ) -> None:
+        """Handle problem proposals from scientists."""
+        try:
+            log.info(
+                f"Task Moderator received problem proposal from Scientist {message.scientist_id} for capability: {message.capability_name}"
+            )
+
+            capability_name = message.capability_name
+            if capability_name not in self._problem_proposals:
+                self._problem_proposals[capability_name] = []
+
+            self._problem_proposals[capability_name].append(message)
+
+            # Check if we have all proposals for this iteration
+            current_proposals = [
+                p
+                for p in self._problem_proposals[capability_name]
+                if p.iteration == message.iteration
+            ]
+            if len(current_proposals) == self._num_scientists:
+                log.info(
+                    f"Task Moderator received all problem proposals for capability: {capability_name}, proceeding to filter"
+                )
+                await self._filter_and_select_problems(
+                    capability_name, message.iteration
+                )
+
+        except Exception as e:
+            log.error(f"Error in Task Moderator handle_scientist_problem_proposal: {e}")
+            log.error(f"Traceback: {traceback.format_exc()}")
+            raise
+
+    async def _filter_and_select_problems(
+        self, capability_name: str, iteration: int
+    ) -> None:
+        """Filter and select problems using moderator LLM."""
+        try:
+            log.info(
+                f"Task Moderator filtering problems for capability: {capability_name}"
+            )
+
+            # Collect all proposed problems
+            current_proposals = [
+                p
+                for p in self._problem_proposals[capability_name]
+                if p.iteration == iteration
+            ]
+            all_problems = {}
+            scientist_attribution = {}
+
+            for proposal in current_proposals:
+                for task_id, problem_text in proposal.problems.items():
+                    unique_id = f"{proposal.scientist_id}_{task_id}"
+                    all_problems[unique_id] = problem_text
+                    scientist_attribution[unique_id] = proposal.scientist_id
+
+            if not all_problems:
+                log.warning(f"No problems received for capability: {capability_name}")
+                return
+
+            # Format problems for moderator
+            problems_text = ""
+            for scientist_id in set(scientist_attribution.values()):
+                problems_text += f"Scientist {scientist_id}:\n"
+                for task_id, problem in all_problems.items():
+                    if scientist_attribution[task_id] == scientist_id:
+                        task_name = task_id.split("_", 1)[1]  # Remove scientist prefix
+                        problems_text += f"- {task_name}: {problem}\n"
+                problems_text += "\n"
+
+            system_prompt = TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT
+
+            capability_info = self._capabilities[capability_name]
+            user_prompt = TASK_MODERATOR_PROBLEM_USER_PROMPT.format(
+                capability_name=capability_info.name,
+                capability_description=capability_info.description,
+                capability_domain=capability_info.domain,
+                problems_text=problems_text,
+            )
+
+            system_message = SystemMessage(content=system_prompt)
+            user_message = UserMessage(content=user_prompt, source="user")
+
+            model_result = await self._model_client.create(
+                [system_message, user_message]
+            )
+
+            raw_content = model_result.content
+            if not isinstance(raw_content, str):
+                raw_content = str(raw_content)
+
+            # Extract JSON from response using robust parser
+            try:
+                parsed = parse_llm_json_response(raw_content)
+                final_tasks = parsed.get("final_tasks", {})
+                rejected_tasks = parsed.get("rejected_tasks", {})
+            except Exception as e:
+                log.error(
+                    f"Error parsing JSON from moderator: {e}\nOutput: {raw_content}"
+                )
+                final_tasks = {}
+                rejected_tasks = {}
+
+            # Update Algorithm 1 state
+            num_remaining = self._num_remaining[capability_name]
+            num_selected = min(len(final_tasks), num_remaining)
+
+            # Add selected problems to final set
+            selected_count = 0
+            for _, problem_text in final_tasks.items():
+                if selected_count < num_selected:
+                    final_task_id = (
+                        f"task_{len(self._final_problems[capability_name]) + 1}"
+                    )
+                    self._final_problems[capability_name][final_task_id] = problem_text
+                    selected_count += 1
+
+            # Update remaining count
+            self._num_remaining[capability_name] = num_remaining - selected_count
+
+            log.info(
+                f"Task Moderator selected {selected_count} problems for {capability_name}, {self._num_remaining[capability_name]} remaining"
+            )
+            log.info(
+                f"Rejected {len(rejected_tasks)} problems: {list(rejected_tasks.keys())}"
+            )
+
+            # Continue Algorithm 1 or move to solution design
+            if self._num_remaining[capability_name] > 0:
+                # Need more problems, start another iteration
+                capability = self._capabilities[capability_name]
+                await self._start_problem_iteration(capability)
+            else:
+                # Problem design complete, start solution design
+                capability = self._capabilities[capability_name]
+                await self._start_solution_design(capability)
+
+        except Exception as e:
+            log.error(f"Error in Task Moderator _filter_and_select_problems: {e}")
+            log.error(f"Traceback: {traceback.format_exc()}")
+            raise
+
+    async def _start_solution_design(self, capability: Capability) -> None:
+        """Start solution design phase."""
+        try:
+            log.info(
+                f"Task Moderator starting solution design for capability: {capability.name}"
+            )
+
+            final_problems = self._final_problems[capability.name]
+            if not final_problems:
+                log.error(
+                    f"No final problems available for capability: {capability.name}"
+                )
+                return
+
+            # Send solution requests to all scientists
+            await self.publish_message(
+                SolutionRequest(
+                    capability_name=capability.name,
+                    capability_description=capability.description,
+                    capability_domain=capability.domain,
+                    capability_area=capability.area,
+                    problems=final_problems,
+                ),
+                topic_id=DefaultTopicId(),
+            )
+
+        except Exception as e:
+            log.error(f"Error in Task Moderator _start_solution_design: {e}")
+            log.error(f"Traceback: {traceback.format_exc()}")
+            raise
+
+    @message_handler
+    async def handle_scientist_solution_proposal(
+        self, message: ScientistSolutionProposal, ctx: MessageContext
+    ) -> None:
+        """Handle solution proposals from scientists."""
+        try:
+            log.info(
+                f"Task Moderator received solution proposal from Scientist {message.scientist_id} for capability: {message.capability_name}"
+            )
+
+            capability_name = message.capability_name
+            if capability_name not in self._solution_proposals:
+                self._solution_proposals[capability_name] = []
+
+            self._solution_proposals[capability_name].append(message)
+
+            # Check if we have all solutions
+            if len(self._solution_proposals[capability_name]) == self._num_scientists:
+                log.info(
+                    f"Task Moderator received all solutions for capability: {capability_name}, determining consensus"
+                )
+                await self._determine_solution_consensus(capability_name)
+
+        except Exception as e:
+            log.error(
+                f"Error in Task Moderator handle_scientist_solution_proposal: {e}"
+            )
+            log.error(f"Traceback: {traceback.format_exc()}")
+            raise
+
+    async def _determine_solution_consensus(self, capability_name: str) -> None:
+        """Determine solution consensus and finalize tasks."""
+        try:
+            log.info(
+                f"Task Moderator determining solution consensus for capability: {capability_name}"
+            )
+
+            solutions_by_task: Dict[
+                str, Dict[str, str]
+            ] = {}  # task_id -> [scientist_id -> solution]
+
+            for proposal in self._solution_proposals[capability_name]:
+                for task_id, solution in proposal.solutions.items():
+                    if task_id not in solutions_by_task:
+                        solutions_by_task[task_id] = {}
+                    solutions_by_task[task_id][proposal.scientist_id] = solution
+
+            final_tasks = {}
+
+            for task_id, problem_text in self._final_problems[capability_name].items():
+                if task_id in solutions_by_task:
+                    scientist_solutions = solutions_by_task[task_id]
+
+                    # Simple consensus: find most common solution
+                    solution_counts: Dict[str, int] = {}
+                    for solution in scientist_solutions.values():
+                        solution_counts[solution] = solution_counts.get(solution, 0) + 1
+
+                    if solution_counts:
+                        most_common_solution = max(
+                            solution_counts.keys(), key=lambda x: solution_counts[x]
+                        )
+                        agreement_rate = solution_counts[most_common_solution] / len(
+                            scientist_solutions
+                        )
+
+                        if agreement_rate >= self._agreement_threshold:
+                            final_tasks[task_id] = {
+                                "problem": problem_text,
+                                "answer": most_common_solution,
+                            }
+                            log.info(
+                                f"Task {task_id}: consensus achieved ({agreement_rate:.2f} agreement)"
+                            )
+                        else:
+                            log.warning(
+                                f"Task {task_id}: low agreement ({agreement_rate:.2f}), requires human review"
+                            )
+                            # For now, use most common solution but mark it
+                            final_tasks[task_id] = {
+                                "problem": problem_text,
+                                "answer": most_common_solution,
+                                "requires_human_review": "true",
+                                "agreement_rate": str(agreement_rate),
+                            }
+
+            # Save final tasks
+            await self._save_tasks_to_file(capability_name, final_tasks)
+            log.info(f"Task generation completed for capability: {capability_name}")
+
+        except Exception as e:
+            log.error(f"Error in Task Moderator _determine_solution_consensus: {e}")
+            log.error(f"Traceback: {traceback.format_exc()}")
+            raise
+
+    async def _save_tasks_to_file(
+        self, capability_name: str, tasks: Dict[str, Dict[str, str]]
+    ) -> None:
+        """Save final tasks to file."""
+        try:
+            # Create capability directory
+            capability_dir = self._output_dir / capability_name
+            capability_dir.mkdir(parents=True, exist_ok=True)
+
+            # Save tasks
+            tasks_file = capability_dir / "tasks.json"
+            with open(tasks_file, "w", encoding="utf-8") as f:
+                json.dump({"tasks": tasks}, f, indent=2, ensure_ascii=False)
+
+            log.info(
+                f"Saved {len(tasks)} tasks for capability '{capability_name}' to {tasks_file}"
+            )
+        except Exception as e:
+            log.error(f"Error saving tasks for capability {capability_name}: {e}")
+            log.error(f"Traceback: {traceback.format_exc()}")
+            raise
diff --git a/src/task_generation/scientist.py b/src/task_generation/scientist.py
new file mode 100644
index 0000000..2daa571
--- /dev/null
+++ b/src/task_generation/scientist.py
@@ -0,0 +1,244 @@
+"""Task scientist agent for generating problems and solutions."""
+
+import json
+import logging
+import traceback
+
+from autogen_core import (
+    DefaultTopicId,
+    MessageContext,
+    RoutedAgent,
+    default_subscription,
+    message_handler,
+)
+from autogen_core.models import (
+    ChatCompletionClient,
+    SystemMessage,
+    UserMessage,
+)
+from langfuse import Langfuse
+
+from src.task_generation.messages import (
+    ProblemProposalRequest,
+    ScientistProblemProposal,
+    ScientistSolutionProposal,
+    SolutionRequest,
+)
+from src.utils.agentic_prompts import (
+    TASK_SCIENTIST_PROBLEM_SYSTEM_PROMPT,
+    TASK_SCIENTIST_PROBLEM_USER_PROMPT,
+    TASK_SCIENTIST_SOLUTION_SYSTEM_PROMPT,
+    TASK_SCIENTIST_SOLUTION_USER_PROMPT,
+)
+from src.utils.json_utils import parse_llm_json_response
+
+
+log = logging.getLogger("agentic_task_gen.scientist")
+
+
+@default_subscription
+class TaskScientist(RoutedAgent):
+    """Scientist that generates problems and solutions."""
+
+    def __init__(
+        self,
+        model_client: ChatCompletionClient,
+        scientist_id: str,
+        langfuse_client: Langfuse,
+        domain: str = "",
+    ) -> None:
+        super().__init__(f"Task Scientist {scientist_id}")
+        self._scientist_id = scientist_id
+        self._model_client = model_client
+        self._domain = domain
+        self._langfuse_client = langfuse_client
+
+    @message_handler
+    async def handle_problem_proposal_request(
+        self, message: ProblemProposalRequest, ctx: MessageContext
+    ) -> None:
+        """Handle problem proposal request."""
+        with self._langfuse_client.start_as_current_span(
+            name=f"task_scientist_{self._scientist_id}_problem_proposal"
+        ) as span:
+            try:
+                msg = f"Task Scientist {self._scientist_id} generating {message.num_problems} problems for capability: {message.capability_name}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "problem_request_received": msg,
+                        "scientist_id": self._scientist_id,
+                        "capability_name": message.capability_name,
+                        "capability_description": message.capability_description,
+                        "num_problems": message.num_problems,
+                    }
+                )
+
+                sample_tasks_text = ""
+                if message.sample_tasks:
+                    sample_tasks_text = "\n".join(
+                        [f"- {task}" for task in message.sample_tasks]
+                    )
+                else:
+                    sample_tasks_text = "(No sample tasks provided)"
+
+                system_prompt = TASK_SCIENTIST_PROBLEM_SYSTEM_PROMPT.format(
+                    scientist_id=self._scientist_id,
+                )
+
+                user_prompt = TASK_SCIENTIST_PROBLEM_USER_PROMPT.format(
+                    num_problems=message.num_problems,
+                    capability_name=message.capability_name,
+                    capability_description=message.capability_description,
+                    capability_domain=message.capability_domain,
+                    sample_tasks_text=sample_tasks_text,
+                )
+
+                system_message = SystemMessage(content=system_prompt)
+                user_message = UserMessage(content=user_prompt, source="user")
+
+                model_result = await self._model_client.create(
+                    [system_message, user_message]
+                )
+
+                msg = f"Task Scientist {self._scientist_id} is parsing LLM response"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "llm_response_received": msg,
+                        "scientist_id": self._scientist_id,
+                    }
+                )
+
+                parsed = parse_llm_json_response(model_result.content)
+                problems = parsed.get("problems", {})
+
+                msg = f"Task Scientist {self._scientist_id} proposing {len(problems)} problems for capability: {message.capability_name}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "problem_proposal_published": msg,
+                        "scientist_id": self._scientist_id,
+                        "capability_name": message.capability_name,
+                        "num_problems_generated": len(problems),
+                    }
+                )
+
+                await self.publish_message(
+                    ScientistProblemProposal(
+                        scientist_id=self._scientist_id,
+                        capability_name=message.capability_name,
+                        problems=problems,
+                        iteration=0,
+                    ),
+                    topic_id=DefaultTopicId(),
+                )
+
+            except Exception as e:
+                error_msg = f"Error in Task Scientist {self._scientist_id} handle_problem_proposal_request: {e}"
+                traceback_msg = f"Traceback: {traceback.format_exc()}"
+
+                log.error(error_msg)
+                log.error(traceback_msg)
+
+                span.update(
+                    level="ERROR",
+                    status_message=str(e),
+                    metadata={
+                        "problem_request_error": error_msg,
+                        "scientist_id": self._scientist_id,
+                        "error": str(e),
+                        "traceback": traceback_msg,
+                    },
+                )
+                raise
+
+    @message_handler
+    async def handle_solution_request(
+        self, message: SolutionRequest, ctx: MessageContext
+    ) -> None:
+        """Handle solution request for problems."""
+        with self._langfuse_client.start_as_current_span(
+            name=f"task_scientist_{self._scientist_id}_solution_proposal"
+        ) as span:
+            try:
+                msg = f"Task Scientist {self._scientist_id} solving {len(message.problems)} problems for capability: {message.capability_name}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "solution_request_received": msg,
+                        "scientist_id": self._scientist_id,
+                        "capability_name": message.capability_name,
+                        "num_problems": len(message.problems),
+                    }
+                )
+
+                problems_json = json.dumps(message.problems, indent=2)
+
+                system_prompt = TASK_SCIENTIST_SOLUTION_SYSTEM_PROMPT.format(
+                    scientist_id=self._scientist_id,
+                    capability_domain=message.capability_domain,
+                    capability_name=message.capability_name,
+                )
+
+                user_prompt = TASK_SCIENTIST_SOLUTION_USER_PROMPT.format(
+                    problems=problems_json,
+                )
+
+                system_message = SystemMessage(content=system_prompt)
+                user_message = UserMessage(content=user_prompt, source="user")
+
+                model_result = await self._model_client.create(
+                    [system_message, user_message]
+                )
+
+                msg = f"Task Scientist {self._scientist_id} is parsing LLM response"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "llm_response_received": msg,
+                        "scientist_id": self._scientist_id,
+                    }
+                )
+
+                parsed = parse_llm_json_response(model_result.content)
+                solutions = parsed.get("solutions", {})
+
+                msg = f"Task Scientist {self._scientist_id} publishing solutions for capability: {message.capability_name}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "solution_proposal_published": msg,
+                        "scientist_id": self._scientist_id,
+                        "capability_name": message.capability_name,
+                        "num_solutions_generated": len(solutions),
+                    }
+                )
+
+                await self.publish_message(
+                    ScientistSolutionProposal(
+                        scientist_id=self._scientist_id,
+                        capability_name=message.capability_name,
+                        solutions=solutions,
+                    ),
+                    topic_id=DefaultTopicId(),
+                )
+
+            except Exception as e:
+                error_msg = f"Error in Task Scientist {self._scientist_id} handle_solution_request: {e}"
+                traceback_msg = f"Traceback: {traceback.format_exc()}"
+
+                log.error(error_msg)
+                log.error(traceback_msg)
+
+                span.update(
+                    level="ERROR",
+                    status_message=str(e),
+                    metadata={
+                        "solution_request_error": error_msg,
+                        "scientist_id": self._scientist_id,
+                        "error": str(e),
+                        "traceback": traceback_msg,
+                    },
+                )
+                raise
diff --git a/src/utils/agentic_prompts.py b/src/utils/agentic_prompts.py
index 00d1f86..b4a0d26 100644
--- a/src/utils/agentic_prompts.py
+++ b/src/utils/agentic_prompts.py
@@ -202,13 +202,16 @@
 - Avoiding overlap or redundancy,
 - Proposing tasks that vary in difficulty and structure.
 
-Your response must follow this format exactly:
-THOUGHT: <brief reasoning about the kind of tasks you're proposing>
-RESPONSE JSON:
+IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. The JSON should be directly parseable.
+
+Please return your proposal and your thoughts and reasoning in the following format:
 {{
-  "task_1": "<TASK_TEXT_1>",
-  "task_2": "<TASK_TEXT_2>",
-  ...
+  "thought": "Your reasoning and thought process about the kind of tasks you're proposing",
+  "problems": {{
+    "problem_0": "TASK_TEXT_1",
+    "problem_1": "TASK_TEXT_2",
+    ...
+  }}
 }}
 
 Make sure:
@@ -227,13 +230,25 @@
 
 TASK_SCIENTIST_SOLUTION_SYSTEM_PROMPT = """You are Scientist {scientist_id}, an expert in {capability_domain}. You are solving a task related to the capability: {capability_name}.
 
-Provide a clear, accurate, and complete solution to the given problem. Your solution should be correct and well-reasoned."""
+IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. The JSON should be directly parseable.
 
-TASK_SCIENTIST_SOLUTION_USER_PROMPT = """Solve the following problem:
+Please return your solution and your thoughts and reasoning in the following format:
+{{
+  "thought": "Your reasoning and thought process about solving this problem",
+  "solutions": {{
+    "solution_0": "SOLUTION_TEXT_1",
+    "solution_1": "SOLUTION_TEXT_2",
+    ...
+  }}
+}}
+
+Provide clear, accurate, and complete solutions. Your solutions should be correct and well-reasoned."""
 
-{problem_text}
+TASK_SCIENTIST_SOLUTION_USER_PROMPT = """Solve the following problems:
 
-Provide your solution clearly and concisely."""
+{problems}
+
+Provide your solutions clearly and concisely."""
 
 TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT = """You are the Moderator overseeing capability-based task design. Your task is to review proposed tasks from multiple scientist agents and synthesize a final, high-quality task set for the capability.
 
@@ -243,22 +258,27 @@
 - Ensure that the final set of tasks is diverse, non-trivial, and tests different facets of the capability.
 - Include a brief justification for each rejected or significantly modified task.
 
-Your response should follow this format exactly:
+IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. Do not include any prefixes or prose. The JSON should be directly parseable.
 
-THOUGHT: <your summary of strengths and weaknesses of the proposed tasks and your curation plan>
-RESPONSE JSON:
-{{
-  "final_tasks": {{
+CRITICAL: When including LaTeX expressions or backslashes in your JSON strings, you must properly escape them by using double backslashes (\\\\). For example:
+- Write \\\\(x^2\\\\) instead of \\(x^2\\)
+- Write \\\\[equation\\\\] instead of \\[equation\\]
+- Write \\\\times instead of \\times
+
+Please return your curation and your thoughts and reasoning in the following format:
+{
+  "thought": "Your reasoning and curation plan here",
+  "final_tasks": {
     "task_1": "<FINAL_TASK_1>",
     "task_2": "<FINAL_TASK_2>",
     ...
-  }},
-  "rejected_tasks": {{
+  },
+  "rejected_tasks": {
     "task_from_scientist_A": "Reason for rejection or modification",
     "task_from_scientist_B": "Reason for rejection or modification",
     ...
-  }}
-}}"""
+  }
+}"""
 
 TASK_MODERATOR_PROBLEM_USER_PROMPT = """Below is a capability and task proposals from multiple scientist agents. Curate the final task set by filtering, editing, or merging as needed.
 

From 06da9107960f19cc163f9b10a89022f787c3a15d Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Fri, 5 Sep 2025 01:03:28 -0400
Subject: [PATCH 02/19] fixed retry, json processing, and max token.

---
 src/utils/json_utils.py         | 29 ++++++++++++++++++++++++++---
 src/utils/model_client_utils.py | 31 ++++++++++++++++++-------------
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/src/utils/json_utils.py b/src/utils/json_utils.py
index 26c14ae..2a57c0a 100644
--- a/src/utils/json_utils.py
+++ b/src/utils/json_utils.py
@@ -13,7 +13,13 @@ def extract_json_from_markdown(content: str) -> str:
     """Extract JSON from markdown if present and clean control characters."""
     content = content.strip()
 
-    if content.startswith("```json") and content.endswith("```"):
+    # Handle Gemini's format: "```json\n...\n```"
+    if content.startswith('"```json') and content.endswith('```"'):
+        content = content[8:-4].strip()
+    elif content.startswith('"```') and content.endswith('```"'):
+        content = content[4:-4].strip()
+    # Handle standard markdown format: ```json\n...\n```
+    elif content.startswith("```json") and content.endswith("```"):
         content = content[7:-3].strip()
     elif content.startswith("```") and content.endswith("```"):
         content = content[3:-3].strip()
@@ -21,6 +27,18 @@ def extract_json_from_markdown(content: str) -> str:
     return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", content)
 
 
+def fix_common_json_errors(content: str) -> str:
+    """Fix common JSON syntax errors."""
+    # Fix extra equals signs (e.g., "area":="value" -> "area":"value")
+    content = re.sub(r':\s*=\s*"', ':"', content)
+
+    # Fix missing quotes around keys
+    content = re.sub(r'(\w+):\s*"', r'"\1":"', content)
+
+    # Fix trailing commas
+    return re.sub(r",(\s*[}\]])", r"\1", content)
+
+
 def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]:
     """Parse LLM JSON response."""
     try:
@@ -31,8 +49,12 @@ def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]:
         # Clean the content first
         cleaned_content = extract_json_from_markdown(raw_content)
 
+        # Fix common JSON errors
+        cleaned_content = fix_common_json_errors(cleaned_content)
+
         # Parse the JSON
-        return json.loads(cleaned_content)
+        result = json.loads(cleaned_content)
+        return result if isinstance(result, dict) else {}
 
     except json.JSONDecodeError as e:
         log.error(f"Failed to parse JSON response: {e}")
@@ -50,7 +72,8 @@ def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]:
                     log.warning(
                         "Attempting to fix unterminated JSON by truncating to last complete entry"
                     )
-                    return json.loads(fixed_content)
+                    result = json.loads(fixed_content)
+                    return result if isinstance(result, dict) else {}
         except Exception as fix_error:
             log.error(f"Failed to fix JSON: {fix_error}")
 
diff --git a/src/utils/model_client_utils.py b/src/utils/model_client_utils.py
index a650ee6..c1fdea4 100644
--- a/src/utils/model_client_utils.py
+++ b/src/utils/model_client_utils.py
@@ -20,6 +20,8 @@
 )
 
 
+MAX_TOKENS = 1024 * 10
+
 logger = logging.getLogger(__name__)
 
 GEMINI_STUDIO_BASE = "https://generativelanguage.googleapis.com/v1beta/openai/"
@@ -48,30 +50,31 @@ def __init__(self, client: Any, max_retries: int = 3):
         before_sleep=before_sleep_log(logger, logging.WARNING),
         reraise=True,
     )
-    async def create(self, *args, **kwargs):
+    async def create(self, *args: Any, **kwargs: Any) -> Any:
         """Create with retry logic for transient errors."""
         return await self.client.create(*args, **kwargs)
 
-    def __getattr__(self, name):
+    def __getattr__(self, name: str) -> Any:
         """Delegate all other attributes to the wrapped client."""
         return getattr(self.client, name)
 
 
-def get_model_client(model_name: str, seed: Optional[int] = None, **kwargs) -> Any:
-    """Return a model client for the given model name with retry logic."""
+def get_model_client(model_name: str, seed: Optional[int] = None, **kwargs: Any) -> Any:
+    """Get a model client for the given model name."""
     n = model_name.lower()
 
-    if n.startswith(("gpt-", "o1-", "o3-")):
-        # Add max_tokens to prevent truncated responses
-        kwargs.setdefault("max_tokens", 4096)
-        client = OpenAIChatCompletionClient(model=model_name, seed=seed, **kwargs)
-        return RetryableModelClient(client)
+    if n.startswith(("gpt-", "o1-", "o3-", "gpt-5")):
+        kwargs.setdefault("max_completion_tokens", MAX_TOKENS)
+        openai_client = OpenAIChatCompletionClient(
+            model=model_name, seed=seed, **kwargs
+        )
+        return RetryableModelClient(openai_client)
 
     if "claude" in n:
-        # Add max_tokens to prevent truncated responses
-        kwargs.setdefault("max_tokens", 4096)
-        client = AnthropicChatCompletionClient(model=model_name, **kwargs)
-        return RetryableModelClient(client)
+        kwargs.setdefault("max_tokens", MAX_TOKENS)
+        kwargs.setdefault("timeout", None)
+        anthropic_client = AnthropicChatCompletionClient(model=model_name, **kwargs)
+        return RetryableModelClient(anthropic_client)
 
     if "gemini" in n:
         api_key = kwargs.pop("api_key", os.getenv("GOOGLE_API_KEY"))
@@ -89,6 +92,8 @@ def get_model_client(model_name: str, seed: Optional[int] = None, **kwargs) -> A
             ),
         )
 
+        kwargs.setdefault("max_completion_tokens", MAX_TOKENS)
+
         client = OpenAIChatCompletionClient(
             model=model_name,
             base_url=GEMINI_STUDIO_BASE,

From 0ca1c2202234c099ffa09b378573bff1522a8ea5 Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Fri, 5 Sep 2025 03:54:57 -0400
Subject: [PATCH 03/19] switichin to two phase task generation.

---
 src/agentic_task_generator.py    |  23 ++-
 src/agentic_task_solver.py       | 125 +++++++++++
 src/task_generation/generator.py |  77 +++++--
 src/task_generation/messages.py  |  32 ---
 src/task_generation/moderator.py | 140 ++-----------
 src/task_generation/scientist.py |  92 ---------
 src/task_solving/__init__.py     |  17 ++
 src/task_solving/generator.py    | 225 ++++++++++++++++++++
 src/task_solving/messages.py     |  64 ++++++
 src/task_solving/moderator.py    | 342 +++++++++++++++++++++++++++++++
 src/task_solving/scientist.py    | 186 +++++++++++++++++
 11 files changed, 1053 insertions(+), 270 deletions(-)
 create mode 100644 src/agentic_task_solver.py
 create mode 100644 src/task_solving/__init__.py
 create mode 100644 src/task_solving/generator.py
 create mode 100644 src/task_solving/messages.py
 create mode 100644 src/task_solving/moderator.py
 create mode 100644 src/task_solving/scientist.py

diff --git a/src/agentic_task_generator.py b/src/agentic_task_generator.py
index 439f7a1..ffacd99 100644
--- a/src/agentic_task_generator.py
+++ b/src/agentic_task_generator.py
@@ -21,20 +21,20 @@
 
 log = logging.getLogger("agentic_task_gen")
 
+lf = Langfuse()
+openlit.init(
+    tracer=lf._otel_tracer, disable_batch=True, disable_metrics=True
+)
 
 @hydra.main(version_base=None, config_path="cfg", config_name="agentic_config")
 def main(cfg: DictConfig) -> None:
     """Run the multi-agent task generation system."""
     capabilities_tag = cfg.pipeline_tags.capabilities_tag
+    resume_tag = getattr(cfg.pipeline_tags, "resume_tasks_tag", None)
     domain_name = cfg.global_cfg.domain
     exp_id = cfg.exp_cfg.exp_id
 
-    langfuse_client = Langfuse()
-    openlit.init(
-        tracer=langfuse_client._otel_tracer, disable_batch=True, disable_metrics=True
-    )
-
-    with langfuse_client.start_as_current_span(
+    with lf.start_as_current_span(
         name=f"ace_agentic_task_generation:{domain_name}:{exp_id}"
     ) as span:
         try:
@@ -67,23 +67,30 @@ def main(cfg: DictConfig) -> None:
                 error_msg = "No capabilities_tag provided. Please provide pipeline_tags.capabilities_tag=<tag> to specify which capabilities to use."
                 log.warning(error_msg)
                 span.update(
-                    level="WARNING",
+                    level="ERROR",
                     status_message="Missing capabilities_tag",
                     metadata={"capabilities_tag_missing": error_msg},
                 )
                 return
+                
+            if resume_tag:
+                msg = f"Resuming task generation from tag: {resume_tag}"
+                log.info(msg)
+                span.update(metadata={"resume_tag_found": msg, "resume_tag": resume_tag})
+                
 
             span.update_trace(
                 metadata={
                     "domain": domain_name,
                     "exp_id": exp_id,
                     "capabilities_tag": capabilities_tag,
+                    "resume_tag": resume_tag,
                     "config": config_yaml,
                 },
                 tags=["agentic_task_generation", exp_id],
             )
 
-            asyncio.run(generate_tasks(cfg, capabilities_tag, langfuse_client))
+            asyncio.run(generate_tasks(cfg, capabilities_tag, lf, resume_tag))
 
             msg = "Multi-agent task generation completed successfully"
             log.info(msg)
diff --git a/src/agentic_task_solver.py b/src/agentic_task_solver.py
new file mode 100644
index 0000000..355c0df
--- /dev/null
+++ b/src/agentic_task_solver.py
@@ -0,0 +1,125 @@
+"""Multi-agent debate system for solving generated tasks."""
+
+import asyncio
+import logging
+import os
+import traceback
+from pathlib import Path
+
+import hydra
+import openlit
+from langfuse import Langfuse
+from omegaconf import DictConfig, OmegaConf
+
+from src.task_solving.generator import solve_tasks_with_debate, load_tasks_from_file
+
+
+# Suppress OpenTelemetry console output
+os.environ["OTEL_LOG_LEVEL"] = "ERROR"
+os.environ["OTEL_METRICS_EXPORTER"] = "none"
+os.environ["OTEL_PYTHON_LOG_CORRELATION"] = "false"
+os.environ["OTEL_PYTHON_LOG_LEVEL"] = "ERROR"
+
+log = logging.getLogger("agentic_task_solving")
+
+lf = Langfuse()
+openlit.init(tracer=lf._otel_tracer, disable_batch=True, disable_metrics=True)
+
+
+@hydra.main(version_base=None, config_path="cfg", config_name="agentic_config")
+def main(cfg: DictConfig) -> None:
+    """Run the multi-agent debate-based task solving system."""
+    domain_name = cfg.global_cfg.domain
+    exp_id = cfg.exp_cfg.exp_id
+    output_dir = cfg.global_cfg.output_dir
+    max_tasks = cfg.task_solving.get("max_tasks", 0)
+
+    with lf.start_as_current_span(
+        name=f"ace_agentic_task_solving:{domain_name}:{exp_id}"
+    ) as span:
+        try:
+            msg = "Starting multi-agent debate-based task solving"
+            log.info(msg)
+            span.update(metadata={"system_started": msg})
+
+            config_yaml = OmegaConf.to_yaml(cfg, resolve=True)
+            msg = "Configuration loaded"
+            log.info("Configuration:\n%s", config_yaml)
+            span.update(
+                metadata={
+                    "configuration_loaded": msg,
+                    "config": config_yaml,
+                    "domain": domain_name,
+                    "exp_id": exp_id,
+                }
+            )
+
+            # Load tasks from the specified file or use pipeline tags to find them
+            tasks_file = None
+            if cfg.pipeline_tags.get("tasks_tag"):
+                # Look for tasks file using the tag
+                tasks_dir = Path(output_dir) / domain_name / "tasks"
+                tasks_file = tasks_dir / f"tasks_{cfg.pipeline_tags.tasks_tag}.json"
+            elif cfg.task_solving.get("input_file"):
+                tasks_file = Path(cfg.task_solving.input_file)
+            else:
+                raise ValueError("Either pipeline_tags.tasks_tag or task_solving.input_file must be specified")
+
+        if not tasks_file.exists():
+            raise FileNotFoundError(f"Tasks file not found: {tasks_file}")
+        
+        log.info(f"Loading tasks from: {tasks_file}")
+        tasks = load_tasks_from_file(tasks_file)
+        log.info(f"Loaded {len(tasks)} tasks")
+
+        # Limit number of tasks if specified
+            if max_tasks > 0:
+                tasks = tasks[:max_tasks]
+            log.info(f"Limited to {len(tasks)} tasks")
+
+        # Run task solving
+            msg = f"Running task solving for {len(tasks)} tasks"
+            log.info(msg)
+            span.update(metadata={"task_solving_started": msg})
+
+            results = asyncio.run(solve_tasks_with_debate(
+            cfg=cfg,
+            tasks=tasks,
+                langfuse_client=lf,
+            ))
+        
+        # Print summary
+        consensus_count = sum(1 for result in results.values() if result.get("consensus_reached", False))
+        no_consensus_count = len(results) - consensus_count
+        
+            msg = f"Task solving completed. Consensus: {consensus_count}, No consensus: {no_consensus_count}"
+            log.info(msg)
+            span.update(
+                metadata={
+                    "task_solving_completed": msg,
+                    "total_tasks": len(results),
+                    "consensus_reached": consensus_count,
+                    "no_consensus": no_consensus_count,
+                }
+            )
+        
+        # Print detailed results if requested
+            if cfg.task_solving.get("print_results", False):
+            for task_id, result in results.items():
+                log.info(f"\nTask {task_id}:")
+                log.info(f"  Solution: {result['solution'][:100]}...")
+                log.info(f"  Consensus: {result['consensus_reached']}")
+                log.info(f"  Rounds: {result['total_rounds']}")
+
+    except Exception as e:
+            error_msg = f"Error in agentic task solving: {str(e)}"
+            log.error(error_msg)
+            log.error(traceback.format_exc())
+            span.update(metadata={"error": error_msg})
+        raise
+    finally:
+            lf.flush()
+
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file
diff --git a/src/task_generation/generator.py b/src/task_generation/generator.py
index 5be2742..0504aa6 100644
--- a/src/task_generation/generator.py
+++ b/src/task_generation/generator.py
@@ -14,7 +14,7 @@
     DefaultTopicId,
     SingleThreadedAgentRuntime,
 )
-from autogen_ext.models.openai import OpenAIChatCompletionClient
+from src.utils.model_client_utils import get_model_client
 from langfuse import Langfuse
 from omegaconf import DictConfig
 
@@ -56,8 +56,8 @@ async def generate_tasks_for_capability(
                 runtime,
                 "TaskScientistA",
                 lambda: TaskScientist(
-                    model_client=OpenAIChatCompletionClient(
-                        model=cfg.agents.scientist_a.model_name,
+                    model_client=get_model_client(
+                        model_name=cfg.agents.scientist_a.model_name,
                         seed=cfg.agents.scientist_a.seed,
                     ),
                     scientist_id="A",
@@ -70,8 +70,8 @@ async def generate_tasks_for_capability(
                 runtime,
                 "TaskScientistB",
                 lambda: TaskScientist(
-                    model_client=OpenAIChatCompletionClient(
-                        model=cfg.agents.scientist_b.model_name,
+                    model_client=get_model_client(
+                        model_name=cfg.agents.scientist_b.model_name,
                         seed=cfg.agents.scientist_b.seed,
                     ),
                     scientist_id="B",
@@ -85,14 +85,13 @@ async def generate_tasks_for_capability(
                 runtime,
                 "TaskModerator",
                 lambda: TaskModerator(
-                    model_client=OpenAIChatCompletionClient(
-                        model=cfg.agents.moderator.model_name,
+                    model_client=get_model_client(
+                        model_name=cfg.agents.moderator.model_name,
                         seed=cfg.agents.moderator.seed,
                     ),
                     num_scientists=2,
                     num_final_problems=cfg.task_generation.num_final_problems_per_capability,
                     buffer_param=cfg.task_generation.buffer_param,
-                    agreement_threshold=cfg.task_generation.agreement_threshold,
                     output_dir=output_dir,
                     domain=domain_name,
                     langfuse_client=langfuse_client,
@@ -161,12 +160,20 @@ async def generate_tasks_for_capability(
 
 
 async def generate_tasks(
-    cfg: DictConfig, capabilities_tag: str, langfuse_client: Langfuse
+    cfg: DictConfig, 
+    capabilities_tag: str, 
+    langfuse_client: Langfuse,
+    resume_tag: str = None,
 ) -> None:
     """Generate tasks for all capabilities."""
     domain_name = cfg.global_cfg.domain
     exp_id = cfg.exp_cfg.exp_id
-    tasks_tag = f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+    
+    if resume_tag:
+        tasks_tag = resume_tag
+        log.info(f"Resuming task generation with existing tag: {tasks_tag}")
+    else:
+        tasks_tag = f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
 
     with langfuse_client.start_as_current_span(
         name=f"ace_task_generation:{domain_name}:{exp_id}:{tasks_tag}"
@@ -230,7 +237,6 @@ async def generate_tasks(
                     if capabilities_file.exists():
                         with open(capabilities_file, "r", encoding="utf-8") as f:
                             capabilities_data = json.load(f)
-
                         if (
                             isinstance(capabilities_data, dict)
                             and "capabilities" in capabilities_data
@@ -291,11 +297,41 @@ async def generate_tasks(
                 }
             )
 
-            # Print the timestamp for future reference
-            print(f"Tasks generated with tag: {tasks_tag}")
+            # Check for existing tasks if resuming
+            existing_tasks = set()
+            if resume_tag and output_dir.exists():
+                for cap_dir in output_dir.iterdir():
+                    if cap_dir.is_dir() and (cap_dir / "tasks.json").exists():
+                        existing_tasks.add(cap_dir.name)
+
+                if existing_tasks:
+                    msg = f"Found {len(existing_tasks)} existing task sets: {list(existing_tasks)}"
+                    log.info(msg)
+                    span.update(metadata={"existing_tasks": msg})
+                else:
+                    log.info("No existing tasks found, will generate tasks all capabilities")
+
+            processed_capabilities = 0
+            skipped_capabilities = 0
 
             # Process each capability individually
             for i, capability in enumerate(capabilities):
+                capability_dir_name = capability.name.replace(" ", "_")
+                
+                # Skip if tasks already exist for this capability
+                if resume_tag and capability_dir_name in existing_tasks:
+                    msg = f"Skipping capability {i + 1}/{len(capabilities)}: {capability.name} (already exists)"
+                    log.info(msg)
+                    span.update(
+                        metadata={
+                            f"capability_{i + 1}_skipped": msg,
+                            "skipped_capability": capability.name,
+                            "progress": f"{i + 1}/{len(capabilities)}",
+                        }
+                    )
+                    skipped_capabilities += 1
+                    continue
+                    
                 msg = f"Processing capability {i + 1}/{len(capabilities)}: {capability.name}"
                 log.info(msg)
                 span.update(
@@ -318,8 +354,21 @@ async def generate_tasks(
                         "completed_capability": capability.name,
                     }
                 )
-
+                
+                processed_capabilities += 1
                 await asyncio.sleep(1)
+                
+            # Final summary
+            msg = f"Task generation completed. Processed: {processed_capabilities}, Skipped: {skipped_capabilities}, Total: {len(capabilities)}"
+            log.info(msg)
+            span.update(
+                metadata={
+                    "final_summary": msg,
+                    "processed_capabilities": processed_capabilities,
+                    "skipped_capabilities": skipped_capabilities,
+                    "total_capabilities": len(capabilities),
+                }
+            )
 
         except Exception as e:
             error_msg = f"Error in generate_tasks: {e}"
diff --git a/src/task_generation/messages.py b/src/task_generation/messages.py
index 09b5e9d..38daaa9 100644
--- a/src/task_generation/messages.py
+++ b/src/task_generation/messages.py
@@ -36,39 +36,7 @@ class ScientistProblemProposal:
     iteration: int
 
 
-@dataclass
-class ModeratorProblemReview:
-    """Moderator's review and filtering of problems."""
-
-    capability_name: str
-    final_problems: Dict[str, str]  # task_id -> task_text
-    rejected_problems: Dict[str, str]  # task_id -> rejection_reason
-    iteration: int
-
-
-@dataclass
-class SolutionRequest:
-    """Request for scientists to solve problems."""
-
-    capability_name: str
-    capability_description: str
-    capability_domain: str
-    capability_area: str
-    problems: Dict[str, str]  # task_id -> task_text
-
-
-@dataclass
-class ScientistSolutionProposal:
-    """Solution proposal from a scientist."""
 
-    scientist_id: str
-    capability_name: str
-    solutions: Dict[str, str]  # task_id -> solution
 
 
-@dataclass
-class FinalTaskSet:
-    """Final task set with problems and solutions."""
 
-    capability_name: str
-    tasks: Dict[str, Dict[str, str]]  # task_id -> {problem, answer}
diff --git a/src/task_generation/moderator.py b/src/task_generation/moderator.py
index 9e5c96b..0238d44 100644
--- a/src/task_generation/moderator.py
+++ b/src/task_generation/moderator.py
@@ -25,8 +25,6 @@
     Capability,
     ProblemProposalRequest,
     ScientistProblemProposal,
-    ScientistSolutionProposal,
-    SolutionRequest,
 )
 from src.utils.agentic_prompts import (
     TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT,
@@ -48,7 +46,6 @@ def __init__(
         num_scientists: int,
         num_final_problems: int,
         buffer_param: int,
-        agreement_threshold: float,
         output_dir: Path,
         domain: str,
         langfuse_client: Langfuse,
@@ -58,7 +55,6 @@ def __init__(
         self._num_scientists = num_scientists
         self._num_final_problems = num_final_problems
         self._buffer_param = buffer_param
-        self._agreement_threshold = agreement_threshold
         self._output_dir = output_dir
         self._domain = domain
         self._langfuse_client = langfuse_client
@@ -75,10 +71,7 @@ def __init__(
             str, List[ScientistProblemProposal]
         ] = {}  # capability -> proposals
 
-        # Solution design state
-        self._solution_proposals: Dict[
-            str, List[ScientistSolutionProposal]
-        ] = {}  # capability -> solutions
+
 
     @message_handler
     async def handle_capability(self, message: Capability, ctx: MessageContext) -> None:
@@ -266,13 +259,11 @@ async def _filter_and_select_problems(
             try:
                 parsed = parse_llm_json_response(raw_content)
                 final_tasks = parsed.get("final_tasks", {})
-                rejected_tasks = parsed.get("rejected_tasks", {})
             except Exception as e:
                 log.error(
                     f"Error parsing JSON from moderator: {e}\nOutput: {raw_content}"
                 )
                 final_tasks = {}
-                rejected_tasks = {}
 
             # Update Algorithm 1 state
             num_remaining = self._num_remaining[capability_name]
@@ -294,9 +285,6 @@ async def _filter_and_select_problems(
             log.info(
                 f"Task Moderator selected {selected_count} problems for {capability_name}, {self._num_remaining[capability_name]} remaining"
             )
-            log.info(
-                f"Rejected {len(rejected_tasks)} problems: {list(rejected_tasks.keys())}"
-            )
 
             # Continue Algorithm 1 or move to solution design
             if self._num_remaining[capability_name] > 0:
@@ -304,138 +292,42 @@ async def _filter_and_select_problems(
                 capability = self._capabilities[capability_name]
                 await self._start_problem_iteration(capability)
             else:
-                # Problem design complete, start solution design
-                capability = self._capabilities[capability_name]
-                await self._start_solution_design(capability)
+                # Problem design complete, finalize tasks without solutions
+                await self._finalize_tasks_without_solutions(capability_name)
 
         except Exception as e:
             log.error(f"Error in Task Moderator _filter_and_select_problems: {e}")
             log.error(f"Traceback: {traceback.format_exc()}")
             raise
 
-    async def _start_solution_design(self, capability: Capability) -> None:
-        """Start solution design phase."""
+    async def _finalize_tasks_without_solutions(self, capability_name: str) -> None:
+        """Finalize tasks with problems only (no solutions)."""
         try:
             log.info(
-                f"Task Moderator starting solution design for capability: {capability.name}"
+                f"Task Moderator finalizing tasks for capability: {capability_name}"
             )
 
-            final_problems = self._final_problems[capability.name]
+            final_problems = self._final_problems[capability_name]
             if not final_problems:
                 log.error(
-                    f"No final problems available for capability: {capability.name}"
+                    f"No final problems available for capability: {capability_name}"
                 )
                 return
 
-            # Send solution requests to all scientists
-            await self.publish_message(
-                SolutionRequest(
-                    capability_name=capability.name,
-                    capability_description=capability.description,
-                    capability_domain=capability.domain,
-                    capability_area=capability.area,
-                    problems=final_problems,
-                ),
-                topic_id=DefaultTopicId(),
-            )
-
-        except Exception as e:
-            log.error(f"Error in Task Moderator _start_solution_design: {e}")
-            log.error(f"Traceback: {traceback.format_exc()}")
-            raise
-
-    @message_handler
-    async def handle_scientist_solution_proposal(
-        self, message: ScientistSolutionProposal, ctx: MessageContext
-    ) -> None:
-        """Handle solution proposals from scientists."""
-        try:
-            log.info(
-                f"Task Moderator received solution proposal from Scientist {message.scientist_id} for capability: {message.capability_name}"
-            )
-
-            capability_name = message.capability_name
-            if capability_name not in self._solution_proposals:
-                self._solution_proposals[capability_name] = []
-
-            self._solution_proposals[capability_name].append(message)
-
-            # Check if we have all solutions
-            if len(self._solution_proposals[capability_name]) == self._num_scientists:
-                log.info(
-                    f"Task Moderator received all solutions for capability: {capability_name}, determining consensus"
-                )
-                await self._determine_solution_consensus(capability_name)
-
-        except Exception as e:
-            log.error(
-                f"Error in Task Moderator handle_scientist_solution_proposal: {e}"
-            )
-            log.error(f"Traceback: {traceback.format_exc()}")
-            raise
-
-    async def _determine_solution_consensus(self, capability_name: str) -> None:
-        """Determine solution consensus and finalize tasks."""
-        try:
-            log.info(
-                f"Task Moderator determining solution consensus for capability: {capability_name}"
-            )
-
-            solutions_by_task: Dict[
-                str, Dict[str, str]
-            ] = {}  # task_id -> [scientist_id -> solution]
-
-            for proposal in self._solution_proposals[capability_name]:
-                for task_id, solution in proposal.solutions.items():
-                    if task_id not in solutions_by_task:
-                        solutions_by_task[task_id] = {}
-                    solutions_by_task[task_id][proposal.scientist_id] = solution
-
+            # Create tasks with problems only
             final_tasks = {}
-
-            for task_id, problem_text in self._final_problems[capability_name].items():
-                if task_id in solutions_by_task:
-                    scientist_solutions = solutions_by_task[task_id]
-
-                    # Simple consensus: find most common solution
-                    solution_counts: Dict[str, int] = {}
-                    for solution in scientist_solutions.values():
-                        solution_counts[solution] = solution_counts.get(solution, 0) + 1
-
-                    if solution_counts:
-                        most_common_solution = max(
-                            solution_counts.keys(), key=lambda x: solution_counts[x]
-                        )
-                        agreement_rate = solution_counts[most_common_solution] / len(
-                            scientist_solutions
-                        )
-
-                        if agreement_rate >= self._agreement_threshold:
-                            final_tasks[task_id] = {
-                                "problem": problem_text,
-                                "answer": most_common_solution,
-                            }
-                            log.info(
-                                f"Task {task_id}: consensus achieved ({agreement_rate:.2f} agreement)"
-                            )
-                        else:
-                            log.warning(
-                                f"Task {task_id}: low agreement ({agreement_rate:.2f}), requires human review"
-                            )
-                            # For now, use most common solution but mark it
-                            final_tasks[task_id] = {
-                                "problem": problem_text,
-                                "answer": most_common_solution,
-                                "requires_human_review": "true",
-                                "agreement_rate": str(agreement_rate),
-                            }
+            for task_id, problem_text in final_problems.items():
+                final_tasks[task_id] = {
+                    "task": problem_text,
+                    "capability_id": capability_name,
+                }
 
             # Save final tasks
             await self._save_tasks_to_file(capability_name, final_tasks)
-            log.info(f"Task generation completed for capability: {capability_name}")
+            log.info(f"Task generation completed for capability: {capability_name} ({len(final_tasks)} tasks)")
 
         except Exception as e:
-            log.error(f"Error in Task Moderator _determine_solution_consensus: {e}")
+            log.error(f"Error in Task Moderator _finalize_tasks_without_solutions: {e}")
             log.error(f"Traceback: {traceback.format_exc()}")
             raise
 
diff --git a/src/task_generation/scientist.py b/src/task_generation/scientist.py
index 2daa571..25b25d8 100644
--- a/src/task_generation/scientist.py
+++ b/src/task_generation/scientist.py
@@ -21,14 +21,10 @@
 from src.task_generation.messages import (
     ProblemProposalRequest,
     ScientistProblemProposal,
-    ScientistSolutionProposal,
-    SolutionRequest,
 )
 from src.utils.agentic_prompts import (
     TASK_SCIENTIST_PROBLEM_SYSTEM_PROMPT,
     TASK_SCIENTIST_PROBLEM_USER_PROMPT,
-    TASK_SCIENTIST_SOLUTION_SYSTEM_PROMPT,
-    TASK_SCIENTIST_SOLUTION_USER_PROMPT,
 )
 from src.utils.json_utils import parse_llm_json_response
 
@@ -153,92 +149,4 @@ async def handle_problem_proposal_request(
                 )
                 raise
 
-    @message_handler
-    async def handle_solution_request(
-        self, message: SolutionRequest, ctx: MessageContext
-    ) -> None:
-        """Handle solution request for problems."""
-        with self._langfuse_client.start_as_current_span(
-            name=f"task_scientist_{self._scientist_id}_solution_proposal"
-        ) as span:
-            try:
-                msg = f"Task Scientist {self._scientist_id} solving {len(message.problems)} problems for capability: {message.capability_name}"
-                log.info(msg)
-                span.update(
-                    metadata={
-                        "solution_request_received": msg,
-                        "scientist_id": self._scientist_id,
-                        "capability_name": message.capability_name,
-                        "num_problems": len(message.problems),
-                    }
-                )
-
-                problems_json = json.dumps(message.problems, indent=2)
-
-                system_prompt = TASK_SCIENTIST_SOLUTION_SYSTEM_PROMPT.format(
-                    scientist_id=self._scientist_id,
-                    capability_domain=message.capability_domain,
-                    capability_name=message.capability_name,
-                )
 
-                user_prompt = TASK_SCIENTIST_SOLUTION_USER_PROMPT.format(
-                    problems=problems_json,
-                )
-
-                system_message = SystemMessage(content=system_prompt)
-                user_message = UserMessage(content=user_prompt, source="user")
-
-                model_result = await self._model_client.create(
-                    [system_message, user_message]
-                )
-
-                msg = f"Task Scientist {self._scientist_id} is parsing LLM response"
-                log.info(msg)
-                span.update(
-                    metadata={
-                        "llm_response_received": msg,
-                        "scientist_id": self._scientist_id,
-                    }
-                )
-
-                parsed = parse_llm_json_response(model_result.content)
-                solutions = parsed.get("solutions", {})
-
-                msg = f"Task Scientist {self._scientist_id} publishing solutions for capability: {message.capability_name}"
-                log.info(msg)
-                span.update(
-                    metadata={
-                        "solution_proposal_published": msg,
-                        "scientist_id": self._scientist_id,
-                        "capability_name": message.capability_name,
-                        "num_solutions_generated": len(solutions),
-                    }
-                )
-
-                await self.publish_message(
-                    ScientistSolutionProposal(
-                        scientist_id=self._scientist_id,
-                        capability_name=message.capability_name,
-                        solutions=solutions,
-                    ),
-                    topic_id=DefaultTopicId(),
-                )
-
-            except Exception as e:
-                error_msg = f"Error in Task Scientist {self._scientist_id} handle_solution_request: {e}"
-                traceback_msg = f"Traceback: {traceback.format_exc()}"
-
-                log.error(error_msg)
-                log.error(traceback_msg)
-
-                span.update(
-                    level="ERROR",
-                    status_message=str(e),
-                    metadata={
-                        "solution_request_error": error_msg,
-                        "scientist_id": self._scientist_id,
-                        "error": str(e),
-                        "traceback": traceback_msg,
-                    },
-                )
-                raise
diff --git a/src/task_solving/__init__.py b/src/task_solving/__init__.py
new file mode 100644
index 0000000..51e8634
--- /dev/null
+++ b/src/task_solving/__init__.py
@@ -0,0 +1,17 @@
+"""Task solving module with debate-based approach."""
+
+from .generator import solve_tasks_with_debate, load_tasks_from_file
+from .messages import Task, TaskSolutionRequest, AgentSolution, FinalSolution
+from .moderator import TaskSolvingModerator
+from .scientist import TaskSolvingScientist
+
+__all__ = [
+    "solve_tasks_with_debate",
+    "load_tasks_from_file",
+    "Task",
+    "TaskSolutionRequest", 
+    "AgentSolution",
+    "FinalSolution",
+    "TaskSolvingModerator",
+    "TaskSolvingScientist",
+] 
\ No newline at end of file
diff --git a/src/task_solving/generator.py b/src/task_solving/generator.py
new file mode 100644
index 0000000..26f05b6
--- /dev/null
+++ b/src/task_solving/generator.py
@@ -0,0 +1,225 @@
+"""Main task solving orchestration function."""
+
+import json
+import logging
+import traceback
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List
+
+from autogen_core import (
+    EVENT_LOGGER_NAME,
+    ROOT_LOGGER_NAME,
+    TRACE_LOGGER_NAME,
+    DefaultTopicId,
+    SingleThreadedAgentRuntime,
+)
+from langfuse import Langfuse
+from omegaconf import DictConfig
+
+from src.task_solving.messages import Task
+from src.task_solving.moderator import TaskSolvingModerator
+from src.task_solving.scientist import TaskSolvingScientist
+from src.utils.model_client_utils import get_model_client
+
+
+log = logging.getLogger("task_solving.generator")
+logging.getLogger(ROOT_LOGGER_NAME).setLevel(logging.WARNING)
+logging.getLogger(TRACE_LOGGER_NAME).setLevel(logging.WARNING)
+logging.getLogger(EVENT_LOGGER_NAME).setLevel(logging.WARNING)
+
+
+async def solve_tasks_with_debate(
+    cfg: DictConfig, 
+    tasks: List[Dict], 
+    langfuse_client: Langfuse = None
+) -> Dict[str, Dict]:
+    """
+    Solve tasks using multi-agent debate system.
+    
+    Args:
+        cfg: Configuration containing debate and model settings
+        tasks: List of tasks to solve, each containing task_id, task content, and capability_id
+        langfuse_client: Langfuse client for tracing
+        
+    Returns:
+        Dictionary mapping task_id to final solution data
+    """
+    domain_name = cfg.global_cfg.domain
+    exp_id = cfg.exp_cfg.exp_id
+    max_rounds = cfg.debate_cfg.max_round
+    num_solvers = 2  # scientist_a and scientist_b
+    solutions_tag = f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+
+    with langfuse_client.start_as_current_span(
+        name=f"ace_task_solving:{domain_name}:{exp_id}:{solutions_tag}"
+    ) as span:
+        try:
+            msg = f"Solutions will be saved with tag: {solutions_tag}"
+            log.info(msg)
+            span.update(
+                metadata={
+                    "solving_started": msg,
+                    "solutions_tag": solutions_tag,
+                    "domain": domain_name,
+                    "exp_id": exp_id,
+                    "num_tasks": len(tasks),
+                    "num_solvers": num_solvers,
+                    "max_rounds": max_rounds,
+                }
+            )
+
+            # Create output directory
+            output_dir = Path(cfg.global_cfg.output_dir) / "task_solutions" / f"{domain_name}_{exp_id}{solutions_tag}"
+            output_dir.mkdir(parents=True, exist_ok=True)
+
+            # Set up runtime
+            runtime = SingleThreadedAgentRuntime()
+
+            # Create model clients for each agent
+            scientist_a_client = get_model_client(
+                cfg.agents.scientist_a.model_name, 
+                seed=cfg.agents.scientist_a.get("seed")
+            )
+            scientist_b_client = get_model_client(
+                cfg.agents.scientist_b.model_name, 
+                seed=cfg.agents.scientist_b.get("seed")
+            )
+            moderator_client = get_model_client(
+                cfg.agents.moderator.model_name, 
+                seed=cfg.agents.moderator.get("seed")
+            )
+
+            # Register moderator
+            moderator_agent_type = await TaskSolvingModerator.register(
+                runtime,
+                "task_solving_moderator",
+                lambda: TaskSolvingModerator(
+                    model_client=moderator_client,
+                    num_solvers=num_solvers,
+                    max_rounds=max_rounds,
+                    output_dir=output_dir,
+                    langfuse_client=langfuse_client,
+                ),
+            )
+
+            # Register scientist agents
+            scientist_a_type = await TaskSolvingScientist.register(
+                runtime,
+                "task_scientist_a",
+                lambda: TaskSolvingScientist(
+                    model_client=scientist_a_client,
+                    scientist_id="scientist_a",
+                    langfuse_client=langfuse_client,
+                ),
+            )
+            
+            scientist_b_type = await TaskSolvingScientist.register(
+                runtime,
+                "task_scientist_b", 
+                lambda: TaskSolvingScientist(
+                    model_client=scientist_b_client,
+                    scientist_id="scientist_b",
+                    langfuse_client=langfuse_client,
+                ),
+            )
+
+            # Start runtime
+            runtime.start()
+
+            log.info(f"Starting task solving for {len(tasks)} tasks with {num_solvers} scientists")
+
+            # Process each task
+            for i, (task_id, task_data) in enumerate(tasks.items()):
+                # Handle both old and new task formats
+                if isinstance(task_data, dict) and "task" in task_data:
+                    # New format: {"task": "problem text", "capability_id": "cap_name"}
+                    capability_id = task_data.get("capability_id", "unknown")
+                    task_content = task_data
+                else:
+                    # Old format or other formats
+                    capability_id = task_data.get("capability_id", "unknown") if isinstance(task_data, dict) else "unknown"
+                    task_content = {"task": str(task_data)} if not isinstance(task_data, dict) else task_data
+                
+                # Create task message
+                task = Task(
+                    task_id=task_id,
+                    task_content=task_content,
+                    capability_id=capability_id,
+                )
+
+                # Send task to moderator
+                await runtime.publish_message(
+                    task, 
+                    topic_id=DefaultTopicId()
+                )
+
+                log.info(f"Submitted task {task_id} for solving")
+
+            # Wait for all tasks to complete
+            # Note: In a real implementation, you might want to add a timeout
+            # and check for completion status
+            await runtime.stop_when_idle()
+
+            # Collect results
+            results = {}
+            for solution_file in output_dir.glob("task_*_solution.json"):
+                try:
+                    with open(solution_file, "r") as f:
+                        solution_data = json.load(f)
+                        results[solution_data["task_id"]] = solution_data
+                except Exception as e:
+                    log.error(f"Error loading solution from {solution_file}: {e}")
+
+            log.info(f"Task solving completed. Processed {len(results)} tasks.")
+            
+            span.update(
+                metadata={
+                    "solving_completed": f"Processed {len(results)} tasks",
+                    "output_dir": str(output_dir),
+                    "results_count": len(results),
+                }
+            )
+
+            return results
+
+        except Exception as e:
+            error_msg = f"Error in task solving: {str(e)}"
+            log.error(error_msg)
+            log.error(traceback.format_exc())
+            span.update(metadata={"error": error_msg})
+            raise
+
+
+def load_tasks_from_file(tasks_file: Path) -> List[Dict]:
+    """
+    Load tasks from a JSON file.
+    
+    Args:
+        tasks_file: Path to the tasks file
+        
+    Returns:
+        List of task dictionaries
+    """
+    try:
+        with open(tasks_file, "r") as f:
+            tasks_data = json.load(f)
+        
+        # Handle different task file formats
+        if isinstance(tasks_data, list):
+            # Old format: list of tasks
+            return {f"task_{i+1}": task for i, task in enumerate(tasks_data)}
+        elif isinstance(tasks_data, dict):
+            # If it's a dict, try to extract tasks
+            if "tasks" in tasks_data:
+                # New format: {"tasks": {"task_1": {...}, "task_2": {...}}}
+                return tasks_data["tasks"]
+            else:
+                # Convert dict to single task
+                return {"task_1": tasks_data}
+        else:
+            raise ValueError(f"Unexpected task file format: {type(tasks_data)}")
+            
+    except Exception as e:
+        log.error(f"Error loading tasks from {tasks_file}: {e}")
+        raise 
\ No newline at end of file
diff --git a/src/task_solving/messages.py b/src/task_solving/messages.py
new file mode 100644
index 0000000..a1af9d3
--- /dev/null
+++ b/src/task_solving/messages.py
@@ -0,0 +1,64 @@
+"""Message types for task solving debate system."""
+
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
+from autogen_core import BaseMessage
+
+
+@dataclass
+class Task(BaseMessage):
+    """Task to be solved."""
+    
+    task_id: str
+    task_content: Dict[str, Any]
+    capability_id: str
+
+
+@dataclass
+class TaskSolutionRequest(BaseMessage):
+    """Request to solve a task."""
+    
+    task: Task
+    round_number: int = 1
+
+
+@dataclass
+class AgentSolution(BaseMessage):
+    """Solution proposed by an agent."""
+    
+    agent_id: str
+    task_id: str
+    thought: str
+    final_answer: str
+    round_number: int
+
+
+@dataclass
+class AgentRevisionRequest(BaseMessage):
+    """Request for agent to revise solution based on other agents' solutions."""
+    
+    task: Task
+    other_solutions: List[AgentSolution]
+    round_number: int
+
+
+@dataclass
+class ConsensusCheck(BaseMessage):
+    """Check if consensus has been reached."""
+    
+    task_id: str
+    solutions: List[AgentSolution]
+    round_number: int
+
+
+@dataclass
+class FinalSolution(BaseMessage):
+    """Final solution for a task."""
+    
+    task_id: str
+    solution: str
+    reasoning: str
+    consensus_reached: bool
+    total_rounds: int
+    all_solutions: List[AgentSolution] 
\ No newline at end of file
diff --git a/src/task_solving/moderator.py b/src/task_solving/moderator.py
new file mode 100644
index 0000000..251bfd6
--- /dev/null
+++ b/src/task_solving/moderator.py
@@ -0,0 +1,342 @@
+"""Task solving moderator agent for managing the debate process."""
+
+import json
+import logging
+import re
+import traceback
+from pathlib import Path
+from typing import Dict, List
+
+from autogen_core import (
+    DefaultTopicId,
+    MessageContext,
+    RoutedAgent,
+    default_subscription,
+    message_handler,
+)
+from autogen_core.models import (
+    ChatCompletionClient,
+    SystemMessage,
+    UserMessage,
+)
+from langfuse import Langfuse
+
+from src.task_solving.messages import (
+    AgentRevisionRequest,
+    AgentSolution,
+    ConsensusCheck,
+    FinalSolution,
+    Task,
+    TaskSolutionRequest,
+)
+from src.utils.agentic_prompts import (
+    TASK_MODERATOR_CONSENSUS_PROMPT,
+    TASK_MODERATOR_SYSTEM_MESSAGE,
+)
+
+
+log = logging.getLogger("task_solving.moderator")
+
+
+@default_subscription
+class TaskSolvingModerator(RoutedAgent):
+    """Moderator that manages task solving debate and checks for consensus."""
+
+    def __init__(
+        self,
+        model_client: ChatCompletionClient,
+        num_solvers: int,
+        max_rounds: int,
+        output_dir: Path,
+        langfuse_client: Langfuse = None,
+    ) -> None:
+        super().__init__("Task Solving Moderator")
+        self._model_client = model_client
+        self._num_solvers = num_solvers
+        self._max_rounds = max_rounds
+        self._output_dir = output_dir
+        self._langfuse_client = langfuse_client
+        
+        # Track solutions by task_id and round
+        self._solutions_buffer: Dict[str, Dict[int, List[AgentSolution]]] = {}
+        self._current_round: Dict[str, int] = {}
+        self._final_solutions: Dict[str, FinalSolution] = {}
+
+    def _extract_consensus_components(self, response: str) -> tuple[bool, str, str]:
+        """Extract consensus decision, solution, and reasoning from response."""
+        consensus_match = re.search(r"CONSENSUS_REACHED:\s*(true|false)", response, re.IGNORECASE)
+        solution_match = re.search(r"FINAL_SOLUTION:\s*(.*?)(?=REASONING:|$)", response, re.DOTALL | re.IGNORECASE)
+        reasoning_match = re.search(r"REASONING:\s*(.*?)$", response, re.DOTALL | re.IGNORECASE)
+        
+        consensus_reached = consensus_match.group(1).lower() == "true" if consensus_match else False
+        final_solution = solution_match.group(1).strip() if solution_match else "NONE"
+        reasoning = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided"
+        
+        return consensus_reached, final_solution, reasoning
+
+    def _check_simple_consensus(self, solutions: List[AgentSolution]) -> tuple[bool, str]:
+        """Simple consensus check - if all agents have the same final answer."""
+        if not solutions:
+            return False, ""
+        
+        # Extract final answers and normalize them
+        answers = [sol.final_answer.strip().lower() for sol in solutions]
+        
+        # Check if all answers are the same
+        if len(set(answers)) == 1:
+            return True, solutions[0].final_answer
+        
+        return False, ""
+
+    @message_handler
+    async def handle_task(self, message: Task, ctx: MessageContext) -> None:
+        """Handle a task and initiate the solving process."""
+        with self._langfuse_client.start_as_current_span(
+            name=f"moderator_handle_task_{message.task_id}"
+        ) as span:
+            try:
+                msg = f"Moderator received task: {message.task_id}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "task_received": msg,
+                        "task_id": message.task_id,
+                        "capability_id": message.capability_id,
+                    }
+                )
+
+                # Initialize tracking for this task
+                self._solutions_buffer[message.task_id] = {}
+                self._current_round[message.task_id] = 1
+
+                # Send initial solution request to all solvers
+                await self.publish_message(
+                    TaskSolutionRequest(task=message, round_number=1),
+                    topic_id=DefaultTopicId(),
+                )
+
+                span.update(
+                    metadata={"solution_request_sent": f"Round 1 solution request sent for task {message.task_id}"}
+                )
+
+            except Exception as e:
+                error_msg = f"Error handling task {message.task_id}: {str(e)}"
+                log.error(error_msg)
+                log.error(traceback.format_exc())
+                span.update(metadata={"error": error_msg})
+
+    @message_handler
+    async def handle_agent_solution(self, message: AgentSolution, ctx: MessageContext) -> None:
+        """Handle solution from an agent."""
+        with self._langfuse_client.start_as_current_span(
+            name=f"moderator_handle_solution_{message.task_id}_round_{message.round_number}"
+        ) as span:
+            try:
+                task_id = message.task_id
+                round_num = message.round_number
+                
+                msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, round {round_num}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "solution_received": msg,
+                        "task_id": task_id,
+                        "agent_id": message.agent_id,
+                        "round": round_num,
+                    }
+                )
+
+                # Initialize round buffer if needed
+                if round_num not in self._solutions_buffer[task_id]:
+                    self._solutions_buffer[task_id][round_num] = []
+
+                # Add solution to buffer
+                self._solutions_buffer[task_id][round_num].append(message)
+
+                # Check if we have all solutions for this round
+                if len(self._solutions_buffer[task_id][round_num]) == self._num_solvers:
+                    await self._check_consensus_and_proceed(task_id, round_num, ctx)
+
+                span.update(
+                    metadata={
+                        "solutions_collected": f"{len(self._solutions_buffer[task_id][round_num])}/{self._num_solvers} for round {round_num}"
+                    }
+                )
+
+            except Exception as e:
+                error_msg = f"Error handling solution from agent {message.agent_id}: {str(e)}"
+                log.error(error_msg)
+                log.error(traceback.format_exc())
+                span.update(metadata={"error": error_msg})
+
+    async def _check_consensus_and_proceed(self, task_id: str, round_num: int, ctx: MessageContext) -> None:
+        """Check for consensus and either finalize or start next round."""
+        with self._langfuse_client.start_as_current_span(
+            name=f"moderator_consensus_check_{task_id}_round_{round_num}"
+        ) as span:
+            try:
+                solutions = self._solutions_buffer[task_id][round_num]
+                
+                # First try simple consensus check
+                simple_consensus, simple_solution = self._check_simple_consensus(solutions)
+                
+                if simple_consensus:
+                    # Simple consensus reached
+                    final_solution = FinalSolution(
+                        task_id=task_id,
+                        solution=simple_solution,
+                        reasoning="All agents provided the same answer",
+                        consensus_reached=True,
+                        total_rounds=round_num,
+                        all_solutions=self._get_all_solutions_for_task(task_id),
+                    )
+                    
+                    self._final_solutions[task_id] = final_solution
+                    await self._save_final_solution(final_solution)
+                    
+                    span.update(
+                        metadata={
+                            "consensus_reached": True,
+                            "method": "simple",
+                            "final_solution": simple_solution[:100],
+                        }
+                    )
+                    return
+
+                # If no simple consensus and we haven't reached max rounds, use LLM to check
+                if round_num < self._max_rounds:
+                    # Use LLM moderator to check for consensus
+                    task_content = ""  # We need to get the original task content
+                    # For now, let's get it from the first solution's context or we need to store it
+                    
+                    # Format solutions for LLM
+                    all_solutions_text = "\n\n".join([
+                        f"Agent {sol.agent_id}:\nReasoning: {sol.thought}\nFinal Answer: {sol.final_answer}"
+                        for sol in solutions
+                    ])
+                    
+                    prompt = TASK_MODERATOR_CONSENSUS_PROMPT.format(
+                        problem_text=task_content,  # We need to store this from the original task
+                        all_solutions=all_solutions_text
+                    )
+                    
+                    system_message = SystemMessage(content=TASK_MODERATOR_SYSTEM_MESSAGE)
+                    user_message = UserMessage(content=prompt, source="user")
+
+                    response = await self._model_client.create(
+                        messages=[system_message, user_message],
+                        cancellation_token=ctx.cancellation_token,
+                    )
+
+                    consensus_reached, final_solution_text, reasoning = self._extract_consensus_components(response.content)
+                    
+                    if consensus_reached:
+                        # LLM found consensus
+                        final_solution = FinalSolution(
+                            task_id=task_id,
+                            solution=final_solution_text,
+                            reasoning=reasoning,
+                            consensus_reached=True,
+                            total_rounds=round_num,
+                            all_solutions=self._get_all_solutions_for_task(task_id),
+                        )
+                        
+                        self._final_solutions[task_id] = final_solution
+                        await self._save_final_solution(final_solution)
+                        
+                        span.update(
+                            metadata={
+                                "consensus_reached": True,
+                                "method": "llm_moderator",
+                                "final_solution": final_solution_text[:100],
+                            }
+                        )
+                        return
+                    else:
+                        # No consensus, start next round
+                        next_round = round_num + 1
+                        self._current_round[task_id] = next_round
+                        
+                        # We need the original task to send revision requests
+                        # For now, create a placeholder task
+                        task = Task(task_id=task_id, task_content={"task": task_content}, capability_id="")
+                        
+                        await self.publish_message(
+                            AgentRevisionRequest(
+                                task=task,
+                                other_solutions=solutions,
+                                round_number=next_round,
+                            ),
+                            topic_id=DefaultTopicId(),
+                        )
+                        
+                        span.update(
+                            metadata={
+                                "consensus_reached": False,
+                                "next_round_started": next_round,
+                            }
+                        )
+                else:
+                    # Max rounds reached, no consensus
+                    final_solution = FinalSolution(
+                        task_id=task_id,
+                        solution="No consensus reached",
+                        reasoning=f"Maximum rounds ({self._max_rounds}) reached without consensus",
+                        consensus_reached=False,
+                        total_rounds=round_num,
+                        all_solutions=self._get_all_solutions_for_task(task_id),
+                    )
+                    
+                    self._final_solutions[task_id] = final_solution
+                    await self._save_final_solution(final_solution)
+                    
+                    span.update(
+                        metadata={
+                            "consensus_reached": False,
+                            "max_rounds_reached": True,
+                        }
+                    )
+
+            except Exception as e:
+                error_msg = f"Error checking consensus for task {task_id}: {str(e)}"
+                log.error(error_msg)
+                log.error(traceback.format_exc())
+                span.update(metadata={"error": error_msg})
+
+    def _get_all_solutions_for_task(self, task_id: str) -> List[AgentSolution]:
+        """Get all solutions for a task across all rounds."""
+        all_solutions = []
+        for round_solutions in self._solutions_buffer[task_id].values():
+            all_solutions.extend(round_solutions)
+        return all_solutions
+
+    async def _save_final_solution(self, final_solution: FinalSolution) -> None:
+        """Save the final solution to a file."""
+        try:
+            output_file = self._output_dir / f"task_{final_solution.task_id}_solution.json"
+            
+            solution_data = {
+                "task_id": final_solution.task_id,
+                "solution": final_solution.solution,
+                "reasoning": final_solution.reasoning,
+                "consensus_reached": final_solution.consensus_reached,
+                "total_rounds": final_solution.total_rounds,
+                "all_solutions": [
+                    {
+                        "agent_id": sol.agent_id,
+                        "thought": sol.thought,
+                        "final_answer": sol.final_answer,
+                        "round_number": sol.round_number,
+                    }
+                    for sol in final_solution.all_solutions
+                ],
+            }
+            
+            with open(output_file, "w") as f:
+                json.dump(solution_data, f, indent=2)
+            
+            log.info(f"Saved final solution for task {final_solution.task_id} to {output_file}")
+            
+        except Exception as e:
+            log.error(f"Error saving final solution for task {final_solution.task_id}: {str(e)}")
+            log.error(traceback.format_exc()) 
\ No newline at end of file
diff --git a/src/task_solving/scientist.py b/src/task_solving/scientist.py
new file mode 100644
index 0000000..c493625
--- /dev/null
+++ b/src/task_solving/scientist.py
@@ -0,0 +1,186 @@
+"""Task solver agent for solving tasks through debate."""
+
+import logging
+import re
+import traceback
+
+from autogen_core import (
+    DefaultTopicId,
+    MessageContext,
+    RoutedAgent,
+    default_subscription,
+    message_handler,
+)
+from autogen_core.models import (
+    ChatCompletionClient,
+    SystemMessage,
+    UserMessage,
+)
+from langfuse import Langfuse
+
+from src.task_solving.messages import (
+    AgentRevisionRequest,
+    AgentSolution,
+    TaskSolutionRequest,
+)
+from src.utils.agentic_prompts import (
+    TASK_SOLVER_ROUND_1_PROMPT,
+    TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT,
+    TASK_SOLVER_SYSTEM_MESSAGE,
+)
+
+
+log = logging.getLogger("task_solving.solver")
+
+
+@default_subscription
+class TaskSolvingScientist(RoutedAgent):
+    """A scientist that solves tasks through debate."""
+
+    def __init__(
+        self,
+        model_client: ChatCompletionClient,
+        scientist_id: str,
+        langfuse_client: Langfuse = None,
+    ) -> None:
+        super().__init__(f"Task Solving Scientist {scientist_id}")
+        self._model_client = model_client
+        self._scientist_id = scientist_id
+        self._langfuse_client = langfuse_client
+
+    def _extract_solution_components(self, response: str) -> tuple[str, str]:
+        """Extract thought and final answer from the response."""
+        thought_match = re.search(r"THOUGHT:\s*(.*?)(?=FINAL ANSWER:|$)", response, re.DOTALL | re.IGNORECASE)
+        answer_match = re.search(r"FINAL ANSWER:\s*(.*?)$", response, re.DOTALL | re.IGNORECASE)
+        
+        thought = thought_match.group(1).strip() if thought_match else response.strip()
+        final_answer = answer_match.group(1).strip() if answer_match else "No clear answer provided"
+        
+        return thought, final_answer
+
+    @message_handler
+    async def handle_task_solution_request(
+        self, message: TaskSolutionRequest, ctx: MessageContext
+    ) -> None:
+        """Handle initial task solution request (Round 1)."""
+        with self._langfuse_client.start_as_current_span(
+            name=f"scientist_{self._scientist_id}_round_1"
+        ) as span:
+            try:
+                task_text = message.task.task_content.get("task", "")
+                
+                msg = f"Scientist {self._scientist_id} handling initial solution request for task: {message.task.task_id}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "solution_request_received": msg,
+                        "scientist_id": self._scientist_id,
+                        "task_id": message.task.task_id,
+                        "round": message.round_number,
+                    }
+                )
+
+                prompt = TASK_SOLVER_ROUND_1_PROMPT.format(problem_text=task_text)
+                
+                system_message = SystemMessage(content=TASK_SOLVER_SYSTEM_MESSAGE)
+                user_message = UserMessage(content=prompt, source="user")
+
+                response = await self._model_client.create(
+                    messages=[system_message, user_message],
+                    cancellation_token=ctx.cancellation_token,
+                )
+
+                response_content = response.content
+                thought, final_answer = self._extract_solution_components(response_content)
+
+                solution = AgentSolution(
+                    agent_id=self._scientist_id,
+                    task_id=message.task.task_id,
+                    thought=thought,
+                    final_answer=final_answer,
+                    round_number=message.round_number,
+                )
+
+                await self.publish_message(solution, topic_id=DefaultTopicId())
+
+                span.update(
+                    metadata={
+                        "solution_generated": f"Scientist {self._scientist_id} generated solution for task {message.task.task_id}",
+                        "final_answer": final_answer[:100],  # Truncate for logging
+                    }
+                )
+
+            except Exception as e:
+                error_msg = f"Error in scientist {self._scientist_id} round 1: {str(e)}"
+                log.error(error_msg)
+                log.error(traceback.format_exc())
+                span.update(metadata={"error": error_msg})
+
+    @message_handler
+    async def handle_agent_revision_request(
+        self, message: AgentRevisionRequest, ctx: MessageContext
+    ) -> None:
+        """Handle revision request with other agents' solutions."""
+        with self._langfuse_client.start_as_current_span(
+            name=f"scientist_{self._scientist_id}_round_{message.round_number}"
+        ) as span:
+            try:
+                task_text = message.task.task_content.get("task", "")
+                
+                msg = f"Scientist {self._scientist_id} handling revision request for task: {message.task.task_id}, round: {message.round_number}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "revision_request_received": msg,
+                        "scientist_id": self._scientist_id,
+                        "task_id": message.task.task_id,
+                        "round": message.round_number,
+                        "num_other_solutions": len(message.other_solutions),
+                    }
+                )
+
+                # Format other scientists' solutions
+                other_solutions_text = "\n\n".join([
+                    f"Scientist {sol.agent_id}: Reasoning: {sol.thought}, Final solution: {sol.final_answer}"
+                    for sol in message.other_solutions
+                    if sol.agent_id != self._scientist_id  # Don't include our own solution
+                ])
+
+                prompt = TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT.format(
+                    other_solutions=other_solutions_text,
+                    problem_text=task_text
+                )
+                
+                system_message = SystemMessage(content=TASK_SOLVER_SYSTEM_MESSAGE)
+                user_message = UserMessage(content=prompt, source="user")
+
+                response = await self._model_client.create(
+                    messages=[system_message, user_message],
+                    cancellation_token=ctx.cancellation_token,
+                )
+
+                response_content = response.content
+                thought, final_answer = self._extract_solution_components(response_content)
+
+                solution = AgentSolution(
+                    agent_id=self._scientist_id,
+                    task_id=message.task.task_id,
+                    thought=thought,
+                    final_answer=final_answer,
+                    round_number=message.round_number,
+                )
+
+                await self.publish_message(solution, topic_id=DefaultTopicId())
+
+                span.update(
+                    metadata={
+                        "revision_generated": f"Scientist {self._scientist_id} generated revision for task {message.task.task_id}",
+                        "final_answer": final_answer[:100],  # Truncate for logging
+                    }
+                )
+
+            except Exception as e:
+                error_msg = f"Error in scientist {self._scientist_id} round {message.round_number}: {str(e)}"
+                log.error(error_msg)
+                log.error(traceback.format_exc())
+                span.update(metadata={"error": error_msg}) 
\ No newline at end of file

From 396feac44a98eafaaffd97d21559426528900fac Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Fri, 5 Sep 2025 03:55:51 -0400
Subject: [PATCH 04/19] switichin to two phase task generation. part 2.

---
 src/agentic_task_generator.py | 12 +++---
 src/agentic_task_solver.py    | 10 ++---
 src/utils/agentic_prompts.py  | 78 +++++++++++++++++++++++------------
 3 files changed, 62 insertions(+), 38 deletions(-)

diff --git a/src/agentic_task_generator.py b/src/agentic_task_generator.py
index ffacd99..96a221a 100644
--- a/src/agentic_task_generator.py
+++ b/src/agentic_task_generator.py
@@ -22,9 +22,8 @@
 log = logging.getLogger("agentic_task_gen")
 
 lf = Langfuse()
-openlit.init(
-    tracer=lf._otel_tracer, disable_batch=True, disable_metrics=True
-)
+openlit.init(tracer=lf._otel_tracer, disable_batch=True, disable_metrics=True)
+
 
 @hydra.main(version_base=None, config_path="cfg", config_name="agentic_config")
 def main(cfg: DictConfig) -> None:
@@ -72,12 +71,13 @@ def main(cfg: DictConfig) -> None:
                     metadata={"capabilities_tag_missing": error_msg},
                 )
                 return
-                
+
             if resume_tag:
                 msg = f"Resuming task generation from tag: {resume_tag}"
                 log.info(msg)
-                span.update(metadata={"resume_tag_found": msg, "resume_tag": resume_tag})
-                
+                span.update(
+                    metadata={"resume_tag_found": msg, "resume_tag": resume_tag}
+                )
 
             span.update_trace(
                 metadata={
diff --git a/src/agentic_task_solver.py b/src/agentic_task_solver.py
index 355c0df..43c7935 100644
--- a/src/agentic_task_solver.py
+++ b/src/agentic_task_solver.py
@@ -67,7 +67,7 @@ def main(cfg: DictConfig) -> None:
 
         if not tasks_file.exists():
             raise FileNotFoundError(f"Tasks file not found: {tasks_file}")
-        
+
         log.info(f"Loading tasks from: {tasks_file}")
         tasks = load_tasks_from_file(tasks_file)
         log.info(f"Loaded {len(tasks)} tasks")
@@ -87,11 +87,11 @@ def main(cfg: DictConfig) -> None:
             tasks=tasks,
                 langfuse_client=lf,
             ))
-        
+
         # Print summary
         consensus_count = sum(1 for result in results.values() if result.get("consensus_reached", False))
         no_consensus_count = len(results) - consensus_count
-        
+
             msg = f"Task solving completed. Consensus: {consensus_count}, No consensus: {no_consensus_count}"
             log.info(msg)
             span.update(
@@ -102,7 +102,7 @@ def main(cfg: DictConfig) -> None:
                     "no_consensus": no_consensus_count,
                 }
             )
-        
+
         # Print detailed results if requested
             if cfg.task_solving.get("print_results", False):
             for task_id, result in results.items():
@@ -122,4 +122,4 @@ def main(cfg: DictConfig) -> None:
 
 
 if __name__ == "__main__":
-    main() 
\ No newline at end of file
+    main()
diff --git a/src/utils/agentic_prompts.py b/src/utils/agentic_prompts.py
index b4a0d26..a65df8f 100644
--- a/src/utils/agentic_prompts.py
+++ b/src/utils/agentic_prompts.py
@@ -228,27 +228,6 @@
 Sample tasks:
 {sample_tasks_text}"""
 
-TASK_SCIENTIST_SOLUTION_SYSTEM_PROMPT = """You are Scientist {scientist_id}, an expert in {capability_domain}. You are solving a task related to the capability: {capability_name}.
-
-IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. The JSON should be directly parseable.
-
-Please return your solution and your thoughts and reasoning in the following format:
-{{
-  "thought": "Your reasoning and thought process about solving this problem",
-  "solutions": {{
-    "solution_0": "SOLUTION_TEXT_1",
-    "solution_1": "SOLUTION_TEXT_2",
-    ...
-  }}
-}}
-
-Provide clear, accurate, and complete solutions. Your solutions should be correct and well-reasoned."""
-
-TASK_SCIENTIST_SOLUTION_USER_PROMPT = """Solve the following problems:
-
-{problems}
-
-Provide your solutions clearly and concisely."""
 
 TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT = """You are the Moderator overseeing capability-based task design. Your task is to review proposed tasks from multiple scientist agents and synthesize a final, high-quality task set for the capability.
 
@@ -256,7 +235,7 @@
 - Eliminate any task that is not clearly aligned with the capability.
 - Merge or remove tasks that are redundant or overly similar.
 - Ensure that the final set of tasks is diverse, non-trivial, and tests different facets of the capability.
-- Include a brief justification for each rejected or significantly modified task.
+- Select only the highest quality tasks that best represent the capability.
 
 IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. Do not include any prefixes or prose. The JSON should be directly parseable.
 
@@ -272,11 +251,6 @@
     "task_1": "<FINAL_TASK_1>",
     "task_2": "<FINAL_TASK_2>",
     ...
-  },
-  "rejected_tasks": {
-    "task_from_scientist_A": "Reason for rejection or modification",
-    "task_from_scientist_B": "Reason for rejection or modification",
-    ...
   }
 }"""
 
@@ -289,6 +263,56 @@
 Proposed Tasks:
 {problems_text}"""
 
+# =============================================================================
+# TASK SOLVING DEBATE PROMPTS
+# =============================================================================
+
+TASK_SOLVER_SYSTEM_MESSAGE = """You are an expert problem solver participating in a collaborative debate to solve tasks. You will work with other agents to find the best solution through structured discussion and reasoning."""
+
+TASK_SOLVER_ROUND_1_PROMPT = """Can you solve the following problem?
+
+PROBLEM: {problem_text}
+
+Explain your reasoning step by step. Your final answer should be clearly stated at the end of your response.
+
+Respond using this format:
+THOUGHT: <your reasoning and thought process for solving the task>
+FINAL ANSWER: <answer>"""
+
+TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT = """These are the reasoning and solutions to the problem from other agents:
+
+{other_solutions}
+
+Using the solutions from other agents as additional information, can you provide your answer to the problem? 
+
+The original problem is: {problem_text}
+
+Explain your reasoning step by step. Your final answer should be clearly stated at the end of your response.
+
+Respond using this format:
+THOUGHT: <your reasoning and thought process for solving the task>
+FINAL ANSWER: <answer>"""
+
+TASK_MODERATOR_SYSTEM_MESSAGE = """You are a moderator overseeing a collaborative problem-solving debate. Your role is to check for consensus among agents and determine the final solution."""
+
+TASK_MODERATOR_CONSENSUS_PROMPT = """Review the following solutions from different agents for the same problem:
+
+PROBLEM: {problem_text}
+
+SOLUTIONS:
+{all_solutions}
+
+Determine if there is consensus among the agents. Consensus is reached when:
+1. All agents provide the same final answer, OR
+2. The majority of agents agree on the same answer with similar reasoning
+
+If consensus is reached, provide the agreed-upon solution. If not, indicate that another round of debate is needed.
+
+Respond using this format:
+CONSENSUS_REACHED: <true/false>
+FINAL_SOLUTION: <the agreed solution if consensus reached, otherwise "NONE">
+REASONING: <explanation of your decision>"""
+
 # =============================================================================
 # SYSTEM MESSAGES
 # =============================================================================

From b166e4cea29bcd35b4aab1dce00db117ebd81dc7 Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Fri, 5 Sep 2025 04:06:58 -0400
Subject: [PATCH 05/19] updated agentic config and readme.

---
 README.md                   |  9 ++++++++-
 src/cfg/agentic_config.yaml | 26 +++++++++++++++-----------
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 311eb4f..07f0bf2 100644
--- a/README.md
+++ b/README.md
@@ -86,5 +86,12 @@ python -m src.agentic_area_generator
 python -m src.agentic_capability_generator
 
 # Generate tasks for each capability
-python -m src.agentic_task_generator
+python -m src.agentic_task_generator 
+
+# Generate tasks for all capabilities
+python -m src.agentic_task_generator pipeline_tags.capabilities_tag=_20250902_030203
+
+# Generate solutions for tasks
+# python -m sr
+
 ```
diff --git a/src/cfg/agentic_config.yaml b/src/cfg/agentic_config.yaml
index aff027d..8b8bb9f 100644
--- a/src/cfg/agentic_config.yaml
+++ b/src/cfg/agentic_config.yaml
@@ -4,7 +4,7 @@ defaults:
 # Global configuration
 global_cfg:
   domain: math
-  output_dir: /fs01/projects/aieng/public/ace/agentic_outputs/
+  output_dir: agentic_outputs/
 
 # Debate configuration (shared across all stages)
 debate_cfg:
@@ -12,35 +12,39 @@ debate_cfg:
 
 # Agent configurations (shared across all stages)
 agents:
-  scientist_a:
-    model_name: o3-mini
+  scientist_a: 
+    model_name: gpt-5
     seed: 8
 
   scientist_b:
-    model_name: claude-3-5-sonnet-20241022
+    model_name: gemini-2.5-pro
     seed: 88  # If using same model as scientist_a, use different seed for diversity
 
   moderator:
-    model_name: gpt-4o
+    model_name: claude-opus-4-1-20250805
     seed: 888
 
 # Stage 1: Area Generation Configuration
 area_generation:
-  num_areas: 2                    # Number of top-level areas to generate
+  num_areas: 20                    # Number of top-level areas to generate
 
 # Stage 2: Capability Generation Configuration
 capability_generation:
-  num_capabilities_per_area: 3    # Number of capabilities to generate per area
+  num_capabilities_per_area: 20    # Number of capabilities to generate per area
 
 # Stage 3: Task Generation Configuration
 task_generation:
-  num_final_problems_per_capability: 3    # N: Number of final problems per capability
-  buffer_param: 2                         # B: Buffer parameter (extra problems each agent proposes)
-  agreement_threshold: 0.6                # S: Agreement threshold for solution consensus
+  num_final_tasks_per_capability: 10    # N: Number of final problems per capability
+  buffer_param: 5                         # B: Buffer parameter (extra problems each agent proposes)
+
+# Stage 4: Task Solving Configuration  
+task_solving:
+  max_tasks: 0                            # Maximum number of tasks to process (0 = all)
+  print_results: false                    # Whether to print detailed results to console
 
 # Experiment configuration
 exp_cfg:
-  exp_id: test
+  exp_id: r0_20x20
 
 # Pipeline tags for chaining stages
 pipeline_tags:

From 084b68c12d823158eb71187dca58fa593d6e6574 Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Fri, 5 Sep 2025 04:40:59 -0400
Subject: [PATCH 06/19] simplified task generations.

---
 src/cfg/agentic_config.yaml      |  8 ++++----
 src/task_generation/__init__.py  | 11 -----------
 src/task_generation/generator.py | 25 +++++++++++++++----------
 src/task_generation/moderator.py | 28 +++++++++-------------------
 4 files changed, 28 insertions(+), 44 deletions(-)

diff --git a/src/cfg/agentic_config.yaml b/src/cfg/agentic_config.yaml
index 8b8bb9f..68a3d9a 100644
--- a/src/cfg/agentic_config.yaml
+++ b/src/cfg/agentic_config.yaml
@@ -8,11 +8,11 @@ global_cfg:
 
 # Debate configuration (shared across all stages)
 debate_cfg:
-  max_round: 3
+  max_round: 5
 
 # Agent configurations (shared across all stages)
 agents:
-  scientist_a: 
+  scientist_a:
     model_name: gpt-5
     seed: 8
 
@@ -34,10 +34,10 @@ capability_generation:
 
 # Stage 3: Task Generation Configuration
 task_generation:
-  num_final_tasks_per_capability: 10    # N: Number of final problems per capability
+  num_final_problems_per_capability: 10    # N: Number of final problems per capability
   buffer_param: 5                         # B: Buffer parameter (extra problems each agent proposes)
 
-# Stage 4: Task Solving Configuration  
+# Stage 4: Task Solving Configuration
 task_solving:
   max_tasks: 0                            # Maximum number of tasks to process (0 = all)
   print_results: false                    # Whether to print detailed results to console
diff --git a/src/task_generation/__init__.py b/src/task_generation/__init__.py
index 2598dec..8d54a1b 100644
--- a/src/task_generation/__init__.py
+++ b/src/task_generation/__init__.py
@@ -1,12 +1 @@
 """Task generation package for multi-agent debate-based task generation."""
-
-from .generator import generate_tasks
-from .moderator import TaskModerator
-from .scientist import TaskScientist
-
-
-__all__ = [
-    "generate_tasks",
-    "TaskModerator",
-    "TaskScientist",
-]
diff --git a/src/task_generation/generator.py b/src/task_generation/generator.py
index 0504aa6..a0169ed 100644
--- a/src/task_generation/generator.py
+++ b/src/task_generation/generator.py
@@ -14,13 +14,13 @@
     DefaultTopicId,
     SingleThreadedAgentRuntime,
 )
-from src.utils.model_client_utils import get_model_client
 from langfuse import Langfuse
 from omegaconf import DictConfig
 
 from src.task_generation.messages import Capability
 from src.task_generation.moderator import TaskModerator
 from src.task_generation.scientist import TaskScientist
+from src.utils.model_client_utils import get_model_client
 
 
 log = logging.getLogger("agentic_task_gen.generator")
@@ -160,15 +160,16 @@ async def generate_tasks_for_capability(
 
 
 async def generate_tasks(
-    cfg: DictConfig, 
-    capabilities_tag: str, 
+    cfg: DictConfig,
+    capabilities_tag: str,
     langfuse_client: Langfuse,
-    resume_tag: str = None,
+    resume_tag: str,
 ) -> None:
     """Generate tasks for all capabilities."""
     domain_name = cfg.global_cfg.domain
     exp_id = cfg.exp_cfg.exp_id
-    
+
+    # Use resume_tag if provided, otherwise create new tag
     if resume_tag:
         tasks_tag = resume_tag
         log.info(f"Resuming task generation with existing tag: {tasks_tag}")
@@ -237,6 +238,7 @@ async def generate_tasks(
                     if capabilities_file.exists():
                         with open(capabilities_file, "r", encoding="utf-8") as f:
                             capabilities_data = json.load(f)
+
                         if (
                             isinstance(capabilities_data, dict)
                             and "capabilities" in capabilities_data
@@ -297,6 +299,9 @@ async def generate_tasks(
                 }
             )
 
+            # Print the timestamp for future reference
+            print(f"Tasks generated with tag: {tasks_tag}")
+
             # Check for existing tasks if resuming
             existing_tasks = set()
             if resume_tag and output_dir.exists():
@@ -309,7 +314,7 @@ async def generate_tasks(
                     log.info(msg)
                     span.update(metadata={"existing_tasks": msg})
                 else:
-                    log.info("No existing tasks found, will generate tasks all capabilities")
+                    log.info("No existing tasks found, will generate all capabilities")
 
             processed_capabilities = 0
             skipped_capabilities = 0
@@ -317,7 +322,7 @@ async def generate_tasks(
             # Process each capability individually
             for i, capability in enumerate(capabilities):
                 capability_dir_name = capability.name.replace(" ", "_")
-                
+
                 # Skip if tasks already exist for this capability
                 if resume_tag and capability_dir_name in existing_tasks:
                     msg = f"Skipping capability {i + 1}/{len(capabilities)}: {capability.name} (already exists)"
@@ -331,7 +336,7 @@ async def generate_tasks(
                     )
                     skipped_capabilities += 1
                     continue
-                    
+
                 msg = f"Processing capability {i + 1}/{len(capabilities)}: {capability.name}"
                 log.info(msg)
                 span.update(
@@ -354,10 +359,10 @@ async def generate_tasks(
                         "completed_capability": capability.name,
                     }
                 )
-                
+
                 processed_capabilities += 1
                 await asyncio.sleep(1)
-                
+
             # Final summary
             msg = f"Task generation completed. Processed: {processed_capabilities}, Skipped: {skipped_capabilities}, Total: {len(capabilities)}"
             log.info(msg)
diff --git a/src/task_generation/moderator.py b/src/task_generation/moderator.py
index 0238d44..711abe0 100644
--- a/src/task_generation/moderator.py
+++ b/src/task_generation/moderator.py
@@ -59,7 +59,6 @@ def __init__(
         self._domain = domain
         self._langfuse_client = langfuse_client
 
-        # Algorithm 1 state
         self._num_remaining: Dict[str, int] = {}
         self._final_problems: Dict[
             str, Dict[str, str]
@@ -71,11 +70,9 @@ def __init__(
             str, List[ScientistProblemProposal]
         ] = {}  # capability -> proposals
 
-
-
     @message_handler
     async def handle_capability(self, message: Capability, ctx: MessageContext) -> None:
-        """Handle capability and start Algorithm 1 for problem design."""
+        """Start problem design for a capability."""
         with self._langfuse_client.start_as_current_span(
             name="task_moderator_handle_capability"
         ) as span:
@@ -91,12 +88,9 @@ async def handle_capability(self, message: Capability, ctx: MessageContext) -> N
                     }
                 )
 
-                # Initialize Algorithm 1 state
                 self._num_remaining[message.name] = self._num_final_problems
                 self._final_problems[message.name] = {}
-                self._capabilities[message.name] = (
-                    message  # Store original capability info
-                )
+                self._capabilities[message.name] = message
 
                 await self._start_problem_iteration(message)
 
@@ -119,14 +113,12 @@ async def handle_capability(self, message: Capability, ctx: MessageContext) -> N
                 raise
 
     async def _start_problem_iteration(self, capability: Capability) -> None:
-        """Start a problem generation iteration (Algorithm 1)."""
+        """Start a problem generation iteration."""
         try:
             num_remaining = self._num_remaining[capability.name]
             if num_remaining <= 0:
-                log.info(
-                    f"Problem design completed for capability: {capability.name}, starting solution design"
-                )
-                await self._start_solution_design(capability)
+                log.info(f"Problem design completed for capability: {capability.name}")
+                await self._finalize_tasks_without_solutions(capability.name)
                 return
 
             # Calculate problems per scientist: ceil(num_remaining / M) + B
@@ -265,7 +257,6 @@ async def _filter_and_select_problems(
                 )
                 final_tasks = {}
 
-            # Update Algorithm 1 state
             num_remaining = self._num_remaining[capability_name]
             num_selected = min(len(final_tasks), num_remaining)
 
@@ -286,13 +277,10 @@ async def _filter_and_select_problems(
                 f"Task Moderator selected {selected_count} problems for {capability_name}, {self._num_remaining[capability_name]} remaining"
             )
 
-            # Continue Algorithm 1 or move to solution design
             if self._num_remaining[capability_name] > 0:
-                # Need more problems, start another iteration
                 capability = self._capabilities[capability_name]
                 await self._start_problem_iteration(capability)
             else:
-                # Problem design complete, finalize tasks without solutions
                 await self._finalize_tasks_without_solutions(capability_name)
 
         except Exception as e:
@@ -301,7 +289,7 @@ async def _filter_and_select_problems(
             raise
 
     async def _finalize_tasks_without_solutions(self, capability_name: str) -> None:
-        """Finalize tasks with problems only (no solutions)."""
+        """Finalize tasks with problems only."""
         try:
             log.info(
                 f"Task Moderator finalizing tasks for capability: {capability_name}"
@@ -324,7 +312,9 @@ async def _finalize_tasks_without_solutions(self, capability_name: str) -> None:
 
             # Save final tasks
             await self._save_tasks_to_file(capability_name, final_tasks)
-            log.info(f"Task generation completed for capability: {capability_name} ({len(final_tasks)} tasks)")
+            log.info(
+                f"Task generation completed for capability: {capability_name} ({len(final_tasks)} tasks)"
+            )
 
         except Exception as e:
             log.error(f"Error in Task Moderator _finalize_tasks_without_solutions: {e}")

From c155d7430b5bc6142fc7521f0b341f356b0f4e33 Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Fri, 5 Sep 2025 16:39:38 -0400
Subject: [PATCH 07/19] simplified task generation.

---
 src/task_generation/__init__.py  |   5 +
 src/task_generation/generator.py |   1 +
 src/task_generation/messages.py  |   7 +-
 src/task_generation/moderator.py | 159 +++++++++++++++----------------
 src/task_generation/scientist.py |   5 +-
 5 files changed, 83 insertions(+), 94 deletions(-)

diff --git a/src/task_generation/__init__.py b/src/task_generation/__init__.py
index 8d54a1b..ebcd01c 100644
--- a/src/task_generation/__init__.py
+++ b/src/task_generation/__init__.py
@@ -1 +1,6 @@
 """Task generation package for multi-agent debate-based task generation."""
+
+from .generator import generate_tasks
+
+
+__all__ = ["generate_tasks"]
diff --git a/src/task_generation/generator.py b/src/task_generation/generator.py
index a0169ed..56094cf 100644
--- a/src/task_generation/generator.py
+++ b/src/task_generation/generator.py
@@ -95,6 +95,7 @@ async def generate_tasks_for_capability(
                     output_dir=output_dir,
                     domain=domain_name,
                     langfuse_client=langfuse_client,
+                    max_round=cfg.task_generation.max_rounds,
                 ),
             )
 
diff --git a/src/task_generation/messages.py b/src/task_generation/messages.py
index 38daaa9..0ae4692 100644
--- a/src/task_generation/messages.py
+++ b/src/task_generation/messages.py
@@ -24,6 +24,7 @@ class ProblemProposalRequest:
     capability_area: str
     num_problems: int
     sample_tasks: List[str]
+    iteration: int = 1
 
 
 @dataclass
@@ -34,9 +35,3 @@ class ScientistProblemProposal:
     capability_name: str
     problems: Dict[str, str]  # task_id -> task_text
     iteration: int
-
-
-
-
-
-
diff --git a/src/task_generation/moderator.py b/src/task_generation/moderator.py
index 711abe0..16e6193 100644
--- a/src/task_generation/moderator.py
+++ b/src/task_generation/moderator.py
@@ -49,6 +49,7 @@ def __init__(
         output_dir: Path,
         domain: str,
         langfuse_client: Langfuse,
+        max_round: int = 5,
     ) -> None:
         super().__init__("Task Moderator")
         self._model_client = model_client
@@ -58,17 +59,17 @@ def __init__(
         self._output_dir = output_dir
         self._domain = domain
         self._langfuse_client = langfuse_client
+        self._max_round = max_round
 
-        self._num_remaining: Dict[str, int] = {}
-        self._final_problems: Dict[
-            str, Dict[str, str]
-        ] = {}  # capability -> {task_id: problem_text}
-        self._capabilities: Dict[str, Capability] = {}  # Store original capability info
+        self._num_remaining = self._num_final_problems
+        self._final_problems: Dict[str, str] = {}  # {task_id: problem_text}
+        self._capability: (
+            Capability  # Store original capability info (set in first message)
+        )
+        self._current_round = 0
 
         # Problem design state
-        self._problem_proposals: Dict[
-            str, List[ScientistProblemProposal]
-        ] = {}  # capability -> proposals
+        self._problem_proposals: Dict[int, List[ScientistProblemProposal]] = {}
 
     @message_handler
     async def handle_capability(self, message: Capability, ctx: MessageContext) -> None:
@@ -77,22 +78,22 @@ async def handle_capability(self, message: Capability, ctx: MessageContext) -> N
             name="task_moderator_handle_capability"
         ) as span:
             try:
-                msg = f"Task Moderator starting problem design for capability: {message.name}"
+                capability_name = message.name
+                msg = f"Task Moderator starting problem design for capability: {capability_name}"
                 log.info(msg)
                 span.update(
                     metadata={
                         "capability_received": msg,
-                        "capability_name": message.name,
+                        "capability_name": capability_name,
                         "capability_description": message.description,
                         "capability_area": message.area,
                     }
                 )
 
-                self._num_remaining[message.name] = self._num_final_problems
-                self._final_problems[message.name] = {}
-                self._capabilities[message.name] = message
+                self._capability = message
+                self._problem_proposals[self._current_round] = []
 
-                await self._start_problem_iteration(message)
+                await self._start_problem_iteration()
 
             except Exception as e:
                 error_msg = f"Error in Task Moderator handle_capability: {e}"
@@ -112,38 +113,50 @@ async def handle_capability(self, message: Capability, ctx: MessageContext) -> N
                 )
                 raise
 
-    async def _start_problem_iteration(self, capability: Capability) -> None:
+    async def _start_problem_iteration(self) -> None:
         """Start a problem generation iteration."""
         try:
-            num_remaining = self._num_remaining[capability.name]
-            if num_remaining <= 0:
-                log.info(f"Problem design completed for capability: {capability.name}")
-                await self._finalize_tasks_without_solutions(capability.name)
+            # Check if we've reached the maximum number of rounds
+            if self._current_round >= self._max_round:
+                log.info(
+                    f"Maximum rounds ({self._max_round}) reached for capability: {self._capability.name}.\
+                    Finalizing with {len(self._final_problems)} problems."
+                )
+                await self._finalize_tasks_without_solutions()
+                return
+
+            if self._num_remaining <= 0:
+                log.info(
+                    f"Problem design completed for capability: {self._capability.name}"
+                )
+                await self._finalize_tasks_without_solutions()
                 return
 
             # Calculate problems per scientist: ceil(num_remaining / M) + B
             problems_per_scientist = (
-                math.ceil(num_remaining / self._num_scientists) + self._buffer_param
+                math.ceil(self._num_remaining / self._num_scientists)
+                + self._buffer_param
             )
 
             log.info(
-                f"Task Moderator requesting {problems_per_scientist} problems per scientist for capability: {capability.name} (remaining: {num_remaining})"
+                f"Task Moderator requesting {problems_per_scientist} problems per scientist for capability: {self._capability.name} (remaining: {self._num_remaining}, round: {self._current_round}/{self._max_round})"
             )
 
             # Get sample tasks from existing final problems
-            sample_tasks = list(self._final_problems[capability.name].values())[
+            sample_tasks = list(self._final_problems.values())[
                 :3
             ]  # Use up to 3 existing problems as samples
 
             # Send problem proposal requests to all scientists
             await self.publish_message(
                 ProblemProposalRequest(
-                    capability_name=capability.name,
-                    capability_description=capability.description,
-                    capability_domain=capability.domain,
-                    capability_area=capability.area,
+                    capability_name=self._capability.name,
+                    capability_description=self._capability.description,
+                    capability_domain=self._capability.domain,
+                    capability_area=self._capability.area,
                     num_problems=problems_per_scientist,
                     sample_tasks=sample_tasks,
+                    iteration=self._current_round,
                 ),
                 topic_id=DefaultTopicId(),
             )
@@ -163,46 +176,30 @@ async def handle_scientist_problem_proposal(
                 f"Task Moderator received problem proposal from Scientist {message.scientist_id} for capability: {message.capability_name}"
             )
 
-            capability_name = message.capability_name
-            if capability_name not in self._problem_proposals:
-                self._problem_proposals[capability_name] = []
-
-            self._problem_proposals[capability_name].append(message)
+            self._problem_proposals[self._current_round].append(message)
 
             # Check if we have all proposals for this iteration
-            current_proposals = [
-                p
-                for p in self._problem_proposals[capability_name]
-                if p.iteration == message.iteration
-            ]
+            current_proposals = self._problem_proposals[self._current_round]
             if len(current_proposals) == self._num_scientists:
                 log.info(
-                    f"Task Moderator received all problem proposals for capability: {capability_name}, proceeding to filter"
-                )
-                await self._filter_and_select_problems(
-                    capability_name, message.iteration
+                    f"Task Moderator received all problem proposals for capability: {self._capability.name}, proceeding to filter"
                 )
+                await self._filter_and_select_problems()
 
         except Exception as e:
             log.error(f"Error in Task Moderator handle_scientist_problem_proposal: {e}")
             log.error(f"Traceback: {traceback.format_exc()}")
             raise
 
-    async def _filter_and_select_problems(
-        self, capability_name: str, iteration: int
-    ) -> None:
+    async def _filter_and_select_problems(self) -> None:
         """Filter and select problems using moderator LLM."""
         try:
             log.info(
-                f"Task Moderator filtering problems for capability: {capability_name}"
+                f"Task Moderator filtering problems for capability: {self._capability.name}"
             )
 
             # Collect all proposed problems
-            current_proposals = [
-                p
-                for p in self._problem_proposals[capability_name]
-                if p.iteration == iteration
-            ]
+            current_proposals = self._problem_proposals[self._current_round]
             all_problems = {}
             scientist_attribution = {}
 
@@ -213,7 +210,9 @@ async def _filter_and_select_problems(
                     scientist_attribution[unique_id] = proposal.scientist_id
 
             if not all_problems:
-                log.warning(f"No problems received for capability: {capability_name}")
+                log.warning(
+                    f"No problems received for capability: {self._capability.name}"
+                )
                 return
 
             # Format problems for moderator
@@ -226,17 +225,14 @@ async def _filter_and_select_problems(
                         problems_text += f"- {task_name}: {problem}\n"
                 problems_text += "\n"
 
-            system_prompt = TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT
-
-            capability_info = self._capabilities[capability_name]
             user_prompt = TASK_MODERATOR_PROBLEM_USER_PROMPT.format(
-                capability_name=capability_info.name,
-                capability_description=capability_info.description,
-                capability_domain=capability_info.domain,
+                capability_name=self._capability.name,
+                capability_description=self._capability.description,
+                capability_domain=self._capability.domain,
                 problems_text=problems_text,
             )
 
-            system_message = SystemMessage(content=system_prompt)
+            system_message = SystemMessage(content=TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT)
             user_message = UserMessage(content=user_prompt, source="user")
 
             model_result = await self._model_client.create(
@@ -257,63 +253,60 @@ async def _filter_and_select_problems(
                 )
                 final_tasks = {}
 
-            num_remaining = self._num_remaining[capability_name]
-            num_selected = min(len(final_tasks), num_remaining)
+            num_selected = min(len(final_tasks), self._num_remaining)
 
             # Add selected problems to final set
             selected_count = 0
             for _, problem_text in final_tasks.items():
                 if selected_count < num_selected:
-                    final_task_id = (
-                        f"task_{len(self._final_problems[capability_name]) + 1}"
-                    )
-                    self._final_problems[capability_name][final_task_id] = problem_text
+                    final_task_id = f"task_{len(self._final_problems) + 1}"
+                    self._final_problems[final_task_id] = problem_text
                     selected_count += 1
 
             # Update remaining count
-            self._num_remaining[capability_name] = num_remaining - selected_count
+            self._num_remaining = self._num_remaining - selected_count
 
             log.info(
-                f"Task Moderator selected {selected_count} problems for {capability_name}, {self._num_remaining[capability_name]} remaining"
+                f"Task Moderator selected {selected_count} problems for {self._capability.name}, {self._num_remaining} remaining"
             )
 
-            if self._num_remaining[capability_name] > 0:
-                capability = self._capabilities[capability_name]
-                await self._start_problem_iteration(capability)
+            if self._num_remaining > 0:
+                # Increment round counter before starting next iteration
+                self._current_round += 1
+                await self._start_problem_iteration()
             else:
-                await self._finalize_tasks_without_solutions(capability_name)
+                await self._finalize_tasks_without_solutions()
 
         except Exception as e:
             log.error(f"Error in Task Moderator _filter_and_select_problems: {e}")
             log.error(f"Traceback: {traceback.format_exc()}")
             raise
 
-    async def _finalize_tasks_without_solutions(self, capability_name: str) -> None:
+    async def _finalize_tasks_without_solutions(self) -> None:
         """Finalize tasks with problems only."""
         try:
             log.info(
-                f"Task Moderator finalizing tasks for capability: {capability_name}"
+                f"Task Moderator finalizing tasks for capability: {self._capability.name}"
             )
 
-            final_problems = self._final_problems[capability_name]
-            if not final_problems:
+            if not self._final_problems:
                 log.error(
-                    f"No final problems available for capability: {capability_name}"
+                    f"No final problems available for capability: {self._capability.name}"
                 )
                 return
 
             # Create tasks with problems only
             final_tasks = {}
-            for task_id, problem_text in final_problems.items():
+            for task_id, problem_text in self._final_problems.items():
                 final_tasks[task_id] = {
                     "task": problem_text,
-                    "capability_id": capability_name,
+                    "capability_id": self._capability.name,
                 }
 
             # Save final tasks
-            await self._save_tasks_to_file(capability_name, final_tasks)
+            await self._save_tasks_to_file(final_tasks)
             log.info(
-                f"Task generation completed for capability: {capability_name} ({len(final_tasks)} tasks)"
+                f"Task generation completed for capability: {self._capability.name} ({len(final_tasks)} tasks)"
             )
 
         except Exception as e:
@@ -321,13 +314,11 @@ async def _finalize_tasks_without_solutions(self, capability_name: str) -> None:
             log.error(f"Traceback: {traceback.format_exc()}")
             raise
 
-    async def _save_tasks_to_file(
-        self, capability_name: str, tasks: Dict[str, Dict[str, str]]
-    ) -> None:
+    async def _save_tasks_to_file(self, tasks: Dict[str, Dict[str, str]]) -> None:
         """Save final tasks to file."""
         try:
             # Create capability directory
-            capability_dir = self._output_dir / capability_name
+            capability_dir = self._output_dir / self._capability.name
             capability_dir.mkdir(parents=True, exist_ok=True)
 
             # Save tasks
@@ -336,9 +327,9 @@ async def _save_tasks_to_file(
                 json.dump({"tasks": tasks}, f, indent=2, ensure_ascii=False)
 
             log.info(
-                f"Saved {len(tasks)} tasks for capability '{capability_name}' to {tasks_file}"
+                f"Saved {len(tasks)} tasks for capability '{self._capability.name}' to {tasks_file}"
             )
         except Exception as e:
-            log.error(f"Error saving tasks for capability {capability_name}: {e}")
+            log.error(f"Error saving tasks for capability {self._capability.name}: {e}")
             log.error(f"Traceback: {traceback.format_exc()}")
             raise
diff --git a/src/task_generation/scientist.py b/src/task_generation/scientist.py
index 25b25d8..e66a7eb 100644
--- a/src/task_generation/scientist.py
+++ b/src/task_generation/scientist.py
@@ -1,6 +1,5 @@
 """Task scientist agent for generating problems and solutions."""
 
-import json
 import logging
 import traceback
 
@@ -125,7 +124,7 @@ async def handle_problem_proposal_request(
                         scientist_id=self._scientist_id,
                         capability_name=message.capability_name,
                         problems=problems,
-                        iteration=0,
+                        iteration=getattr(message, "iteration", 0),
                     ),
                     topic_id=DefaultTopicId(),
                 )
@@ -148,5 +147,3 @@ async def handle_problem_proposal_request(
                     },
                 )
                 raise
-
-

From 52b4d2a8c5a4ec008701a167fe6c6570ae168713 Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Sun, 7 Sep 2025 02:55:50 -0400
Subject: [PATCH 08/19] fixed mypy errors.

---
 README.md                                     |   7 +-
 src/agentic_capability_generator.py           |   7 +-
 src/agentic_task_solver.py                    | 126 ++---
 src/capability_generation/generator.py        |   7 +-
 src/capability_generation/messages.py         |   2 +-
 src/cfg/agentic_config.yaml                   |   9 +-
 src/task_solver/__init__.py                   |   6 +
 src/task_solver/generator.py                  | 246 ++++++++++
 src/task_solver/messages.py                   |  81 ++++
 src/task_solver/moderator.py                  | 442 ++++++++++++++++++
 .../scientist.py                              | 125 ++---
 src/task_solving/__init__.py                  |  17 -
 src/task_solving/generator.py                 | 225 ---------
 src/task_solving/messages.py                  |  64 ---
 src/task_solving/moderator.py                 | 342 --------------
 src/utils/agentic_prompts.py                  |  57 ++-
 16 files changed, 970 insertions(+), 793 deletions(-)
 create mode 100644 src/task_solver/__init__.py
 create mode 100644 src/task_solver/generator.py
 create mode 100644 src/task_solver/messages.py
 create mode 100644 src/task_solver/moderator.py
 rename src/{task_solving => task_solver}/scientist.py (53%)
 delete mode 100644 src/task_solving/__init__.py
 delete mode 100644 src/task_solving/generator.py
 delete mode 100644 src/task_solving/messages.py
 delete mode 100644 src/task_solving/moderator.py

diff --git a/README.md b/README.md
index 07f0bf2..a88c7e3 100644
--- a/README.md
+++ b/README.md
@@ -86,12 +86,13 @@ python -m src.agentic_area_generator
 python -m src.agentic_capability_generator
 
 # Generate tasks for each capability
-python -m src.agentic_task_generator 
+python -m src.agentic_task_generator
 
 # Generate tasks for all capabilities
 python -m src.agentic_task_generator pipeline_tags.capabilities_tag=_20250902_030203
 
-# Generate solutions for tasks
-# python -m sr
+# Generate solutions for tasks using multi-agent debate
+python -m src.agentic_task_solver pipeline_tags.tasks_tag=_20250905_153532
+
 
 ```
diff --git a/src/agentic_capability_generator.py b/src/agentic_capability_generator.py
index e9d9d80..835813e 100644
--- a/src/agentic_capability_generator.py
+++ b/src/agentic_capability_generator.py
@@ -4,6 +4,7 @@
 import logging
 import os
 import traceback
+from typing import Optional
 
 import hydra
 import openlit
@@ -29,7 +30,9 @@
 def main(cfg: DictConfig) -> None:
     """Run the multi-agent debate-based capability generation system."""
     areas_tag = cfg.pipeline_tags.areas_tag
-    resume_tag = getattr(cfg.pipeline_tags, "resume_capabilities_tag", None)
+    resume_tag: Optional[str] = getattr(
+        cfg.pipeline_tags, "resume_capabilities_tag", None
+    )
     domain_name = cfg.global_cfg.domain
     exp_id = cfg.exp_cfg.exp_id
     num_capabilities_per_area = cfg.capability_generation.num_capabilities_per_area
@@ -63,7 +66,7 @@ def main(cfg: DictConfig) -> None:
                 error_msg = "No areas_tag provided. Please provide pipeline_tags.areas_tag=<tag> to specify which areas to use."
                 log.warning(error_msg)
                 span.update(
-                    level="WARNING",
+                    level="ERROR",
                     status_message="Missing areas_tag",
                     metadata={"areas_tag_missing": error_msg},
                 )
diff --git a/src/agentic_task_solver.py b/src/agentic_task_solver.py
index 43c7935..49a52f2 100644
--- a/src/agentic_task_solver.py
+++ b/src/agentic_task_solver.py
@@ -4,14 +4,13 @@
 import logging
 import os
 import traceback
-from pathlib import Path
 
 import hydra
 import openlit
 from langfuse import Langfuse
 from omegaconf import DictConfig, OmegaConf
 
-from src.task_solving.generator import solve_tasks_with_debate, load_tasks_from_file
+from src.task_solver import solve_tasks
 
 
 # Suppress OpenTelemetry console output
@@ -20,25 +19,27 @@
 os.environ["OTEL_PYTHON_LOG_CORRELATION"] = "false"
 os.environ["OTEL_PYTHON_LOG_LEVEL"] = "ERROR"
 
-log = logging.getLogger("agentic_task_solving")
+log = logging.getLogger("agentic_task_solver")
 
-lf = Langfuse()
-openlit.init(tracer=lf._otel_tracer, disable_batch=True, disable_metrics=True)
+langfuse_client = Langfuse()
+openlit.init(
+    tracer=langfuse_client._otel_tracer, disable_batch=True, disable_metrics=True
+)
 
 
 @hydra.main(version_base=None, config_path="cfg", config_name="agentic_config")
 def main(cfg: DictConfig) -> None:
     """Run the multi-agent debate-based task solving system."""
+    tasks_tag = cfg.pipeline_tags.get("tasks_tag")
+    resume_tag = getattr(cfg.pipeline_tags, "resume_solutions_tag", None)
     domain_name = cfg.global_cfg.domain
     exp_id = cfg.exp_cfg.exp_id
-    output_dir = cfg.global_cfg.output_dir
-    max_tasks = cfg.task_solving.get("max_tasks", 0)
 
-    with lf.start_as_current_span(
-        name=f"ace_agentic_task_solving:{domain_name}:{exp_id}"
+    with langfuse_client.start_as_current_span(
+        name=f"ace_agentic_task_solver:{domain_name}:{exp_id}"
     ) as span:
         try:
-            msg = "Starting multi-agent debate-based task solving"
+            msg = "Starting multi-agent debate-based task solver"
             log.info(msg)
             span.update(metadata={"system_started": msg})
 
@@ -54,71 +55,70 @@ def main(cfg: DictConfig) -> None:
                 }
             )
 
-            # Load tasks from the specified file or use pipeline tags to find them
-            tasks_file = None
-            if cfg.pipeline_tags.get("tasks_tag"):
-                # Look for tasks file using the tag
-                tasks_dir = Path(output_dir) / domain_name / "tasks"
-                tasks_file = tasks_dir / f"tasks_{cfg.pipeline_tags.tasks_tag}.json"
-            elif cfg.task_solving.get("input_file"):
-                tasks_file = Path(cfg.task_solving.input_file)
+            if tasks_tag:
+                msg = f"Using tasks from tag: {tasks_tag}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "tasks_tag_found": msg,
+                        "tasks_tag": tasks_tag,
+                    }
+                )
             else:
-                raise ValueError("Either pipeline_tags.tasks_tag or task_solving.input_file must be specified")
-
-        if not tasks_file.exists():
-            raise FileNotFoundError(f"Tasks file not found: {tasks_file}")
-
-        log.info(f"Loading tasks from: {tasks_file}")
-        tasks = load_tasks_from_file(tasks_file)
-        log.info(f"Loaded {len(tasks)} tasks")
+                error_msg = "No tasks_tag provided. Please provide pipeline_tags.tasks_tag=<tag> to specify which tasks to solve."
+                log.warning(error_msg)
+                span.update(
+                    level="ERROR",
+                    status_message="Missing tasks_tag",
+                    metadata={"tasks_tag_missing": error_msg},
+                )
+                return
+
+            if resume_tag:
+                msg = f"Resuming task solving from tag: {resume_tag}"
+                log.info(msg)
+                span.update(
+                    metadata={"resume_tag_found": msg, "resume_tag": resume_tag}
+                )
+
+            span.update_trace(
+                metadata={
+                    "domain": domain_name,
+                    "exp_id": exp_id,
+                    "tasks_tag": tasks_tag,
+                    "resume_tag": resume_tag,
+                    "config": config_yaml,
+                },
+                tags=["agentic_task_solver", exp_id],
+            )
 
-        # Limit number of tasks if specified
-            if max_tasks > 0:
-                tasks = tasks[:max_tasks]
-            log.info(f"Limited to {len(tasks)} tasks")
+            asyncio.run(solve_tasks(cfg, tasks_tag, langfuse_client, resume_tag))
 
-        # Run task solving
-            msg = f"Running task solving for {len(tasks)} tasks"
+            msg = "Multi-agent debate-based task solving completed successfully"
             log.info(msg)
-            span.update(metadata={"task_solving_started": msg})
+            span.update(metadata={"system_completed": msg})
 
-            results = asyncio.run(solve_tasks_with_debate(
-            cfg=cfg,
-            tasks=tasks,
-                langfuse_client=lf,
-            ))
+        except Exception as e:
+            error_msg = f"Task solving failed: {e}"
+            traceback_msg = f"Full traceback: {traceback.format_exc()}"
 
-        # Print summary
-        consensus_count = sum(1 for result in results.values() if result.get("consensus_reached", False))
-        no_consensus_count = len(results) - consensus_count
+            log.error(error_msg)
+            log.error(traceback_msg)
 
-            msg = f"Task solving completed. Consensus: {consensus_count}, No consensus: {no_consensus_count}"
-            log.info(msg)
             span.update(
+                level="ERROR",
+                status_message=str(e),
                 metadata={
-                    "task_solving_completed": msg,
-                    "total_tasks": len(results),
-                    "consensus_reached": consensus_count,
-                    "no_consensus": no_consensus_count,
-                }
+                    "system_error": error_msg,
+                    "error": str(e),
+                    "traceback": traceback_msg,
+                },
             )
 
-        # Print detailed results if requested
-            if cfg.task_solving.get("print_results", False):
-            for task_id, result in results.items():
-                log.info(f"\nTask {task_id}:")
-                log.info(f"  Solution: {result['solution'][:100]}...")
-                log.info(f"  Consensus: {result['consensus_reached']}")
-                log.info(f"  Rounds: {result['total_rounds']}")
+            raise
 
-    except Exception as e:
-            error_msg = f"Error in agentic task solving: {str(e)}"
-            log.error(error_msg)
-            log.error(traceback.format_exc())
-            span.update(metadata={"error": error_msg})
-        raise
-    finally:
-            lf.flush()
+        finally:
+            langfuse_client.flush()
 
 
 if __name__ == "__main__":
diff --git a/src/capability_generation/generator.py b/src/capability_generation/generator.py
index 54d6c0f..b8ffc65 100644
--- a/src/capability_generation/generator.py
+++ b/src/capability_generation/generator.py
@@ -6,6 +6,7 @@
 import traceback
 from datetime import datetime
 from pathlib import Path
+from typing import Optional
 
 from autogen_core import (
     EVENT_LOGGER_NAME,
@@ -30,7 +31,7 @@
 
 
 async def generate_capabilities_for_area(
-    cfg: DictConfig, area: Area, output_dir: Path, langfuse_client: Langfuse = None
+    cfg: DictConfig, area: Area, output_dir: Path, langfuse_client: Langfuse
 ) -> None:
     """Generate capabilities for a single area."""
     with langfuse_client.start_as_current_span(
@@ -153,8 +154,8 @@ async def generate_capabilities_for_area(
 async def generate_capabilities(
     cfg: DictConfig,
     areas_tag: str,
-    langfuse_client: Langfuse = None,
-    resume_tag: str = None,
+    langfuse_client: Langfuse,
+    resume_tag: Optional[str] = None,
 ) -> None:
     """Generate capabilities using multi-agent debate system for each area."""
     domain_name = cfg.global_cfg.domain
diff --git a/src/capability_generation/messages.py b/src/capability_generation/messages.py
index 5118ea4..32e5bba 100644
--- a/src/capability_generation/messages.py
+++ b/src/capability_generation/messages.py
@@ -37,4 +37,4 @@ class CapabilityRevisionRequest:
     scientist_id: str
     moderator_proposal: str
     area_name: str
-    round: int 
\ No newline at end of file
+    round: int
diff --git a/src/cfg/agentic_config.yaml b/src/cfg/agentic_config.yaml
index 68a3d9a..39b7db6 100644
--- a/src/cfg/agentic_config.yaml
+++ b/src/cfg/agentic_config.yaml
@@ -34,13 +34,14 @@ capability_generation:
 
 # Stage 3: Task Generation Configuration
 task_generation:
-  num_final_problems_per_capability: 10    # N: Number of final problems per capability
-  buffer_param: 5                         # B: Buffer parameter (extra problems each agent proposes)
+  num_final_problems_per_capability: 5    # N: Number of final problems per capability
+  buffer_param: 2                         # B: Buffer parameter (extra problems each agent proposes)
+  max_rounds: 2                           # Maximum number of rounds for task generation
 
 # Stage 4: Task Solving Configuration
-task_solving:
+task_solver:
   max_tasks: 0                            # Maximum number of tasks to process (0 = all)
-  print_results: false                    # Whether to print detailed results to console
+  max_rounds: 3                           # Maximum number of debate rounds for task solving
 
 # Experiment configuration
 exp_cfg:
diff --git a/src/task_solver/__init__.py b/src/task_solver/__init__.py
new file mode 100644
index 0000000..ff8672d
--- /dev/null
+++ b/src/task_solver/__init__.py
@@ -0,0 +1,6 @@
+"""Task solving module with debate-based approach."""
+
+from .generator import solve_tasks
+
+
+__all__ = ["solve_tasks"]
diff --git a/src/task_solver/generator.py b/src/task_solver/generator.py
new file mode 100644
index 0000000..85d12d5
--- /dev/null
+++ b/src/task_solver/generator.py
@@ -0,0 +1,246 @@
+"""Main task solver orchestration function."""
+
+import json
+import logging
+import traceback
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+from autogen_core import (
+    EVENT_LOGGER_NAME,
+    ROOT_LOGGER_NAME,
+    TRACE_LOGGER_NAME,
+    DefaultTopicId,
+    SingleThreadedAgentRuntime,
+)
+from langfuse import Langfuse
+from omegaconf import DictConfig
+
+from src.task_solver.messages import Task
+from src.task_solver.moderator import TaskSolverModerator
+from src.task_solver.scientist import TaskSolverScientist
+from src.utils.model_client_utils import get_model_client
+
+
+log = logging.getLogger("task_solver.generator")
+logging.getLogger(ROOT_LOGGER_NAME).setLevel(logging.WARNING)
+logging.getLogger(TRACE_LOGGER_NAME).setLevel(logging.WARNING)
+logging.getLogger(EVENT_LOGGER_NAME).setLevel(logging.WARNING)
+
+
+async def solve_task(
+    cfg: DictConfig, task: Task, output_dir: Path, langfuse_client: Langfuse
+) -> None:
+    """Solve a task using multi-agent debate system."""
+    max_rounds = cfg.task_solver.max_rounds
+    task_id = task.task_id
+    capability_name = task.capability_name
+
+    with langfuse_client.start_as_current_span(
+        name=f"task_solver_for_task:{task_id}, capability:{capability_name}"
+    ) as span:
+        try:
+            msg = f"Generating solutions for task: {task_id}, capability: {capability_name}"
+            log.info(msg)
+            span.update(
+                metadata={
+                    "single_task_solver_started": msg,
+                    "task_id": task_id,
+                    "problem": task.problem,
+                    "capability_name": capability_name,
+                }
+            )
+
+            runtime = SingleThreadedAgentRuntime()
+
+            # Register moderator
+            await TaskSolverModerator.register(
+                runtime,
+                "TaskSolverModerator",
+                lambda: TaskSolverModerator(
+                    model_client=get_model_client(
+                        model_name=cfg.agents.moderator.model_name,
+                        seed=cfg.agents.moderator.get("seed"),
+                    ),
+                    num_solvers=2,
+                    max_rounds=max_rounds,
+                    output_dir=output_dir,
+                    langfuse_client=langfuse_client,
+                ),
+            )
+
+            # Register scientist agents
+            await TaskSolverScientist.register(
+                runtime,
+                "TaskSolverScientistA",
+                lambda: TaskSolverScientist(
+                    model_client=get_model_client(
+                        model_name=cfg.agents.scientist_a.model_name,
+                        seed=cfg.agents.scientist_a.get("seed"),
+                    ),
+                    scientist_id="A",
+                    langfuse_client=langfuse_client,
+                ),
+            )
+
+            await TaskSolverScientist.register(
+                runtime,
+                "TaskSolverScientistB",
+                lambda: TaskSolverScientist(
+                    model_client=get_model_client(
+                        model_name=cfg.agents.scientist_b.model_name,
+                        seed=cfg.agents.scientist_b.get("seed"),
+                    ),
+                    scientist_id="B",
+                    langfuse_client=langfuse_client,
+                ),
+            )
+
+            span.update(
+                metadata={
+                    "agents_registered": "All task agents registered successfully",
+                    "scientists": ["A", "B"],
+                    "moderator": True,
+                }
+            )
+
+            # Start runtime
+            runtime.start()
+
+            await runtime.publish_message(task, DefaultTopicId())
+
+            msg = f"Task message published: {task_id}, capability: {capability_name}"
+            log.info(msg)
+            span.update(
+                metadata={
+                    "task_published": msg,
+                    "task_id": task_id,
+                    "capability_name": capability_name,
+                }
+            )
+
+            try:
+                await runtime.stop_when_idle()
+                msg = (
+                    f"Completed solving task: {task_id}, capability: {capability_name}"
+                )
+                log.info(msg)
+                span.update(metadata={"runtime_completed": msg})
+            except Exception as e:
+                msg = f"Error while solving task {task_id}, capability: {capability_name}: {e}"
+                log.error(msg)
+                span.update(
+                    level="ERROR",
+                    status_message=str(e),
+                    metadata={
+                        "runtime_error": msg,
+                        "error": str(e),
+                        "task_id": task_id,
+                        "capability_name": capability_name,
+                    },
+                )
+                raise
+        except Exception as e:
+            error_msg = f"Error in task solver: {str(e)}"
+            log.error(error_msg)
+            log.error(traceback.format_exc())
+            span.update(metadata={"error": error_msg})
+            raise
+
+
+async def solve_tasks(
+    cfg: DictConfig,
+    tasks_tag: str,
+    langfuse_client: Langfuse,
+    resume_tag: Optional[str] = None,
+) -> None:
+    """Solve tasks using multi-agent debate system."""
+    domain_name = cfg.global_cfg.domain
+    exp_id = cfg.exp_cfg.exp_id
+
+    if resume_tag:
+        solutions_tag = resume_tag
+        log.info(f"Resuming task solver with existing tag: {solutions_tag}")
+    else:
+        solutions_tag = f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+
+    output_dir = (
+        Path.home()
+        / cfg.global_cfg.output_dir
+        / domain_name.replace(" ", "_")
+        / exp_id
+        / "task_solutions"
+        / solutions_tag
+    )
+
+    with langfuse_client.start_as_current_span(
+        name=f"ace_task_solver:{domain_name}:{exp_id}:{solutions_tag}"
+    ) as span:
+        try:
+            msg = f"Solutions will be saved with tag: {solutions_tag}"
+            print(msg)
+            log.info(msg)
+            span.update(
+                metadata={
+                    "solver_started": msg,
+                    "solutions_tag": solutions_tag,
+                    "resume_tag": resume_tag,
+                    "output_dir": output_dir,
+                    "tasks_tag": tasks_tag,
+                    "domain": domain_name,
+                    "exp_id": exp_id,
+                },
+                tags=["task_solver_process", exp_id],
+            )
+
+            tasks_dir = (
+                Path.home()
+                / cfg.global_cfg.output_dir
+                / domain_name.replace(" ", "_")
+                / exp_id
+                / "tasks"
+                / tasks_tag
+            )
+
+            if not tasks_dir.exists():
+                error_msg = f"Tasks directory not found: {tasks_dir}"
+                log.error(error_msg)
+                span.update(
+                    level="ERROR",
+                    status_message="Capabilities directory not found",
+                    metadata={
+                        "directory_not_found_error": error_msg,
+                        "tasks_dir": str(tasks_dir),
+                    },
+                )
+                raise FileNotFoundError(error_msg)
+
+            for capability_dir in tasks_dir.iterdir():
+                if capability_dir.is_dir():
+                    # Check if the last part of capability_dir exists in output_dir
+                    output_solver_dir = Path(output_dir) / capability_dir.name
+                    if output_solver_dir.exists():
+                        msg = f"Solutions for tasks under capability {capability_dir.name} already exist: {output_solver_dir}"
+                        log.info(msg)
+                        span.update(metadata={"task_solver_skipped": msg})
+                        continue
+
+                    tasks_file = capability_dir / "tasks.json"
+                    if tasks_file.exists():
+                        with open(tasks_file, "r", encoding="utf-8") as f:
+                            tasks = json.load(f)["tasks"]
+                            for task_id, task_data in tasks.items():
+                                task = Task(
+                                    task_id=task_id,
+                                    problem=task_data["task"],
+                                    capability_name=task_data["capability_id"],
+                                )
+                                await solve_task(cfg, task, output_dir, langfuse_client)
+
+        except Exception as e:
+            error_msg = f"Error in task solver: {str(e)}"
+            log.error(error_msg)
+            log.error(f"Traceback: {traceback.format_exc()}")
+            span.update(metadata={"error": error_msg})
+            raise
diff --git a/src/task_solver/messages.py b/src/task_solver/messages.py
new file mode 100644
index 0000000..5187bcc
--- /dev/null
+++ b/src/task_solver/messages.py
@@ -0,0 +1,81 @@
+"""Message types for task solving debate system."""
+
+from dataclasses import dataclass
+from typing import Dict, List
+
+
+@dataclass
+class Task:
+    """Task to be solved."""
+
+    task_id: str
+    problem: str
+    capability_name: str
+
+
+@dataclass
+class TaskSolutionRequest:
+    """Request to solve a task."""
+
+    task_id: str
+    problem: str
+    capability_name: str
+    round_number: int = 1
+
+
+@dataclass
+class AgentSolution:
+    """Solution proposed by an agent."""
+
+    agent_id: str
+    task_id: str
+    thought: str
+    final_answer: str
+    numerical_answer: str
+    round_number: int
+
+    def to_dict(self) -> Dict[str, str]:
+        """Convert to dictionary."""
+        return {
+            "agent_id": self.agent_id,
+            "task_id": self.task_id,
+            "thought": self.thought,
+            "final_answer": self.final_answer,
+            "numerical_answer": self.numerical_answer,
+            "round_number": str(self.round_number),
+        }
+
+
+@dataclass
+class AgentRevisionRequest:
+    """Request for agent to revise solution based on other agents' solutions."""
+
+    task_id: str
+    problem: str
+    capability_name: str
+    other_solutions: List[Dict[str, str]]
+    round_number: int
+
+
+@dataclass
+class ConsensusCheck:
+    """Check if consensus has been reached."""
+
+    task_id: str
+    solutions: List[Dict[str, str]]
+    round_number: int
+
+
+@dataclass
+class FinalSolution:
+    """Final solution for a task."""
+
+    task_id: str
+    capability_name: str
+    problem: str
+    solution: str
+    numerical_answer: str
+    reasoning: str
+    consensus_reached: bool
+    total_rounds: int
+    all_solutions: List[Dict[str, str]]
diff --git a/src/task_solver/moderator.py b/src/task_solver/moderator.py
new file mode 100644
index 0000000..673baac
--- /dev/null
+++ b/src/task_solver/moderator.py
@@ -0,0 +1,442 @@
+"""Task solver moderator agent for managing the debate process."""
+
+import json
+import logging
+import re
+import traceback
+from pathlib import Path
+from typing import Dict, List
+
+from autogen_core import (
+    DefaultTopicId,
+    MessageContext,
+    RoutedAgent,
+    default_subscription,
+    message_handler,
+)
+from autogen_core.models import (
+    ChatCompletionClient,
+    SystemMessage,
+    UserMessage,
+)
+from langfuse import Langfuse
+
+from src.task_solver.messages import (
+    AgentRevisionRequest,
+    AgentSolution,
+    FinalSolution,
+    Task,
+    TaskSolutionRequest,
+)
+from src.utils.agentic_prompts import (
+    TASK_MODERATOR_CONSENSUS_PROMPT,
+    TASK_MODERATOR_SYSTEM_MESSAGE,
+)
+from src.utils.json_utils import parse_llm_json_response
+
+
+log = logging.getLogger("task_solver.moderator")
+
+
+@default_subscription
+class TaskSolverModerator(RoutedAgent):
+    """Moderator that manages task solver debate and checks for consensus."""
+
+    def __init__(
+        self,
+        model_client: ChatCompletionClient,
+        num_solvers: int,
+        max_rounds: int,
+        output_dir: Path,
+        langfuse_client: Langfuse,
+    ) -> None:
+        super().__init__("Task Solver Moderator")
+        self._model_client = model_client
+        self._num_solvers = num_solvers
+        self._max_rounds = max_rounds
+        self._output_dir = output_dir
+        self._langfuse_client = langfuse_client
+
+        # Track solutions by task_id and round
+        self._solutions_buffer: Dict[int, List[AgentSolution]]
+        self._current_round = 0
+        self._final_solutions: FinalSolution
+        self._tasks: Task  # Store original tasks for consensus checking
+
+    def _extract_consensus_components(
+        self, response: str
+    ) -> tuple[bool, str, str, str]:
+        """Extract consensus, solution, reasoning, and numerical answer from JSON."""
+        try:
+            parsed = parse_llm_json_response(response)
+            consensus_reached = parsed.get("consensus_reached", False)
+            final_solution = parsed.get("final_solution", "NONE")
+            reasoning = parsed.get("reasoning", "No reasoning provided")
+            numerical_answer = parsed.get("numerical_answer")
+
+            # Convert numerical_answer to string representation
+            if numerical_answer is not None:
+                numerical_answer = str(numerical_answer)
+            else:
+                numerical_answer = "null"
+
+            return consensus_reached, final_solution, reasoning, numerical_answer
+
+        except Exception as e:
+            # Fallback to old text parsing if JSON parsing fails
+            log.warning(
+                f"Failed to parse JSON response from moderator, falling back to text parsing: {e}"
+            )
+            consensus_match = re.search(
+                r"CONSENSUS_REACHED:\s*(true|false)", response, re.IGNORECASE
+            )
+            solution_match = re.search(
+                r"FINAL_SOLUTION:\s*(.*?)(?=REASONING:|$)",
+                response,
+                re.DOTALL | re.IGNORECASE,
+            )
+            reasoning_match = re.search(
+                r"REASONING:\s*(.*?)$", response, re.DOTALL | re.IGNORECASE
+            )
+
+            consensus_reached = (
+                consensus_match.group(1).lower() == "true" if consensus_match else False
+            )
+            final_solution = (
+                solution_match.group(1).strip() if solution_match else "NONE"
+            )
+            reasoning = (
+                reasoning_match.group(1).strip()
+                if reasoning_match
+                else "No reasoning provided"
+            )
+
+            return consensus_reached, final_solution, reasoning, "null"
+
+    def _check_simple_consensus(
+        self, solutions: List[AgentSolution]
+    ) -> tuple[bool, str, str]:
+        """Check consensus; if all agents have the same final answer."""
+        if not solutions:
+            return False, "", "null"
+
+        # First check numerical answers if they exist
+        numerical_answers = [
+            sol.numerical_answer for sol in solutions if sol.numerical_answer != "null"
+        ]
+        if (
+            len(numerical_answers) == len(solutions)
+            and len(set(numerical_answers)) == 1
+        ):
+            return True, solutions[0].final_answer, solutions[0].numerical_answer
+
+        # Fallback to text-based consensus
+        answers = [sol.final_answer.strip().lower() for sol in solutions]
+        if len(set(answers)) == 1:
+            return True, solutions[0].final_answer, solutions[0].numerical_answer
+
+        return False, "", "null"
+
+    @message_handler
+    async def handle_task(self, message: Task, ctx: MessageContext) -> None:
+        """Handle a task and initiate the solver process."""
+        with self._langfuse_client.start_as_current_span(
+            name=f"moderator_handle_task_{message.task_id}"
+        ) as span:
+            try:
+                msg = f"Moderator received task: {message.task_id}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "task_received": msg,
+                        "task_id": message.task_id,
+                        "capability_name": message.capability_name,
+                    }
+                )
+
+                # Initialize tracking for this task
+                self._solutions_buffer = {}
+                self._tasks = message
+
+                # Send initial solution request to all solvers
+                await self.publish_message(
+                    TaskSolutionRequest(
+                        task_id=message.task_id,
+                        problem=message.problem,
+                        capability_name=message.capability_name,
+                        round_number=1,
+                    ),
+                    topic_id=DefaultTopicId(),
+                )
+
+                span.update(
+                    metadata={
+                        "solution_request_sent": f"Round 1 solution request sent for task {message.task_id}"
+                    }
+                )
+
+            except Exception as e:
+                error_msg = f"Error handling task {message.task_id}: {str(e)}"
+                log.error(error_msg)
+                log.error(traceback.format_exc())
+                span.update(metadata={"error": error_msg})
+
+    @message_handler
+    async def handle_agent_solution(
+        self, message: AgentSolution, ctx: MessageContext
+    ) -> None:
+        """Handle solution from an agent."""
+        with self._langfuse_client.start_as_current_span(
+            name=f"moderator_handle_solution_{message.task_id}_round_{message.round_number}"
+        ) as span:
+            try:
+                task_id = message.task_id
+                round_num = message.round_number
+
+                msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, round {round_num}"
+                log.info(msg)
+                span.update(
+                    metadata={
+                        "solution_received": msg,
+                        "task_id": task_id,
+                        "agent_id": message.agent_id,
+                        "round": round_num,
+                    }
+                )
+
+                # Initialize round buffer if needed
+                if round_num not in self._solutions_buffer:
+                    self._solutions_buffer[round_num] = []
+
+                # Add solution to buffer
+                self._solutions_buffer[round_num].append(message)
+
+                # Check if we have all solutions for this round
+                if len(self._solutions_buffer[round_num]) == self._num_solvers:
+                    await self._check_consensus_and_proceed(task_id, round_num, ctx)
+
+                span.update(
+                    metadata={
+                        "solutions_collected": f"{len(self._solutions_buffer[round_num])}/{self._num_solvers} for round {round_num}"
+                    }
+                )
+
+            except Exception as e:
+                error_msg = (
+                    f"Error handling solution from agent {message.agent_id}: {str(e)}"
+                )
+                log.error(error_msg)
+                log.error(traceback.format_exc())
+                span.update(metadata={"error": error_msg})
+
+    async def _check_consensus_and_proceed(
+        self, task_id: str, round_num: int, ctx: MessageContext
+    ) -> None:
+        """Check for consensus and either finalize or start next round."""
+        with self._langfuse_client.start_as_current_span(
+            name=f"moderator_consensus_check_{task_id}_round_{round_num}"
+        ) as span:
+            try:
+                solutions = self._solutions_buffer[round_num]
+
+                # First try simple consensus check
+                simple_consensus, simple_solution, simple_numerical = (
+                    self._check_simple_consensus(solutions)
+                )
+
+                if simple_consensus:
+                    # Simple consensus reached
+                    final_solution = FinalSolution(
+                        task_id=task_id,
+                        capability_name=self._tasks.capability_name,
+                        problem=self._tasks.problem,
+                        solution=simple_solution,
+                        numerical_answer=simple_numerical,
+                        reasoning="All agents provided the same answer",
+                        consensus_reached=True,
+                        total_rounds=round_num,
+                        all_solutions=self._get_all_solutions(),
+                    )
+
+                    self._final_solutions = final_solution
+                    await self._save_final_solution(final_solution)
+
+                    span.update(
+                        metadata={
+                            "consensus_reached": True,
+                            "method": "simple",
+                            "final_solution": simple_solution[:100],
+                        }
+                    )
+                    return
+
+                if round_num < self._max_rounds:
+                    # Use LLM moderator to check for consensus
+                    stored_task = self._tasks  # Get original task
+
+                    # Format solutions for LLM
+                    all_solutions_text = "\n\n".join(
+                        [
+                            f"Agent {sol.agent_id}:\nReasoning: {sol.thought}\nFinal Answer: {sol.final_answer}"
+                            for sol in solutions
+                        ]
+                    )
+
+                    prompt = TASK_MODERATOR_CONSENSUS_PROMPT.format(
+                        problem_text=stored_task.problem,
+                        all_solutions=all_solutions_text,
+                    )
+
+                    system_message = SystemMessage(
+                        content=TASK_MODERATOR_SYSTEM_MESSAGE
+                    )
+                    user_message = UserMessage(content=prompt, source="user")
+
+                    response = await self._model_client.create(
+                        messages=[system_message, user_message],
+                        cancellation_token=ctx.cancellation_token,
+                    )
+
+                    (
+                        consensus_reached,
+                        final_solution_text,
+                        reasoning,
+                        numerical_answer,
+                    ) = self._extract_consensus_components(str(response.content))
+
+                    if consensus_reached:
+                        # LLM found consensus
+                        final_solution = FinalSolution(
+                            task_id=task_id,
+                            capability_name=self._tasks.capability_name,
+                            problem=self._tasks.problem,
+                            solution=final_solution_text,
+                            numerical_answer=numerical_answer,
+                            reasoning=reasoning,
+                            consensus_reached=True,
+                            total_rounds=round_num,
+                            all_solutions=self._get_all_solutions(),
+                        )
+
+                        self._final_solutions = final_solution
+                        await self._save_final_solution(final_solution)
+
+                        span.update(
+                            metadata={
+                                "consensus_reached": True,
+                                "method": "llm_moderator",
+                                "final_solution": final_solution_text[:100],
+                            }
+                        )
+                        return
+                    # No consensus, start next round
+                    next_round = round_num + 1
+                    self._current_round = next_round
+
+                    # Send revision request with flattened task data
+                    stored_task = self._tasks  # Get the original task
+
+                    await self.publish_message(
+                        AgentRevisionRequest(
+                            task_id=stored_task.task_id,
+                            problem=stored_task.problem,
+                            capability_name=stored_task.capability_name,
+                            other_solutions=[
+                                {
+                                    "agent_id": sol.agent_id,
+                                    "task_id": sol.task_id,
+                                    "thought": sol.thought,
+                                    "final_answer": sol.final_answer,
+                                    "numerical_answer": sol.numerical_answer,
+                                    "round_number": str(sol.round_number),
+                                }
+                                for sol in solutions
+                            ],
+                            round_number=next_round,
+                        ),
+                        topic_id=DefaultTopicId(),
+                    )
+
+                    span.update(
+                        metadata={
+                            "consensus_reached": False,
+                            "next_round_started": next_round,
+                        }
+                    )
+                else:
+                    # Max rounds reached, no consensus
+                    final_solution = FinalSolution(
+                        task_id=task_id,
+                        capability_name=self._tasks.capability_name,
+                        problem=self._tasks.problem,
+                        solution="No consensus reached",
+                        numerical_answer="null",
+                        reasoning=f"Maximum rounds ({self._max_rounds}) reached without consensus",
+                        consensus_reached=False,
+                        total_rounds=round_num,
+                        all_solutions=self._get_all_solutions(),
+                    )
+
+                    self._final_solutions = final_solution
+                    await self._save_final_solution(final_solution)
+
+                    span.update(
+                        metadata={
+                            "consensus_reached": False,
+                            "max_rounds_reached": True,
+                        }
+                    )
+
+            except Exception as e:
+                error_msg = f"Error checking consensus for task {task_id}: {str(e)}"
+                log.error(error_msg)
+                log.error(traceback.format_exc())
+                span.update(metadata={"error": error_msg})
+
+    def _get_all_solutions(self) -> List[Dict[str, str]]:
+        return [
+            sol.to_dict() for sols in self._solutions_buffer.values() for sol in sols
+        ]
+
+    async def _save_final_solution(self, final_solution: FinalSolution) -> None:
+        """Save the final solution to a file."""
+        try:
+            self._output_dir.mkdir(parents=True, exist_ok=True)
+            output_file = (
+                self._output_dir / f"task_{final_solution.task_id}_solution.json"
+            )
+
+            solution_data = {
+                "task_id": final_solution.task_id,
+                "capability_name": final_solution.capability_name,
+                "problem": final_solution.problem,
+                "solution": final_solution.solution,
+                "numerical_answer": final_solution.numerical_answer,
+                "reasoning": final_solution.reasoning,
+                "consensus_reached": final_solution.consensus_reached,
+                "total_rounds": final_solution.total_rounds,
+                "all_solutions": [
+                    {
+                        "agent_id": sol["agent_id"],
+                        "task_id": sol["task_id"],
+                        "thought": sol["thought"],
+                        "final_answer": sol["final_answer"],
+                        "numerical_answer": sol["numerical_answer"],
+                        "round_number": sol["round_number"],
+                    }
+                    for sol in final_solution.all_solutions
+                ],
+            }
+
+            with open(output_file, "w") as f:
+                json.dump(solution_data, f, indent=2)
+
+            log.info(
+                f"Saved final solution for task {final_solution.task_id} to {output_file}"
+            )
+
+        except Exception as e:
+            log.error(
+                f"Error saving final solution for task {final_solution.task_id}: {str(e)}"
+            )
+            log.error(traceback.format_exc())
diff --git a/src/task_solving/scientist.py b/src/task_solver/scientist.py
similarity index 53%
rename from src/task_solving/scientist.py
rename to src/task_solver/scientist.py
index c493625..957617f 100644
--- a/src/task_solving/scientist.py
+++ b/src/task_solver/scientist.py
@@ -1,7 +1,6 @@
-"""Task solver agent for solving tasks through debate."""
+"""Task solver agent for solver tasks through debate."""
 
 import logging
-import re
 import traceback
 
 from autogen_core import (
@@ -18,7 +17,7 @@
 )
 from langfuse import Langfuse
 
-from src.task_solving.messages import (
+from src.task_solver.messages import (
     AgentRevisionRequest,
     AgentSolution,
     TaskSolutionRequest,
@@ -28,76 +27,90 @@
     TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT,
     TASK_SOLVER_SYSTEM_MESSAGE,
 )
+from src.utils.json_utils import parse_llm_json_response
 
 
-log = logging.getLogger("task_solving.solver")
+log = logging.getLogger("task_solver.scientist")
 
 
 @default_subscription
-class TaskSolvingScientist(RoutedAgent):
+class TaskSolverScientist(RoutedAgent):
     """A scientist that solves tasks through debate."""
 
     def __init__(
         self,
         model_client: ChatCompletionClient,
         scientist_id: str,
-        langfuse_client: Langfuse = None,
+        langfuse_client: Langfuse,
     ) -> None:
-        super().__init__(f"Task Solving Scientist {scientist_id}")
+        super().__init__(f"Task Solver Scientist {scientist_id}")
         self._model_client = model_client
         self._scientist_id = scientist_id
         self._langfuse_client = langfuse_client
 
-    def _extract_solution_components(self, response: str) -> tuple[str, str]:
-        """Extract thought and final answer from the response."""
-        thought_match = re.search(r"THOUGHT:\s*(.*?)(?=FINAL ANSWER:|$)", response, re.DOTALL | re.IGNORECASE)
-        answer_match = re.search(r"FINAL ANSWER:\s*(.*?)$", response, re.DOTALL | re.IGNORECASE)
-        
-        thought = thought_match.group(1).strip() if thought_match else response.strip()
-        final_answer = answer_match.group(1).strip() if answer_match else "No clear answer provided"
-        
-        return thought, final_answer
+    def _extract_solution_components(self, response: str) -> tuple[str, str, str]:
+        """Extract thought, final answer, and numerical answer from JSON response."""
+        try:
+            parsed = parse_llm_json_response(response)
+            thought = parsed.get("thought", response.strip())
+            final_answer = parsed.get("final_answer", "No clear answer provided")
+            numerical_answer = parsed.get("numerical_answer")
+
+            # Convert numerical_answer to string representation
+            if numerical_answer is not None:
+                numerical_answer = str(numerical_answer)
+            else:
+                numerical_answer = "null"
+
+            return thought, final_answer, numerical_answer
+
+        except Exception as e:
+            msg = f"Failed to parse JSON response: {e} \n Response: {response}"
+            log.error(msg)
+            log.error(traceback.format_exc())
+            raise
 
     @message_handler
     async def handle_task_solution_request(
         self, message: TaskSolutionRequest, ctx: MessageContext
     ) -> None:
-        """Handle initial task solution request (Round 1)."""
+        """Handle initial task solution request."""
         with self._langfuse_client.start_as_current_span(
-            name=f"scientist_{self._scientist_id}_round_1"
+            name=f"scientist_{self._scientist_id}_initial_solution_request"
         ) as span:
             try:
-                task_text = message.task.task_content.get("task", "")
-                
-                msg = f"Scientist {self._scientist_id} handling initial solution request for task: {message.task.task_id}"
+                msg = f"Scientist {self._scientist_id} handling initial solution request for task: {message.task_id}, capability: {message.capability_name} round: {message.round_number}"
                 log.info(msg)
                 span.update(
                     metadata={
                         "solution_request_received": msg,
                         "scientist_id": self._scientist_id,
-                        "task_id": message.task.task_id,
+                        "task_id": message.task_id,
+                        "capability": message.capability_name,
                         "round": message.round_number,
                     }
                 )
 
-                prompt = TASK_SOLVER_ROUND_1_PROMPT.format(problem_text=task_text)
-                
+                prompt = TASK_SOLVER_ROUND_1_PROMPT.format(problem_text=message.problem)
+
                 system_message = SystemMessage(content=TASK_SOLVER_SYSTEM_MESSAGE)
                 user_message = UserMessage(content=prompt, source="user")
 
                 response = await self._model_client.create(
-                    messages=[system_message, user_message],
-                    cancellation_token=ctx.cancellation_token,
+                    [system_message, user_message]
                 )
 
-                response_content = response.content
-                thought, final_answer = self._extract_solution_components(response_content)
+                response_content = str(response.content)
+                thought, final_answer, numerical_answer = (
+                    self._extract_solution_components(response_content)
+                )
 
                 solution = AgentSolution(
                     agent_id=self._scientist_id,
-                    task_id=message.task.task_id,
+                    task_id=message.task_id,
                     thought=thought,
                     final_answer=final_answer,
+                    numerical_answer=numerical_answer,
                     round_number=message.round_number,
                 )
 
@@ -105,16 +118,15 @@ async def handle_task_solution_request(
 
                 span.update(
                     metadata={
-                        "solution_generated": f"Scientist {self._scientist_id} generated solution for task {message.task.task_id}",
-                        "final_answer": final_answer[:100],  # Truncate for logging
+                        "solution_generated": f"Scientist {self._scientist_id} generated solution for task {message.task_id}, capability: {message.capability_name} round: {message.round_number}",
                     }
                 )
 
             except Exception as e:
-                error_msg = f"Error in scientist {self._scientist_id} round 1: {str(e)}"
-                log.error(error_msg)
+                msg = f"Error in scientist {self._scientist_id} task solution request: {str(e)}"
+                log.error(msg)
                 log.error(traceback.format_exc())
-                span.update(metadata={"error": error_msg})
+                span.update(metadata={"error": msg})
 
     @message_handler
     async def handle_agent_revision_request(
@@ -125,48 +137,50 @@ async def handle_agent_revision_request(
             name=f"scientist_{self._scientist_id}_round_{message.round_number}"
         ) as span:
             try:
-                task_text = message.task.task_content.get("task", "")
-                
-                msg = f"Scientist {self._scientist_id} handling revision request for task: {message.task.task_id}, round: {message.round_number}"
+                msg = f"Scientist {self._scientist_id} handling revision request for task: {message.task_id}, capability: {message.capability_name} round: {message.round_number}"
                 log.info(msg)
                 span.update(
                     metadata={
                         "revision_request_received": msg,
                         "scientist_id": self._scientist_id,
-                        "task_id": message.task.task_id,
+                        "task_id": message.task_id,
                         "round": message.round_number,
                         "num_other_solutions": len(message.other_solutions),
                     }
                 )
 
                 # Format other scientists' solutions
-                other_solutions_text = "\n\n".join([
-                    f"Scientist {sol.agent_id}: Reasoning: {sol.thought}, Final solution: {sol.final_answer}"
-                    for sol in message.other_solutions
-                    if sol.agent_id != self._scientist_id  # Don't include our own solution
-                ])
+                other_solutions_text = "\n\n".join(
+                    [
+                        f"Scientist {sol['agent_id']}: Reasoning: {sol['thought']}, Final solution: {sol['final_answer']}"
+                        for sol in message.other_solutions
+                        if sol["agent_id"]
+                        != self._scientist_id  # Don't include its own solution
+                    ]
+                )
 
                 prompt = TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT.format(
-                    other_solutions=other_solutions_text,
-                    problem_text=task_text
+                    other_solutions=other_solutions_text, problem_text=message.problem
                 )
-                
+
                 system_message = SystemMessage(content=TASK_SOLVER_SYSTEM_MESSAGE)
                 user_message = UserMessage(content=prompt, source="user")
 
                 response = await self._model_client.create(
-                    messages=[system_message, user_message],
-                    cancellation_token=ctx.cancellation_token,
+                    [system_message, user_message]
                 )
 
-                response_content = response.content
-                thought, final_answer = self._extract_solution_components(response_content)
+                response_content = str(response.content)
+                thought, final_answer, numerical_answer = (
+                    self._extract_solution_components(response_content)
+                )
 
                 solution = AgentSolution(
                     agent_id=self._scientist_id,
-                    task_id=message.task.task_id,
+                    task_id=message.task_id,
                     thought=thought,
                     final_answer=final_answer,
+                    numerical_answer=numerical_answer,
                     round_number=message.round_number,
                 )
 
@@ -174,13 +188,12 @@ async def handle_agent_revision_request(
 
                 span.update(
                     metadata={
-                        "revision_generated": f"Scientist {self._scientist_id} generated revision for task {message.task.task_id}",
-                        "final_answer": final_answer[:100],  # Truncate for logging
+                        "revision_generated": f"Scientist {self._scientist_id} generated revision for task {message.task_id}, capability: {message.capability_name}, round: {message.round_number}",
                     }
                 )
 
             except Exception as e:
-                error_msg = f"Error in scientist {self._scientist_id} round {message.round_number}: {str(e)}"
-                log.error(error_msg)
+                msg = f"Error in scientist {self._scientist_id} agent revision request: {str(e)}"
+                log.error(msg)
                 log.error(traceback.format_exc())
-                span.update(metadata={"error": error_msg}) 
\ No newline at end of file
+                span.update(metadata={"error": msg})
diff --git a/src/task_solving/__init__.py b/src/task_solving/__init__.py
deleted file mode 100644
index 51e8634..0000000
--- a/src/task_solving/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-"""Task solving module with debate-based approach."""
-
-from .generator import solve_tasks_with_debate, load_tasks_from_file
-from .messages import Task, TaskSolutionRequest, AgentSolution, FinalSolution
-from .moderator import TaskSolvingModerator
-from .scientist import TaskSolvingScientist
-
-__all__ = [
-    "solve_tasks_with_debate",
-    "load_tasks_from_file",
-    "Task",
-    "TaskSolutionRequest", 
-    "AgentSolution",
-    "FinalSolution",
-    "TaskSolvingModerator",
-    "TaskSolvingScientist",
-] 
\ No newline at end of file
diff --git a/src/task_solving/generator.py b/src/task_solving/generator.py
deleted file mode 100644
index 26f05b6..0000000
--- a/src/task_solving/generator.py
+++ /dev/null
@@ -1,225 +0,0 @@
-"""Main task solving orchestration function."""
-
-import json
-import logging
-import traceback
-from datetime import datetime
-from pathlib import Path
-from typing import Dict, List
-
-from autogen_core import (
-    EVENT_LOGGER_NAME,
-    ROOT_LOGGER_NAME,
-    TRACE_LOGGER_NAME,
-    DefaultTopicId,
-    SingleThreadedAgentRuntime,
-)
-from langfuse import Langfuse
-from omegaconf import DictConfig
-
-from src.task_solving.messages import Task
-from src.task_solving.moderator import TaskSolvingModerator
-from src.task_solving.scientist import TaskSolvingScientist
-from src.utils.model_client_utils import get_model_client
-
-
-log = logging.getLogger("task_solving.generator")
-logging.getLogger(ROOT_LOGGER_NAME).setLevel(logging.WARNING)
-logging.getLogger(TRACE_LOGGER_NAME).setLevel(logging.WARNING)
-logging.getLogger(EVENT_LOGGER_NAME).setLevel(logging.WARNING)
-
-
-async def solve_tasks_with_debate(
-    cfg: DictConfig, 
-    tasks: List[Dict], 
-    langfuse_client: Langfuse = None
-) -> Dict[str, Dict]:
-    """
-    Solve tasks using multi-agent debate system.
-    
-    Args:
-        cfg: Configuration containing debate and model settings
-        tasks: List of tasks to solve, each containing task_id, task content, and capability_id
-        langfuse_client: Langfuse client for tracing
-        
-    Returns:
-        Dictionary mapping task_id to final solution data
-    """
-    domain_name = cfg.global_cfg.domain
-    exp_id = cfg.exp_cfg.exp_id
-    max_rounds = cfg.debate_cfg.max_round
-    num_solvers = 2  # scientist_a and scientist_b
-    solutions_tag = f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
-
-    with langfuse_client.start_as_current_span(
-        name=f"ace_task_solving:{domain_name}:{exp_id}:{solutions_tag}"
-    ) as span:
-        try:
-            msg = f"Solutions will be saved with tag: {solutions_tag}"
-            log.info(msg)
-            span.update(
-                metadata={
-                    "solving_started": msg,
-                    "solutions_tag": solutions_tag,
-                    "domain": domain_name,
-                    "exp_id": exp_id,
-                    "num_tasks": len(tasks),
-                    "num_solvers": num_solvers,
-                    "max_rounds": max_rounds,
-                }
-            )
-
-            # Create output directory
-            output_dir = Path(cfg.global_cfg.output_dir) / "task_solutions" / f"{domain_name}_{exp_id}{solutions_tag}"
-            output_dir.mkdir(parents=True, exist_ok=True)
-
-            # Set up runtime
-            runtime = SingleThreadedAgentRuntime()
-
-            # Create model clients for each agent
-            scientist_a_client = get_model_client(
-                cfg.agents.scientist_a.model_name, 
-                seed=cfg.agents.scientist_a.get("seed")
-            )
-            scientist_b_client = get_model_client(
-                cfg.agents.scientist_b.model_name, 
-                seed=cfg.agents.scientist_b.get("seed")
-            )
-            moderator_client = get_model_client(
-                cfg.agents.moderator.model_name, 
-                seed=cfg.agents.moderator.get("seed")
-            )
-
-            # Register moderator
-            moderator_agent_type = await TaskSolvingModerator.register(
-                runtime,
-                "task_solving_moderator",
-                lambda: TaskSolvingModerator(
-                    model_client=moderator_client,
-                    num_solvers=num_solvers,
-                    max_rounds=max_rounds,
-                    output_dir=output_dir,
-                    langfuse_client=langfuse_client,
-                ),
-            )
-
-            # Register scientist agents
-            scientist_a_type = await TaskSolvingScientist.register(
-                runtime,
-                "task_scientist_a",
-                lambda: TaskSolvingScientist(
-                    model_client=scientist_a_client,
-                    scientist_id="scientist_a",
-                    langfuse_client=langfuse_client,
-                ),
-            )
-            
-            scientist_b_type = await TaskSolvingScientist.register(
-                runtime,
-                "task_scientist_b", 
-                lambda: TaskSolvingScientist(
-                    model_client=scientist_b_client,
-                    scientist_id="scientist_b",
-                    langfuse_client=langfuse_client,
-                ),
-            )
-
-            # Start runtime
-            runtime.start()
-
-            log.info(f"Starting task solving for {len(tasks)} tasks with {num_solvers} scientists")
-
-            # Process each task
-            for i, (task_id, task_data) in enumerate(tasks.items()):
-                # Handle both old and new task formats
-                if isinstance(task_data, dict) and "task" in task_data:
-                    # New format: {"task": "problem text", "capability_id": "cap_name"}
-                    capability_id = task_data.get("capability_id", "unknown")
-                    task_content = task_data
-                else:
-                    # Old format or other formats
-                    capability_id = task_data.get("capability_id", "unknown") if isinstance(task_data, dict) else "unknown"
-                    task_content = {"task": str(task_data)} if not isinstance(task_data, dict) else task_data
-                
-                # Create task message
-                task = Task(
-                    task_id=task_id,
-                    task_content=task_content,
-                    capability_id=capability_id,
-                )
-
-                # Send task to moderator
-                await runtime.publish_message(
-                    task, 
-                    topic_id=DefaultTopicId()
-                )
-
-                log.info(f"Submitted task {task_id} for solving")
-
-            # Wait for all tasks to complete
-            # Note: In a real implementation, you might want to add a timeout
-            # and check for completion status
-            await runtime.stop_when_idle()
-
-            # Collect results
-            results = {}
-            for solution_file in output_dir.glob("task_*_solution.json"):
-                try:
-                    with open(solution_file, "r") as f:
-                        solution_data = json.load(f)
-                        results[solution_data["task_id"]] = solution_data
-                except Exception as e:
-                    log.error(f"Error loading solution from {solution_file}: {e}")
-
-            log.info(f"Task solving completed. Processed {len(results)} tasks.")
-            
-            span.update(
-                metadata={
-                    "solving_completed": f"Processed {len(results)} tasks",
-                    "output_dir": str(output_dir),
-                    "results_count": len(results),
-                }
-            )
-
-            return results
-
-        except Exception as e:
-            error_msg = f"Error in task solving: {str(e)}"
-            log.error(error_msg)
-            log.error(traceback.format_exc())
-            span.update(metadata={"error": error_msg})
-            raise
-
-
-def load_tasks_from_file(tasks_file: Path) -> List[Dict]:
-    """
-    Load tasks from a JSON file.
-    
-    Args:
-        tasks_file: Path to the tasks file
-        
-    Returns:
-        List of task dictionaries
-    """
-    try:
-        with open(tasks_file, "r") as f:
-            tasks_data = json.load(f)
-        
-        # Handle different task file formats
-        if isinstance(tasks_data, list):
-            # Old format: list of tasks
-            return {f"task_{i+1}": task for i, task in enumerate(tasks_data)}
-        elif isinstance(tasks_data, dict):
-            # If it's a dict, try to extract tasks
-            if "tasks" in tasks_data:
-                # New format: {"tasks": {"task_1": {...}, "task_2": {...}}}
-                return tasks_data["tasks"]
-            else:
-                # Convert dict to single task
-                return {"task_1": tasks_data}
-        else:
-            raise ValueError(f"Unexpected task file format: {type(tasks_data)}")
-            
-    except Exception as e:
-        log.error(f"Error loading tasks from {tasks_file}: {e}")
-        raise 
\ No newline at end of file
diff --git a/src/task_solving/messages.py b/src/task_solving/messages.py
deleted file mode 100644
index a1af9d3..0000000
--- a/src/task_solving/messages.py
+++ /dev/null
@@ -1,64 +0,0 @@
-"""Message types for task solving debate system."""
-
-from dataclasses import dataclass
-from typing import Any, Dict, List
-
-from autogen_core import BaseMessage
-
-
-@dataclass
-class Task(BaseMessage):
-    """Task to be solved."""
-    
-    task_id: str
-    task_content: Dict[str, Any]
-    capability_id: str
-
-
-@dataclass
-class TaskSolutionRequest(BaseMessage):
-    """Request to solve a task."""
-    
-    task: Task
-    round_number: int = 1
-
-
-@dataclass
-class AgentSolution(BaseMessage):
-    """Solution proposed by an agent."""
-    
-    agent_id: str
-    task_id: str
-    thought: str
-    final_answer: str
-    round_number: int
-
-
-@dataclass
-class AgentRevisionRequest(BaseMessage):
-    """Request for agent to revise solution based on other agents' solutions."""
-    
-    task: Task
-    other_solutions: List[AgentSolution]
-    round_number: int
-
-
-@dataclass
-class ConsensusCheck(BaseMessage):
-    """Check if consensus has been reached."""
-    
-    task_id: str
-    solutions: List[AgentSolution]
-    round_number: int
-
-
-@dataclass
-class FinalSolution(BaseMessage):
-    """Final solution for a task."""
-    
-    task_id: str
-    solution: str
-    reasoning: str
-    consensus_reached: bool
-    total_rounds: int
-    all_solutions: List[AgentSolution] 
\ No newline at end of file
diff --git a/src/task_solving/moderator.py b/src/task_solving/moderator.py
deleted file mode 100644
index 251bfd6..0000000
--- a/src/task_solving/moderator.py
+++ /dev/null
@@ -1,342 +0,0 @@
-"""Task solving moderator agent for managing the debate process."""
-
-import json
-import logging
-import re
-import traceback
-from pathlib import Path
-from typing import Dict, List
-
-from autogen_core import (
-    DefaultTopicId,
-    MessageContext,
-    RoutedAgent,
-    default_subscription,
-    message_handler,
-)
-from autogen_core.models import (
-    ChatCompletionClient,
-    SystemMessage,
-    UserMessage,
-)
-from langfuse import Langfuse
-
-from src.task_solving.messages import (
-    AgentRevisionRequest,
-    AgentSolution,
-    ConsensusCheck,
-    FinalSolution,
-    Task,
-    TaskSolutionRequest,
-)
-from src.utils.agentic_prompts import (
-    TASK_MODERATOR_CONSENSUS_PROMPT,
-    TASK_MODERATOR_SYSTEM_MESSAGE,
-)
-
-
-log = logging.getLogger("task_solving.moderator")
-
-
-@default_subscription
-class TaskSolvingModerator(RoutedAgent):
-    """Moderator that manages task solving debate and checks for consensus."""
-
-    def __init__(
-        self,
-        model_client: ChatCompletionClient,
-        num_solvers: int,
-        max_rounds: int,
-        output_dir: Path,
-        langfuse_client: Langfuse = None,
-    ) -> None:
-        super().__init__("Task Solving Moderator")
-        self._model_client = model_client
-        self._num_solvers = num_solvers
-        self._max_rounds = max_rounds
-        self._output_dir = output_dir
-        self._langfuse_client = langfuse_client
-        
-        # Track solutions by task_id and round
-        self._solutions_buffer: Dict[str, Dict[int, List[AgentSolution]]] = {}
-        self._current_round: Dict[str, int] = {}
-        self._final_solutions: Dict[str, FinalSolution] = {}
-
-    def _extract_consensus_components(self, response: str) -> tuple[bool, str, str]:
-        """Extract consensus decision, solution, and reasoning from response."""
-        consensus_match = re.search(r"CONSENSUS_REACHED:\s*(true|false)", response, re.IGNORECASE)
-        solution_match = re.search(r"FINAL_SOLUTION:\s*(.*?)(?=REASONING:|$)", response, re.DOTALL | re.IGNORECASE)
-        reasoning_match = re.search(r"REASONING:\s*(.*?)$", response, re.DOTALL | re.IGNORECASE)
-        
-        consensus_reached = consensus_match.group(1).lower() == "true" if consensus_match else False
-        final_solution = solution_match.group(1).strip() if solution_match else "NONE"
-        reasoning = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided"
-        
-        return consensus_reached, final_solution, reasoning
-
-    def _check_simple_consensus(self, solutions: List[AgentSolution]) -> tuple[bool, str]:
-        """Simple consensus check - if all agents have the same final answer."""
-        if not solutions:
-            return False, ""
-        
-        # Extract final answers and normalize them
-        answers = [sol.final_answer.strip().lower() for sol in solutions]
-        
-        # Check if all answers are the same
-        if len(set(answers)) == 1:
-            return True, solutions[0].final_answer
-        
-        return False, ""
-
-    @message_handler
-    async def handle_task(self, message: Task, ctx: MessageContext) -> None:
-        """Handle a task and initiate the solving process."""
-        with self._langfuse_client.start_as_current_span(
-            name=f"moderator_handle_task_{message.task_id}"
-        ) as span:
-            try:
-                msg = f"Moderator received task: {message.task_id}"
-                log.info(msg)
-                span.update(
-                    metadata={
-                        "task_received": msg,
-                        "task_id": message.task_id,
-                        "capability_id": message.capability_id,
-                    }
-                )
-
-                # Initialize tracking for this task
-                self._solutions_buffer[message.task_id] = {}
-                self._current_round[message.task_id] = 1
-
-                # Send initial solution request to all solvers
-                await self.publish_message(
-                    TaskSolutionRequest(task=message, round_number=1),
-                    topic_id=DefaultTopicId(),
-                )
-
-                span.update(
-                    metadata={"solution_request_sent": f"Round 1 solution request sent for task {message.task_id}"}
-                )
-
-            except Exception as e:
-                error_msg = f"Error handling task {message.task_id}: {str(e)}"
-                log.error(error_msg)
-                log.error(traceback.format_exc())
-                span.update(metadata={"error": error_msg})
-
-    @message_handler
-    async def handle_agent_solution(self, message: AgentSolution, ctx: MessageContext) -> None:
-        """Handle solution from an agent."""
-        with self._langfuse_client.start_as_current_span(
-            name=f"moderator_handle_solution_{message.task_id}_round_{message.round_number}"
-        ) as span:
-            try:
-                task_id = message.task_id
-                round_num = message.round_number
-                
-                msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, round {round_num}"
-                log.info(msg)
-                span.update(
-                    metadata={
-                        "solution_received": msg,
-                        "task_id": task_id,
-                        "agent_id": message.agent_id,
-                        "round": round_num,
-                    }
-                )
-
-                # Initialize round buffer if needed
-                if round_num not in self._solutions_buffer[task_id]:
-                    self._solutions_buffer[task_id][round_num] = []
-
-                # Add solution to buffer
-                self._solutions_buffer[task_id][round_num].append(message)
-
-                # Check if we have all solutions for this round
-                if len(self._solutions_buffer[task_id][round_num]) == self._num_solvers:
-                    await self._check_consensus_and_proceed(task_id, round_num, ctx)
-
-                span.update(
-                    metadata={
-                        "solutions_collected": f"{len(self._solutions_buffer[task_id][round_num])}/{self._num_solvers} for round {round_num}"
-                    }
-                )
-
-            except Exception as e:
-                error_msg = f"Error handling solution from agent {message.agent_id}: {str(e)}"
-                log.error(error_msg)
-                log.error(traceback.format_exc())
-                span.update(metadata={"error": error_msg})
-
-    async def _check_consensus_and_proceed(self, task_id: str, round_num: int, ctx: MessageContext) -> None:
-        """Check for consensus and either finalize or start next round."""
-        with self._langfuse_client.start_as_current_span(
-            name=f"moderator_consensus_check_{task_id}_round_{round_num}"
-        ) as span:
-            try:
-                solutions = self._solutions_buffer[task_id][round_num]
-                
-                # First try simple consensus check
-                simple_consensus, simple_solution = self._check_simple_consensus(solutions)
-                
-                if simple_consensus:
-                    # Simple consensus reached
-                    final_solution = FinalSolution(
-                        task_id=task_id,
-                        solution=simple_solution,
-                        reasoning="All agents provided the same answer",
-                        consensus_reached=True,
-                        total_rounds=round_num,
-                        all_solutions=self._get_all_solutions_for_task(task_id),
-                    )
-                    
-                    self._final_solutions[task_id] = final_solution
-                    await self._save_final_solution(final_solution)
-                    
-                    span.update(
-                        metadata={
-                            "consensus_reached": True,
-                            "method": "simple",
-                            "final_solution": simple_solution[:100],
-                        }
-                    )
-                    return
-
-                # If no simple consensus and we haven't reached max rounds, use LLM to check
-                if round_num < self._max_rounds:
-                    # Use LLM moderator to check for consensus
-                    task_content = ""  # We need to get the original task content
-                    # For now, let's get it from the first solution's context or we need to store it
-                    
-                    # Format solutions for LLM
-                    all_solutions_text = "\n\n".join([
-                        f"Agent {sol.agent_id}:\nReasoning: {sol.thought}\nFinal Answer: {sol.final_answer}"
-                        for sol in solutions
-                    ])
-                    
-                    prompt = TASK_MODERATOR_CONSENSUS_PROMPT.format(
-                        problem_text=task_content,  # We need to store this from the original task
-                        all_solutions=all_solutions_text
-                    )
-                    
-                    system_message = SystemMessage(content=TASK_MODERATOR_SYSTEM_MESSAGE)
-                    user_message = UserMessage(content=prompt, source="user")
-
-                    response = await self._model_client.create(
-                        messages=[system_message, user_message],
-                        cancellation_token=ctx.cancellation_token,
-                    )
-
-                    consensus_reached, final_solution_text, reasoning = self._extract_consensus_components(response.content)
-                    
-                    if consensus_reached:
-                        # LLM found consensus
-                        final_solution = FinalSolution(
-                            task_id=task_id,
-                            solution=final_solution_text,
-                            reasoning=reasoning,
-                            consensus_reached=True,
-                            total_rounds=round_num,
-                            all_solutions=self._get_all_solutions_for_task(task_id),
-                        )
-                        
-                        self._final_solutions[task_id] = final_solution
-                        await self._save_final_solution(final_solution)
-                        
-                        span.update(
-                            metadata={
-                                "consensus_reached": True,
-                                "method": "llm_moderator",
-                                "final_solution": final_solution_text[:100],
-                            }
-                        )
-                        return
-                    else:
-                        # No consensus, start next round
-                        next_round = round_num + 1
-                        self._current_round[task_id] = next_round
-                        
-                        # We need the original task to send revision requests
-                        # For now, create a placeholder task
-                        task = Task(task_id=task_id, task_content={"task": task_content}, capability_id="")
-                        
-                        await self.publish_message(
-                            AgentRevisionRequest(
-                                task=task,
-                                other_solutions=solutions,
-                                round_number=next_round,
-                            ),
-                            topic_id=DefaultTopicId(),
-                        )
-                        
-                        span.update(
-                            metadata={
-                                "consensus_reached": False,
-                                "next_round_started": next_round,
-                            }
-                        )
-                else:
-                    # Max rounds reached, no consensus
-                    final_solution = FinalSolution(
-                        task_id=task_id,
-                        solution="No consensus reached",
-                        reasoning=f"Maximum rounds ({self._max_rounds}) reached without consensus",
-                        consensus_reached=False,
-                        total_rounds=round_num,
-                        all_solutions=self._get_all_solutions_for_task(task_id),
-                    )
-                    
-                    self._final_solutions[task_id] = final_solution
-                    await self._save_final_solution(final_solution)
-                    
-                    span.update(
-                        metadata={
-                            "consensus_reached": False,
-                            "max_rounds_reached": True,
-                        }
-                    )
-
-            except Exception as e:
-                error_msg = f"Error checking consensus for task {task_id}: {str(e)}"
-                log.error(error_msg)
-                log.error(traceback.format_exc())
-                span.update(metadata={"error": error_msg})
-
-    def _get_all_solutions_for_task(self, task_id: str) -> List[AgentSolution]:
-        """Get all solutions for a task across all rounds."""
-        all_solutions = []
-        for round_solutions in self._solutions_buffer[task_id].values():
-            all_solutions.extend(round_solutions)
-        return all_solutions
-
-    async def _save_final_solution(self, final_solution: FinalSolution) -> None:
-        """Save the final solution to a file."""
-        try:
-            output_file = self._output_dir / f"task_{final_solution.task_id}_solution.json"
-            
-            solution_data = {
-                "task_id": final_solution.task_id,
-                "solution": final_solution.solution,
-                "reasoning": final_solution.reasoning,
-                "consensus_reached": final_solution.consensus_reached,
-                "total_rounds": final_solution.total_rounds,
-                "all_solutions": [
-                    {
-                        "agent_id": sol.agent_id,
-                        "thought": sol.thought,
-                        "final_answer": sol.final_answer,
-                        "round_number": sol.round_number,
-                    }
-                    for sol in final_solution.all_solutions
-                ],
-            }
-            
-            with open(output_file, "w") as f:
-                json.dump(solution_data, f, indent=2)
-            
-            log.info(f"Saved final solution for task {final_solution.task_id} to {output_file}")
-            
-        except Exception as e:
-            log.error(f"Error saving final solution for task {final_solution.task_id}: {str(e)}")
-            log.error(traceback.format_exc()) 
\ No newline at end of file
diff --git a/src/utils/agentic_prompts.py b/src/utils/agentic_prompts.py
index a65df8f..ff3f6a7 100644
--- a/src/utils/agentic_prompts.py
+++ b/src/utils/agentic_prompts.py
@@ -273,25 +273,50 @@
 
 PROBLEM: {problem_text}
 
-Explain your reasoning step by step. Your final answer should be clearly stated at the end of your response.
+Provide your solution in JSON format with the following structure:
+- thought: Your detailed reasoning and step-by-step solution process
+- final_answer: Your complete answer with explanation
+- numerical_answer: The final numerical result (if applicable, otherwise null)
 
-Respond using this format:
-THOUGHT: <your reasoning and thought process for solving the task>
-FINAL ANSWER: <answer>"""
+Example for a math problem:
+{{
+    "thought": "To solve this problem, I need to...",
+    "final_answer": "The solution is 42 because...",
+    "numerical_answer": 42
+}}
+
+Example for a non-numerical problem:
+{{
+    "thought": "To approach this problem, I should consider...",
+    "final_answer": "The answer is that we should use method X because...",
+    "numerical_answer": null
+}}
+
+Respond with valid JSON only."""
 
 TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT = """These are the reasoning and solutions to the problem from other agents:
 
 {other_solutions}
 
-Using the solutions from other agents as additional information, can you provide your answer to the problem? 
+Using the solutions from other agents as additional information, can you provide your answer to the problem?
 
 The original problem is: {problem_text}
 
-Explain your reasoning step by step. Your final answer should be clearly stated at the end of your response.
+Consider the other agents' approaches and reasoning. You may agree with them, disagree, or provide a synthesis of different approaches.
 
-Respond using this format:
-THOUGHT: <your reasoning and thought process for solving the task>
-FINAL ANSWER: <answer>"""
+Provide your solution in JSON format with the following structure:
+- thought: Your detailed reasoning, considering other agents' solutions
+- final_answer: Your complete answer with explanation
+- numerical_answer: The final numerical result (if applicable, otherwise null)
+
+Example:
+{{
+    "thought": "Looking at the other solutions, Agent A used method X which is correct, but Agent B made an error in step 2. My approach is...",
+    "final_answer": "The solution is 42 because...",
+    "numerical_answer": 42
+}}
+
+Respond with valid JSON only."""
 
 TASK_MODERATOR_SYSTEM_MESSAGE = """You are a moderator overseeing a collaborative problem-solving debate. Your role is to check for consensus among agents and determine the final solution."""
 
@@ -305,13 +330,19 @@
 Determine if there is consensus among the agents. Consensus is reached when:
 1. All agents provide the same final answer, OR
 2. The majority of agents agree on the same answer with similar reasoning
+3. For numerical problems, the numerical answers should match or be very close
 
 If consensus is reached, provide the agreed-upon solution. If not, indicate that another round of debate is needed.
 
-Respond using this format:
-CONSENSUS_REACHED: <true/false>
-FINAL_SOLUTION: <the agreed solution if consensus reached, otherwise "NONE">
-REASONING: <explanation of your decision>"""
+Provide your assessment in JSON format:
+{{
+    "consensus_reached": true/false,
+    "final_solution": "the agreed solution if consensus reached, otherwise null",
+    "numerical_answer": final_numerical_result_if_applicable_otherwise_null,
+    "reasoning": "explanation of your decision"
+}}
+
+Respond with valid JSON only."""
 
 # =============================================================================
 # SYSTEM MESSAGES

From d1e1812dd18baa874d08a5684e93777e3e742242 Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Sun, 7 Sep 2025 02:57:53 -0400
Subject: [PATCH 09/19] ruff fix.

---
 src/agentic_capability_generator.py | 5 +----
 src/task_generation/generator.py    | 3 ++-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/agentic_capability_generator.py b/src/agentic_capability_generator.py
index 835813e..20052ff 100644
--- a/src/agentic_capability_generator.py
+++ b/src/agentic_capability_generator.py
@@ -4,7 +4,6 @@
 import logging
 import os
 import traceback
-from typing import Optional
 
 import hydra
 import openlit
@@ -30,9 +29,7 @@
 def main(cfg: DictConfig) -> None:
     """Run the multi-agent debate-based capability generation system."""
     areas_tag = cfg.pipeline_tags.areas_tag
-    resume_tag: Optional[str] = getattr(
-        cfg.pipeline_tags, "resume_capabilities_tag", None
-    )
+    resume_tag = getattr(cfg.pipeline_tags, "resume_capabilities_tag", None)
     domain_name = cfg.global_cfg.domain
     exp_id = cfg.exp_cfg.exp_id
     num_capabilities_per_area = cfg.capability_generation.num_capabilities_per_area
diff --git a/src/task_generation/generator.py b/src/task_generation/generator.py
index 56094cf..9fb8aaa 100644
--- a/src/task_generation/generator.py
+++ b/src/task_generation/generator.py
@@ -6,6 +6,7 @@
 import traceback
 from datetime import datetime
 from pathlib import Path
+from typing import Optional
 
 from autogen_core import (
     EVENT_LOGGER_NAME,
@@ -164,7 +165,7 @@ async def generate_tasks(
     cfg: DictConfig,
     capabilities_tag: str,
     langfuse_client: Langfuse,
-    resume_tag: str,
+    resume_tag: Optional[str] = None,
 ) -> None:
     """Generate tasks for all capabilities."""
     domain_name = cfg.global_cfg.domain

From 4d237f7a3b1f059c075382e6145688416e16ca59 Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Sun, 7 Sep 2025 04:09:53 -0400
Subject: [PATCH 10/19] updated saved file name for solutions.

---
 src/task_solver/moderator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/task_solver/moderator.py b/src/task_solver/moderator.py
index 673baac..2789e89 100644
--- a/src/task_solver/moderator.py
+++ b/src/task_solver/moderator.py
@@ -403,7 +403,8 @@ async def _save_final_solution(self, final_solution: FinalSolution) -> None:
         try:
             self._output_dir.mkdir(parents=True, exist_ok=True)
             output_file = (
-                self._output_dir / f"task_{final_solution.task_id}_solution.json"
+                self._output_dir
+                / f"{final_solution.task_id}_{final_solution.capability_name}_solution.json"
             )
 
             solution_data = {

From 38d825d56d38cbe002cbf85ade0c068293a0cf73 Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Tue, 9 Sep 2025 00:28:13 -0400
Subject: [PATCH 11/19] added extra details to agent solution messages.

---
 src/task_solver/messages.py  |  2 +
 src/task_solver/moderator.py | 95 ++++++++++++++----------------------
 src/task_solver/scientist.py |  2 +
 src/utils/agentic_prompts.py | 14 ++++++
 4 files changed, 54 insertions(+), 59 deletions(-)

diff --git a/src/task_solver/messages.py b/src/task_solver/messages.py
index 5187bcc..b3c8694 100644
--- a/src/task_solver/messages.py
+++ b/src/task_solver/messages.py
@@ -33,6 +33,7 @@ class AgentSolution:
     final_answer: str
     numerical_answer: str
     round_number: int
+    capability_name: str
 
     def to_dict(self) -> Dict[str, str]:
         """Convert to dictionary."""
@@ -43,6 +44,7 @@ def to_dict(self) -> Dict[str, str]:
             "final_answer": self.final_answer,
             "numerical_answer": self.numerical_answer,
             "round_number": str(self.round_number),
+            "capability_name": self.capability_name,
         }
 
 
diff --git a/src/task_solver/moderator.py b/src/task_solver/moderator.py
index 2789e89..9f3cc62 100644
--- a/src/task_solver/moderator.py
+++ b/src/task_solver/moderator.py
@@ -2,7 +2,6 @@
 
 import json
 import logging
-import re
 import traceback
 from pathlib import Path
 from typing import Dict, List
@@ -83,35 +82,10 @@ def _extract_consensus_components(
             return consensus_reached, final_solution, reasoning, numerical_answer
 
         except Exception as e:
-            # Fallback to old text parsing if JSON parsing fails
-            log.warning(
-                f"Failed to parse JSON response from moderator, falling back to text parsing: {e}"
-            )
-            consensus_match = re.search(
-                r"CONSENSUS_REACHED:\s*(true|false)", response, re.IGNORECASE
-            )
-            solution_match = re.search(
-                r"FINAL_SOLUTION:\s*(.*?)(?=REASONING:|$)",
-                response,
-                re.DOTALL | re.IGNORECASE,
-            )
-            reasoning_match = re.search(
-                r"REASONING:\s*(.*?)$", response, re.DOTALL | re.IGNORECASE
-            )
-
-            consensus_reached = (
-                consensus_match.group(1).lower() == "true" if consensus_match else False
-            )
-            final_solution = (
-                solution_match.group(1).strip() if solution_match else "NONE"
-            )
-            reasoning = (
-                reasoning_match.group(1).strip()
-                if reasoning_match
-                else "No reasoning provided"
-            )
-
-            return consensus_reached, final_solution, reasoning, "null"
+            msg = f"Error extracting consensus components: {e}"
+            log.error(msg)
+            log.error(traceback.format_exc())
+            raise
 
     def _check_simple_consensus(
         self, solutions: List[AgentSolution]
@@ -144,7 +118,7 @@ async def handle_task(self, message: Task, ctx: MessageContext) -> None:
             name=f"moderator_handle_task_{message.task_id}"
         ) as span:
             try:
-                msg = f"Moderator received task: {message.task_id}"
+                msg = f"Moderator received task: {message.task_id}, {message.capability_name} round {self._current_round}"
                 log.info(msg)
                 span.update(
                     metadata={
@@ -164,14 +138,14 @@ async def handle_task(self, message: Task, ctx: MessageContext) -> None:
                         task_id=message.task_id,
                         problem=message.problem,
                         capability_name=message.capability_name,
-                        round_number=1,
+                        round_number=self._current_round,
                     ),
                     topic_id=DefaultTopicId(),
                 )
 
                 span.update(
                     metadata={
-                        "solution_request_sent": f"Round 1 solution request sent for task {message.task_id}"
+                        "solution_request_sent": f"Round {self._current_round} solution request sent for task {message.task_id}"
                     }
                 )
 
@@ -193,7 +167,7 @@ async def handle_agent_solution(
                 task_id = message.task_id
                 round_num = message.round_number
 
-                msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, round {round_num}"
+                msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, {message.capability_name} round {round_num}"
                 log.info(msg)
                 span.update(
                     metadata={
@@ -204,22 +178,28 @@ async def handle_agent_solution(
                     }
                 )
 
+                if round_num != self._current_round:
+                    msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, {message.capability_name} round {round_num} but current round is {self._current_round}"
+                    log.error(msg)
+                    span.update(metadata={"error": msg})
+                    raise Exception(msg)
+
                 # Initialize round buffer if needed
-                if round_num not in self._solutions_buffer:
-                    self._solutions_buffer[round_num] = []
+                if self._current_round not in self._solutions_buffer:
+                    self._solutions_buffer[self._current_round] = []
 
                 # Add solution to buffer
-                self._solutions_buffer[round_num].append(message)
+                self._solutions_buffer[self._current_round].append(message)
 
-                # Check if we have all solutions for this round
-                if len(self._solutions_buffer[round_num]) == self._num_solvers:
-                    await self._check_consensus_and_proceed(task_id, round_num, ctx)
+                msg = f"{len(self._solutions_buffer[self._current_round])}/{self._num_solvers} solutions collected for round {self._current_round}"
+                log.info(msg)
+                span.update(metadata={"solutions_collected": msg})
 
-                span.update(
-                    metadata={
-                        "solutions_collected": f"{len(self._solutions_buffer[round_num])}/{self._num_solvers} for round {round_num}"
-                    }
-                )
+                if (
+                    len(self._solutions_buffer[self._current_round])
+                    == self._num_solvers
+                ):
+                    await self._check_consensus_and_proceed(task_id, ctx)
 
             except Exception as e:
                 error_msg = (
@@ -230,14 +210,14 @@ async def handle_agent_solution(
                 span.update(metadata={"error": error_msg})
 
     async def _check_consensus_and_proceed(
-        self, task_id: str, round_num: int, ctx: MessageContext
+        self, task_id: str, ctx: MessageContext
     ) -> None:
         """Check for consensus and either finalize or start next round."""
         with self._langfuse_client.start_as_current_span(
-            name=f"moderator_consensus_check_{task_id}_round_{round_num}"
+            name=f"moderator_consensus_check_{task_id}_round_{self._current_round}"
         ) as span:
             try:
-                solutions = self._solutions_buffer[round_num]
+                solutions = self._solutions_buffer[self._current_round]
 
                 # First try simple consensus check
                 simple_consensus, simple_solution, simple_numerical = (
@@ -245,7 +225,6 @@ async def _check_consensus_and_proceed(
                 )
 
                 if simple_consensus:
-                    # Simple consensus reached
                     final_solution = FinalSolution(
                         task_id=task_id,
                         capability_name=self._tasks.capability_name,
@@ -254,7 +233,7 @@ async def _check_consensus_and_proceed(
                         numerical_answer=simple_numerical,
                         reasoning="All agents provided the same answer",
                         consensus_reached=True,
-                        total_rounds=round_num,
+                        total_rounds=self._current_round,
                         all_solutions=self._get_all_solutions(),
                     )
 
@@ -270,9 +249,8 @@ async def _check_consensus_and_proceed(
                     )
                     return
 
-                if round_num < self._max_rounds:
-                    # Use LLM moderator to check for consensus
-                    stored_task = self._tasks  # Get original task
+                if self._current_round < self._max_rounds:
+                    stored_task = self._tasks
 
                     # Format solutions for LLM
                     all_solutions_text = "\n\n".join(
@@ -314,7 +292,7 @@ async def _check_consensus_and_proceed(
                             numerical_answer=numerical_answer,
                             reasoning=reasoning,
                             consensus_reached=True,
-                            total_rounds=round_num,
+                            total_rounds=self._current_round,
                             all_solutions=self._get_all_solutions(),
                         )
 
@@ -330,8 +308,7 @@ async def _check_consensus_and_proceed(
                         )
                         return
                     # No consensus, start next round
-                    next_round = round_num + 1
-                    self._current_round = next_round
+                    self._current_round += 1
 
                     # Send revision request with flattened task data
                     stored_task = self._tasks  # Get the original task
@@ -352,7 +329,7 @@ async def _check_consensus_and_proceed(
                                 }
                                 for sol in solutions
                             ],
-                            round_number=next_round,
+                            round_number=self._current_round,
                         ),
                         topic_id=DefaultTopicId(),
                     )
@@ -360,7 +337,7 @@ async def _check_consensus_and_proceed(
                     span.update(
                         metadata={
                             "consensus_reached": False,
-                            "next_round_started": next_round,
+                            "next_round_started": self._current_round,
                         }
                     )
                 else:
@@ -373,7 +350,7 @@ async def _check_consensus_and_proceed(
                         numerical_answer="null",
                         reasoning=f"Maximum rounds ({self._max_rounds}) reached without consensus",
                         consensus_reached=False,
-                        total_rounds=round_num,
+                        total_rounds=self._current_round,
                         all_solutions=self._get_all_solutions(),
                     )
 
diff --git a/src/task_solver/scientist.py b/src/task_solver/scientist.py
index 957617f..8383c08 100644
--- a/src/task_solver/scientist.py
+++ b/src/task_solver/scientist.py
@@ -112,6 +112,7 @@ async def handle_task_solution_request(
                     final_answer=final_answer,
                     numerical_answer=numerical_answer,
                     round_number=message.round_number,
+                    capability_name=message.capability_name,
                 )
 
                 await self.publish_message(solution, topic_id=DefaultTopicId())
@@ -182,6 +183,7 @@ async def handle_agent_revision_request(
                     final_answer=final_answer,
                     numerical_answer=numerical_answer,
                     round_number=message.round_number,
+                    capability_name=message.capability_name,
                 )
 
                 await self.publish_message(solution, topic_id=DefaultTopicId())
diff --git a/src/utils/agentic_prompts.py b/src/utils/agentic_prompts.py
index ff3f6a7..da62565 100644
--- a/src/utils/agentic_prompts.py
+++ b/src/utils/agentic_prompts.py
@@ -273,6 +273,13 @@
 
 PROBLEM: {problem_text}
 
+IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. Do not include any prefixes or prose. The JSON should be directly parseable.
+
+CRITICAL: When including LaTeX expressions or backslashes in your JSON strings, you must properly escape them by using double backslashes (\\\\). For example:
+- Write \\\\(x^2\\\\) instead of \\(x^2\\)
+- Write \\\\[equation\\\\] instead of \\[equation\\]
+- Write \\\\times instead of \\times
+
 Provide your solution in JSON format with the following structure:
 - thought: Your detailed reasoning and step-by-step solution process
 - final_answer: Your complete answer with explanation
@@ -304,6 +311,13 @@
 
 Consider the other agents' approaches and reasoning. You may agree with them, disagree, or provide a synthesis of different approaches.
 
+IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. Do not include any prefixes or prose. The JSON should be directly parseable.
+
+CRITICAL: When including LaTeX expressions or backslashes in your JSON strings, you must properly escape them by using double backslashes (\\\\). For example:
+- Write \\\\(x^2\\\\) instead of \\(x^2\\)
+- Write \\\\[equation\\\\] instead of \\[equation\\]
+- Write \\\\times instead of \\times
+
 Provide your solution in JSON format with the following structure:
 - thought: Your detailed reasoning, considering other agents' solutions
 - final_answer: Your complete answer with explanation

From c5afb81815c26cb7d338a0385a14d9f4896ac08b Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Thu, 9 Oct 2025 12:20:13 -0400
Subject: [PATCH 12/19] fixed prompts.

---
 src/utils/agentic_prompts.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/utils/agentic_prompts.py b/src/utils/agentic_prompts.py
index da62565..b0dd4cd 100644
--- a/src/utils/agentic_prompts.py
+++ b/src/utils/agentic_prompts.py
@@ -206,10 +206,10 @@
 
 Please return your proposal and your thoughts and reasoning in the following format:
 {{
-  "thought": "Your reasoning and thought process about the kind of tasks you're proposing",
+  "thought": "Your reasoning and thought process for designing the tasks and ensuring diversity in content and difficulty of tasks",
   "problems": {{
-    "problem_0": "TASK_TEXT_1",
-    "problem_1": "TASK_TEXT_2",
+    "problem_0": "PROBLEM_0_DESCRIPTION",
+    "problem_1": "PROBLEM_1_DESCRIPTION",
     ...
   }}
 }}
@@ -285,7 +285,7 @@
 - final_answer: Your complete answer with explanation
 - numerical_answer: The final numerical result (if applicable, otherwise null)
 
-Example for a math problem:
+Example for a numerical problem:
 {{
     "thought": "To solve this problem, I need to...",
     "final_answer": "The solution is 42 because...",

From 9195b93864deb282031ce977c54644f9a59ff6de Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Thu, 16 Oct 2025 18:01:58 -0400
Subject: [PATCH 13/19] fixed output dir name to include area name.

---
 src/task_generation/generator.py | 15 ++++++++++-----
 src/task_generation/moderator.py |  8 ++++----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/task_generation/generator.py b/src/task_generation/generator.py
index 9fb8aaa..7ff3468 100644
--- a/src/task_generation/generator.py
+++ b/src/task_generation/generator.py
@@ -31,7 +31,10 @@
 
 
 async def generate_tasks_for_capability(
-    cfg: DictConfig, capability: Capability, output_dir: Path, langfuse_client: Langfuse
+    cfg: DictConfig,
+    capability: Capability,
+    task_output_dir_name: Path,
+    langfuse_client: Langfuse,
 ) -> None:
     """Generate tasks for a single capability."""
     with langfuse_client.start_as_current_span(
@@ -93,7 +96,7 @@ async def generate_tasks_for_capability(
                     num_scientists=2,
                     num_final_problems=cfg.task_generation.num_final_problems_per_capability,
                     buffer_param=cfg.task_generation.buffer_param,
-                    output_dir=output_dir,
+                    output_dir=task_output_dir_name,
                     domain=domain_name,
                     langfuse_client=langfuse_client,
                     max_round=cfg.task_generation.max_rounds,
@@ -324,9 +327,11 @@ async def generate_tasks(
             # Process each capability individually
             for i, capability in enumerate(capabilities):
                 capability_dir_name = capability.name.replace(" ", "_")
-
+                area_dir_name = capability.area.replace(" ", "_").lower()
+                task_output_dir_name = f"[{area_dir_name}]-[{capability_dir_name}]"
+                tasks_output_dir = output_dir / task_output_dir_name
                 # Skip if tasks already exist for this capability
-                if resume_tag and capability_dir_name in existing_tasks:
+                if resume_tag and task_output_dir_name in existing_tasks:
                     msg = f"Skipping capability {i + 1}/{len(capabilities)}: {capability.name} (already exists)"
                     log.info(msg)
                     span.update(
@@ -350,7 +355,7 @@ async def generate_tasks(
                 )
 
                 await generate_tasks_for_capability(
-                    cfg, capability, output_dir, langfuse_client
+                    cfg, capability, tasks_output_dir, langfuse_client
                 )
 
                 msg = f"Completed capability {i + 1}/{len(capabilities)}: {capability.name}"
diff --git a/src/task_generation/moderator.py b/src/task_generation/moderator.py
index 16e6193..3d9cf6e 100644
--- a/src/task_generation/moderator.py
+++ b/src/task_generation/moderator.py
@@ -301,6 +301,7 @@ async def _finalize_tasks_without_solutions(self) -> None:
                 final_tasks[task_id] = {
                     "task": problem_text,
                     "capability_id": self._capability.name,
+                    "area_id": self._capability.area,
                 }
 
             # Save final tasks
@@ -317,12 +318,11 @@ async def _finalize_tasks_without_solutions(self) -> None:
     async def _save_tasks_to_file(self, tasks: Dict[str, Dict[str, str]]) -> None:
         """Save final tasks to file."""
         try:
-            # Create capability directory
-            capability_dir = self._output_dir / self._capability.name
-            capability_dir.mkdir(parents=True, exist_ok=True)
+            # Create task output directory
+            self._output_dir.mkdir(parents=True, exist_ok=True)
 
             # Save tasks
-            tasks_file = capability_dir / "tasks.json"
+            tasks_file = self._output_dir / "tasks.json"
             with open(tasks_file, "w", encoding="utf-8") as f:
                 json.dump({"tasks": tasks}, f, indent=2, ensure_ascii=False)
 

From 57d2d2a87c2b9c16efca79ff9d950c3088fc740c Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Fri, 17 Oct 2025 14:16:38 -0400
Subject: [PATCH 14/19] fixed task solver output dir name.

---
 src/task_solver/generator.py |  69 ++++++++++--------
 src/task_solver/messages.py  |   6 ++
 src/task_solver/moderator.py |  18 +++--
 src/task_solver/scientist.py | 132 +++++++++++++++++++++++++++--------
 4 files changed, 161 insertions(+), 64 deletions(-)

diff --git a/src/task_solver/generator.py b/src/task_solver/generator.py
index 85d12d5..0165c8b 100644
--- a/src/task_solver/generator.py
+++ b/src/task_solver/generator.py
@@ -36,12 +36,13 @@ async def solve_task(
     max_rounds = cfg.task_solver.max_rounds
     task_id = task.task_id
     capability_name = task.capability_name
+    area_name = task.area_name
 
     with langfuse_client.start_as_current_span(
-        name=f"task_solver_for_task:{task_id}, capability:{capability_name}"
+        name=f"task_solver_for_task:{task_id}, capability:{capability_name}, area: {area_name}"
     ) as span:
         try:
-            msg = f"Generating solutions for task: {task_id}, capability: {capability_name}"
+            msg = f"Generating solutions for task: {task_id}, capability: {capability_name}, area: {area_name}"
             log.info(msg)
             span.update(
                 metadata={
@@ -49,6 +50,7 @@ async def solve_task(
                     "task_id": task_id,
                     "problem": task.problem,
                     "capability_name": capability_name,
+                    "area_name": area_name,
                 }
             )
 
@@ -110,25 +112,24 @@ async def solve_task(
 
             await runtime.publish_message(task, DefaultTopicId())
 
-            msg = f"Task message published: {task_id}, capability: {capability_name}"
+            msg = f"Task message published: {task_id}, capability: {capability_name}, area: {area_name}"
             log.info(msg)
             span.update(
                 metadata={
                     "task_published": msg,
                     "task_id": task_id,
                     "capability_name": capability_name,
+                    "area_name": area_name,
                 }
             )
 
             try:
                 await runtime.stop_when_idle()
-                msg = (
-                    f"Completed solving task: {task_id}, capability: {capability_name}"
-                )
+                msg = f"Completed solving task: {task_id}, capability: {capability_name}, area: {area_name}"
                 log.info(msg)
                 span.update(metadata={"runtime_completed": msg})
             except Exception as e:
-                msg = f"Error while solving task {task_id}, capability: {capability_name}: {e}"
+                msg = f"Error while solving task {task_id}, capability: {capability_name}, area: {area_name}: {e}"
                 log.error(msg)
                 span.update(
                     level="ERROR",
@@ -138,6 +139,7 @@ async def solve_task(
                         "error": str(e),
                         "task_id": task_id,
                         "capability_name": capability_name,
+                        "area_name": {area_name},
                     },
                 )
                 raise
@@ -208,7 +210,6 @@ async def solve_tasks(
                 log.error(error_msg)
                 span.update(
                     level="ERROR",
-                    status_message="Capabilities directory not found",
                     metadata={
                         "directory_not_found_error": error_msg,
                         "tasks_dir": str(tasks_dir),
@@ -216,27 +217,37 @@ async def solve_tasks(
                 )
                 raise FileNotFoundError(error_msg)
 
-            for capability_dir in tasks_dir.iterdir():
-                if capability_dir.is_dir():
-                    # Check if the last part of capability_dir exists in output_dir
-                    output_solver_dir = Path(output_dir) / capability_dir.name
-                    if output_solver_dir.exists():
-                        msg = f"Solutions for tasks under capability {capability_dir.name} already exist: {output_solver_dir}"
-                        log.info(msg)
-                        span.update(metadata={"task_solver_skipped": msg})
-                        continue
-
-                    tasks_file = capability_dir / "tasks.json"
-                    if tasks_file.exists():
-                        with open(tasks_file, "r", encoding="utf-8") as f:
-                            tasks = json.load(f)["tasks"]
-                            for task_id, task_data in tasks.items():
-                                task = Task(
-                                    task_id=task_id,
-                                    problem=task_data["task"],
-                                    capability_name=task_data["capability_id"],
-                                )
-                                await solve_task(cfg, task, output_dir, langfuse_client)
+            for per_area_capability_dir in tasks_dir.iterdir():
+                tasks_file = per_area_capability_dir / "tasks.json"
+
+                if not tasks_file.exists():
+                    msg = f"Tasks file not found: {tasks_file}"
+                    log.error(msg)
+                    span.update(metadata={"warning": msg})
+                    continue
+
+                with open(tasks_file, "r", encoding="utf-8") as f:
+                    tasks = json.load(f)["tasks"]
+                    output_solver_dir = Path(output_dir) / per_area_capability_dir.name
+
+                    for task_id, task_data in tasks.items():
+                        if (
+                            output_solver_dir.exists()
+                            and f"{task_id}_solution.json"
+                            in list(output_solver_dir.iterdir())
+                        ):
+                            msg = f"Task {task_id} already solved"
+                            log.info(msg)
+                            span.update(metadata={"task_solver_skipped": msg})
+                            continue
+
+                        task = Task(
+                            task_id=task_id,
+                            problem=task_data["task"],
+                            capability_name=task_data["capability_id"],
+                            area_name=task_data["area_id"],
+                        )
+                        await solve_task(cfg, task, output_solver_dir, langfuse_client)
 
         except Exception as e:
             error_msg = f"Error in task solver: {str(e)}"
diff --git a/src/task_solver/messages.py b/src/task_solver/messages.py
index b3c8694..36c196e 100644
--- a/src/task_solver/messages.py
+++ b/src/task_solver/messages.py
@@ -11,6 +11,7 @@ class Task:
     task_id: str
     problem: str
     capability_name: str
+    area_name: str
 
 
 @dataclass
@@ -20,6 +21,7 @@ class TaskSolutionRequest:
     task_id: str
     problem: str
     capability_name: str
+    area_name: str
     round_number: int = 1
 
 
@@ -34,6 +36,7 @@ class AgentSolution:
     numerical_answer: str
     round_number: int
     capability_name: str
+    area_name: str
 
     def to_dict(self) -> Dict[str, str]:
         """Convert to dictionary."""
@@ -45,6 +48,7 @@ def to_dict(self) -> Dict[str, str]:
             "numerical_answer": self.numerical_answer,
             "round_number": str(self.round_number),
             "capability_name": self.capability_name,
+            "area_name": self.area_name,
         }
 
 
@@ -55,6 +59,7 @@ class AgentRevisionRequest:
     task_id: str
     problem: str
     capability_name: str
+    area_name: str
     other_solutions: List[Dict[str, str]]
     round_number: int
 
@@ -74,6 +79,7 @@ class FinalSolution:
 
     task_id: str
     capability_name: str
+    area_name: str
     problem: str
     solution: str
     numerical_answer: str
diff --git a/src/task_solver/moderator.py b/src/task_solver/moderator.py
index 9f3cc62..c46ab1c 100644
--- a/src/task_solver/moderator.py
+++ b/src/task_solver/moderator.py
@@ -91,7 +91,7 @@ def _check_simple_consensus(
         self, solutions: List[AgentSolution]
     ) -> tuple[bool, str, str]:
         """Check consensus; if all agents have the same final answer."""
-        if not solutions:
+        if not solutions or len(solutions) < self._num_solvers:
             return False, "", "null"
 
         # First check numerical answers if they exist
@@ -125,6 +125,7 @@ async def handle_task(self, message: Task, ctx: MessageContext) -> None:
                         "task_received": msg,
                         "task_id": message.task_id,
                         "capability_name": message.capability_name,
+                        "area_name": message.area_name,
                     }
                 )
 
@@ -138,6 +139,7 @@ async def handle_task(self, message: Task, ctx: MessageContext) -> None:
                         task_id=message.task_id,
                         problem=message.problem,
                         capability_name=message.capability_name,
+                        area_name=message.area_name,
                         round_number=self._current_round,
                     ),
                     topic_id=DefaultTopicId(),
@@ -167,7 +169,7 @@ async def handle_agent_solution(
                 task_id = message.task_id
                 round_num = message.round_number
 
-                msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, {message.capability_name} round {round_num}"
+                msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, {message.capability_name}, {message.area_name} round {round_num}"
                 log.info(msg)
                 span.update(
                     metadata={
@@ -179,7 +181,7 @@ async def handle_agent_solution(
                 )
 
                 if round_num != self._current_round:
-                    msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, {message.capability_name} round {round_num} but current round is {self._current_round}"
+                    msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, {message.capability_name}, {message.area_name} round {round_num} but current round is {self._current_round}"
                     log.error(msg)
                     span.update(metadata={"error": msg})
                     raise Exception(msg)
@@ -228,6 +230,7 @@ async def _check_consensus_and_proceed(
                     final_solution = FinalSolution(
                         task_id=task_id,
                         capability_name=self._tasks.capability_name,
+                        area_name=self._tasks.area_name,
                         problem=self._tasks.problem,
                         solution=simple_solution,
                         numerical_answer=simple_numerical,
@@ -287,6 +290,7 @@ async def _check_consensus_and_proceed(
                         final_solution = FinalSolution(
                             task_id=task_id,
                             capability_name=self._tasks.capability_name,
+                            area_name=self._tasks.area_name,
                             problem=self._tasks.problem,
                             solution=final_solution_text,
                             numerical_answer=numerical_answer,
@@ -318,6 +322,7 @@ async def _check_consensus_and_proceed(
                             task_id=stored_task.task_id,
                             problem=stored_task.problem,
                             capability_name=stored_task.capability_name,
+                            area_name=stored_task.area_name,
                             other_solutions=[
                                 {
                                     "agent_id": sol.agent_id,
@@ -345,6 +350,7 @@ async def _check_consensus_and_proceed(
                     final_solution = FinalSolution(
                         task_id=task_id,
                         capability_name=self._tasks.capability_name,
+                        area_name=self._tasks.area_name,
                         problem=self._tasks.problem,
                         solution="No consensus reached",
                         numerical_answer="null",
@@ -379,14 +385,12 @@ async def _save_final_solution(self, final_solution: FinalSolution) -> None:
         """Save the final solution to a file."""
         try:
             self._output_dir.mkdir(parents=True, exist_ok=True)
-            output_file = (
-                self._output_dir
-                / f"{final_solution.task_id}_{final_solution.capability_name}_solution.json"
-            )
+            output_file = self._output_dir / f"{final_solution.task_id}_solution.json"
 
             solution_data = {
                 "task_id": final_solution.task_id,
                 "capability_name": final_solution.capability_name,
+                "area_name": final_solution.area_name,
                 "problem": final_solution.problem,
                 "solution": final_solution.solution,
                 "numerical_answer": final_solution.numerical_answer,
diff --git a/src/task_solver/scientist.py b/src/task_solver/scientist.py
index 8383c08..262f255 100644
--- a/src/task_solver/scientist.py
+++ b/src/task_solver/scientist.py
@@ -1,5 +1,6 @@
 """Task solver agent for solver tasks through debate."""
 
+import json
 import logging
 import traceback
 
@@ -32,6 +33,8 @@
 
 log = logging.getLogger("task_solver.scientist")
 
+MAX_MODEL_ATTEMPTS = 3
+
 
 @default_subscription
 class TaskSolverScientist(RoutedAgent):
@@ -52,11 +55,21 @@ def _extract_solution_components(self, response: str) -> tuple[str, str, str]:
         """Extract thought, final answer, and numerical answer from JSON response."""
         try:
             parsed = parse_llm_json_response(response)
-            thought = parsed.get("thought", response.strip())
-            final_answer = parsed.get("final_answer", "No clear answer provided")
+            thought_raw = parsed.get("thought", response.strip())
+            final_answer_raw = parsed.get("final_answer", "No clear answer provided")
             numerical_answer = parsed.get("numerical_answer")
 
-            # Convert numerical_answer to string representation
+            thought = (
+                json.dumps(thought_raw, ensure_ascii=False)
+                if isinstance(thought_raw, (dict, list))
+                else str(thought_raw).strip()
+            )
+            final_answer = (
+                json.dumps(final_answer_raw, ensure_ascii=False, indent=2)
+                if isinstance(final_answer_raw, (dict, list))
+                else str(final_answer_raw).strip()
+            )
+
             if numerical_answer is not None:
                 numerical_answer = str(numerical_answer)
             else:
@@ -70,6 +83,54 @@ def _extract_solution_components(self, response: str) -> tuple[str, str, str]:
             log.error(traceback.format_exc())
             raise
 
+    async def _generate_solution_payload(
+        self, system_message: SystemMessage, user_message: UserMessage
+    ) -> tuple[str, str, str]:
+        """Call the model with retries until valid JSON is returned."""
+        last_error: Exception | None = None
+        for attempt in range(1, MAX_MODEL_ATTEMPTS + 1):
+            try:
+                response = await self._model_client.create(
+                    [system_message, user_message],
+                    json_output=True,
+                )
+            except Exception as exc:  # pragma: no cover - network/SDK errors
+                last_error = exc
+                log.warning(
+                    "Scientist %s failed to get response on attempt %d: %s",
+                    self._scientist_id,
+                    attempt,
+                    exc,
+                )
+                continue
+
+            response_content = str(getattr(response, "content", "") or "").strip()
+            if not response_content:
+                last_error = ValueError("Empty response content")
+                log.warning(
+                    "Scientist %s received empty response on attempt %d",
+                    self._scientist_id,
+                    attempt,
+                )
+                continue
+
+            try:
+                return self._extract_solution_components(response_content)
+            except Exception as exc:
+                last_error = exc
+                log.warning(
+                    "Scientist %s failed to parse model response on attempt %d: %s",
+                    self._scientist_id,
+                    attempt,
+                    exc,
+                )
+                continue
+
+        raise RuntimeError(
+            f"Scientist {self._scientist_id} could not obtain valid JSON "
+            f"after {MAX_MODEL_ATTEMPTS} attempts"
+        ) from last_error
+
     @message_handler
     async def handle_task_solution_request(
         self, message: TaskSolutionRequest, ctx: MessageContext
@@ -79,7 +140,11 @@ async def handle_task_solution_request(
             name=f"scientist_{self._scientist_id}_initial_solution_request"
         ) as span:
             try:
-                msg = f"Scientist {self._scientist_id} handling initial solution request for task: {message.task_id}, capability: {message.capability_name} round: {message.round_number}"
+                msg = (
+                    f"Scientist {self._scientist_id} handling initial solution request "
+                    f"for task: {message.task_id}, capability: {message.capability_name}, area: {message.area_name}"
+                    f"round: {message.round_number}"
+                )
                 log.info(msg)
                 span.update(
                     metadata={
@@ -87,6 +152,7 @@ async def handle_task_solution_request(
                         "scientist_id": self._scientist_id,
                         "task_id": message.task_id,
                         "capability": message.capability_name,
+                        "area": message.area_name,
                         "round": message.round_number,
                     }
                 )
@@ -96,14 +162,11 @@ async def handle_task_solution_request(
                 system_message = SystemMessage(content=TASK_SOLVER_SYSTEM_MESSAGE)
                 user_message = UserMessage(content=prompt, source="user")
 
-                response = await self._model_client.create(
-                    [system_message, user_message]
-                )
-
-                response_content = str(response.content)
-                thought, final_answer, numerical_answer = (
-                    self._extract_solution_components(response_content)
-                )
+                (
+                    thought,
+                    final_answer,
+                    numerical_answer,
+                ) = await self._generate_solution_payload(system_message, user_message)
 
                 solution = AgentSolution(
                     agent_id=self._scientist_id,
@@ -113,13 +176,18 @@ async def handle_task_solution_request(
                     numerical_answer=numerical_answer,
                     round_number=message.round_number,
                     capability_name=message.capability_name,
+                    area_name=message.area_name,
                 )
 
                 await self.publish_message(solution, topic_id=DefaultTopicId())
 
                 span.update(
                     metadata={
-                        "solution_generated": f"Scientist {self._scientist_id} generated solution for task {message.task_id}, capability: {message.capability_name} round: {message.round_number}",
+                        "solution_generated": (
+                            f"Scientist {self._scientist_id} generated solution for task "
+                            f"{message.task_id}, capability: {message.capability_name}, area: {message.area_name}"
+                            f"round: {message.round_number}"
+                        ),
                     }
                 )
 
@@ -138,7 +206,11 @@ async def handle_agent_revision_request(
             name=f"scientist_{self._scientist_id}_round_{message.round_number}"
         ) as span:
             try:
-                msg = f"Scientist {self._scientist_id} handling revision request for task: {message.task_id}, capability: {message.capability_name} round: {message.round_number}"
+                msg = (
+                    f"Scientist {self._scientist_id} handling revision request for task: "
+                    f"{message.task_id}, capability: {message.capability_name}, area: {message.area_name}"
+                    f"round: {message.round_number}"
+                )
                 log.info(msg)
                 span.update(
                     metadata={
@@ -150,31 +222,30 @@ async def handle_agent_revision_request(
                     }
                 )
 
-                # Format other scientists' solutions
                 other_solutions_text = "\n\n".join(
                     [
-                        f"Scientist {sol['agent_id']}: Reasoning: {sol['thought']}, Final solution: {sol['final_answer']}"
+                        (
+                            f"Scientist {sol['agent_id']}: Reasoning: {sol['thought']}, "
+                            f"Final solution: {sol['final_answer']}"
+                        )
                         for sol in message.other_solutions
-                        if sol["agent_id"]
-                        != self._scientist_id  # Don't include its own solution
+                        if sol["agent_id"] != self._scientist_id
                     ]
                 )
 
                 prompt = TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT.format(
-                    other_solutions=other_solutions_text, problem_text=message.problem
+                    other_solutions=other_solutions_text,
+                    problem_text=message.problem,
                 )
 
                 system_message = SystemMessage(content=TASK_SOLVER_SYSTEM_MESSAGE)
                 user_message = UserMessage(content=prompt, source="user")
 
-                response = await self._model_client.create(
-                    [system_message, user_message]
-                )
-
-                response_content = str(response.content)
-                thought, final_answer, numerical_answer = (
-                    self._extract_solution_components(response_content)
-                )
+                (
+                    thought,
+                    final_answer,
+                    numerical_answer,
+                ) = await self._generate_solution_payload(system_message, user_message)
 
                 solution = AgentSolution(
                     agent_id=self._scientist_id,
@@ -184,13 +255,18 @@ async def handle_agent_revision_request(
                     numerical_answer=numerical_answer,
                     round_number=message.round_number,
                     capability_name=message.capability_name,
+                    area_name=message.area_name,
                 )
 
                 await self.publish_message(solution, topic_id=DefaultTopicId())
 
                 span.update(
                     metadata={
-                        "revision_generated": f"Scientist {self._scientist_id} generated revision for task {message.task_id}, capability: {message.capability_name}, round: {message.round_number}",
+                        "revision_generated": (
+                            f"Scientist {self._scientist_id} generated revision for task "
+                            f"{message.task_id}, capability: {message.capability_name}, area: {message.area_name}"
+                            f"round: {message.round_number}"
+                        ),
                     }
                 )
 

From 32922994378c4143c33dcfab66ef13cbbbe0962b Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Fri, 17 Oct 2025 14:19:46 -0400
Subject: [PATCH 15/19] upgraded json handling, and model call.

---
 src/utils/json_utils.py         | 35 ++++++++++++++++++---------------
 src/utils/model_client_utils.py |  2 +-
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/utils/json_utils.py b/src/utils/json_utils.py
index 2a57c0a..3d2fd77 100644
--- a/src/utils/json_utils.py
+++ b/src/utils/json_utils.py
@@ -24,35 +24,42 @@ def extract_json_from_markdown(content: str) -> str:
     elif content.startswith("```") and content.endswith("```"):
         content = content[3:-3].strip()
 
-    return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", content)
+    content = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", content)
+
+    if content and not content.lstrip().startswith(("{", "[")):
+        brace_start = content.find("{")
+        brace_end = content.rfind("}")
+        bracket_start = content.find("[")
+        bracket_end = content.rfind("]")
+
+        if brace_start != -1 and brace_end > brace_start:
+            content = content[brace_start : brace_end + 1].strip()
+        elif bracket_start != -1 and bracket_end > bracket_start:
+            content = content[bracket_start : bracket_end + 1].strip()
+
+    return content
 
 
 def fix_common_json_errors(content: str) -> str:
     """Fix common JSON syntax errors."""
-    # Fix extra equals signs (e.g., "area":="value" -> "area":"value")
     content = re.sub(r':\s*=\s*"', ':"', content)
-
-    # Fix missing quotes around keys
     content = re.sub(r'(\w+):\s*"', r'"\1":"', content)
-
-    # Fix trailing commas
+    content = re.sub(r'\\(?!["\\/bfnrtu])', r"\\\\", content)
     return re.sub(r",(\s*[}\]])", r"\1", content)
 
 
 def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]:
     """Parse LLM JSON response."""
     try:
-        # Ensure content is a string
         if not isinstance(raw_content, str):
             raw_content = str(raw_content)
 
-        # Clean the content first
         cleaned_content = extract_json_from_markdown(raw_content)
-
-        # Fix common JSON errors
         cleaned_content = fix_common_json_errors(cleaned_content)
 
-        # Parse the JSON
+        if not cleaned_content:
+            raise json.JSONDecodeError("Empty JSON content", cleaned_content or "", 0)
+
         result = json.loads(cleaned_content)
         return result if isinstance(result, dict) else {}
 
@@ -60,14 +67,10 @@ def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]:
         log.error(f"Failed to parse JSON response: {e}")
         log.error(f"Content length: {len(cleaned_content)} characters")
 
-        # Try to fix common JSON issues
         try:
-            # Attempt to fix unterminated strings by finding the last complete entry
             if "Unterminated string" in str(e):
-                # Find the last complete capability entry
                 last_complete = cleaned_content.rfind('"},')
                 if last_complete > 0:
-                    # Truncate to last complete entry and close the JSON
                     fixed_content = cleaned_content[: last_complete + 2] + "\n  }\n}"
                     log.warning(
                         "Attempting to fix unterminated JSON by truncating to last complete entry"
@@ -77,9 +80,9 @@ def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]:
         except Exception as fix_error:
             log.error(f"Failed to fix JSON: {fix_error}")
 
-        # If we can't fix it, log more details and re-raise
         log.error(f"Raw content (last 500 chars): {raw_content[-500:]}")
         raise
+
     except Exception as e:
         log.error(f"Unexpected error parsing JSON: {e}")
         log.error(f"Raw content: {raw_content}")
diff --git a/src/utils/model_client_utils.py b/src/utils/model_client_utils.py
index c1fdea4..c8c2ef6 100644
--- a/src/utils/model_client_utils.py
+++ b/src/utils/model_client_utils.py
@@ -20,7 +20,7 @@
 )
 
 
-MAX_TOKENS = 1024 * 10
+MAX_TOKENS = 1024 * 30
 
 logger = logging.getLogger(__name__)
 

From df4860b597902f1916340a30808508a58778b158 Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Fri, 17 Oct 2025 14:58:08 -0400
Subject: [PATCH 16/19] updated readme to include latest agentic changes.

---
 README.md | 73 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index a88c7e3..80f157f 100644
--- a/README.md
+++ b/README.md
@@ -73,26 +73,77 @@ Utilize the capability and the corresponding subject LLM score to select or gene
 ```bash
 python -m src.run_lbo
 ```
-
 ### Agentic Generation Scripts
 
-Generate areas, capabilities, and tasks using multi-agent debate systems. Configure parameters in `src/cfg/agentic_config.yaml`.
+These scripts implement the multi-agent debate workflow for automated generation of areas, capabilities, tasks, and solutions.
+All configurable parameters are defined in `src/cfg/agentic_config.yaml`.
+
+---
 
+#### 1. Generate Areas
+Generate domain areas using the scientist–moderator debate system:
 ```bash
-# Generate capability areas
 python -m src.agentic_area_generator
+```
+
+Output location:
+```
+~/<output_dir>/<domain>/<exp_id>/areas/<areas_tag>/areas.json
+```
+Where:
+- <output_dir> comes from `global_cfg.output_dir`
+- <domain> comes from `global_cfg.domain` (spaces replaced with underscores)
+- <exp_id> comes from `exp_cfg.exp_id`
+- <areas_tag> is the tag used for the generated areas
+
+#### 2. Generate Capabilities
+Generate capabilities for each area:
+```bash
+python -m src.agentic_capability_generator pipeline_tags.areas_tag=_YYYYMMDD_HHMMSS pipeline_tags.resume_capabilities_tag=_YYYYMMDD_HHMMSS
+```
+
+**Options:**
+- `pipeline_tags.areas_tag` specifies which set of areas to use when generating capabilities.
+- `pipeline_tags.resume_capabilities_tag` (optional) resumes a previous capability generation run.
+
+**Output location:**
+```
+~/<output_dir>/<domain>/<exp_id>/capabilities/<capabilities_tag>/<area>/capabilities.json
+```
+Where:
+- <capabilities_tag> is the tag used for the generated capabilities (either resumed or auto-generated)
+
 
-# Generate capabilities for each area
-python -m src.agentic_capability_generator
+#### 3. Generate Tasks
+Generate evaluation tasks for a specific capabilities tag:
+```bash
+python -m src.agentic_task_generator pipeline_tags.capabilities_tag=_YYYYMMDD_HHMMSS pipeline_tags.resume_tasks_tag=_YYYYMMDD_HHMMSS
+```
 
-# Generate tasks for each capability
-python -m src.agentic_task_generator
+**Options:**
+- `pipeline_tags.capabilities_tag` specifies which set of capabilities to use when generating tasks.
+- `pipeline_tags.resume_tasks_tag` (optional) resumes a previous task generation run.
 
-# Generate tasks for all capabilities
-python -m src.agentic_task_generator pipeline_tags.capabilities_tag=_20250902_030203
+**Output location:**
+```
+~/<output_dir>/<domain>/<exp_id>/tasks/<tasks_tag>/[<area>]-[<capability>]/tasks.json
+```
+Where:
+- <tasks_tag> is the tag used for the generated tasks (either resumed or auto-generated)
 
-# Generate solutions for tasks using multi-agent debate
-python -m src.agentic_task_solver pipeline_tags.tasks_tag=_20250905_153532
+#### 4. Generate Solutions
+Solve generated tasks using the multi-agent debate system:
+```bash
+python -m src.agentic_task_solver pipeline_tags.tasks_tag=_YYYYMMDD_HHMMSS pipeline_tags.resume_solutions_tag=_YYYYMMDD_HHMMSS
+```
 
+**Options:**
+- `pipeline_tags.tasks_tag` specifies which set of tasks to solve.
+- `pipeline_tags.resume_solutions_tag` (optional) resumes a previous solution generation run.
 
+**Output location:**
+```
+~/<output_dir>/<domain>/<exp_id>/task_solutions/<solutions_tag>/[<area>]-[<capability>]/<task_id>_solution.json
 ```
+Where:
+- <solutions_tag> is the tag used for the generated solutions (either resumed or auto-generated)

From 653179baa5f855799f8a94b4af20c4dff2fa610a Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Sun, 2 Nov 2025 16:26:36 -0500
Subject: [PATCH 17/19] task diversity study scripts added.

---
 experimental/diverse_task_config.yaml    |  89 +++++
 experimental/diverse_task_dataclasses.py |  77 +++++
 experimental/diverse_task_generator.py   | 266 +++++++++++++++
 experimental/diverse_task_prompts.py     | 416 +++++++++++++++++++++++
 experimental/extract_subtopics.py        |  41 +++
 experimental/find_combinations.py        |  57 ++++
 experimental/generate_blueprints.py      |  66 ++++
 experimental/generate_tasks.py           |  68 ++++
 experimental/model_utils.py              |  41 +++
 experimental/verify_tasks.py             | 112 ++++++
 10 files changed, 1233 insertions(+)
 create mode 100644 experimental/diverse_task_config.yaml
 create mode 100644 experimental/diverse_task_dataclasses.py
 create mode 100644 experimental/diverse_task_generator.py
 create mode 100644 experimental/diverse_task_prompts.py
 create mode 100644 experimental/extract_subtopics.py
 create mode 100644 experimental/find_combinations.py
 create mode 100644 experimental/generate_blueprints.py
 create mode 100644 experimental/generate_tasks.py
 create mode 100644 experimental/model_utils.py
 create mode 100644 experimental/verify_tasks.py

diff --git a/experimental/diverse_task_config.yaml b/experimental/diverse_task_config.yaml
new file mode 100644
index 0000000..d0d1329
--- /dev/null
+++ b/experimental/diverse_task_config.yaml
@@ -0,0 +1,89 @@
+# Configuration for Diverse Task Generator
+
+# Model settings
+model:
+  name: gpt-4o  # OpenAI model to use
+  temperature: 1.0  # Temperature for all steps
+  max_tokens: 8192  # Max tokens for all steps
+
+# Task generation settings
+generation:
+  tasks_per_blueprint: 3  # Number of tasks to generate per blueprint
+  min_subtopics: 3  # Suggested minimum number of sub-topics
+  max_subtopics: 8  # Suggested maximum number of sub-topics
+
+# Output settings
+output:
+  base_dir: diverse_task_outputs
+  save_intermediate_steps: true  # Save each step's output
+  pretty_print_json: true  # Indent JSON files
+
+# Input settings
+input:
+  capability_json_path: capability.json  # Default capability JSON file path
+
+# Bloom's Taxonomy definitions
+# Source: Revised Bloom's Taxonomy (Anderson & Krathwohl, 2001)
+blooms_taxonomy:
+  Remember:
+    description: "Retrieving relevant knowledge from long-term memory. Involves recognizing and recalling facts, terms, basic concepts, or answers."
+    keywords: ["define", "list", "identify", "recall", "name", "state"]
+
+  Understand:
+    description: "Constructing meaning from instructional messages. Involves interpreting, exemplifying, classifying, summarizing, inferring, comparing, and explaining."
+    keywords: ["explain", "describe", "interpret", "summarize", "compare", "contrast"]
+
+  Apply:
+    description: "Carrying out or using a procedure in a given situation. Involves executing or implementing a method, technique, or process."
+    keywords: ["apply", "use", "implement", "execute", "solve", "demonstrate"]
+
+  Analyze:
+    description: "Breaking material into constituent parts and determining how parts relate to one another and to an overall structure. Involves differentiating, organizing, and attributing."
+    keywords: ["analyze", "differentiate", "organize", "distinguish", "examine", "compare"]
+
+  Evaluate:
+    description: "Making judgments based on criteria and standards. Involves checking for internal consistency or logical fallacies, and critiquing based on external criteria."
+    keywords: ["evaluate", "judge", "critique", "assess", "justify", "argue"]
+
+  Create:
+    description: "Putting elements together to form a novel, coherent whole or make an original product. Involves generating, planning, and producing."
+    keywords: ["create", "design", "construct", "develop", "formulate", "generate"]
+
+# Difficulty level definitions
+difficulty_levels:
+  easy:
+    description: "Basic, straightforward problems requiring minimal steps and fundamental knowledge."
+    characteristics:
+      - "Single concept application"
+      - "Direct recall or simple calculation"
+      - "Clear and unambiguous"
+      - "Minimal prerequisite knowledge"
+
+  medium:
+    description: "Moderate complexity requiring multiple steps, integration of concepts, or non-trivial reasoning."
+    characteristics:
+      - "Multiple concept integration"
+      - "Multi-step solution required"
+      - "Some prerequisite knowledge needed"
+      - "May involve edge cases"
+
+  hard:
+    description: "Complex, challenging problems requiring deep understanding, multiple concepts, edge cases, or sophisticated reasoning."
+    characteristics:
+      - "Complex multi-concept integration"
+      - "Multiple challenging steps"
+      - "Deep domain knowledge required"
+      - "Edge cases and exceptions"
+      - "May require insight or creative approach"
+
+# Verification criteria
+verification:
+  pass_threshold: 0.8  # Minimum pass rate to consider successful
+  strict_mode: false  # If true, all alignment criteria must pass
+
+# Example capability for quick testing
+example_capability:
+  name: "compound_interest_calculations"
+  description: "The ability to calculate compound interest for various scenarios, including different compounding frequencies (annually, semi-annually, quarterly, monthly), different time periods, and understanding how changes in principal, rate, or time affect the final amount."
+  domain: "personal_finance"
+  area: "investing_and_savings"
diff --git a/experimental/diverse_task_dataclasses.py b/experimental/diverse_task_dataclasses.py
new file mode 100644
index 0000000..b03aa15
--- /dev/null
+++ b/experimental/diverse_task_dataclasses.py
@@ -0,0 +1,77 @@
+"""Dataclasses for the diverse task generation pipeline."""
+
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+
+@dataclass
+class Capability:
+    """Represents a capability to be tested."""
+
+    name: str
+    description: str
+    domain: str
+    area: Optional[str] = None
+    example_tasks: List[Dict] = field(default_factory=list)
+
+
+@dataclass
+class SubTopic:
+    """Represents a sub-topic within a capability."""
+
+    name: str
+    description: Optional[str] = None
+
+
+@dataclass
+class Combination:
+    """Represents a valid (content, difficulty, reasoning) combination."""
+
+    content: str
+    difficulty: str
+    reasoning: str
+    rationale: Optional[str] = None
+
+
+@dataclass
+class Blueprint:
+    """Represents a task blueprint for a specific combination."""
+
+    combination_id: int
+    subtopic: str
+    difficulty: str
+    reasoning: str
+    blueprint: str
+    key_characteristics: List[str] = field(default_factory=list)
+    example_question_outline: Optional[str] = None
+    rationale: Optional[str] = None
+
+
+@dataclass
+class Task:
+    """Represents a generated multiple-choice task."""
+
+    task_id: str
+    blueprint_id: int
+    subtopic: str
+    difficulty: str
+    reasoning: str
+    question: str
+    choices: Dict[str, str]
+    correct_answer: str
+    explanation: Optional[str] = None
+    alignment_notes: Optional[str] = None
+
+
+@dataclass
+class VerificationResult:
+    """Represents the verification result for a task."""
+
+    task_id: str
+    subtopic_aligned: bool
+    difficulty_aligned: bool
+    reasoning_aligned: bool
+    choices_appropriate: bool
+    overall_aligned: bool
+    feedback: str
+    suggested_improvements: Optional[str] = None
diff --git a/experimental/diverse_task_generator.py b/experimental/diverse_task_generator.py
new file mode 100644
index 0000000..8518a1b
--- /dev/null
+++ b/experimental/diverse_task_generator.py
@@ -0,0 +1,266 @@
+"""Standalone script for generating diverse tasks for a single capability."""
+
+import argparse
+import json
+import logging
+import os
+from dataclasses import asdict
+from datetime import datetime
+from functools import partial
+from pathlib import Path
+from typing import Any
+
+import yaml
+from diverse_task_dataclasses import (
+    Blueprint,
+    Capability,
+    Combination,
+    SubTopic,
+    Task,
+    VerificationResult,
+)
+from extract_subtopics import extract_subtopics
+from find_combinations import find_valid_combinations
+from generate_blueprints import generate_blueprints
+from model_utils import call_model
+from openai import OpenAI
+from verify_tasks import verify_tasks
+
+from generate_tasks import generate_tasks
+
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+class DiverseTaskGenerator:
+    """Generate diverse tasks for a capability using multi-dimensional approach."""
+
+    def __init__(
+        self,
+        capability_dict: dict,
+        config: dict,
+    ) -> None:
+        """Initialize the diverse task generator."""
+        # Extract example tasks from capability_data if present
+        example_tasks = (
+            capability_dict.get("capability_data", [])[:3]
+            if "capability_data" in capability_dict
+            else []
+        )
+
+        self.capability = Capability(
+            name=capability_dict["capability_name"],
+            description=capability_dict["capability_description"],
+            domain=capability_dict["capability_domain"],
+            area=capability_dict.get("capability_area"),
+            example_tasks=example_tasks,
+        )
+
+        # Store configuration
+        self.config = config
+
+        # Use config values
+        self.model_name = self.config["model"]["name"]
+        self.temperature = self.config["model"]["temperature"]
+        self.max_tokens = self.config["model"]["max_tokens"]
+        self.output_dir = Path(self.config["output"]["base_dir"])
+
+        # Initialize OpenAI client
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError("OPENAI_API_KEY environment variable not set")
+        self.client = OpenAI(api_key=api_key)
+
+        # Create output directory
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        self.run_output_dir = self.output_dir / f"{self.capability.name}_{timestamp}"
+        self.run_output_dir.mkdir(parents=True, exist_ok=True)
+
+        logger.info("=" * 80)
+        logger.info(f"Initialized DiverseTaskGenerator for: {self.capability.name}")
+        logger.info(f"Model: {self.model_name}")
+        logger.info(f"Temperature: {self.temperature}")
+        logger.info(f"Max tokens: {self.max_tokens}")
+        logger.info(f"Output directory: {self.run_output_dir}")
+        logger.info("=" * 80)
+
+        # Create API caller with pre-configured parameters
+        self._call_api = partial(
+            call_model,
+            self.client,
+            model_name=self.model_name,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+        )
+
+    def _save_json(self, filename: str, data_key: str, data: Any) -> Path:
+        """Save data to JSON file."""
+        output_file = self.run_output_dir / filename
+        # Convert dataclass objects to dicts if needed
+        if data and hasattr(
+            data[0] if isinstance(data, list) else data, "__dataclass_fields__"
+        ):
+            data = (
+                [asdict(item) for item in data]
+                if isinstance(data, list)
+                else asdict(data)
+            )
+
+        with open(output_file, "w") as f:
+            json.dump({data_key: data} if data_key else data, f, indent=2)
+        logger.info(f"Saved to: {output_file}")
+        return output_file
+
+    def extract_and_save_subtopics(self) -> list[SubTopic]:
+        """Extract sub-topics and save results."""
+        subtopics = extract_subtopics(self.capability, self._call_api)
+        self._save_json("subtopics.json", "sub_topics", subtopics)
+        return subtopics
+
+    def find_and_save_combinations(
+        self, subtopics: list[SubTopic]
+    ) -> list[Combination]:
+        """Find valid combinations and save results."""
+        combinations = find_valid_combinations(
+            self.capability, subtopics, self._call_api
+        )
+        self._save_json("combinations.json", "valid_combinations", combinations)
+        return combinations
+
+    def generate_and_save_blueprints(
+        self, combinations: list[Combination]
+    ) -> list[Blueprint]:
+        """Generate blueprints and save results."""
+        blueprints = generate_blueprints(
+            self.capability, combinations, self._call_api, self.config
+        )
+        self._save_json("blueprints.json", "blueprints", blueprints)
+        return blueprints
+
+    def generate_and_save_tasks(self, blueprints: list[Blueprint]) -> list[Task]:
+        """Generate tasks and save results."""
+        tasks_per_blueprint = self.config["generation"]["tasks_per_blueprint"]
+        tasks = generate_tasks(
+            self.capability, blueprints, self._call_api, tasks_per_blueprint
+        )
+        self._save_json("tasks.json", "tasks", tasks)
+        return tasks
+
+    def verify_and_save_tasks(
+        self, tasks: list[Task], blueprints: list[Blueprint]
+    ) -> VerificationResult:
+        """Verify tasks and save results."""
+        verification = verify_tasks(self.capability, tasks, blueprints, self._call_api)
+        self._save_json("verification.json", None, verification)
+        return verification
+
+    def run_full_pipeline(self) -> dict:
+        """Run the complete diverse task generation pipeline."""
+        logger.info("=" * 80)
+        logger.info("Starting Diverse Task Generation Pipeline")
+        logger.info(f"Capability: {self.capability.name}")
+        logger.info(f"Model: {self.model_name}")
+        logger.info("=" * 80)
+
+        # Extract sub-topics
+        subtopics = self.extract_and_save_subtopics()
+
+        # Find valid combinations
+        combinations = self.find_and_save_combinations(subtopics)
+
+        # Generate blueprints
+        blueprints = self.generate_and_save_blueprints(combinations)
+
+        # Generate tasks
+        tasks = self.generate_and_save_tasks(blueprints)
+
+        # Verify tasks
+        verification = self.verify_and_save_tasks(tasks, blueprints)
+
+        # Compile final results
+        results = {
+            "capability_name": self.capability.name,
+            "capability_description": self.capability.description,
+            "capability_domain": self.capability.domain,
+            "model_name": self.model_name,
+            "timestamp": datetime.now().isoformat(),
+            "subtopics": [asdict(st) for st in subtopics],
+            "combinations": [asdict(c) for c in combinations],
+            "blueprints": [asdict(bp) for bp in blueprints],
+            "tasks": [asdict(t) for t in tasks],
+            "verification": verification,
+        }
+
+        # Save final results
+        self._save_json("final_results.json", None, results)
+
+        logger.info("=" * 80)
+        logger.info("Pipeline Complete!")
+        logger.info(f"All results saved to: {self.run_output_dir}")
+        logger.info("=" * 80)
+
+        return results
+
+
+def load_capability_from_json(capability_json_path: str) -> dict:
+    """Load capability information from a JSON file."""
+    with open(capability_json_path, "r") as f:
+        return json.load(f)
+
+
+def main() -> None:
+    """Generate diverse tasks for a single capability."""
+    parser = argparse.ArgumentParser(
+        description="Generate diverse tasks for a capability from JSON file"
+    )
+    parser.add_argument(
+        "--capability-json-path",
+        type=str,
+        help="Path to capability JSON file (default: from config file)",
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        help="OpenAI model name (default: from config file)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="Output directory (default: from config file)",
+    )
+
+    args = parser.parse_args()
+
+    # Load config
+    config_file = Path(__file__).parent / "diverse_task_config.yaml"
+    with open(config_file, "r") as f:
+        config = yaml.safe_load(f)
+
+    # Override config with command-line arguments
+    if args.model_name:
+        config["model"]["name"] = args.model_name
+    if args.output_dir:
+        config["output"]["base_dir"] = args.output_dir
+    if args.capability_json_path:
+        config["input"]["capability_json_path"] = args.capability_json_path
+
+    logger.info(f"Loading capability from: {config['input']['capability_json_path']}")
+    capability_dict = load_capability_from_json(config["input"]["capability_json_path"])
+
+    # Initialize and run generator
+    generator = DiverseTaskGenerator(
+        capability_dict=capability_dict,
+        config=config,
+    )
+    generator.run_full_pipeline()
+
+    logger.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experimental/diverse_task_prompts.py b/experimental/diverse_task_prompts.py
new file mode 100644
index 0000000..a2fa6fc
--- /dev/null
+++ b/experimental/diverse_task_prompts.py
@@ -0,0 +1,416 @@
+"""
+Prompts for the diverse task generation pipeline.
+
+Edit these prompts to customize the task generation behavior.
+The main script can import these instead of using hardcoded prompts.
+"""
+
+# =============================================================================
+# SUB-TOPIC EXTRACTION
+# =============================================================================
+
+SUBTOPIC_SYSTEM_PROMPT = """
+You are an expert educational scientist responsible for identifying comprehensible sub-topics for a given capability.
+
+The name, description, and domain/area of the capability will be provided.
+
+Your goal is to decompose the capability into meaningful sub-topics that together provide full and balanced coverage of testing the given capability.
+
+Respond precisely in the following format, including the JSON start and end markers:
+
+RESPONSE JSON:
+{
+  "sub_topics": [
+    "<Sub-topic 1>",
+    "<Sub-topic 2>",
+    "<Sub-topic 3>"
+  ]
+}
+
+List each sub-topic as a concise noun phrase (5–10 words).
+
+Avoid redundancy and ensure each sub-topic can be independently assessed through a test question.
+"""
+
+SUBTOPIC_USER_PROMPT_TEMPLATE = """
+Identify the key sub-topics required to assess the following capability.
+
+Domain: {capability_domain}
+Area: {area_text}
+Capability Name: {capability_name}
+Capability Description: {capability_description}
+
+Depending on the granularity of the capability, generate 2–10 sub-topics that comprehensively represent this capability.
+"""
+
+
+# =============================================================================
+# VALID COMBINATIONS
+# =============================================================================
+
+COMBINATION_SYSTEM_PROMPT = """
+You are an educational scientist responsible for determining which combinations of (Content, Difficulty, Reasoning) are valid and meaningful for task generation.
+
+The list of available sub-topics (Content dimension), difficulty levels, and reasoning categories (based on Bloom's taxonomy) will be provided.
+
+Your goal is to select combinations that make pedagogical sense — i.e., combinations where a valid and meaningful question could be designed for the given sub-topic, at the specified difficulty, requiring the indicated reasoning level.
+
+Respond precisely in the following format, including the JSON start and end markers:
+
+RESPONSE JSON:
+{
+  "valid_combinations": [
+    {
+      "content": "<Sub-topic>",
+      "difficulty": "<easy|medium|hard>",
+      "reasoning": "<Bloom category>"
+    },
+    ...
+  ]
+}
+
+For example, extremely high reasoning levels like "Create" may not apply to simple factual sub-topics, and very easy difficulties may not pair with "Evaluate" or "Analyze" levels.
+
+Guidelines:
+- Select only combinations that would yield meaningful assessment tasks.
+
+- Ensure a balanced coverage across difficulties and reasoning levels if possible.
+
+- Avoid redundant combinations.
+"""
+
+COMBINATION_USER_PROMPT_TEMPLATE = """
+Determine all valid and meaningful (Content, Difficulty, Reasoning) combinations for the given capability.
+
+Domain: {capability_domain}
+Area: {capability_area}
+Capability Name: {capability_name}
+Capability Description: {capability_description}
+
+Sub-topics (Content dimension):
+{subtopics_desc}
+
+Difficulty levels:
+- Easy: Involves direct recall, recognition, or simple application of knowledge and procedures.
+- Medium: Requires connecting multiple ideas, performing multi-step reasoning, or applying knowledge in new but familiar contexts.
+- Hard: Involves complex reasoning, integration of several sub-topics, or solving non-trivial problems that demand deeper conceptual understanding.
+
+Reasoning types (Bloom's Taxonomy):
+1. Remember – Recall or recognize facts, terms, and basic concepts. Example verbs: define, list, identify.
+2. Understand – Explain ideas or concepts and interpret information in one's own words. Example verbs: summarize, describe, classify.
+3. Apply – Use knowledge or methods in new but familiar situations. Example verbs: calculate, demonstrate, use, implement.
+4. Analyze – Break information into parts and examine relationships or patterns. Example verbs: differentiate, compare, examine, infer.
+5. Evaluate – Make judgments based on criteria and standards. Example verbs: justify, critique, assess, argue.
+6. Create – Combine elements to form a new pattern, structure, or product. Example verbs: design, compose, formulate, generate.
+
+Your task:
+Identify all combinations of (Content, Difficulty, Reasoning) that are valid and pedagogically meaningful for this capability.
+
+Avoid combinations that are unrealistic (e.g., "Remember" level with "Hard" difficulty) or redundant.
+
+Ensure each selected combination could correspond to a feasible assessment task.
+"""
+
+
+# =============================================================================
+# BLUEPRINT GENERATION
+# =============================================================================
+
+BLUEPRINT_SYSTEM_PROMPT = """
+You are an expert educational scientist designing task blueprints for an assessment generation framework.
+
+Given a (Content, Difficulty, Reasoning) combination for a specific capability, you must produce a clear and detailed blueprint describing what kind of question should be designed for that combination.
+
+A task blueprint is a natural-language description that specifies:
+1. The core skill or concept being tested (based on the content/sub-topic).
+
+2. The expected cognitive process or reasoning level (based on Bloom's taxonomy).
+
+3. The intended level of challenge or complexity (based on difficulty).
+
+4. The type of task or question that would fit these criteria (e.g., conceptual explanation, computation, real-world application, analysis of case, critique, design, etc.).
+
+Respond precisely in the following format, including the JSON start and end markers:
+
+RESPONSE JSON:
+{
+  "blueprint": "<Natural-language description of the task blueprint>"
+}
+
+In <blueprint>, write a single coherent paragraph (3–5 sentences) describing how the task should look — what the student should be asked to do, what level of reasoning it should involve, and how difficulty manifests (e.g., unfamiliar data, abstract setting, multi-step reasoning, creative synthesis).
+
+Ensure the blueprint is descriptive, not a question itself.
+"""
+
+BLUEPRINT_USER_PROMPT_TEMPLATE = """
+Generate a task blueprint for the following capability and combination.
+
+Domain: {capability_domain}
+Area: {capability_area}
+Capability Name: {capability_name}
+Capability Description: {capability_description}
+
+Selected Combination:
+- Content (Sub-topic): {subtopic}
+- Difficulty: {difficulty} — {difficulty_description}
+- Reasoning Type (Bloom's Taxonomy): {reasoning} — {reasoning_description}
+
+Write a detailed blueprint describing what kind of question should be generated for this combination.
+
+The blueprint should explain:
+1. What the learner is expected to do.
+2. What kind of reasoning the task requires.
+3. How difficulty manifests in the structure or context of the task.
+"""
+
+
+# =============================================================================
+# TASK GENERATION
+# =============================================================================
+
+TASK_SYSTEM_PROMPT = """
+You are an expert educational scientist responsible for generating high-quality multiple-choice tasks.
+
+Given a task blueprint that describes what the question should assess, your goal is to write a complete multiple-choice question that:
+
+1. Accurately reflects the blueprint and capability description.
+
+2. Includes exactly four answer options.
+
+3. Has ONLY one correct answer.
+
+4. Uses clear and unambiguous wording.
+
+5. Ensures that incorrect options (distractors) are plausible but clearly wrong when the concept is understood correctly.
+
+Respond precisely in the following format, including the JSON start and end markers:
+
+RESPONSE JSON:
+{
+  "question": "<Question text>",
+  "options": {
+    "A": "<Option A>",
+    "B": "<Option B>",
+    "C": "<Option C>",
+    "D": "<Option D>"
+  },
+  "correct_answer": "<A/B/C/D>"
+}
+
+Ensure that the correct answer is consistent with the capability description and reasoning category.
+
+Avoid using vague words like "always," "never," or "most likely" unless the blueprint specifies such nuance.
+
+If mathematical notation is included, ensure all LaTeX symbols use escaped backslashes (e.g., "$\\\\frac{{1}}{{2}}$").
+"""
+
+TASK_USER_PROMPT_TEMPLATE = """
+Generate a multiple-choice task according to the following information.
+
+Domain: {capability_domain}
+Area: {capability_area}
+Capability Name: {capability_name}
+Capability Description: {capability_description}
+
+Task Blueprint:
+{blueprint_description}
+
+Requirements:
+- Write exactly one well-formed multiple-choice question.
+
+- Include four options (A–D).
+
+- Only one option should be correct.
+
+- Ensure all distractors (incorrect options) are realistic and relevant to the topic.
+
+- The task should be consistent with the intended content, difficulty, and reasoning type implied by the blueprint.
+"""
+
+
+# =============================================================================
+# TASK VERIFICATION
+# =============================================================================
+
+VERIFICATION_SYSTEM_PROMPT = """
+You are an expert educational evaluator responsible for verifying whether a generated multiple-choice task aligns with its intended blueprint and assessment criteria.
+
+Given a capability, its description, a task blueprint, and a generated multiple-choice question, your goal is to critically assess whether the task accurately reflects the blueprint and is logically valid.
+
+You must check the following aspects:
+
+1. Blueprint Alignment – Does the question content, reasoning level, and complexity match the description in the blueprint?
+
+2. Capability Relevance – Is the question consistent with the overall capability description?
+
+3. Difficulty and Reasoning Match – Does the cognitive demand align with the intended difficulty and Bloom's taxonomy level implied by the blueprint?
+
+4. Multiple-Choice Integrity – Are there exactly four answer options? Is ONLY ONE option correct? Are distractors plausible but clearly wrong?
+
+5. Clarity and Format – Is the question unambiguous, grammatically correct, and well-structured?
+
+Respond precisely in the following format, including the JSON start and end markers:
+
+RESPONSE JSON:
+{
+  "blueprint_alignment": "<Yes/No>",
+  "capability_alignment": "<Yes/No>",
+  "difficulty_reasoning_match": "<Yes/No>",
+  "single_correct_answer": "<Yes/No>",
+  "overall_verdict": "<Pass/Fail>",
+  "explanation": "<Brief justification of your verdict>"
+}
+
+Be specific about any mismatch in reasoning level, scope, or difficulty.
+
+In <explanation>, summarize in 2–3 sentences why the task passes or fails.
+
+Mark overall_verdict = Pass only if all criteria above are satisfied.
+"""
+
+VERIFICATION_USER_PROMPT_TEMPLATE = """
+Verify whether the following multiple-choice task meets the intended blueprint and capability criteria.
+
+Domain: {capability_domain}
+Area: {capability_area}
+Capability Name: {capability_name}
+Capability Description: {capability_description}
+
+Task Blueprint:
+{task_blueprint}
+
+Generated Task:
+Question: {question}
+Options:
+A. {option_a}
+B. {option_b}
+C. {option_c}
+D. {option_d}
+Correct Answer: {correct_answer}
+
+Check:
+1. Does the task align with the blueprint description?
+2. Does it reflect the intended reasoning and difficulty level?
+3. Are there exactly four options and ONLY ONE correct answer?
+4. Are the distractors reasonable but incorrect?
+5. Is the question clearly written and consistent with the capability?
+
+Return your structured evaluation in the specified JSON format.
+"""
+
+
+# =============================================================================
+# HELPER FUNCTIONS
+# =============================================================================
+
+
+def format_subtopic_prompt(
+    capability_name, capability_description, capability_domain, capability_area=None
+):
+    """Format subtopic extraction prompts."""
+    area_text = capability_area if capability_area else "N/A"
+
+    user_prompt = SUBTOPIC_USER_PROMPT_TEMPLATE.format(
+        capability_name=capability_name,
+        capability_description=capability_description,
+        capability_domain=capability_domain,
+        area_text=area_text,
+    )
+
+    return SUBTOPIC_SYSTEM_PROMPT, user_prompt
+
+
+def format_combination_prompt(
+    capability_name,
+    capability_description,
+    capability_domain,
+    capability_area,
+    subtopics_desc,
+):
+    """Format combination finding prompts."""
+    user_prompt = COMBINATION_USER_PROMPT_TEMPLATE.format(
+        capability_name=capability_name,
+        capability_description=capability_description,
+        capability_domain=capability_domain,
+        capability_area=capability_area if capability_area else "N/A",
+        subtopics_desc=subtopics_desc,
+    )
+
+    return COMBINATION_SYSTEM_PROMPT, user_prompt
+
+
+def format_blueprint_prompt(
+    capability_name,
+    capability_description,
+    capability_domain,
+    capability_area,
+    subtopic,
+    difficulty,
+    difficulty_description,
+    reasoning,
+    reasoning_description,
+):
+    """Format blueprint generation prompts."""
+    user_prompt = BLUEPRINT_USER_PROMPT_TEMPLATE.format(
+        capability_name=capability_name,
+        capability_description=capability_description,
+        capability_domain=capability_domain,
+        capability_area=capability_area if capability_area else "N/A",
+        subtopic=subtopic,
+        difficulty=difficulty,
+        difficulty_description=difficulty_description,
+        reasoning=reasoning,
+        reasoning_description=reasoning_description,
+    )
+
+    return BLUEPRINT_SYSTEM_PROMPT, user_prompt
+
+
+def format_task_prompt(
+    capability_name,
+    capability_description,
+    capability_domain,
+    capability_area,
+    blueprint_description,
+):
+    """Format task generation prompts."""
+    user_prompt = TASK_USER_PROMPT_TEMPLATE.format(
+        capability_name=capability_name,
+        capability_description=capability_description,
+        capability_domain=capability_domain,
+        capability_area=capability_area if capability_area else "N/A",
+        blueprint_description=blueprint_description,
+    )
+
+    return TASK_SYSTEM_PROMPT, user_prompt
+
+
+def format_verification_prompt(
+    capability_domain,
+    capability_area,
+    capability_name,
+    capability_description,
+    task_blueprint,
+    question,
+    option_a,
+    option_b,
+    option_c,
+    option_d,
+    correct_answer,
+):
+    """Format verification prompts."""
+    user_prompt = VERIFICATION_USER_PROMPT_TEMPLATE.format(
+        capability_domain=capability_domain,
+        capability_area=capability_area if capability_area else "N/A",
+        capability_name=capability_name,
+        capability_description=capability_description,
+        task_blueprint=task_blueprint,
+        question=question,
+        option_a=option_a,
+        option_b=option_b,
+        option_c=option_c,
+        option_d=option_d,
+        correct_answer=correct_answer,
+    )
+
+    return VERIFICATION_SYSTEM_PROMPT, user_prompt
diff --git a/experimental/extract_subtopics.py b/experimental/extract_subtopics.py
new file mode 100644
index 0000000..5cbbe43
--- /dev/null
+++ b/experimental/extract_subtopics.py
@@ -0,0 +1,41 @@
+"""Extract sub-topics for a capability."""
+
+import json
+import logging
+from typing import Callable
+
+from diverse_task_dataclasses import Capability, SubTopic
+from diverse_task_prompts import format_subtopic_prompt
+
+
+logger = logging.getLogger(__name__)
+
+
+def extract_subtopics(capability: Capability, call_llm: Callable) -> list[SubTopic]:
+    """Extract sub-topics for the given capability."""
+    logger.info("Extracting sub-topics...")
+
+    system_prompt, user_prompt = format_subtopic_prompt(
+        capability_name=capability.name,
+        capability_description=capability.description,
+        capability_domain=capability.domain,
+        capability_area=capability.area,
+    )
+
+    response = call_llm(
+        system_prompt=system_prompt,
+        user_prompt=user_prompt,
+        response_format={"type": "json_object"},
+    )
+
+    result = json.loads(response)
+    subtopic_names = result.get("sub_topics", [])
+
+    # Create SubTopic objects
+    subtopics = [SubTopic(name=name) for name in subtopic_names]
+
+    logger.info(f"Extracted {len(subtopics)} sub-topics:")
+    for st in subtopics:
+        logger.info(f"  - {st.name}")
+
+    return subtopics
diff --git a/experimental/find_combinations.py b/experimental/find_combinations.py
new file mode 100644
index 0000000..8f15731
--- /dev/null
+++ b/experimental/find_combinations.py
@@ -0,0 +1,57 @@
+"""Find valid combinations of (Content, Difficulty, Reasoning)."""
+
+import json
+import logging
+
+from diverse_task_dataclasses import Capability, Combination, SubTopic
+from diverse_task_prompts import format_combination_prompt
+
+
+logger = logging.getLogger(__name__)
+
+
+def find_valid_combinations(
+    capability: Capability, subtopics: list[SubTopic], call_llm
+) -> list[Combination]:
+    """Find valid combinations of for the capability."""
+    logger.info("Finding valid combinations...")
+
+    # Prepare subtopics description
+    subtopics_desc = "\n".join([f"- {st.name}" for st in subtopics])
+
+    system_prompt, user_prompt = format_combination_prompt(
+        capability_name=capability.name,
+        capability_description=capability.description,
+        capability_domain=capability.domain,
+        capability_area=capability.area,
+        subtopics_desc=subtopics_desc,
+    )
+
+    response = call_llm(
+        system_prompt=system_prompt,
+        user_prompt=user_prompt,
+        response_format={"type": "json_object"},
+    )
+
+    result = json.loads(response)
+    combinations_data = result.get("valid_combinations", [])
+
+    # Create Combination objects
+    combinations = [
+        Combination(
+            content=combo["content"],
+            difficulty=combo["difficulty"],
+            reasoning=combo["reasoning"],
+        )
+        for combo in combinations_data
+    ]
+
+    logger.info(f"Found {len(combinations)} valid combinations:")
+    for i, combo in enumerate(combinations[:5]):  # Show first 5
+        logger.info(
+            f"  {i + 1}. {combo.content} | {combo.difficulty} | {combo.reasoning}"
+        )
+    if len(combinations) > 5:
+        logger.info(f"  ... and {len(combinations) - 5} more")
+
+    return combinations
diff --git a/experimental/generate_blueprints.py b/experimental/generate_blueprints.py
new file mode 100644
index 0000000..4884590
--- /dev/null
+++ b/experimental/generate_blueprints.py
@@ -0,0 +1,66 @@
+"""Generate task blueprints for each valid combination."""
+
+import json
+import logging
+from typing import Callable
+
+from diverse_task_dataclasses import Blueprint, Capability, Combination
+from diverse_task_prompts import format_blueprint_prompt
+
+
+logger = logging.getLogger(__name__)
+
+
+def generate_blueprints(
+    capability: Capability,
+    combinations: list[Combination],
+    call_llm: Callable,
+    config: dict,
+) -> list[Blueprint]:
+    """Generate task blueprints for each valid combination."""
+    logger.info("Generating task blueprints...")
+
+    blueprints = []
+
+    for i, combo in enumerate(combinations):
+        logger.info(
+            f"Generating blueprint {i + 1}/{len(combinations)}: "
+            f"{combo.content} | {combo.difficulty} | {combo.reasoning}"
+        )
+
+        system_prompt, user_prompt = format_blueprint_prompt(
+            capability_name=capability.name,
+            capability_description=capability.description,
+            capability_domain=capability.domain,
+            capability_area=capability.area,
+            subtopic=combo.content,
+            difficulty=combo.difficulty,
+            difficulty_description=config["difficulty_levels"][
+                combo.difficulty.lower()
+            ]["description"],
+            reasoning=combo.reasoning,
+            reasoning_description=config["blooms_taxonomy"][combo.reasoning][
+                "description"
+            ],
+        )
+
+        response = call_llm(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            response_format={"type": "json_object"},
+        )
+
+        blueprint_data = json.loads(response)
+        blueprint = Blueprint(
+            combination_id=i,
+            subtopic=combo.content,
+            difficulty=combo.difficulty,
+            reasoning=combo.reasoning,
+            blueprint=blueprint_data["blueprint"],
+            rationale=combo.rationale,
+        )
+        blueprints.append(blueprint)
+
+    logger.info(f"Generated {len(blueprints)} blueprints")
+
+    return blueprints
diff --git a/experimental/generate_tasks.py b/experimental/generate_tasks.py
new file mode 100644
index 0000000..a8926b8
--- /dev/null
+++ b/experimental/generate_tasks.py
@@ -0,0 +1,68 @@
+"""Generate multiple-choice questions for each blueprint."""
+
+import json
+import logging
+from typing import Callable
+
+from diverse_task_dataclasses import Blueprint, Capability, Task
+from diverse_task_prompts import format_task_prompt
+
+
+logger = logging.getLogger(__name__)
+
+
+def generate_tasks(
+    capability: Capability,
+    blueprints: list[Blueprint],
+    call_llm: Callable,
+    tasks_per_blueprint: int = 3,
+) -> list[Task]:
+    """Generate multiple-choice questions for each blueprint."""
+    logger.info("Generating tasks from blueprints...")
+
+    all_tasks = []
+
+    for blueprint in blueprints:
+        logger.info(
+            f"Generating {tasks_per_blueprint} tasks for blueprint "
+            f"{blueprint.combination_id}: {blueprint.subtopic} | "
+            f"{blueprint.difficulty} | {blueprint.reasoning}"
+        )
+
+        # Generate multiple tasks for this blueprint
+        for j in range(tasks_per_blueprint):
+            system_prompt, user_prompt = format_task_prompt(
+                capability_name=capability.name,
+                capability_description=capability.description,
+                capability_domain=capability.domain,
+                capability_area=capability.area,
+                blueprint_description=blueprint.blueprint,
+            )
+
+            response = call_llm(
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                response_format={"type": "json_object"},
+            )
+
+            task_data = json.loads(response)
+
+            # Create Task object
+            task_id = f"task_{blueprint.combination_id}_{j}"
+            task = Task(
+                task_id=task_id,
+                blueprint_id=blueprint.combination_id,
+                subtopic=blueprint.subtopic,
+                difficulty=blueprint.difficulty,
+                reasoning=blueprint.reasoning,
+                question=task_data["question"],
+                choices=task_data["options"],
+                correct_answer=task_data["correct_answer"],
+            )
+            all_tasks.append(task)
+
+        logger.info(f"  Generated {tasks_per_blueprint} tasks")
+
+    logger.info(f"Generated {len(all_tasks)} total tasks")
+
+    return all_tasks
diff --git a/experimental/model_utils.py b/experimental/model_utils.py
new file mode 100644
index 0000000..f0633c8
--- /dev/null
+++ b/experimental/model_utils.py
@@ -0,0 +1,41 @@
+"""Utilities for LLM API calls."""
+
+import logging
+
+from openai import OpenAI
+
+
+logger = logging.getLogger(__name__)
+
+
+def call_model(
+    client: OpenAI,
+    system_prompt: str,
+    user_prompt: str,
+    model_name: str,
+    temperature: float,
+    max_tokens: int,
+    response_format: dict = None,
+) -> str:
+    """Call LLM API with given prompts."""
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt},
+    ]
+
+    try:
+        kwargs = {
+            "model": model_name,
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        if response_format:
+            kwargs["response_format"] = response_format
+
+        response = client.chat.completions.create(**kwargs)
+        return response.choices[0].message.content
+
+    except Exception as e:
+        logger.error(f"Model API call failed: {e}")
+        raise
diff --git a/experimental/verify_tasks.py b/experimental/verify_tasks.py
new file mode 100644
index 0000000..8d14828
--- /dev/null
+++ b/experimental/verify_tasks.py
@@ -0,0 +1,112 @@
+"""Verify that generated tasks align with intended dimensions."""
+
+import json
+import logging
+from typing import Callable
+
+from diverse_task_dataclasses import Blueprint, Capability, Task, VerificationResult
+from diverse_task_prompts import format_verification_prompt
+
+
+logger = logging.getLogger(__name__)
+
+
+def verify_tasks(
+    capability: Capability,
+    tasks: list[Task],
+    blueprints: list[Blueprint],
+    call_llm: Callable,
+) -> VerificationResult:
+    """Verify that generated tasks align with intended dimensions."""
+    logger.info("Verifying task alignment...")
+
+    # Create blueprint lookup
+    blueprint_dict = {bp.combination_id: bp for bp in blueprints}
+
+    verification_results = []
+
+    for i, task in enumerate(tasks):
+        logger.info(f"Verifying task {i + 1}/{len(tasks)}: {task.task_id}")
+
+        # Get blueprint for this task
+        blueprint = blueprint_dict.get(task.blueprint_id)
+        blueprint_text = blueprint.blueprint if blueprint else "N/A"
+
+        system_prompt, user_prompt = format_verification_prompt(
+            capability_domain=capability.domain,
+            capability_area=capability.area,
+            capability_name=capability.name,
+            capability_description=capability.description,
+            task_blueprint=blueprint_text,
+            question=task.question,
+            option_a=task.choices.get("A", ""),
+            option_b=task.choices.get("B", ""),
+            option_c=task.choices.get("C", ""),
+            option_d=task.choices.get("D", ""),
+            correct_answer=task.correct_answer,
+        )
+
+        response = call_llm(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            response_format={"type": "json_object"},
+        )
+
+        verification_data = json.loads(response)
+
+        # Map new verification format to old format
+        overall_aligned = verification_data.get("overall_verdict", "Fail") == "Pass"
+
+        verification = VerificationResult(
+            task_id=task.task_id,
+            subtopic_aligned=verification_data.get("blueprint_alignment", "No")
+            == "Yes",
+            difficulty_aligned=verification_data.get("difficulty_reasoning_match", "No")
+            == "Yes",
+            reasoning_aligned=verification_data.get("capability_alignment", "No")
+            == "Yes",
+            choices_appropriate=verification_data.get("single_correct_answer", "No")
+            == "Yes",
+            overall_aligned=overall_aligned,
+            feedback=verification_data.get("explanation", ""),
+        )
+        verification_results.append(verification)
+
+        status = "✓ PASS" if verification.overall_aligned else "✗ FAIL"
+        logger.info(f"  {status}")
+
+    # Calculate statistics
+    total = len(verification_results)
+    passed = sum(1 for v in verification_results if v.overall_aligned)
+    failed = total - passed
+
+    # Convert to dict for JSON serialization
+    verification_details_dict = [
+        {
+            "task_id": v.task_id,
+            "subtopic_aligned": v.subtopic_aligned,
+            "difficulty_aligned": v.difficulty_aligned,
+            "reasoning_aligned": v.reasoning_aligned,
+            "choices_appropriate": v.choices_appropriate,
+            "overall_aligned": v.overall_aligned,
+            "feedback": v.feedback,
+            "suggested_improvements": v.suggested_improvements,
+        }
+        for v in verification_results
+    ]
+
+    summary = {
+        "total_tasks": total,
+        "passed": passed,
+        "failed": failed,
+        "pass_rate": passed / total if total > 0 else 0,
+        "verification_details": verification_details_dict,
+    }
+
+    logger.info("\nVerification Summary:")
+    logger.info(f"  Total tasks: {total}")
+    logger.info(f"  Passed: {passed}")
+    logger.info(f"  Failed: {failed}")
+    logger.info(f"  Pass rate: {summary['pass_rate']:.1%}")
+
+    return summary

From fdbe5700bd3b51a8a04dbbcd3d05f2b6533ed990 Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Wed, 5 Nov 2025 11:29:35 -0500
Subject: [PATCH 18/19] updated prompts, and combination logic.

---
 experimental/diverse_task_config.yaml  |  54 -----------
 experimental/diverse_task_constants.py |  34 +++++++
 experimental/diverse_task_generator.py |   2 +-
 experimental/diverse_task_prompts.py   | 121 ++++++++++++++++---------
 experimental/find_combinations.py      |  39 ++++++--
 experimental/generate_blueprints.py    |   9 +-
 6 files changed, 150 insertions(+), 109 deletions(-)
 create mode 100644 experimental/diverse_task_constants.py

diff --git a/experimental/diverse_task_config.yaml b/experimental/diverse_task_config.yaml
index d0d1329..879902d 100644
--- a/experimental/diverse_task_config.yaml
+++ b/experimental/diverse_task_config.yaml
@@ -22,60 +22,6 @@ output:
 input:
   capability_json_path: capability.json  # Default capability JSON file path
 
-# Bloom's Taxonomy definitions
-# Source: Revised Bloom's Taxonomy (Anderson & Krathwohl, 2001)
-blooms_taxonomy:
-  Remember:
-    description: "Retrieving relevant knowledge from long-term memory. Involves recognizing and recalling facts, terms, basic concepts, or answers."
-    keywords: ["define", "list", "identify", "recall", "name", "state"]
-
-  Understand:
-    description: "Constructing meaning from instructional messages. Involves interpreting, exemplifying, classifying, summarizing, inferring, comparing, and explaining."
-    keywords: ["explain", "describe", "interpret", "summarize", "compare", "contrast"]
-
-  Apply:
-    description: "Carrying out or using a procedure in a given situation. Involves executing or implementing a method, technique, or process."
-    keywords: ["apply", "use", "implement", "execute", "solve", "demonstrate"]
-
-  Analyze:
-    description: "Breaking material into constituent parts and determining how parts relate to one another and to an overall structure. Involves differentiating, organizing, and attributing."
-    keywords: ["analyze", "differentiate", "organize", "distinguish", "examine", "compare"]
-
-  Evaluate:
-    description: "Making judgments based on criteria and standards. Involves checking for internal consistency or logical fallacies, and critiquing based on external criteria."
-    keywords: ["evaluate", "judge", "critique", "assess", "justify", "argue"]
-
-  Create:
-    description: "Putting elements together to form a novel, coherent whole or make an original product. Involves generating, planning, and producing."
-    keywords: ["create", "design", "construct", "develop", "formulate", "generate"]
-
-# Difficulty level definitions
-difficulty_levels:
-  easy:
-    description: "Basic, straightforward problems requiring minimal steps and fundamental knowledge."
-    characteristics:
-      - "Single concept application"
-      - "Direct recall or simple calculation"
-      - "Clear and unambiguous"
-      - "Minimal prerequisite knowledge"
-
-  medium:
-    description: "Moderate complexity requiring multiple steps, integration of concepts, or non-trivial reasoning."
-    characteristics:
-      - "Multiple concept integration"
-      - "Multi-step solution required"
-      - "Some prerequisite knowledge needed"
-      - "May involve edge cases"
-
-  hard:
-    description: "Complex, challenging problems requiring deep understanding, multiple concepts, edge cases, or sophisticated reasoning."
-    characteristics:
-      - "Complex multi-concept integration"
-      - "Multiple challenging steps"
-      - "Deep domain knowledge required"
-      - "Edge cases and exceptions"
-      - "May require insight or creative approach"
-
 # Verification criteria
 verification:
   pass_threshold: 0.8  # Minimum pass rate to consider successful
diff --git a/experimental/diverse_task_constants.py b/experimental/diverse_task_constants.py
new file mode 100644
index 0000000..8ca53c9
--- /dev/null
+++ b/experimental/diverse_task_constants.py
@@ -0,0 +1,34 @@
+"""Constants for diverse task generation."""
+
+BLOOMS_TAXONOMY = {
+    "Remember": {
+        "description": "Recall or recognize facts, terms, and basic concepts. Example verbs: define, list, identify."
+    },
+    "Understand": {
+        "description": "Explain ideas or concepts and interpret information in one's own words. Example verbs: summarize, describe, classify."
+    },
+    "Apply": {
+        "description": "Use knowledge or methods in new but familiar situations. Example verbs: calculate, demonstrate, use, implement."
+    },
+    "Analyze": {
+        "description": "Break information into parts and examine relationships or patterns. Example verbs: differentiate, compare, examine, infer."
+    },
+    "Evaluate": {
+        "description": "Make judgments based on criteria and standards. Example verbs: justify, critique, assess, argue."
+    },
+    "Create": {
+        "description": "Combine elements to form a new pattern, structure, or product. Example verbs: design, compose, formulate, generate."
+    },
+}
+
+DIFFICULTY_LEVELS = {
+    "easy": {
+        "description": "Involves direct recall, recognition, or simple application of knowledge and procedures."
+    },
+    "medium": {
+        "description": "Requires connecting multiple ideas, performing multi-step reasoning, or applying knowledge in new but familiar contexts."
+    },
+    "hard": {
+        "description": "Involves complex reasoning, integration of several sub-topics, or solving non-trivial problems that demand deeper conceptual understanding."
+    },
+}
diff --git a/experimental/diverse_task_generator.py b/experimental/diverse_task_generator.py
index 8518a1b..e530746 100644
--- a/experimental/diverse_task_generator.py
+++ b/experimental/diverse_task_generator.py
@@ -127,7 +127,7 @@ def find_and_save_combinations(
     ) -> list[Combination]:
         """Find valid combinations and save results."""
         combinations = find_valid_combinations(
-            self.capability, subtopics, self._call_api
+            self.capability, subtopics, self._call_api, self.config
         )
         self._save_json("combinations.json", "valid_combinations", combinations)
         return combinations
diff --git a/experimental/diverse_task_prompts.py b/experimental/diverse_task_prompts.py
index a2fa6fc..3cb6c46 100644
--- a/experimental/diverse_task_prompts.py
+++ b/experimental/diverse_task_prompts.py
@@ -10,7 +10,9 @@
 # =============================================================================
 
 SUBTOPIC_SYSTEM_PROMPT = """
-You are an expert educational scientist responsible for identifying comprehensible sub-topics for a given capability.
+You are an expert in {capability_domain} responsible for identifying comprehensible sub-topics for a given capability.
+
+A domain is a broad subject area (e.g., Mathematics), an area is a specialized field within that domain (e.g., Linear Algebra), and a capability is a specific topic within that area (e.g., representing graphs using matrices).
 
 The name, description, and domain/area of the capability will be provided.
 
@@ -19,13 +21,13 @@
 Respond precisely in the following format, including the JSON start and end markers:
 
 RESPONSE JSON:
-{
+{{{{
   "sub_topics": [
     "<Sub-topic 1>",
     "<Sub-topic 2>",
     "<Sub-topic 3>"
   ]
-}
+}}}}
 
 List each sub-topic as a concise noun phrase (5–10 words).
 
@@ -36,7 +38,7 @@
 Identify the key sub-topics required to assess the following capability.
 
 Domain: {capability_domain}
-Area: {area_text}
+Area: {capability_area}
 Capability Name: {capability_name}
 Capability Description: {capability_description}
 
@@ -49,7 +51,9 @@
 # =============================================================================
 
 COMBINATION_SYSTEM_PROMPT = """
-You are an educational scientist responsible for determining which combinations of (Content, Difficulty, Reasoning) are valid and meaningful for task generation.
+You are an expert in {capability_domain} responsible for determining which combinations of (Content, Difficulty, Reasoning) are valid and meaningful for task generation.
+
+A domain is a broad subject area (e.g., Mathematics), an area is a specialized field within that domain (e.g., Linear Algebra), a capability is a specific concept or topic within that area (e.g., representing graphs using matrices), and a sub-topic is a concrete skill of that capability that can be assessed (e.g., constructing an adjacency matrix for a given graph).
 
 The list of available sub-topics (Content dimension), difficulty levels, and reasoning categories (based on Bloom's taxonomy) will be provided.
 
@@ -58,16 +62,16 @@
 Respond precisely in the following format, including the JSON start and end markers:
 
 RESPONSE JSON:
-{
+{{{{
   "valid_combinations": [
-    {
+    {{{{
       "content": "<Sub-topic>",
       "difficulty": "<easy|medium|hard>",
       "reasoning": "<Bloom category>"
-    },
+    }}}},
     ...
   ]
-}
+}}}}
 
 For example, extremely high reasoning levels like "Create" may not apply to simple factual sub-topics, and very easy difficulties may not pair with "Evaluate" or "Analyze" levels.
 
@@ -88,7 +92,7 @@
 Capability Description: {capability_description}
 
 Sub-topics (Content dimension):
-{subtopics_desc}
+{content_list}
 
 Difficulty levels:
 - Easy: Involves direct recall, recognition, or simple application of knowledge and procedures.
@@ -119,7 +123,9 @@
 BLUEPRINT_SYSTEM_PROMPT = """
 You are an expert educational scientist designing task blueprints for an assessment generation framework.
 
-Given a (Content, Difficulty, Reasoning) combination for a specific capability, you must produce a clear and detailed blueprint describing what kind of question should be designed for that combination.
+A domain is a broad subject area (e.g., Mathematics), an area is a specialized field within that domain (e.g., Linear Algebra), a capability is a specific concept or topic within that area (e.g., representing graphs using matrices), and a sub-topic is a concrete skill of that capability that can be assessed (e.g., constructing an adjacency matrix for a given graph).
+
+Given a (Content (sub-topic), Difficulty, Reasoning) combination for a specific capability, you must produce a clear and detailed blueprint describing what kind of question should be designed for that combination.
 
 A task blueprint is a natural-language description that specifies:
 1. The core skill or concept being tested (based on the content/sub-topic).
@@ -128,16 +134,22 @@
 
 3. The intended level of challenge or complexity (based on difficulty).
 
-4. The type of task or question that would fit these criteria (e.g., conceptual explanation, computation, real-world application, analysis of case, critique, design, etc.).
-
 Respond precisely in the following format, including the JSON start and end markers:
 
 RESPONSE JSON:
-{
+{{{{
   "blueprint": "<Natural-language description of the task blueprint>"
-}
+}}}}
 
-In <blueprint>, write a single coherent paragraph (3–5 sentences) describing how the task should look — what the student should be asked to do, what level of reasoning it should involve, and how difficulty manifests (e.g., unfamiliar data, abstract setting, multi-step reasoning, creative synthesis).
+In <blueprint>, write a single coherent paragraph (3–5 sentences) describing how the task should look — what the task evaluates, what the student should be asked to do, what level of reasoning it should involve (based on the bloom's taxonomy provided below), and how difficulty manifests (e.g., unfamiliar data, abstract setting, multi-step reasoning, creative synthesis).
+
+Reasoning types (Bloom's Taxonomy):
+1. Remember – Recall or recognize facts, terms, and basic concepts. Example verbs: define, list, identify.
+2. Understand – Explain ideas or concepts and interpret information in one's own words. Example verbs: summarize, describe, classify.
+3. Apply – Use knowledge or methods in new but familiar situations. Example verbs: calculate, demonstrate, use, implement.
+4. Analyze – Break information into parts and examine relationships or patterns. Example verbs: differentiate, compare, examine, infer.
+5. Evaluate – Make judgments based on criteria and standards. Example verbs: justify, critique, assess, argue.
+6. Create – Combine elements to form a new pattern, structure, or product. Example verbs: design, compose, formulate, generate.
 
 Ensure the blueprint is descriptive, not a question itself.
 """
@@ -151,9 +163,9 @@
 Capability Description: {capability_description}
 
 Selected Combination:
-- Content (Sub-topic): {subtopic}
-- Difficulty: {difficulty} — {difficulty_description}
-- Reasoning Type (Bloom's Taxonomy): {reasoning} — {reasoning_description}
+- Content (Sub-topic): {content_value}
+- Difficulty: {difficulty_value} — {difficulty_definition}
+- Reasoning Type (Bloom's Taxonomy): {reasoning_value} — {reasoning_definition}
 
 Write a detailed blueprint describing what kind of question should be generated for this combination.
 
@@ -171,7 +183,9 @@
 TASK_SYSTEM_PROMPT = """
 You are an expert educational scientist responsible for generating high-quality multiple-choice tasks.
 
-Given a task blueprint that describes what the question should assess, your goal is to write a complete multiple-choice question that:
+A domain is a broad subject area (e.g., Mathematics), an area is a specialized field within that domain (e.g., Linear Algebra), a capability is a specific concept or topic within that area (e.g., representing graphs using matrices), and a sub-topic is a concrete skill of that capability that can be assessed (e.g., constructing an adjacency matrix for a given graph).
+
+Given a task blueprint that describes what the question should assess, difficulty level and reasoning type based on bloom's taxonomy, your goal is to write a complete multiple-choice question that:
 
 1. Accurately reflects the blueprint and capability description.
 
@@ -186,22 +200,36 @@
 Respond precisely in the following format, including the JSON start and end markers:
 
 RESPONSE JSON:
-{
+{{{{
   "question": "<Question text>",
-  "options": {
+  "options": {{{{
     "A": "<Option A>",
     "B": "<Option B>",
     "C": "<Option C>",
     "D": "<Option D>"
-  },
+  }}}},
   "correct_answer": "<A/B/C/D>"
-}
+}}}}
+
+Difficulty levels:
+- Easy: Involves direct recall, recognition, or simple application of knowledge and procedures.
+- Medium: Requires connecting multiple ideas, performing multi-step reasoning, or applying knowledge in new but familiar contexts.
+- Hard: Involves complex reasoning, integration of several sub-topics, or solving non-trivial problems that demand deeper conceptual understanding.
+
+Reasoning types (Bloom's Taxonomy):
+1. Remember – Recall or recognize facts, terms, and basic concepts. Example verbs: define, list, identify.
+2. Understand – Explain ideas or concepts and interpret information in one's own words. Example verbs: summarize, describe, classify.
+3. Apply – Use knowledge or methods in new but familiar situations. Example verbs: calculate, demonstrate, use, implement.
+4. Analyze – Break information into parts and examine relationships or patterns. Example verbs: differentiate, compare, examine, infer.
+5. Evaluate – Make judgments based on criteria and standards. Example verbs: justify, critique, assess, argue.
+6. Create – Combine elements to form a new pattern, structure, or product. Example verbs: design, compose, formulate, generate.
+
 
 Ensure that the correct answer is consistent with the capability description and reasoning category.
 
 Avoid using vague words like "always," "never," or "most likely" unless the blueprint specifies such nuance.
 
-If mathematical notation is included, ensure all LaTeX symbols use escaped backslashes (e.g., "$\\\\frac{{1}}{{2}}$").
+If mathematical notation is included, ensure all LaTeX symbols use escaped backslashes (e.g., "$\\\\frac{1}{2}$").
 """
 
 TASK_USER_PROMPT_TEMPLATE = """
@@ -213,7 +241,7 @@
 Capability Description: {capability_description}
 
 Task Blueprint:
-{blueprint_description}
+{task_blueprint}
 
 Requirements:
 - Write exactly one well-formed multiple-choice question.
@@ -252,14 +280,14 @@
 Respond precisely in the following format, including the JSON start and end markers:
 
 RESPONSE JSON:
-{
+{{{{
   "blueprint_alignment": "<Yes/No>",
   "capability_alignment": "<Yes/No>",
   "difficulty_reasoning_match": "<Yes/No>",
   "single_correct_answer": "<Yes/No>",
   "overall_verdict": "<Pass/Fail>",
   "explanation": "<Brief justification of your verdict>"
-}
+}}}}
 
 Be specific about any mismatch in reasoning level, scope, or difficulty.
 
@@ -280,13 +308,13 @@
 {task_blueprint}
 
 Generated Task:
-Question: {question}
+Question: {question_text}
 Options:
 A. {option_a}
 B. {option_b}
 C. {option_c}
 D. {option_d}
-Correct Answer: {correct_answer}
+
 
 Check:
 1. Does the task align with the blueprint description?
@@ -310,14 +338,18 @@ def format_subtopic_prompt(
     """Format subtopic extraction prompts."""
     area_text = capability_area if capability_area else "N/A"
 
+    system_prompt = SUBTOPIC_SYSTEM_PROMPT.format(
+        capability_domain=capability_domain,
+    )
+
     user_prompt = SUBTOPIC_USER_PROMPT_TEMPLATE.format(
         capability_name=capability_name,
         capability_description=capability_description,
         capability_domain=capability_domain,
-        area_text=area_text,
+        capability_area=area_text,
     )
 
-    return SUBTOPIC_SYSTEM_PROMPT, user_prompt
+    return system_prompt, user_prompt
 
 
 def format_combination_prompt(
@@ -325,18 +357,22 @@ def format_combination_prompt(
     capability_description,
     capability_domain,
     capability_area,
-    subtopics_desc,
+    content_list,
 ):
     """Format combination finding prompts."""
+    system_prompt = COMBINATION_SYSTEM_PROMPT.format(
+        capability_domain=capability_domain,
+    )
+
     user_prompt = COMBINATION_USER_PROMPT_TEMPLATE.format(
         capability_name=capability_name,
         capability_description=capability_description,
         capability_domain=capability_domain,
         capability_area=capability_area if capability_area else "N/A",
-        subtopics_desc=subtopics_desc,
+        content_list=content_list,
     )
 
-    return COMBINATION_SYSTEM_PROMPT, user_prompt
+    return system_prompt, user_prompt
 
 
 def format_blueprint_prompt(
@@ -356,11 +392,11 @@ def format_blueprint_prompt(
         capability_description=capability_description,
         capability_domain=capability_domain,
         capability_area=capability_area if capability_area else "N/A",
-        subtopic=subtopic,
-        difficulty=difficulty,
-        difficulty_description=difficulty_description,
-        reasoning=reasoning,
-        reasoning_description=reasoning_description,
+        content_value=subtopic,
+        difficulty_value=difficulty,
+        difficulty_definition=difficulty_description,
+        reasoning_value=reasoning,
+        reasoning_definition=reasoning_description,
     )
 
     return BLUEPRINT_SYSTEM_PROMPT, user_prompt
@@ -379,7 +415,7 @@ def format_task_prompt(
         capability_description=capability_description,
         capability_domain=capability_domain,
         capability_area=capability_area if capability_area else "N/A",
-        blueprint_description=blueprint_description,
+        task_blueprint=blueprint_description,
     )
 
     return TASK_SYSTEM_PROMPT, user_prompt
@@ -405,12 +441,11 @@ def format_verification_prompt(
         capability_name=capability_name,
         capability_description=capability_description,
         task_blueprint=task_blueprint,
-        question=question,
+        question_text=question,
         option_a=option_a,
         option_b=option_b,
         option_c=option_c,
         option_d=option_d,
-        correct_answer=correct_answer,
     )
 
     return VERIFICATION_SYSTEM_PROMPT, user_prompt
diff --git a/experimental/find_combinations.py b/experimental/find_combinations.py
index 8f15731..adf9d4f 100644
--- a/experimental/find_combinations.py
+++ b/experimental/find_combinations.py
@@ -3,6 +3,7 @@
 import json
 import logging
 
+from diverse_task_constants import BLOOMS_TAXONOMY, DIFFICULTY_LEVELS
 from diverse_task_dataclasses import Capability, Combination, SubTopic
 from diverse_task_prompts import format_combination_prompt
 
@@ -11,20 +12,44 @@
 
 
 def find_valid_combinations(
-    capability: Capability, subtopics: list[SubTopic], call_llm
+    capability: Capability, subtopics: list[SubTopic], call_llm, config: dict
 ) -> list[Combination]:
-    """Find valid combinations of for the capability."""
+    """Find valid combinations of Content, Difficulty, and Reasoning."""
     logger.info("Finding valid combinations...")
 
-    # Prepare subtopics description
-    subtopics_desc = "\n".join([f"- {st.name}" for st in subtopics])
+    # Get difficulty levels and reasoning types from constants
+    difficulty_levels = list(DIFFICULTY_LEVELS.keys())
+    reasoning_types = list(BLOOMS_TAXONOMY.keys())
+
+    # Generate all possible combinations
+    all_combinations = []
+    for subtopic in subtopics:
+        for difficulty in difficulty_levels:
+            for reasoning in reasoning_types:
+                all_combinations.append(
+                    {
+                        "content": subtopic.name,
+                        "difficulty": difficulty,
+                        "reasoning": reasoning,
+                    }
+                )
+
+    logger.info(f"Generated {len(all_combinations)} total combinations to validate")
+
+    # Format combinations as a numbered list for the LLM
+    content_list = "\n".join(
+        [
+            f"{i + 1}. Content: {c['content']}, Difficulty: {c['difficulty']}, Reasoning: {c['reasoning']}"
+            for i, c in enumerate(all_combinations)
+        ]
+    )
 
     system_prompt, user_prompt = format_combination_prompt(
         capability_name=capability.name,
         capability_description=capability.description,
         capability_domain=capability.domain,
         capability_area=capability.area,
-        subtopics_desc=subtopics_desc,
+        content_list=content_list,
     )
 
     response = call_llm(
@@ -46,7 +71,9 @@ def find_valid_combinations(
         for combo in combinations_data
     ]
 
-    logger.info(f"Found {len(combinations)} valid combinations:")
+    logger.info(
+        f"Found {len(combinations)} valid combinations out of {len(all_combinations)} total:"
+    )
     for i, combo in enumerate(combinations[:5]):  # Show first 5
         logger.info(
             f"  {i + 1}. {combo.content} | {combo.difficulty} | {combo.reasoning}"
diff --git a/experimental/generate_blueprints.py b/experimental/generate_blueprints.py
index 4884590..5ea4169 100644
--- a/experimental/generate_blueprints.py
+++ b/experimental/generate_blueprints.py
@@ -4,6 +4,7 @@
 import logging
 from typing import Callable
 
+from diverse_task_constants import BLOOMS_TAXONOMY, DIFFICULTY_LEVELS
 from diverse_task_dataclasses import Blueprint, Capability, Combination
 from diverse_task_prompts import format_blueprint_prompt
 
@@ -35,13 +36,11 @@ def generate_blueprints(
             capability_area=capability.area,
             subtopic=combo.content,
             difficulty=combo.difficulty,
-            difficulty_description=config["difficulty_levels"][
-                combo.difficulty.lower()
-            ]["description"],
-            reasoning=combo.reasoning,
-            reasoning_description=config["blooms_taxonomy"][combo.reasoning][
+            difficulty_description=DIFFICULTY_LEVELS[combo.difficulty.lower()][
                 "description"
             ],
+            reasoning=combo.reasoning,
+            reasoning_description=BLOOMS_TAXONOMY[combo.reasoning]["description"],
         )
 
         response = call_llm(

From 9866bafb16cd6e5fe1e517d4b6d96032aca85ccc Mon Sep 17 00:00:00 2001
From: kohankhaki <fkohankh8@gmail.com>
Date: Fri, 7 Nov 2025 15:40:24 -0500
Subject: [PATCH 19/19] added resume, and retry.

---
 experimental/diverse_task_config.yaml  |   2 +
 experimental/diverse_task_generator.py |  97 +++++++++++++++++--
 experimental/generate_tasks.py         |  73 ++++++++------
 experimental/model_utils.py            |  54 +++++++----
 experimental/verify_tasks.py           | 126 ++++++++++++++++---------
 5 files changed, 253 insertions(+), 99 deletions(-)

diff --git a/experimental/diverse_task_config.yaml b/experimental/diverse_task_config.yaml
index 879902d..c261782 100644
--- a/experimental/diverse_task_config.yaml
+++ b/experimental/diverse_task_config.yaml
@@ -5,6 +5,8 @@ model:
   name: gpt-4o  # OpenAI model to use
   temperature: 1.0  # Temperature for all steps
   max_tokens: 8192  # Max tokens for all steps
+  max_retries: 3  # Number of retry attempts for API calls
+  retry_delay: 2.0  # Initial delay between retries in seconds (exponential backoff)
 
 # Task generation settings
 generation:
diff --git a/experimental/diverse_task_generator.py b/experimental/diverse_task_generator.py
index e530746..06595d3 100644
--- a/experimental/diverse_task_generator.py
+++ b/experimental/diverse_task_generator.py
@@ -44,6 +44,8 @@ def __init__(
         self,
         capability_dict: dict,
         config: dict,
+        resume: bool = False,
+        resume_dir: str = None,
     ) -> None:
         """Initialize the diverse task generator."""
         # Extract example tasks from capability_data if present
@@ -63,11 +65,14 @@ def __init__(
 
         # Store configuration
         self.config = config
+        self.resume = resume
 
         # Use config values
         self.model_name = self.config["model"]["name"]
         self.temperature = self.config["model"]["temperature"]
         self.max_tokens = self.config["model"]["max_tokens"]
+        self.max_retries = self.config["model"].get("max_retries", 3)
+        self.retry_delay = self.config["model"].get("retry_delay", 2.0)
         self.output_dir = Path(self.config["output"]["base_dir"])
 
         # Initialize OpenAI client
@@ -76,13 +81,22 @@ def __init__(
             raise ValueError("OPENAI_API_KEY environment variable not set")
         self.client = OpenAI(api_key=api_key)
 
-        # Create output directory
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        self.run_output_dir = self.output_dir / f"{self.capability.name}_{timestamp}"
-        self.run_output_dir.mkdir(parents=True, exist_ok=True)
+        # Create or resume output directory
+        if resume and resume_dir:
+            self.run_output_dir = Path(resume_dir)
+            if not self.run_output_dir.exists():
+                raise ValueError(f"Resume directory does not exist: {resume_dir}")
+            logger.info(f"Resuming from: {self.run_output_dir}")
+        else:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            self.run_output_dir = (
+                self.output_dir / f"{self.capability.name}_{timestamp}"
+            )
+            self.run_output_dir.mkdir(parents=True, exist_ok=True)
 
         logger.info("=" * 80)
         logger.info(f"Initialized DiverseTaskGenerator for: {self.capability.name}")
+        logger.info(f"Mode: {'RESUME' if resume else 'NEW RUN'}")
         logger.info(f"Model: {self.model_name}")
         logger.info(f"Temperature: {self.temperature}")
         logger.info(f"Max tokens: {self.max_tokens}")
@@ -96,6 +110,8 @@ def __init__(
             model_name=self.model_name,
             temperature=self.temperature,
             max_tokens=self.max_tokens,
+            max_retries=self.max_retries,
+            retry_delay=self.retry_delay,
         )
 
     def _save_json(self, filename: str, data_key: str, data: Any) -> Path:
@@ -116,6 +132,47 @@ def _save_json(self, filename: str, data_key: str, data: Any) -> Path:
         logger.info(f"Saved to: {output_file}")
         return output_file
 
+    def _load_json(self, filename: str, data_key: str = None):
+        """Load data from JSON file if it exists."""
+        file_path = self.run_output_dir / filename
+        if not file_path.exists():
+            return None
+
+        with open(file_path, "r") as f:
+            data = json.load(f)
+
+        if data_key:
+            return data.get(data_key)
+        return data
+
+    def _load_subtopics(self):
+        """Load subtopics from checkpoint."""
+        data = self._load_json("subtopics.json", "sub_topics")
+        if data:
+            return [SubTopic(**item) for item in data]
+        return None
+
+    def _load_combinations(self):
+        """Load combinations from checkpoint."""
+        data = self._load_json("combinations.json", "valid_combinations")
+        if data:
+            return [Combination(**item) for item in data]
+        return None
+
+    def _load_blueprints(self):
+        """Load blueprints from checkpoint."""
+        data = self._load_json("blueprints.json", "blueprints")
+        if data:
+            return [Blueprint(**item) for item in data]
+        return None
+
+    def _load_tasks(self):
+        """Load tasks from checkpoint."""
+        data = self._load_json("tasks.json", "tasks")
+        if data:
+            return [Task(**item) for item in data]
+        return None
+
     def extract_and_save_subtopics(self) -> list[SubTopic]:
         """Extract sub-topics and save results."""
         subtopics = extract_subtopics(self.capability, self._call_api)
@@ -168,16 +225,32 @@ def run_full_pipeline(self) -> dict:
         logger.info("=" * 80)
 
         # Extract sub-topics
-        subtopics = self.extract_and_save_subtopics()
+        subtopics = self._load_subtopics() if self.resume else None
+        if subtopics:
+            logger.info(f"Loaded {len(subtopics)} subtopics from checkpoint")
+        else:
+            subtopics = self.extract_and_save_subtopics()
 
         # Find valid combinations
-        combinations = self.find_and_save_combinations(subtopics)
+        combinations = self._load_combinations() if self.resume else None
+        if combinations:
+            logger.info(f"Loaded {len(combinations)} combinations from checkpoint")
+        else:
+            combinations = self.find_and_save_combinations(subtopics)
 
         # Generate blueprints
-        blueprints = self.generate_and_save_blueprints(combinations)
+        blueprints = self._load_blueprints() if self.resume else None
+        if blueprints:
+            logger.info(f"Loaded {len(blueprints)} blueprints from checkpoint")
+        else:
+            blueprints = self.generate_and_save_blueprints(combinations)
 
         # Generate tasks
-        tasks = self.generate_and_save_tasks(blueprints)
+        tasks = self._load_tasks() if self.resume else None
+        if tasks:
+            logger.info(f"Loaded {len(tasks)} tasks from checkpoint")
+        else:
+            tasks = self.generate_and_save_tasks(blueprints)
 
         # Verify tasks
         verification = self.verify_and_save_tasks(tasks, blueprints)
@@ -233,6 +306,11 @@ def main() -> None:
         type=str,
         help="Output directory (default: from config file)",
     )
+    parser.add_argument(
+        "--resume-dir",
+        type=str,
+        help="Resume from an existing run directory",
+    )
 
     args = parser.parse_args()
 
@@ -253,8 +331,11 @@ def main() -> None:
     capability_dict = load_capability_from_json(config["input"]["capability_json_path"])
 
     # Initialize and run generator
+    # If resume_dir is provided, automatically enable resume mode
     generator = DiverseTaskGenerator(
         capability_dict=capability_dict,
+        resume=bool(args.resume_dir),
+        resume_dir=args.resume_dir,
         config=config,
     )
     generator.run_full_pipeline()
diff --git a/experimental/generate_tasks.py b/experimental/generate_tasks.py
index a8926b8..4c89de8 100644
--- a/experimental/generate_tasks.py
+++ b/experimental/generate_tasks.py
@@ -31,35 +31,52 @@ def generate_tasks(
 
         # Generate multiple tasks for this blueprint
         for j in range(tasks_per_blueprint):
-            system_prompt, user_prompt = format_task_prompt(
-                capability_name=capability.name,
-                capability_description=capability.description,
-                capability_domain=capability.domain,
-                capability_area=capability.area,
-                blueprint_description=blueprint.blueprint,
-            )
-
-            response = call_llm(
-                system_prompt=system_prompt,
-                user_prompt=user_prompt,
-                response_format={"type": "json_object"},
-            )
-
-            task_data = json.loads(response)
-
-            # Create Task object
             task_id = f"task_{blueprint.combination_id}_{j}"
-            task = Task(
-                task_id=task_id,
-                blueprint_id=blueprint.combination_id,
-                subtopic=blueprint.subtopic,
-                difficulty=blueprint.difficulty,
-                reasoning=blueprint.reasoning,
-                question=task_data["question"],
-                choices=task_data["options"],
-                correct_answer=task_data["correct_answer"],
-            )
-            all_tasks.append(task)
+
+            try:
+                system_prompt, user_prompt = format_task_prompt(
+                    capability_name=capability.name,
+                    capability_description=capability.description,
+                    capability_domain=capability.domain,
+                    capability_area=capability.area,
+                    blueprint_description=blueprint.blueprint,
+                )
+
+                response = call_llm(
+                    system_prompt=system_prompt,
+                    user_prompt=user_prompt,
+                    response_format={"type": "json_object"},
+                )
+
+                task_data = json.loads(response)
+
+                # Create Task object
+                task = Task(
+                    task_id=task_id,
+                    blueprint_id=blueprint.combination_id,
+                    subtopic=blueprint.subtopic,
+                    difficulty=blueprint.difficulty,
+                    reasoning=blueprint.reasoning,
+                    question=task_data["question"],
+                    choices=task_data["options"],
+                    correct_answer=task_data["correct_answer"],
+                )
+                all_tasks.append(task)
+
+            except Exception as e:
+                logger.error(f"  Failed to generate {task_id}: {e}")
+                # Create a task with error information
+                task = Task(
+                    task_id=task_id,
+                    blueprint_id=blueprint.combination_id,
+                    subtopic=blueprint.subtopic,
+                    difficulty=blueprint.difficulty,
+                    reasoning=blueprint.reasoning,
+                    question=f"ERROR: Failed to generate task - {str(e)}",
+                    choices={"A": "N/A", "B": "N/A", "C": "N/A", "D": "N/A"},
+                    correct_answer="A",
+                )
+                all_tasks.append(task)
 
         logger.info(f"  Generated {tasks_per_blueprint} tasks")
 
diff --git a/experimental/model_utils.py b/experimental/model_utils.py
index f0633c8..fdf0091 100644
--- a/experimental/model_utils.py
+++ b/experimental/model_utils.py
@@ -1,6 +1,7 @@
 """Utilities for LLM API calls."""
 
 import logging
+import time
 
 from openai import OpenAI
 
@@ -16,26 +17,45 @@ def call_model(
     temperature: float,
     max_tokens: int,
     response_format: dict = None,
+    max_retries: int = 3,
+    retry_delay: float = 2.0,
 ) -> str:
-    """Call LLM API with given prompts."""
+    """Call LLM API with given prompts and automatic retries."""
     messages = [
         {"role": "system", "content": system_prompt},
         {"role": "user", "content": user_prompt},
     ]
 
-    try:
-        kwargs = {
-            "model": model_name,
-            "messages": messages,
-            "temperature": temperature,
-            "max_tokens": max_tokens,
-        }
-        if response_format:
-            kwargs["response_format"] = response_format
-
-        response = client.chat.completions.create(**kwargs)
-        return response.choices[0].message.content
-
-    except Exception as e:
-        logger.error(f"Model API call failed: {e}")
-        raise
+    kwargs = {
+        "model": model_name,
+        "messages": messages,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+    }
+    if response_format:
+        kwargs["response_format"] = response_format
+
+    last_error = None
+    for attempt in range(max_retries):
+        try:
+            response = client.chat.completions.create(**kwargs)
+            content = response.choices[0].message.content
+
+            if content is None:
+                raise ValueError("Model returned empty response")
+
+            return content
+
+        except Exception as e:
+            last_error = e
+            if attempt < max_retries - 1:
+                wait_time = retry_delay * (2**attempt)
+                logger.warning(
+                    f"API call failed (attempt {attempt + 1}/{max_retries}): {e}"
+                )
+                logger.info(f"Retrying in {wait_time:.1f} seconds...")
+                time.sleep(wait_time)
+            else:
+                logger.error(f"API call failed after {max_retries} attempts: {e}")
+
+    raise last_error
diff --git a/experimental/verify_tasks.py b/experimental/verify_tasks.py
index 8d14828..504b823 100644
--- a/experimental/verify_tasks.py
+++ b/experimental/verify_tasks.py
@@ -28,52 +28,86 @@ def verify_tasks(
     for i, task in enumerate(tasks):
         logger.info(f"Verifying task {i + 1}/{len(tasks)}: {task.task_id}")
 
-        # Get blueprint for this task
-        blueprint = blueprint_dict.get(task.blueprint_id)
-        blueprint_text = blueprint.blueprint if blueprint else "N/A"
-
-        system_prompt, user_prompt = format_verification_prompt(
-            capability_domain=capability.domain,
-            capability_area=capability.area,
-            capability_name=capability.name,
-            capability_description=capability.description,
-            task_blueprint=blueprint_text,
-            question=task.question,
-            option_a=task.choices.get("A", ""),
-            option_b=task.choices.get("B", ""),
-            option_c=task.choices.get("C", ""),
-            option_d=task.choices.get("D", ""),
-            correct_answer=task.correct_answer,
-        )
-
-        response = call_llm(
-            system_prompt=system_prompt,
-            user_prompt=user_prompt,
-            response_format={"type": "json_object"},
-        )
-
-        verification_data = json.loads(response)
-
-        # Map new verification format to old format
-        overall_aligned = verification_data.get("overall_verdict", "Fail") == "Pass"
-
-        verification = VerificationResult(
-            task_id=task.task_id,
-            subtopic_aligned=verification_data.get("blueprint_alignment", "No")
-            == "Yes",
-            difficulty_aligned=verification_data.get("difficulty_reasoning_match", "No")
-            == "Yes",
-            reasoning_aligned=verification_data.get("capability_alignment", "No")
-            == "Yes",
-            choices_appropriate=verification_data.get("single_correct_answer", "No")
-            == "Yes",
-            overall_aligned=overall_aligned,
-            feedback=verification_data.get("explanation", ""),
-        )
-        verification_results.append(verification)
-
-        status = "✓ PASS" if verification.overall_aligned else "✗ FAIL"
-        logger.info(f"  {status}")
+        try:
+            # Skip verification for tasks that failed generation
+            if task.question.startswith("ERROR:"):
+                logger.warning("  Skipping verification (task generation failed)")
+                verification = VerificationResult(
+                    task_id=task.task_id,
+                    subtopic_aligned=False,
+                    difficulty_aligned=False,
+                    reasoning_aligned=False,
+                    choices_appropriate=False,
+                    overall_aligned=False,
+                    feedback="Task generation failed - verification skipped",
+                )
+                verification_results.append(verification)
+                logger.info("  ✗ SKIPPED")
+                continue
+
+            # Get blueprint for this task
+            blueprint = blueprint_dict.get(task.blueprint_id)
+            blueprint_text = blueprint.blueprint if blueprint else "N/A"
+
+            system_prompt, user_prompt = format_verification_prompt(
+                capability_domain=capability.domain,
+                capability_area=capability.area,
+                capability_name=capability.name,
+                capability_description=capability.description,
+                task_blueprint=blueprint_text,
+                question=task.question,
+                option_a=task.choices.get("A", ""),
+                option_b=task.choices.get("B", ""),
+                option_c=task.choices.get("C", ""),
+                option_d=task.choices.get("D", ""),
+                correct_answer=task.correct_answer,
+            )
+
+            response = call_llm(
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                response_format={"type": "json_object"},
+            )
+
+            verification_data = json.loads(response)
+
+            # Map new verification format to old format
+            overall_aligned = verification_data.get("overall_verdict", "Fail") == "Pass"
+
+            verification = VerificationResult(
+                task_id=task.task_id,
+                subtopic_aligned=verification_data.get("blueprint_alignment", "No")
+                == "Yes",
+                difficulty_aligned=verification_data.get(
+                    "difficulty_reasoning_match", "No"
+                )
+                == "Yes",
+                reasoning_aligned=verification_data.get("capability_alignment", "No")
+                == "Yes",
+                choices_appropriate=verification_data.get("single_correct_answer", "No")
+                == "Yes",
+                overall_aligned=overall_aligned,
+                feedback=verification_data.get("explanation", ""),
+            )
+            verification_results.append(verification)
+
+            status = "✓ PASS" if verification.overall_aligned else "✗ FAIL"
+            logger.info(f"  {status}")
+
+        except Exception as e:
+            logger.error(f"  Failed to verify {task.task_id}: {e}")
+            # Create a verification result with error information
+            verification = VerificationResult(
+                task_id=task.task_id,
+                subtopic_aligned=False,
+                difficulty_aligned=False,
+                reasoning_aligned=False,
+                choices_appropriate=False,
+                overall_aligned=False,
+                feedback=f"Verification failed: {str(e)}",
+            )
+            verification_results.append(verification)
+            logger.info("  ✗ ERROR")
 
     # Calculate statistics
     total = len(verification_results)