From f0cf76002f08e11e277fa9a1ca93e1657b8f63f2 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Tue, 26 Aug 2025 13:47:06 -0400 Subject: [PATCH 01/19] adding refactored task generation. updated prompts to ask for json outputs, and updated corresponding output parser. --- src/agentic_task_generator.py | 110 ++++++-- src/task_generation/__init__.py | 12 + src/task_generation/generator.py | 341 +++++++++++++++++++++++ src/task_generation/messages.py | 74 +++++ src/task_generation/moderator.py | 462 +++++++++++++++++++++++++++++++ src/task_generation/scientist.py | 244 ++++++++++++++++ src/utils/agentic_prompts.py | 58 ++-- 7 files changed, 1263 insertions(+), 38 deletions(-) create mode 100644 src/task_generation/__init__.py create mode 100644 src/task_generation/generator.py create mode 100644 src/task_generation/messages.py create mode 100644 src/task_generation/moderator.py create mode 100644 src/task_generation/scientist.py diff --git a/src/agentic_task_generator.py b/src/agentic_task_generator.py index 62a6a10..439f7a1 100644 --- a/src/agentic_task_generator.py +++ b/src/agentic_task_generator.py @@ -2,39 +2,111 @@ import asyncio import logging +import os import traceback import hydra +import openlit +from langfuse import Langfuse from omegaconf import DictConfig, OmegaConf -from .task_generation import generate_tasks +from src.task_generation import generate_tasks +# Suppress OpenTelemetry console output +os.environ["OTEL_LOG_LEVEL"] = "ERROR" +os.environ["OTEL_METRICS_EXPORTER"] = "none" +os.environ["OTEL_PYTHON_LOG_CORRELATION"] = "false" +os.environ["OTEL_PYTHON_LOG_LEVEL"] = "ERROR" + log = logging.getLogger("agentic_task_gen") @hydra.main(version_base=None, config_path="cfg", config_name="agentic_config") def main(cfg: DictConfig) -> None: """Run the multi-agent task generation system.""" - log.info("Starting multi-agent task generation") - log.info("Configuration:\n%s", OmegaConf.to_yaml(cfg, resolve=True)) - - # Check for capabilities_tag parameter capabilities_tag = cfg.pipeline_tags.capabilities_tag - if capabilities_tag: - log.info(f"Using capabilities from tag: {capabilities_tag}") - else: - log.warning( - "No capabilities_tag provided. Please provide --pipeline_tags.capabilities_tag= to specify which capabilities to use." - ) - return - - try: - asyncio.run(generate_tasks(cfg, capabilities_tag)) - except Exception as e: - log.error(f"Task generation failed: {e}") - log.error(f"Full traceback: {traceback.format_exc()}") - raise + domain_name = cfg.global_cfg.domain + exp_id = cfg.exp_cfg.exp_id + + langfuse_client = Langfuse() + openlit.init( + tracer=langfuse_client._otel_tracer, disable_batch=True, disable_metrics=True + ) + + with langfuse_client.start_as_current_span( + name=f"ace_agentic_task_generation:{domain_name}:{exp_id}" + ) as span: + try: + msg = "Starting multi-agent task generation" + log.info(msg) + span.update(metadata={"system_started": msg}) + + config_yaml = OmegaConf.to_yaml(cfg, resolve=True) + msg = "Configuration loaded" + log.info("Configuration:\n%s", config_yaml) + span.update( + metadata={ + "configuration_loaded": msg, + "config": config_yaml, + "domain": domain_name, + "exp_id": exp_id, + } + ) + + if capabilities_tag: + msg = f"Using capabilities from tag: {capabilities_tag}" + log.info(msg) + span.update( + metadata={ + "capabilities_tag_found": msg, + "capabilities_tag": capabilities_tag, + } + ) + else: + error_msg = "No capabilities_tag provided. Please provide pipeline_tags.capabilities_tag= to specify which capabilities to use." + log.warning(error_msg) + span.update( + level="WARNING", + status_message="Missing capabilities_tag", + metadata={"capabilities_tag_missing": error_msg}, + ) + return + + span.update_trace( + metadata={ + "domain": domain_name, + "exp_id": exp_id, + "capabilities_tag": capabilities_tag, + "config": config_yaml, + }, + tags=["agentic_task_generation", exp_id], + ) + + asyncio.run(generate_tasks(cfg, capabilities_tag, langfuse_client)) + + msg = "Multi-agent task generation completed successfully" + log.info(msg) + span.update(metadata={"system_completed": msg}) + + except Exception as e: + error_msg = f"Task generation failed: {e}" + traceback_msg = f"Full traceback: {traceback.format_exc()}" + + log.error(error_msg) + log.error(traceback_msg) + + span.update( + level="ERROR", + status_message=str(e), + metadata={ + "system_error": error_msg, + "error": str(e), + "traceback": traceback_msg, + }, + ) + + raise if __name__ == "__main__": diff --git a/src/task_generation/__init__.py b/src/task_generation/__init__.py new file mode 100644 index 0000000..2598dec --- /dev/null +++ b/src/task_generation/__init__.py @@ -0,0 +1,12 @@ +"""Task generation package for multi-agent debate-based task generation.""" + +from .generator import generate_tasks +from .moderator import TaskModerator +from .scientist import TaskScientist + + +__all__ = [ + "generate_tasks", + "TaskModerator", + "TaskScientist", +] diff --git a/src/task_generation/generator.py b/src/task_generation/generator.py new file mode 100644 index 0000000..5be2742 --- /dev/null +++ b/src/task_generation/generator.py @@ -0,0 +1,341 @@ +"""Main task generation orchestration functions.""" + +import asyncio +import json +import logging +import traceback +from datetime import datetime +from pathlib import Path + +from autogen_core import ( + EVENT_LOGGER_NAME, + ROOT_LOGGER_NAME, + TRACE_LOGGER_NAME, + DefaultTopicId, + SingleThreadedAgentRuntime, +) +from autogen_ext.models.openai import OpenAIChatCompletionClient +from langfuse import Langfuse +from omegaconf import DictConfig + +from src.task_generation.messages import Capability +from src.task_generation.moderator import TaskModerator +from src.task_generation.scientist import TaskScientist + + +log = logging.getLogger("agentic_task_gen.generator") +logging.getLogger(ROOT_LOGGER_NAME).setLevel(logging.WARNING) +logging.getLogger(TRACE_LOGGER_NAME).setLevel(logging.WARNING) +logging.getLogger(EVENT_LOGGER_NAME).setLevel(logging.WARNING) + + +async def generate_tasks_for_capability( + cfg: DictConfig, capability: Capability, output_dir: Path, langfuse_client: Langfuse +) -> None: + """Generate tasks for a single capability.""" + with langfuse_client.start_as_current_span( + name=f"task_generation_for_capability:{capability.name}" + ) as span: + try: + msg = f"Generating tasks for capability: {capability.name}" + log.info(msg) + span.update( + metadata={ + "capability_generation_started": msg, + "capability_name": capability.name, + "capability_description": capability.description, + } + ) + + domain_name = cfg.global_cfg.domain + + runtime = SingleThreadedAgentRuntime() + + # Register scientists + await TaskScientist.register( + runtime, + "TaskScientistA", + lambda: TaskScientist( + model_client=OpenAIChatCompletionClient( + model=cfg.agents.scientist_a.model_name, + seed=cfg.agents.scientist_a.seed, + ), + scientist_id="A", + domain=domain_name, + langfuse_client=langfuse_client, + ), + ) + + await TaskScientist.register( + runtime, + "TaskScientistB", + lambda: TaskScientist( + model_client=OpenAIChatCompletionClient( + model=cfg.agents.scientist_b.model_name, + seed=cfg.agents.scientist_b.seed, + ), + scientist_id="B", + domain=domain_name, + langfuse_client=langfuse_client, + ), + ) + + # Register moderator + await TaskModerator.register( + runtime, + "TaskModerator", + lambda: TaskModerator( + model_client=OpenAIChatCompletionClient( + model=cfg.agents.moderator.model_name, + seed=cfg.agents.moderator.seed, + ), + num_scientists=2, + num_final_problems=cfg.task_generation.num_final_problems_per_capability, + buffer_param=cfg.task_generation.buffer_param, + agreement_threshold=cfg.task_generation.agreement_threshold, + output_dir=output_dir, + domain=domain_name, + langfuse_client=langfuse_client, + ), + ) + + span.update( + metadata={ + "agents_registered": "All task agents registered successfully", + "scientists": ["A", "B"], + "moderator": True, + } + ) + + # Start runtime and process the capability + runtime.start() + await runtime.publish_message(capability, DefaultTopicId()) + + msg = f"Capability message published: {capability.name}" + log.info(msg) + span.update( + metadata={ + "capability_published": msg, + "capability_name": capability.name, + } + ) + + # Wait for the runtime to stop when idle + try: + await runtime.stop_when_idle() + + msg = f"Completed generating tasks for capability: {capability.name}" + log.info(msg) + span.update(metadata={"runtime_completed": msg}) + except Exception as e: + msg = f"Error while generating tasks for capability {capability.name}: {e}" + log.error(msg) + span.update( + level="ERROR", + status_message=str(e), + metadata={ + "runtime_error": msg, + "error": str(e), + "capability_name": capability.name, + }, + ) + raise + + except Exception as e: + error_msg = f"Error in generating tasks for {capability.name}: {e}" + traceback_msg = f"Traceback: {traceback.format_exc()}" + + log.error(error_msg) + log.error(traceback_msg) + + span.update( + level="ERROR", + status_message=str(e), + metadata={ + "capability_generation_error": error_msg, + "error": str(e), + "traceback": traceback_msg, + }, + ) + raise + + +async def generate_tasks( + cfg: DictConfig, capabilities_tag: str, langfuse_client: Langfuse +) -> None: + """Generate tasks for all capabilities.""" + domain_name = cfg.global_cfg.domain + exp_id = cfg.exp_cfg.exp_id + tasks_tag = f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + + with langfuse_client.start_as_current_span( + name=f"ace_task_generation:{domain_name}:{exp_id}:{tasks_tag}" + ) as span: + try: + msg = f"Tasks will be saved with tag: {tasks_tag}" + log.info(msg) + span.update( + metadata={ + "generation_started": msg, + "tasks_tag": tasks_tag, + "domain": domain_name, + "exp_id": exp_id, + } + ) + + msg = "Starting task generation process" + log.info(msg) + span.update(metadata={"process_started": msg}) + + span.update_trace( + metadata={ + "domain": domain_name, + "exp_id": exp_id, + "tasks_tag": tasks_tag, + "capabilities_tag": capabilities_tag, + "num_problems_per_capability": cfg.task_generation.num_final_problems_per_capability, + }, + tags=["task_generation_process", exp_id], + ) + + # Read capabilities from the timestamped capabilities directory + capabilities_dir = ( + Path.home() + / cfg.global_cfg.output_dir + / domain_name.replace(" ", "_") + / exp_id + / "capabilities" + / capabilities_tag + ) + + if not capabilities_dir.exists(): + error_msg = f"Capabilities directory not found: {capabilities_dir}" + log.error(error_msg) + span.update( + level="ERROR", + status_message="Capabilities directory not found", + metadata={ + "directory_not_found_error": error_msg, + "capabilities_dir": str(capabilities_dir), + }, + ) + raise FileNotFoundError(error_msg) + + capabilities = [] + + # Iterate through area directories + for area_dir in capabilities_dir.iterdir(): + if area_dir.is_dir(): + capabilities_file = area_dir / "capabilities.json" + if capabilities_file.exists(): + with open(capabilities_file, "r", encoding="utf-8") as f: + capabilities_data = json.load(f) + + if ( + isinstance(capabilities_data, dict) + and "capabilities" in capabilities_data + ): + for cap_dict in capabilities_data["capabilities"]: + if ( + isinstance(cap_dict, dict) + and "name" in cap_dict + and "description" in cap_dict + ): + capabilities.append( + Capability( + name=cap_dict["name"], + description=cap_dict["description"], + domain=cap_dict.get("domain", domain_name), + area=cap_dict.get("area", area_dir.name), + ) + ) + + if not capabilities: + error_msg = f"No valid capabilities found in {capabilities_dir}" + span.update( + level="ERROR", + status_message="No valid capabilities found", + metadata={ + "no_capabilities_error": error_msg, + "capabilities_dir": str(capabilities_dir), + }, + ) + raise ValueError(error_msg) + + msg = f"Found {len(capabilities)} capabilities to process" + log.info(msg) + span.update( + metadata={ + "capabilities_loaded": msg, + "num_capabilities": len(capabilities), + "capability_names": [cap.name for cap in capabilities], + } + ) + + # Create timestamped output directory for tasks + output_dir = ( + Path.home() + / cfg.global_cfg.output_dir + / domain_name.replace(" ", "_") + / exp_id + / "tasks" + / tasks_tag + ) + + msg = f"Output directory: {output_dir}" + log.info(msg) + span.update( + metadata={ + "output_directory_configured": msg, + "output_dir": str(output_dir), + } + ) + + # Print the timestamp for future reference + print(f"Tasks generated with tag: {tasks_tag}") + + # Process each capability individually + for i, capability in enumerate(capabilities): + msg = f"Processing capability {i + 1}/{len(capabilities)}: {capability.name}" + log.info(msg) + span.update( + metadata={ + f"capability_{i + 1}_started": msg, + "current_capability": capability.name, + "progress": f"{i + 1}/{len(capabilities)}", + } + ) + + await generate_tasks_for_capability( + cfg, capability, output_dir, langfuse_client + ) + + msg = f"Completed capability {i + 1}/{len(capabilities)}: {capability.name}" + log.info(msg) + span.update( + metadata={ + f"capability_{i + 1}_completed": msg, + "completed_capability": capability.name, + } + ) + + await asyncio.sleep(1) + + except Exception as e: + error_msg = f"Error in generate_tasks: {e}" + traceback_msg = f"Traceback: {traceback.format_exc()}" + + log.error(error_msg) + log.error(traceback_msg) + + span.update( + level="ERROR", + status_message=str(e), + metadata={ + "generation_error": error_msg, + "error": str(e), + "traceback": traceback_msg, + }, + ) + + raise diff --git a/src/task_generation/messages.py b/src/task_generation/messages.py new file mode 100644 index 0000000..09b5e9d --- /dev/null +++ b/src/task_generation/messages.py @@ -0,0 +1,74 @@ +"""Message types and data classes for task generation.""" + +from dataclasses import dataclass +from typing import Dict, List + + +@dataclass +class Capability: + """A capability with name, description, domain, and area.""" + + name: str + description: str + domain: str + area: str + + +@dataclass +class ProblemProposalRequest: + """Request for problem proposals from scientists.""" + + capability_name: str + capability_description: str + capability_domain: str + capability_area: str + num_problems: int + sample_tasks: List[str] + + +@dataclass +class ScientistProblemProposal: + """Problem proposal from a scientist.""" + + scientist_id: str + capability_name: str + problems: Dict[str, str] # task_id -> task_text + iteration: int + + +@dataclass +class ModeratorProblemReview: + """Moderator's review and filtering of problems.""" + + capability_name: str + final_problems: Dict[str, str] # task_id -> task_text + rejected_problems: Dict[str, str] # task_id -> rejection_reason + iteration: int + + +@dataclass +class SolutionRequest: + """Request for scientists to solve problems.""" + + capability_name: str + capability_description: str + capability_domain: str + capability_area: str + problems: Dict[str, str] # task_id -> task_text + + +@dataclass +class ScientistSolutionProposal: + """Solution proposal from a scientist.""" + + scientist_id: str + capability_name: str + solutions: Dict[str, str] # task_id -> solution + + +@dataclass +class FinalTaskSet: + """Final task set with problems and solutions.""" + + capability_name: str + tasks: Dict[str, Dict[str, str]] # task_id -> {problem, answer} diff --git a/src/task_generation/moderator.py b/src/task_generation/moderator.py new file mode 100644 index 0000000..9e5c96b --- /dev/null +++ b/src/task_generation/moderator.py @@ -0,0 +1,462 @@ +"""Task moderator agent for managing task generation workflow.""" + +import json +import logging +import math +import traceback +from pathlib import Path +from typing import Dict, List + +from autogen_core import ( + DefaultTopicId, + MessageContext, + RoutedAgent, + default_subscription, + message_handler, +) +from autogen_core.models import ( + ChatCompletionClient, + SystemMessage, + UserMessage, +) +from langfuse import Langfuse + +from src.task_generation.messages import ( + Capability, + ProblemProposalRequest, + ScientistProblemProposal, + ScientistSolutionProposal, + SolutionRequest, +) +from src.utils.agentic_prompts import ( + TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT, + TASK_MODERATOR_PROBLEM_USER_PROMPT, +) +from src.utils.json_utils import parse_llm_json_response + + +log = logging.getLogger("agentic_task_gen.moderator") + + +@default_subscription +class TaskModerator(RoutedAgent): + """Moderator that merges scientist task proposals and manages iteration.""" + + def __init__( + self, + model_client: ChatCompletionClient, + num_scientists: int, + num_final_problems: int, + buffer_param: int, + agreement_threshold: float, + output_dir: Path, + domain: str, + langfuse_client: Langfuse, + ) -> None: + super().__init__("Task Moderator") + self._model_client = model_client + self._num_scientists = num_scientists + self._num_final_problems = num_final_problems + self._buffer_param = buffer_param + self._agreement_threshold = agreement_threshold + self._output_dir = output_dir + self._domain = domain + self._langfuse_client = langfuse_client + + # Algorithm 1 state + self._num_remaining: Dict[str, int] = {} + self._final_problems: Dict[ + str, Dict[str, str] + ] = {} # capability -> {task_id: problem_text} + self._capabilities: Dict[str, Capability] = {} # Store original capability info + + # Problem design state + self._problem_proposals: Dict[ + str, List[ScientistProblemProposal] + ] = {} # capability -> proposals + + # Solution design state + self._solution_proposals: Dict[ + str, List[ScientistSolutionProposal] + ] = {} # capability -> solutions + + @message_handler + async def handle_capability(self, message: Capability, ctx: MessageContext) -> None: + """Handle capability and start Algorithm 1 for problem design.""" + with self._langfuse_client.start_as_current_span( + name="task_moderator_handle_capability" + ) as span: + try: + msg = f"Task Moderator starting problem design for capability: {message.name}" + log.info(msg) + span.update( + metadata={ + "capability_received": msg, + "capability_name": message.name, + "capability_description": message.description, + "capability_area": message.area, + } + ) + + # Initialize Algorithm 1 state + self._num_remaining[message.name] = self._num_final_problems + self._final_problems[message.name] = {} + self._capabilities[message.name] = ( + message # Store original capability info + ) + + await self._start_problem_iteration(message) + + except Exception as e: + error_msg = f"Error in Task Moderator handle_capability: {e}" + traceback_msg = f"Traceback: {traceback.format_exc()}" + + log.error(error_msg) + log.error(traceback_msg) + + span.update( + level="ERROR", + status_message=str(e), + metadata={ + "handle_capability_error": error_msg, + "error": str(e), + "traceback": traceback_msg, + }, + ) + raise + + async def _start_problem_iteration(self, capability: Capability) -> None: + """Start a problem generation iteration (Algorithm 1).""" + try: + num_remaining = self._num_remaining[capability.name] + if num_remaining <= 0: + log.info( + f"Problem design completed for capability: {capability.name}, starting solution design" + ) + await self._start_solution_design(capability) + return + + # Calculate problems per scientist: ceil(num_remaining / M) + B + problems_per_scientist = ( + math.ceil(num_remaining / self._num_scientists) + self._buffer_param + ) + + log.info( + f"Task Moderator requesting {problems_per_scientist} problems per scientist for capability: {capability.name} (remaining: {num_remaining})" + ) + + # Get sample tasks from existing final problems + sample_tasks = list(self._final_problems[capability.name].values())[ + :3 + ] # Use up to 3 existing problems as samples + + # Send problem proposal requests to all scientists + await self.publish_message( + ProblemProposalRequest( + capability_name=capability.name, + capability_description=capability.description, + capability_domain=capability.domain, + capability_area=capability.area, + num_problems=problems_per_scientist, + sample_tasks=sample_tasks, + ), + topic_id=DefaultTopicId(), + ) + + except Exception as e: + log.error(f"Error in Task Moderator _start_problem_iteration: {e}") + log.error(f"Traceback: {traceback.format_exc()}") + raise + + @message_handler + async def handle_scientist_problem_proposal( + self, message: ScientistProblemProposal, ctx: MessageContext + ) -> None: + """Handle problem proposals from scientists.""" + try: + log.info( + f"Task Moderator received problem proposal from Scientist {message.scientist_id} for capability: {message.capability_name}" + ) + + capability_name = message.capability_name + if capability_name not in self._problem_proposals: + self._problem_proposals[capability_name] = [] + + self._problem_proposals[capability_name].append(message) + + # Check if we have all proposals for this iteration + current_proposals = [ + p + for p in self._problem_proposals[capability_name] + if p.iteration == message.iteration + ] + if len(current_proposals) == self._num_scientists: + log.info( + f"Task Moderator received all problem proposals for capability: {capability_name}, proceeding to filter" + ) + await self._filter_and_select_problems( + capability_name, message.iteration + ) + + except Exception as e: + log.error(f"Error in Task Moderator handle_scientist_problem_proposal: {e}") + log.error(f"Traceback: {traceback.format_exc()}") + raise + + async def _filter_and_select_problems( + self, capability_name: str, iteration: int + ) -> None: + """Filter and select problems using moderator LLM.""" + try: + log.info( + f"Task Moderator filtering problems for capability: {capability_name}" + ) + + # Collect all proposed problems + current_proposals = [ + p + for p in self._problem_proposals[capability_name] + if p.iteration == iteration + ] + all_problems = {} + scientist_attribution = {} + + for proposal in current_proposals: + for task_id, problem_text in proposal.problems.items(): + unique_id = f"{proposal.scientist_id}_{task_id}" + all_problems[unique_id] = problem_text + scientist_attribution[unique_id] = proposal.scientist_id + + if not all_problems: + log.warning(f"No problems received for capability: {capability_name}") + return + + # Format problems for moderator + problems_text = "" + for scientist_id in set(scientist_attribution.values()): + problems_text += f"Scientist {scientist_id}:\n" + for task_id, problem in all_problems.items(): + if scientist_attribution[task_id] == scientist_id: + task_name = task_id.split("_", 1)[1] # Remove scientist prefix + problems_text += f"- {task_name}: {problem}\n" + problems_text += "\n" + + system_prompt = TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT + + capability_info = self._capabilities[capability_name] + user_prompt = TASK_MODERATOR_PROBLEM_USER_PROMPT.format( + capability_name=capability_info.name, + capability_description=capability_info.description, + capability_domain=capability_info.domain, + problems_text=problems_text, + ) + + system_message = SystemMessage(content=system_prompt) + user_message = UserMessage(content=user_prompt, source="user") + + model_result = await self._model_client.create( + [system_message, user_message] + ) + + raw_content = model_result.content + if not isinstance(raw_content, str): + raw_content = str(raw_content) + + # Extract JSON from response using robust parser + try: + parsed = parse_llm_json_response(raw_content) + final_tasks = parsed.get("final_tasks", {}) + rejected_tasks = parsed.get("rejected_tasks", {}) + except Exception as e: + log.error( + f"Error parsing JSON from moderator: {e}\nOutput: {raw_content}" + ) + final_tasks = {} + rejected_tasks = {} + + # Update Algorithm 1 state + num_remaining = self._num_remaining[capability_name] + num_selected = min(len(final_tasks), num_remaining) + + # Add selected problems to final set + selected_count = 0 + for _, problem_text in final_tasks.items(): + if selected_count < num_selected: + final_task_id = ( + f"task_{len(self._final_problems[capability_name]) + 1}" + ) + self._final_problems[capability_name][final_task_id] = problem_text + selected_count += 1 + + # Update remaining count + self._num_remaining[capability_name] = num_remaining - selected_count + + log.info( + f"Task Moderator selected {selected_count} problems for {capability_name}, {self._num_remaining[capability_name]} remaining" + ) + log.info( + f"Rejected {len(rejected_tasks)} problems: {list(rejected_tasks.keys())}" + ) + + # Continue Algorithm 1 or move to solution design + if self._num_remaining[capability_name] > 0: + # Need more problems, start another iteration + capability = self._capabilities[capability_name] + await self._start_problem_iteration(capability) + else: + # Problem design complete, start solution design + capability = self._capabilities[capability_name] + await self._start_solution_design(capability) + + except Exception as e: + log.error(f"Error in Task Moderator _filter_and_select_problems: {e}") + log.error(f"Traceback: {traceback.format_exc()}") + raise + + async def _start_solution_design(self, capability: Capability) -> None: + """Start solution design phase.""" + try: + log.info( + f"Task Moderator starting solution design for capability: {capability.name}" + ) + + final_problems = self._final_problems[capability.name] + if not final_problems: + log.error( + f"No final problems available for capability: {capability.name}" + ) + return + + # Send solution requests to all scientists + await self.publish_message( + SolutionRequest( + capability_name=capability.name, + capability_description=capability.description, + capability_domain=capability.domain, + capability_area=capability.area, + problems=final_problems, + ), + topic_id=DefaultTopicId(), + ) + + except Exception as e: + log.error(f"Error in Task Moderator _start_solution_design: {e}") + log.error(f"Traceback: {traceback.format_exc()}") + raise + + @message_handler + async def handle_scientist_solution_proposal( + self, message: ScientistSolutionProposal, ctx: MessageContext + ) -> None: + """Handle solution proposals from scientists.""" + try: + log.info( + f"Task Moderator received solution proposal from Scientist {message.scientist_id} for capability: {message.capability_name}" + ) + + capability_name = message.capability_name + if capability_name not in self._solution_proposals: + self._solution_proposals[capability_name] = [] + + self._solution_proposals[capability_name].append(message) + + # Check if we have all solutions + if len(self._solution_proposals[capability_name]) == self._num_scientists: + log.info( + f"Task Moderator received all solutions for capability: {capability_name}, determining consensus" + ) + await self._determine_solution_consensus(capability_name) + + except Exception as e: + log.error( + f"Error in Task Moderator handle_scientist_solution_proposal: {e}" + ) + log.error(f"Traceback: {traceback.format_exc()}") + raise + + async def _determine_solution_consensus(self, capability_name: str) -> None: + """Determine solution consensus and finalize tasks.""" + try: + log.info( + f"Task Moderator determining solution consensus for capability: {capability_name}" + ) + + solutions_by_task: Dict[ + str, Dict[str, str] + ] = {} # task_id -> [scientist_id -> solution] + + for proposal in self._solution_proposals[capability_name]: + for task_id, solution in proposal.solutions.items(): + if task_id not in solutions_by_task: + solutions_by_task[task_id] = {} + solutions_by_task[task_id][proposal.scientist_id] = solution + + final_tasks = {} + + for task_id, problem_text in self._final_problems[capability_name].items(): + if task_id in solutions_by_task: + scientist_solutions = solutions_by_task[task_id] + + # Simple consensus: find most common solution + solution_counts: Dict[str, int] = {} + for solution in scientist_solutions.values(): + solution_counts[solution] = solution_counts.get(solution, 0) + 1 + + if solution_counts: + most_common_solution = max( + solution_counts.keys(), key=lambda x: solution_counts[x] + ) + agreement_rate = solution_counts[most_common_solution] / len( + scientist_solutions + ) + + if agreement_rate >= self._agreement_threshold: + final_tasks[task_id] = { + "problem": problem_text, + "answer": most_common_solution, + } + log.info( + f"Task {task_id}: consensus achieved ({agreement_rate:.2f} agreement)" + ) + else: + log.warning( + f"Task {task_id}: low agreement ({agreement_rate:.2f}), requires human review" + ) + # For now, use most common solution but mark it + final_tasks[task_id] = { + "problem": problem_text, + "answer": most_common_solution, + "requires_human_review": "true", + "agreement_rate": str(agreement_rate), + } + + # Save final tasks + await self._save_tasks_to_file(capability_name, final_tasks) + log.info(f"Task generation completed for capability: {capability_name}") + + except Exception as e: + log.error(f"Error in Task Moderator _determine_solution_consensus: {e}") + log.error(f"Traceback: {traceback.format_exc()}") + raise + + async def _save_tasks_to_file( + self, capability_name: str, tasks: Dict[str, Dict[str, str]] + ) -> None: + """Save final tasks to file.""" + try: + # Create capability directory + capability_dir = self._output_dir / capability_name + capability_dir.mkdir(parents=True, exist_ok=True) + + # Save tasks + tasks_file = capability_dir / "tasks.json" + with open(tasks_file, "w", encoding="utf-8") as f: + json.dump({"tasks": tasks}, f, indent=2, ensure_ascii=False) + + log.info( + f"Saved {len(tasks)} tasks for capability '{capability_name}' to {tasks_file}" + ) + except Exception as e: + log.error(f"Error saving tasks for capability {capability_name}: {e}") + log.error(f"Traceback: {traceback.format_exc()}") + raise diff --git a/src/task_generation/scientist.py b/src/task_generation/scientist.py new file mode 100644 index 0000000..2daa571 --- /dev/null +++ b/src/task_generation/scientist.py @@ -0,0 +1,244 @@ +"""Task scientist agent for generating problems and solutions.""" + +import json +import logging +import traceback + +from autogen_core import ( + DefaultTopicId, + MessageContext, + RoutedAgent, + default_subscription, + message_handler, +) +from autogen_core.models import ( + ChatCompletionClient, + SystemMessage, + UserMessage, +) +from langfuse import Langfuse + +from src.task_generation.messages import ( + ProblemProposalRequest, + ScientistProblemProposal, + ScientistSolutionProposal, + SolutionRequest, +) +from src.utils.agentic_prompts import ( + TASK_SCIENTIST_PROBLEM_SYSTEM_PROMPT, + TASK_SCIENTIST_PROBLEM_USER_PROMPT, + TASK_SCIENTIST_SOLUTION_SYSTEM_PROMPT, + TASK_SCIENTIST_SOLUTION_USER_PROMPT, +) +from src.utils.json_utils import parse_llm_json_response + + +log = logging.getLogger("agentic_task_gen.scientist") + + +@default_subscription +class TaskScientist(RoutedAgent): + """Scientist that generates problems and solutions.""" + + def __init__( + self, + model_client: ChatCompletionClient, + scientist_id: str, + langfuse_client: Langfuse, + domain: str = "", + ) -> None: + super().__init__(f"Task Scientist {scientist_id}") + self._scientist_id = scientist_id + self._model_client = model_client + self._domain = domain + self._langfuse_client = langfuse_client + + @message_handler + async def handle_problem_proposal_request( + self, message: ProblemProposalRequest, ctx: MessageContext + ) -> None: + """Handle problem proposal request.""" + with self._langfuse_client.start_as_current_span( + name=f"task_scientist_{self._scientist_id}_problem_proposal" + ) as span: + try: + msg = f"Task Scientist {self._scientist_id} generating {message.num_problems} problems for capability: {message.capability_name}" + log.info(msg) + span.update( + metadata={ + "problem_request_received": msg, + "scientist_id": self._scientist_id, + "capability_name": message.capability_name, + "capability_description": message.capability_description, + "num_problems": message.num_problems, + } + ) + + sample_tasks_text = "" + if message.sample_tasks: + sample_tasks_text = "\n".join( + [f"- {task}" for task in message.sample_tasks] + ) + else: + sample_tasks_text = "(No sample tasks provided)" + + system_prompt = TASK_SCIENTIST_PROBLEM_SYSTEM_PROMPT.format( + scientist_id=self._scientist_id, + ) + + user_prompt = TASK_SCIENTIST_PROBLEM_USER_PROMPT.format( + num_problems=message.num_problems, + capability_name=message.capability_name, + capability_description=message.capability_description, + capability_domain=message.capability_domain, + sample_tasks_text=sample_tasks_text, + ) + + system_message = SystemMessage(content=system_prompt) + user_message = UserMessage(content=user_prompt, source="user") + + model_result = await self._model_client.create( + [system_message, user_message] + ) + + msg = f"Task Scientist {self._scientist_id} is parsing LLM response" + log.info(msg) + span.update( + metadata={ + "llm_response_received": msg, + "scientist_id": self._scientist_id, + } + ) + + parsed = parse_llm_json_response(model_result.content) + problems = parsed.get("problems", {}) + + msg = f"Task Scientist {self._scientist_id} proposing {len(problems)} problems for capability: {message.capability_name}" + log.info(msg) + span.update( + metadata={ + "problem_proposal_published": msg, + "scientist_id": self._scientist_id, + "capability_name": message.capability_name, + "num_problems_generated": len(problems), + } + ) + + await self.publish_message( + ScientistProblemProposal( + scientist_id=self._scientist_id, + capability_name=message.capability_name, + problems=problems, + iteration=0, + ), + topic_id=DefaultTopicId(), + ) + + except Exception as e: + error_msg = f"Error in Task Scientist {self._scientist_id} handle_problem_proposal_request: {e}" + traceback_msg = f"Traceback: {traceback.format_exc()}" + + log.error(error_msg) + log.error(traceback_msg) + + span.update( + level="ERROR", + status_message=str(e), + metadata={ + "problem_request_error": error_msg, + "scientist_id": self._scientist_id, + "error": str(e), + "traceback": traceback_msg, + }, + ) + raise + + @message_handler + async def handle_solution_request( + self, message: SolutionRequest, ctx: MessageContext + ) -> None: + """Handle solution request for problems.""" + with self._langfuse_client.start_as_current_span( + name=f"task_scientist_{self._scientist_id}_solution_proposal" + ) as span: + try: + msg = f"Task Scientist {self._scientist_id} solving {len(message.problems)} problems for capability: {message.capability_name}" + log.info(msg) + span.update( + metadata={ + "solution_request_received": msg, + "scientist_id": self._scientist_id, + "capability_name": message.capability_name, + "num_problems": len(message.problems), + } + ) + + problems_json = json.dumps(message.problems, indent=2) + + system_prompt = TASK_SCIENTIST_SOLUTION_SYSTEM_PROMPT.format( + scientist_id=self._scientist_id, + capability_domain=message.capability_domain, + capability_name=message.capability_name, + ) + + user_prompt = TASK_SCIENTIST_SOLUTION_USER_PROMPT.format( + problems=problems_json, + ) + + system_message = SystemMessage(content=system_prompt) + user_message = UserMessage(content=user_prompt, source="user") + + model_result = await self._model_client.create( + [system_message, user_message] + ) + + msg = f"Task Scientist {self._scientist_id} is parsing LLM response" + log.info(msg) + span.update( + metadata={ + "llm_response_received": msg, + "scientist_id": self._scientist_id, + } + ) + + parsed = parse_llm_json_response(model_result.content) + solutions = parsed.get("solutions", {}) + + msg = f"Task Scientist {self._scientist_id} publishing solutions for capability: {message.capability_name}" + log.info(msg) + span.update( + metadata={ + "solution_proposal_published": msg, + "scientist_id": self._scientist_id, + "capability_name": message.capability_name, + "num_solutions_generated": len(solutions), + } + ) + + await self.publish_message( + ScientistSolutionProposal( + scientist_id=self._scientist_id, + capability_name=message.capability_name, + solutions=solutions, + ), + topic_id=DefaultTopicId(), + ) + + except Exception as e: + error_msg = f"Error in Task Scientist {self._scientist_id} handle_solution_request: {e}" + traceback_msg = f"Traceback: {traceback.format_exc()}" + + log.error(error_msg) + log.error(traceback_msg) + + span.update( + level="ERROR", + status_message=str(e), + metadata={ + "solution_request_error": error_msg, + "scientist_id": self._scientist_id, + "error": str(e), + "traceback": traceback_msg, + }, + ) + raise diff --git a/src/utils/agentic_prompts.py b/src/utils/agentic_prompts.py index 00d1f86..b4a0d26 100644 --- a/src/utils/agentic_prompts.py +++ b/src/utils/agentic_prompts.py @@ -202,13 +202,16 @@ - Avoiding overlap or redundancy, - Proposing tasks that vary in difficulty and structure. -Your response must follow this format exactly: -THOUGHT: -RESPONSE JSON: +IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. The JSON should be directly parseable. + +Please return your proposal and your thoughts and reasoning in the following format: {{ - "task_1": "", - "task_2": "", - ... + "thought": "Your reasoning and thought process about the kind of tasks you're proposing", + "problems": {{ + "problem_0": "TASK_TEXT_1", + "problem_1": "TASK_TEXT_2", + ... + }} }} Make sure: @@ -227,13 +230,25 @@ TASK_SCIENTIST_SOLUTION_SYSTEM_PROMPT = """You are Scientist {scientist_id}, an expert in {capability_domain}. You are solving a task related to the capability: {capability_name}. -Provide a clear, accurate, and complete solution to the given problem. Your solution should be correct and well-reasoned.""" +IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. The JSON should be directly parseable. -TASK_SCIENTIST_SOLUTION_USER_PROMPT = """Solve the following problem: +Please return your solution and your thoughts and reasoning in the following format: +{{ + "thought": "Your reasoning and thought process about solving this problem", + "solutions": {{ + "solution_0": "SOLUTION_TEXT_1", + "solution_1": "SOLUTION_TEXT_2", + ... + }} +}} + +Provide clear, accurate, and complete solutions. Your solutions should be correct and well-reasoned.""" -{problem_text} +TASK_SCIENTIST_SOLUTION_USER_PROMPT = """Solve the following problems: -Provide your solution clearly and concisely.""" +{problems} + +Provide your solutions clearly and concisely.""" TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT = """You are the Moderator overseeing capability-based task design. Your task is to review proposed tasks from multiple scientist agents and synthesize a final, high-quality task set for the capability. @@ -243,22 +258,27 @@ - Ensure that the final set of tasks is diverse, non-trivial, and tests different facets of the capability. - Include a brief justification for each rejected or significantly modified task. -Your response should follow this format exactly: +IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. Do not include any prefixes or prose. The JSON should be directly parseable. -THOUGHT: -RESPONSE JSON: -{{ - "final_tasks": {{ +CRITICAL: When including LaTeX expressions or backslashes in your JSON strings, you must properly escape them by using double backslashes (\\\\). For example: +- Write \\\\(x^2\\\\) instead of \\(x^2\\) +- Write \\\\[equation\\\\] instead of \\[equation\\] +- Write \\\\times instead of \\times + +Please return your curation and your thoughts and reasoning in the following format: +{ + "thought": "Your reasoning and curation plan here", + "final_tasks": { "task_1": "", "task_2": "", ... - }}, - "rejected_tasks": {{ + }, + "rejected_tasks": { "task_from_scientist_A": "Reason for rejection or modification", "task_from_scientist_B": "Reason for rejection or modification", ... - }} -}}""" + } +}""" TASK_MODERATOR_PROBLEM_USER_PROMPT = """Below is a capability and task proposals from multiple scientist agents. Curate the final task set by filtering, editing, or merging as needed. From 06da9107960f19cc163f9b10a89022f787c3a15d Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 5 Sep 2025 01:03:28 -0400 Subject: [PATCH 02/19] fixed retry, json processing, and max token. --- src/utils/json_utils.py | 29 ++++++++++++++++++++++++++--- src/utils/model_client_utils.py | 31 ++++++++++++++++++------------- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/src/utils/json_utils.py b/src/utils/json_utils.py index 26c14ae..2a57c0a 100644 --- a/src/utils/json_utils.py +++ b/src/utils/json_utils.py @@ -13,7 +13,13 @@ def extract_json_from_markdown(content: str) -> str: """Extract JSON from markdown if present and clean control characters.""" content = content.strip() - if content.startswith("```json") and content.endswith("```"): + # Handle Gemini's format: "```json\n...\n```" + if content.startswith('"```json') and content.endswith('```"'): + content = content[8:-4].strip() + elif content.startswith('"```') and content.endswith('```"'): + content = content[4:-4].strip() + # Handle standard markdown format: ```json\n...\n``` + elif content.startswith("```json") and content.endswith("```"): content = content[7:-3].strip() elif content.startswith("```") and content.endswith("```"): content = content[3:-3].strip() @@ -21,6 +27,18 @@ def extract_json_from_markdown(content: str) -> str: return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", content) +def fix_common_json_errors(content: str) -> str: + """Fix common JSON syntax errors.""" + # Fix extra equals signs (e.g., "area":="value" -> "area":"value") + content = re.sub(r':\s*=\s*"', ':"', content) + + # Fix missing quotes around keys + content = re.sub(r'(\w+):\s*"', r'"\1":"', content) + + # Fix trailing commas + return re.sub(r",(\s*[}\]])", r"\1", content) + + def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]: """Parse LLM JSON response.""" try: @@ -31,8 +49,12 @@ def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]: # Clean the content first cleaned_content = extract_json_from_markdown(raw_content) + # Fix common JSON errors + cleaned_content = fix_common_json_errors(cleaned_content) + # Parse the JSON - return json.loads(cleaned_content) + result = json.loads(cleaned_content) + return result if isinstance(result, dict) else {} except json.JSONDecodeError as e: log.error(f"Failed to parse JSON response: {e}") @@ -50,7 +72,8 @@ def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]: log.warning( "Attempting to fix unterminated JSON by truncating to last complete entry" ) - return json.loads(fixed_content) + result = json.loads(fixed_content) + return result if isinstance(result, dict) else {} except Exception as fix_error: log.error(f"Failed to fix JSON: {fix_error}") diff --git a/src/utils/model_client_utils.py b/src/utils/model_client_utils.py index a650ee6..c1fdea4 100644 --- a/src/utils/model_client_utils.py +++ b/src/utils/model_client_utils.py @@ -20,6 +20,8 @@ ) +MAX_TOKENS = 1024 * 10 + logger = logging.getLogger(__name__) GEMINI_STUDIO_BASE = "https://generativelanguage.googleapis.com/v1beta/openai/" @@ -48,30 +50,31 @@ def __init__(self, client: Any, max_retries: int = 3): before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True, ) - async def create(self, *args, **kwargs): + async def create(self, *args: Any, **kwargs: Any) -> Any: """Create with retry logic for transient errors.""" return await self.client.create(*args, **kwargs) - def __getattr__(self, name): + def __getattr__(self, name: str) -> Any: """Delegate all other attributes to the wrapped client.""" return getattr(self.client, name) -def get_model_client(model_name: str, seed: Optional[int] = None, **kwargs) -> Any: - """Return a model client for the given model name with retry logic.""" +def get_model_client(model_name: str, seed: Optional[int] = None, **kwargs: Any) -> Any: + """Get a model client for the given model name.""" n = model_name.lower() - if n.startswith(("gpt-", "o1-", "o3-")): - # Add max_tokens to prevent truncated responses - kwargs.setdefault("max_tokens", 4096) - client = OpenAIChatCompletionClient(model=model_name, seed=seed, **kwargs) - return RetryableModelClient(client) + if n.startswith(("gpt-", "o1-", "o3-", "gpt-5")): + kwargs.setdefault("max_completion_tokens", MAX_TOKENS) + openai_client = OpenAIChatCompletionClient( + model=model_name, seed=seed, **kwargs + ) + return RetryableModelClient(openai_client) if "claude" in n: - # Add max_tokens to prevent truncated responses - kwargs.setdefault("max_tokens", 4096) - client = AnthropicChatCompletionClient(model=model_name, **kwargs) - return RetryableModelClient(client) + kwargs.setdefault("max_tokens", MAX_TOKENS) + kwargs.setdefault("timeout", None) + anthropic_client = AnthropicChatCompletionClient(model=model_name, **kwargs) + return RetryableModelClient(anthropic_client) if "gemini" in n: api_key = kwargs.pop("api_key", os.getenv("GOOGLE_API_KEY")) @@ -89,6 +92,8 @@ def get_model_client(model_name: str, seed: Optional[int] = None, **kwargs) -> A ), ) + kwargs.setdefault("max_completion_tokens", MAX_TOKENS) + client = OpenAIChatCompletionClient( model=model_name, base_url=GEMINI_STUDIO_BASE, From 0ca1c2202234c099ffa09b378573bff1522a8ea5 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 5 Sep 2025 03:54:57 -0400 Subject: [PATCH 03/19] switichin to two phase task generation. --- src/agentic_task_generator.py | 23 ++- src/agentic_task_solver.py | 125 +++++++++++ src/task_generation/generator.py | 77 +++++-- src/task_generation/messages.py | 32 --- src/task_generation/moderator.py | 140 ++----------- src/task_generation/scientist.py | 92 --------- src/task_solving/__init__.py | 17 ++ src/task_solving/generator.py | 225 ++++++++++++++++++++ src/task_solving/messages.py | 64 ++++++ src/task_solving/moderator.py | 342 +++++++++++++++++++++++++++++++ src/task_solving/scientist.py | 186 +++++++++++++++++ 11 files changed, 1053 insertions(+), 270 deletions(-) create mode 100644 src/agentic_task_solver.py create mode 100644 src/task_solving/__init__.py create mode 100644 src/task_solving/generator.py create mode 100644 src/task_solving/messages.py create mode 100644 src/task_solving/moderator.py create mode 100644 src/task_solving/scientist.py diff --git a/src/agentic_task_generator.py b/src/agentic_task_generator.py index 439f7a1..ffacd99 100644 --- a/src/agentic_task_generator.py +++ b/src/agentic_task_generator.py @@ -21,20 +21,20 @@ log = logging.getLogger("agentic_task_gen") +lf = Langfuse() +openlit.init( + tracer=lf._otel_tracer, disable_batch=True, disable_metrics=True +) @hydra.main(version_base=None, config_path="cfg", config_name="agentic_config") def main(cfg: DictConfig) -> None: """Run the multi-agent task generation system.""" capabilities_tag = cfg.pipeline_tags.capabilities_tag + resume_tag = getattr(cfg.pipeline_tags, "resume_tasks_tag", None) domain_name = cfg.global_cfg.domain exp_id = cfg.exp_cfg.exp_id - langfuse_client = Langfuse() - openlit.init( - tracer=langfuse_client._otel_tracer, disable_batch=True, disable_metrics=True - ) - - with langfuse_client.start_as_current_span( + with lf.start_as_current_span( name=f"ace_agentic_task_generation:{domain_name}:{exp_id}" ) as span: try: @@ -67,23 +67,30 @@ def main(cfg: DictConfig) -> None: error_msg = "No capabilities_tag provided. Please provide pipeline_tags.capabilities_tag= to specify which capabilities to use." log.warning(error_msg) span.update( - level="WARNING", + level="ERROR", status_message="Missing capabilities_tag", metadata={"capabilities_tag_missing": error_msg}, ) return + + if resume_tag: + msg = f"Resuming task generation from tag: {resume_tag}" + log.info(msg) + span.update(metadata={"resume_tag_found": msg, "resume_tag": resume_tag}) + span.update_trace( metadata={ "domain": domain_name, "exp_id": exp_id, "capabilities_tag": capabilities_tag, + "resume_tag": resume_tag, "config": config_yaml, }, tags=["agentic_task_generation", exp_id], ) - asyncio.run(generate_tasks(cfg, capabilities_tag, langfuse_client)) + asyncio.run(generate_tasks(cfg, capabilities_tag, lf, resume_tag)) msg = "Multi-agent task generation completed successfully" log.info(msg) diff --git a/src/agentic_task_solver.py b/src/agentic_task_solver.py new file mode 100644 index 0000000..355c0df --- /dev/null +++ b/src/agentic_task_solver.py @@ -0,0 +1,125 @@ +"""Multi-agent debate system for solving generated tasks.""" + +import asyncio +import logging +import os +import traceback +from pathlib import Path + +import hydra +import openlit +from langfuse import Langfuse +from omegaconf import DictConfig, OmegaConf + +from src.task_solving.generator import solve_tasks_with_debate, load_tasks_from_file + + +# Suppress OpenTelemetry console output +os.environ["OTEL_LOG_LEVEL"] = "ERROR" +os.environ["OTEL_METRICS_EXPORTER"] = "none" +os.environ["OTEL_PYTHON_LOG_CORRELATION"] = "false" +os.environ["OTEL_PYTHON_LOG_LEVEL"] = "ERROR" + +log = logging.getLogger("agentic_task_solving") + +lf = Langfuse() +openlit.init(tracer=lf._otel_tracer, disable_batch=True, disable_metrics=True) + + +@hydra.main(version_base=None, config_path="cfg", config_name="agentic_config") +def main(cfg: DictConfig) -> None: + """Run the multi-agent debate-based task solving system.""" + domain_name = cfg.global_cfg.domain + exp_id = cfg.exp_cfg.exp_id + output_dir = cfg.global_cfg.output_dir + max_tasks = cfg.task_solving.get("max_tasks", 0) + + with lf.start_as_current_span( + name=f"ace_agentic_task_solving:{domain_name}:{exp_id}" + ) as span: + try: + msg = "Starting multi-agent debate-based task solving" + log.info(msg) + span.update(metadata={"system_started": msg}) + + config_yaml = OmegaConf.to_yaml(cfg, resolve=True) + msg = "Configuration loaded" + log.info("Configuration:\n%s", config_yaml) + span.update( + metadata={ + "configuration_loaded": msg, + "config": config_yaml, + "domain": domain_name, + "exp_id": exp_id, + } + ) + + # Load tasks from the specified file or use pipeline tags to find them + tasks_file = None + if cfg.pipeline_tags.get("tasks_tag"): + # Look for tasks file using the tag + tasks_dir = Path(output_dir) / domain_name / "tasks" + tasks_file = tasks_dir / f"tasks_{cfg.pipeline_tags.tasks_tag}.json" + elif cfg.task_solving.get("input_file"): + tasks_file = Path(cfg.task_solving.input_file) + else: + raise ValueError("Either pipeline_tags.tasks_tag or task_solving.input_file must be specified") + + if not tasks_file.exists(): + raise FileNotFoundError(f"Tasks file not found: {tasks_file}") + + log.info(f"Loading tasks from: {tasks_file}") + tasks = load_tasks_from_file(tasks_file) + log.info(f"Loaded {len(tasks)} tasks") + + # Limit number of tasks if specified + if max_tasks > 0: + tasks = tasks[:max_tasks] + log.info(f"Limited to {len(tasks)} tasks") + + # Run task solving + msg = f"Running task solving for {len(tasks)} tasks" + log.info(msg) + span.update(metadata={"task_solving_started": msg}) + + results = asyncio.run(solve_tasks_with_debate( + cfg=cfg, + tasks=tasks, + langfuse_client=lf, + )) + + # Print summary + consensus_count = sum(1 for result in results.values() if result.get("consensus_reached", False)) + no_consensus_count = len(results) - consensus_count + + msg = f"Task solving completed. Consensus: {consensus_count}, No consensus: {no_consensus_count}" + log.info(msg) + span.update( + metadata={ + "task_solving_completed": msg, + "total_tasks": len(results), + "consensus_reached": consensus_count, + "no_consensus": no_consensus_count, + } + ) + + # Print detailed results if requested + if cfg.task_solving.get("print_results", False): + for task_id, result in results.items(): + log.info(f"\nTask {task_id}:") + log.info(f" Solution: {result['solution'][:100]}...") + log.info(f" Consensus: {result['consensus_reached']}") + log.info(f" Rounds: {result['total_rounds']}") + + except Exception as e: + error_msg = f"Error in agentic task solving: {str(e)}" + log.error(error_msg) + log.error(traceback.format_exc()) + span.update(metadata={"error": error_msg}) + raise + finally: + lf.flush() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/task_generation/generator.py b/src/task_generation/generator.py index 5be2742..0504aa6 100644 --- a/src/task_generation/generator.py +++ b/src/task_generation/generator.py @@ -14,7 +14,7 @@ DefaultTopicId, SingleThreadedAgentRuntime, ) -from autogen_ext.models.openai import OpenAIChatCompletionClient +from src.utils.model_client_utils import get_model_client from langfuse import Langfuse from omegaconf import DictConfig @@ -56,8 +56,8 @@ async def generate_tasks_for_capability( runtime, "TaskScientistA", lambda: TaskScientist( - model_client=OpenAIChatCompletionClient( - model=cfg.agents.scientist_a.model_name, + model_client=get_model_client( + model_name=cfg.agents.scientist_a.model_name, seed=cfg.agents.scientist_a.seed, ), scientist_id="A", @@ -70,8 +70,8 @@ async def generate_tasks_for_capability( runtime, "TaskScientistB", lambda: TaskScientist( - model_client=OpenAIChatCompletionClient( - model=cfg.agents.scientist_b.model_name, + model_client=get_model_client( + model_name=cfg.agents.scientist_b.model_name, seed=cfg.agents.scientist_b.seed, ), scientist_id="B", @@ -85,14 +85,13 @@ async def generate_tasks_for_capability( runtime, "TaskModerator", lambda: TaskModerator( - model_client=OpenAIChatCompletionClient( - model=cfg.agents.moderator.model_name, + model_client=get_model_client( + model_name=cfg.agents.moderator.model_name, seed=cfg.agents.moderator.seed, ), num_scientists=2, num_final_problems=cfg.task_generation.num_final_problems_per_capability, buffer_param=cfg.task_generation.buffer_param, - agreement_threshold=cfg.task_generation.agreement_threshold, output_dir=output_dir, domain=domain_name, langfuse_client=langfuse_client, @@ -161,12 +160,20 @@ async def generate_tasks_for_capability( async def generate_tasks( - cfg: DictConfig, capabilities_tag: str, langfuse_client: Langfuse + cfg: DictConfig, + capabilities_tag: str, + langfuse_client: Langfuse, + resume_tag: str = None, ) -> None: """Generate tasks for all capabilities.""" domain_name = cfg.global_cfg.domain exp_id = cfg.exp_cfg.exp_id - tasks_tag = f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + + if resume_tag: + tasks_tag = resume_tag + log.info(f"Resuming task generation with existing tag: {tasks_tag}") + else: + tasks_tag = f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}" with langfuse_client.start_as_current_span( name=f"ace_task_generation:{domain_name}:{exp_id}:{tasks_tag}" @@ -230,7 +237,6 @@ async def generate_tasks( if capabilities_file.exists(): with open(capabilities_file, "r", encoding="utf-8") as f: capabilities_data = json.load(f) - if ( isinstance(capabilities_data, dict) and "capabilities" in capabilities_data @@ -291,11 +297,41 @@ async def generate_tasks( } ) - # Print the timestamp for future reference - print(f"Tasks generated with tag: {tasks_tag}") + # Check for existing tasks if resuming + existing_tasks = set() + if resume_tag and output_dir.exists(): + for cap_dir in output_dir.iterdir(): + if cap_dir.is_dir() and (cap_dir / "tasks.json").exists(): + existing_tasks.add(cap_dir.name) + + if existing_tasks: + msg = f"Found {len(existing_tasks)} existing task sets: {list(existing_tasks)}" + log.info(msg) + span.update(metadata={"existing_tasks": msg}) + else: + log.info("No existing tasks found, will generate tasks all capabilities") + + processed_capabilities = 0 + skipped_capabilities = 0 # Process each capability individually for i, capability in enumerate(capabilities): + capability_dir_name = capability.name.replace(" ", "_") + + # Skip if tasks already exist for this capability + if resume_tag and capability_dir_name in existing_tasks: + msg = f"Skipping capability {i + 1}/{len(capabilities)}: {capability.name} (already exists)" + log.info(msg) + span.update( + metadata={ + f"capability_{i + 1}_skipped": msg, + "skipped_capability": capability.name, + "progress": f"{i + 1}/{len(capabilities)}", + } + ) + skipped_capabilities += 1 + continue + msg = f"Processing capability {i + 1}/{len(capabilities)}: {capability.name}" log.info(msg) span.update( @@ -318,8 +354,21 @@ async def generate_tasks( "completed_capability": capability.name, } ) - + + processed_capabilities += 1 await asyncio.sleep(1) + + # Final summary + msg = f"Task generation completed. Processed: {processed_capabilities}, Skipped: {skipped_capabilities}, Total: {len(capabilities)}" + log.info(msg) + span.update( + metadata={ + "final_summary": msg, + "processed_capabilities": processed_capabilities, + "skipped_capabilities": skipped_capabilities, + "total_capabilities": len(capabilities), + } + ) except Exception as e: error_msg = f"Error in generate_tasks: {e}" diff --git a/src/task_generation/messages.py b/src/task_generation/messages.py index 09b5e9d..38daaa9 100644 --- a/src/task_generation/messages.py +++ b/src/task_generation/messages.py @@ -36,39 +36,7 @@ class ScientistProblemProposal: iteration: int -@dataclass -class ModeratorProblemReview: - """Moderator's review and filtering of problems.""" - - capability_name: str - final_problems: Dict[str, str] # task_id -> task_text - rejected_problems: Dict[str, str] # task_id -> rejection_reason - iteration: int - - -@dataclass -class SolutionRequest: - """Request for scientists to solve problems.""" - - capability_name: str - capability_description: str - capability_domain: str - capability_area: str - problems: Dict[str, str] # task_id -> task_text - - -@dataclass -class ScientistSolutionProposal: - """Solution proposal from a scientist.""" - scientist_id: str - capability_name: str - solutions: Dict[str, str] # task_id -> solution -@dataclass -class FinalTaskSet: - """Final task set with problems and solutions.""" - capability_name: str - tasks: Dict[str, Dict[str, str]] # task_id -> {problem, answer} diff --git a/src/task_generation/moderator.py b/src/task_generation/moderator.py index 9e5c96b..0238d44 100644 --- a/src/task_generation/moderator.py +++ b/src/task_generation/moderator.py @@ -25,8 +25,6 @@ Capability, ProblemProposalRequest, ScientistProblemProposal, - ScientistSolutionProposal, - SolutionRequest, ) from src.utils.agentic_prompts import ( TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT, @@ -48,7 +46,6 @@ def __init__( num_scientists: int, num_final_problems: int, buffer_param: int, - agreement_threshold: float, output_dir: Path, domain: str, langfuse_client: Langfuse, @@ -58,7 +55,6 @@ def __init__( self._num_scientists = num_scientists self._num_final_problems = num_final_problems self._buffer_param = buffer_param - self._agreement_threshold = agreement_threshold self._output_dir = output_dir self._domain = domain self._langfuse_client = langfuse_client @@ -75,10 +71,7 @@ def __init__( str, List[ScientistProblemProposal] ] = {} # capability -> proposals - # Solution design state - self._solution_proposals: Dict[ - str, List[ScientistSolutionProposal] - ] = {} # capability -> solutions + @message_handler async def handle_capability(self, message: Capability, ctx: MessageContext) -> None: @@ -266,13 +259,11 @@ async def _filter_and_select_problems( try: parsed = parse_llm_json_response(raw_content) final_tasks = parsed.get("final_tasks", {}) - rejected_tasks = parsed.get("rejected_tasks", {}) except Exception as e: log.error( f"Error parsing JSON from moderator: {e}\nOutput: {raw_content}" ) final_tasks = {} - rejected_tasks = {} # Update Algorithm 1 state num_remaining = self._num_remaining[capability_name] @@ -294,9 +285,6 @@ async def _filter_and_select_problems( log.info( f"Task Moderator selected {selected_count} problems for {capability_name}, {self._num_remaining[capability_name]} remaining" ) - log.info( - f"Rejected {len(rejected_tasks)} problems: {list(rejected_tasks.keys())}" - ) # Continue Algorithm 1 or move to solution design if self._num_remaining[capability_name] > 0: @@ -304,138 +292,42 @@ async def _filter_and_select_problems( capability = self._capabilities[capability_name] await self._start_problem_iteration(capability) else: - # Problem design complete, start solution design - capability = self._capabilities[capability_name] - await self._start_solution_design(capability) + # Problem design complete, finalize tasks without solutions + await self._finalize_tasks_without_solutions(capability_name) except Exception as e: log.error(f"Error in Task Moderator _filter_and_select_problems: {e}") log.error(f"Traceback: {traceback.format_exc()}") raise - async def _start_solution_design(self, capability: Capability) -> None: - """Start solution design phase.""" + async def _finalize_tasks_without_solutions(self, capability_name: str) -> None: + """Finalize tasks with problems only (no solutions).""" try: log.info( - f"Task Moderator starting solution design for capability: {capability.name}" + f"Task Moderator finalizing tasks for capability: {capability_name}" ) - final_problems = self._final_problems[capability.name] + final_problems = self._final_problems[capability_name] if not final_problems: log.error( - f"No final problems available for capability: {capability.name}" + f"No final problems available for capability: {capability_name}" ) return - # Send solution requests to all scientists - await self.publish_message( - SolutionRequest( - capability_name=capability.name, - capability_description=capability.description, - capability_domain=capability.domain, - capability_area=capability.area, - problems=final_problems, - ), - topic_id=DefaultTopicId(), - ) - - except Exception as e: - log.error(f"Error in Task Moderator _start_solution_design: {e}") - log.error(f"Traceback: {traceback.format_exc()}") - raise - - @message_handler - async def handle_scientist_solution_proposal( - self, message: ScientistSolutionProposal, ctx: MessageContext - ) -> None: - """Handle solution proposals from scientists.""" - try: - log.info( - f"Task Moderator received solution proposal from Scientist {message.scientist_id} for capability: {message.capability_name}" - ) - - capability_name = message.capability_name - if capability_name not in self._solution_proposals: - self._solution_proposals[capability_name] = [] - - self._solution_proposals[capability_name].append(message) - - # Check if we have all solutions - if len(self._solution_proposals[capability_name]) == self._num_scientists: - log.info( - f"Task Moderator received all solutions for capability: {capability_name}, determining consensus" - ) - await self._determine_solution_consensus(capability_name) - - except Exception as e: - log.error( - f"Error in Task Moderator handle_scientist_solution_proposal: {e}" - ) - log.error(f"Traceback: {traceback.format_exc()}") - raise - - async def _determine_solution_consensus(self, capability_name: str) -> None: - """Determine solution consensus and finalize tasks.""" - try: - log.info( - f"Task Moderator determining solution consensus for capability: {capability_name}" - ) - - solutions_by_task: Dict[ - str, Dict[str, str] - ] = {} # task_id -> [scientist_id -> solution] - - for proposal in self._solution_proposals[capability_name]: - for task_id, solution in proposal.solutions.items(): - if task_id not in solutions_by_task: - solutions_by_task[task_id] = {} - solutions_by_task[task_id][proposal.scientist_id] = solution - + # Create tasks with problems only final_tasks = {} - - for task_id, problem_text in self._final_problems[capability_name].items(): - if task_id in solutions_by_task: - scientist_solutions = solutions_by_task[task_id] - - # Simple consensus: find most common solution - solution_counts: Dict[str, int] = {} - for solution in scientist_solutions.values(): - solution_counts[solution] = solution_counts.get(solution, 0) + 1 - - if solution_counts: - most_common_solution = max( - solution_counts.keys(), key=lambda x: solution_counts[x] - ) - agreement_rate = solution_counts[most_common_solution] / len( - scientist_solutions - ) - - if agreement_rate >= self._agreement_threshold: - final_tasks[task_id] = { - "problem": problem_text, - "answer": most_common_solution, - } - log.info( - f"Task {task_id}: consensus achieved ({agreement_rate:.2f} agreement)" - ) - else: - log.warning( - f"Task {task_id}: low agreement ({agreement_rate:.2f}), requires human review" - ) - # For now, use most common solution but mark it - final_tasks[task_id] = { - "problem": problem_text, - "answer": most_common_solution, - "requires_human_review": "true", - "agreement_rate": str(agreement_rate), - } + for task_id, problem_text in final_problems.items(): + final_tasks[task_id] = { + "task": problem_text, + "capability_id": capability_name, + } # Save final tasks await self._save_tasks_to_file(capability_name, final_tasks) - log.info(f"Task generation completed for capability: {capability_name}") + log.info(f"Task generation completed for capability: {capability_name} ({len(final_tasks)} tasks)") except Exception as e: - log.error(f"Error in Task Moderator _determine_solution_consensus: {e}") + log.error(f"Error in Task Moderator _finalize_tasks_without_solutions: {e}") log.error(f"Traceback: {traceback.format_exc()}") raise diff --git a/src/task_generation/scientist.py b/src/task_generation/scientist.py index 2daa571..25b25d8 100644 --- a/src/task_generation/scientist.py +++ b/src/task_generation/scientist.py @@ -21,14 +21,10 @@ from src.task_generation.messages import ( ProblemProposalRequest, ScientistProblemProposal, - ScientistSolutionProposal, - SolutionRequest, ) from src.utils.agentic_prompts import ( TASK_SCIENTIST_PROBLEM_SYSTEM_PROMPT, TASK_SCIENTIST_PROBLEM_USER_PROMPT, - TASK_SCIENTIST_SOLUTION_SYSTEM_PROMPT, - TASK_SCIENTIST_SOLUTION_USER_PROMPT, ) from src.utils.json_utils import parse_llm_json_response @@ -153,92 +149,4 @@ async def handle_problem_proposal_request( ) raise - @message_handler - async def handle_solution_request( - self, message: SolutionRequest, ctx: MessageContext - ) -> None: - """Handle solution request for problems.""" - with self._langfuse_client.start_as_current_span( - name=f"task_scientist_{self._scientist_id}_solution_proposal" - ) as span: - try: - msg = f"Task Scientist {self._scientist_id} solving {len(message.problems)} problems for capability: {message.capability_name}" - log.info(msg) - span.update( - metadata={ - "solution_request_received": msg, - "scientist_id": self._scientist_id, - "capability_name": message.capability_name, - "num_problems": len(message.problems), - } - ) - - problems_json = json.dumps(message.problems, indent=2) - - system_prompt = TASK_SCIENTIST_SOLUTION_SYSTEM_PROMPT.format( - scientist_id=self._scientist_id, - capability_domain=message.capability_domain, - capability_name=message.capability_name, - ) - user_prompt = TASK_SCIENTIST_SOLUTION_USER_PROMPT.format( - problems=problems_json, - ) - - system_message = SystemMessage(content=system_prompt) - user_message = UserMessage(content=user_prompt, source="user") - - model_result = await self._model_client.create( - [system_message, user_message] - ) - - msg = f"Task Scientist {self._scientist_id} is parsing LLM response" - log.info(msg) - span.update( - metadata={ - "llm_response_received": msg, - "scientist_id": self._scientist_id, - } - ) - - parsed = parse_llm_json_response(model_result.content) - solutions = parsed.get("solutions", {}) - - msg = f"Task Scientist {self._scientist_id} publishing solutions for capability: {message.capability_name}" - log.info(msg) - span.update( - metadata={ - "solution_proposal_published": msg, - "scientist_id": self._scientist_id, - "capability_name": message.capability_name, - "num_solutions_generated": len(solutions), - } - ) - - await self.publish_message( - ScientistSolutionProposal( - scientist_id=self._scientist_id, - capability_name=message.capability_name, - solutions=solutions, - ), - topic_id=DefaultTopicId(), - ) - - except Exception as e: - error_msg = f"Error in Task Scientist {self._scientist_id} handle_solution_request: {e}" - traceback_msg = f"Traceback: {traceback.format_exc()}" - - log.error(error_msg) - log.error(traceback_msg) - - span.update( - level="ERROR", - status_message=str(e), - metadata={ - "solution_request_error": error_msg, - "scientist_id": self._scientist_id, - "error": str(e), - "traceback": traceback_msg, - }, - ) - raise diff --git a/src/task_solving/__init__.py b/src/task_solving/__init__.py new file mode 100644 index 0000000..51e8634 --- /dev/null +++ b/src/task_solving/__init__.py @@ -0,0 +1,17 @@ +"""Task solving module with debate-based approach.""" + +from .generator import solve_tasks_with_debate, load_tasks_from_file +from .messages import Task, TaskSolutionRequest, AgentSolution, FinalSolution +from .moderator import TaskSolvingModerator +from .scientist import TaskSolvingScientist + +__all__ = [ + "solve_tasks_with_debate", + "load_tasks_from_file", + "Task", + "TaskSolutionRequest", + "AgentSolution", + "FinalSolution", + "TaskSolvingModerator", + "TaskSolvingScientist", +] \ No newline at end of file diff --git a/src/task_solving/generator.py b/src/task_solving/generator.py new file mode 100644 index 0000000..26f05b6 --- /dev/null +++ b/src/task_solving/generator.py @@ -0,0 +1,225 @@ +"""Main task solving orchestration function.""" + +import json +import logging +import traceback +from datetime import datetime +from pathlib import Path +from typing import Dict, List + +from autogen_core import ( + EVENT_LOGGER_NAME, + ROOT_LOGGER_NAME, + TRACE_LOGGER_NAME, + DefaultTopicId, + SingleThreadedAgentRuntime, +) +from langfuse import Langfuse +from omegaconf import DictConfig + +from src.task_solving.messages import Task +from src.task_solving.moderator import TaskSolvingModerator +from src.task_solving.scientist import TaskSolvingScientist +from src.utils.model_client_utils import get_model_client + + +log = logging.getLogger("task_solving.generator") +logging.getLogger(ROOT_LOGGER_NAME).setLevel(logging.WARNING) +logging.getLogger(TRACE_LOGGER_NAME).setLevel(logging.WARNING) +logging.getLogger(EVENT_LOGGER_NAME).setLevel(logging.WARNING) + + +async def solve_tasks_with_debate( + cfg: DictConfig, + tasks: List[Dict], + langfuse_client: Langfuse = None +) -> Dict[str, Dict]: + """ + Solve tasks using multi-agent debate system. + + Args: + cfg: Configuration containing debate and model settings + tasks: List of tasks to solve, each containing task_id, task content, and capability_id + langfuse_client: Langfuse client for tracing + + Returns: + Dictionary mapping task_id to final solution data + """ + domain_name = cfg.global_cfg.domain + exp_id = cfg.exp_cfg.exp_id + max_rounds = cfg.debate_cfg.max_round + num_solvers = 2 # scientist_a and scientist_b + solutions_tag = f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + + with langfuse_client.start_as_current_span( + name=f"ace_task_solving:{domain_name}:{exp_id}:{solutions_tag}" + ) as span: + try: + msg = f"Solutions will be saved with tag: {solutions_tag}" + log.info(msg) + span.update( + metadata={ + "solving_started": msg, + "solutions_tag": solutions_tag, + "domain": domain_name, + "exp_id": exp_id, + "num_tasks": len(tasks), + "num_solvers": num_solvers, + "max_rounds": max_rounds, + } + ) + + # Create output directory + output_dir = Path(cfg.global_cfg.output_dir) / "task_solutions" / f"{domain_name}_{exp_id}{solutions_tag}" + output_dir.mkdir(parents=True, exist_ok=True) + + # Set up runtime + runtime = SingleThreadedAgentRuntime() + + # Create model clients for each agent + scientist_a_client = get_model_client( + cfg.agents.scientist_a.model_name, + seed=cfg.agents.scientist_a.get("seed") + ) + scientist_b_client = get_model_client( + cfg.agents.scientist_b.model_name, + seed=cfg.agents.scientist_b.get("seed") + ) + moderator_client = get_model_client( + cfg.agents.moderator.model_name, + seed=cfg.agents.moderator.get("seed") + ) + + # Register moderator + moderator_agent_type = await TaskSolvingModerator.register( + runtime, + "task_solving_moderator", + lambda: TaskSolvingModerator( + model_client=moderator_client, + num_solvers=num_solvers, + max_rounds=max_rounds, + output_dir=output_dir, + langfuse_client=langfuse_client, + ), + ) + + # Register scientist agents + scientist_a_type = await TaskSolvingScientist.register( + runtime, + "task_scientist_a", + lambda: TaskSolvingScientist( + model_client=scientist_a_client, + scientist_id="scientist_a", + langfuse_client=langfuse_client, + ), + ) + + scientist_b_type = await TaskSolvingScientist.register( + runtime, + "task_scientist_b", + lambda: TaskSolvingScientist( + model_client=scientist_b_client, + scientist_id="scientist_b", + langfuse_client=langfuse_client, + ), + ) + + # Start runtime + runtime.start() + + log.info(f"Starting task solving for {len(tasks)} tasks with {num_solvers} scientists") + + # Process each task + for i, (task_id, task_data) in enumerate(tasks.items()): + # Handle both old and new task formats + if isinstance(task_data, dict) and "task" in task_data: + # New format: {"task": "problem text", "capability_id": "cap_name"} + capability_id = task_data.get("capability_id", "unknown") + task_content = task_data + else: + # Old format or other formats + capability_id = task_data.get("capability_id", "unknown") if isinstance(task_data, dict) else "unknown" + task_content = {"task": str(task_data)} if not isinstance(task_data, dict) else task_data + + # Create task message + task = Task( + task_id=task_id, + task_content=task_content, + capability_id=capability_id, + ) + + # Send task to moderator + await runtime.publish_message( + task, + topic_id=DefaultTopicId() + ) + + log.info(f"Submitted task {task_id} for solving") + + # Wait for all tasks to complete + # Note: In a real implementation, you might want to add a timeout + # and check for completion status + await runtime.stop_when_idle() + + # Collect results + results = {} + for solution_file in output_dir.glob("task_*_solution.json"): + try: + with open(solution_file, "r") as f: + solution_data = json.load(f) + results[solution_data["task_id"]] = solution_data + except Exception as e: + log.error(f"Error loading solution from {solution_file}: {e}") + + log.info(f"Task solving completed. Processed {len(results)} tasks.") + + span.update( + metadata={ + "solving_completed": f"Processed {len(results)} tasks", + "output_dir": str(output_dir), + "results_count": len(results), + } + ) + + return results + + except Exception as e: + error_msg = f"Error in task solving: {str(e)}" + log.error(error_msg) + log.error(traceback.format_exc()) + span.update(metadata={"error": error_msg}) + raise + + +def load_tasks_from_file(tasks_file: Path) -> List[Dict]: + """ + Load tasks from a JSON file. + + Args: + tasks_file: Path to the tasks file + + Returns: + List of task dictionaries + """ + try: + with open(tasks_file, "r") as f: + tasks_data = json.load(f) + + # Handle different task file formats + if isinstance(tasks_data, list): + # Old format: list of tasks + return {f"task_{i+1}": task for i, task in enumerate(tasks_data)} + elif isinstance(tasks_data, dict): + # If it's a dict, try to extract tasks + if "tasks" in tasks_data: + # New format: {"tasks": {"task_1": {...}, "task_2": {...}}} + return tasks_data["tasks"] + else: + # Convert dict to single task + return {"task_1": tasks_data} + else: + raise ValueError(f"Unexpected task file format: {type(tasks_data)}") + + except Exception as e: + log.error(f"Error loading tasks from {tasks_file}: {e}") + raise \ No newline at end of file diff --git a/src/task_solving/messages.py b/src/task_solving/messages.py new file mode 100644 index 0000000..a1af9d3 --- /dev/null +++ b/src/task_solving/messages.py @@ -0,0 +1,64 @@ +"""Message types for task solving debate system.""" + +from dataclasses import dataclass +from typing import Any, Dict, List + +from autogen_core import BaseMessage + + +@dataclass +class Task(BaseMessage): + """Task to be solved.""" + + task_id: str + task_content: Dict[str, Any] + capability_id: str + + +@dataclass +class TaskSolutionRequest(BaseMessage): + """Request to solve a task.""" + + task: Task + round_number: int = 1 + + +@dataclass +class AgentSolution(BaseMessage): + """Solution proposed by an agent.""" + + agent_id: str + task_id: str + thought: str + final_answer: str + round_number: int + + +@dataclass +class AgentRevisionRequest(BaseMessage): + """Request for agent to revise solution based on other agents' solutions.""" + + task: Task + other_solutions: List[AgentSolution] + round_number: int + + +@dataclass +class ConsensusCheck(BaseMessage): + """Check if consensus has been reached.""" + + task_id: str + solutions: List[AgentSolution] + round_number: int + + +@dataclass +class FinalSolution(BaseMessage): + """Final solution for a task.""" + + task_id: str + solution: str + reasoning: str + consensus_reached: bool + total_rounds: int + all_solutions: List[AgentSolution] \ No newline at end of file diff --git a/src/task_solving/moderator.py b/src/task_solving/moderator.py new file mode 100644 index 0000000..251bfd6 --- /dev/null +++ b/src/task_solving/moderator.py @@ -0,0 +1,342 @@ +"""Task solving moderator agent for managing the debate process.""" + +import json +import logging +import re +import traceback +from pathlib import Path +from typing import Dict, List + +from autogen_core import ( + DefaultTopicId, + MessageContext, + RoutedAgent, + default_subscription, + message_handler, +) +from autogen_core.models import ( + ChatCompletionClient, + SystemMessage, + UserMessage, +) +from langfuse import Langfuse + +from src.task_solving.messages import ( + AgentRevisionRequest, + AgentSolution, + ConsensusCheck, + FinalSolution, + Task, + TaskSolutionRequest, +) +from src.utils.agentic_prompts import ( + TASK_MODERATOR_CONSENSUS_PROMPT, + TASK_MODERATOR_SYSTEM_MESSAGE, +) + + +log = logging.getLogger("task_solving.moderator") + + +@default_subscription +class TaskSolvingModerator(RoutedAgent): + """Moderator that manages task solving debate and checks for consensus.""" + + def __init__( + self, + model_client: ChatCompletionClient, + num_solvers: int, + max_rounds: int, + output_dir: Path, + langfuse_client: Langfuse = None, + ) -> None: + super().__init__("Task Solving Moderator") + self._model_client = model_client + self._num_solvers = num_solvers + self._max_rounds = max_rounds + self._output_dir = output_dir + self._langfuse_client = langfuse_client + + # Track solutions by task_id and round + self._solutions_buffer: Dict[str, Dict[int, List[AgentSolution]]] = {} + self._current_round: Dict[str, int] = {} + self._final_solutions: Dict[str, FinalSolution] = {} + + def _extract_consensus_components(self, response: str) -> tuple[bool, str, str]: + """Extract consensus decision, solution, and reasoning from response.""" + consensus_match = re.search(r"CONSENSUS_REACHED:\s*(true|false)", response, re.IGNORECASE) + solution_match = re.search(r"FINAL_SOLUTION:\s*(.*?)(?=REASONING:|$)", response, re.DOTALL | re.IGNORECASE) + reasoning_match = re.search(r"REASONING:\s*(.*?)$", response, re.DOTALL | re.IGNORECASE) + + consensus_reached = consensus_match.group(1).lower() == "true" if consensus_match else False + final_solution = solution_match.group(1).strip() if solution_match else "NONE" + reasoning = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided" + + return consensus_reached, final_solution, reasoning + + def _check_simple_consensus(self, solutions: List[AgentSolution]) -> tuple[bool, str]: + """Simple consensus check - if all agents have the same final answer.""" + if not solutions: + return False, "" + + # Extract final answers and normalize them + answers = [sol.final_answer.strip().lower() for sol in solutions] + + # Check if all answers are the same + if len(set(answers)) == 1: + return True, solutions[0].final_answer + + return False, "" + + @message_handler + async def handle_task(self, message: Task, ctx: MessageContext) -> None: + """Handle a task and initiate the solving process.""" + with self._langfuse_client.start_as_current_span( + name=f"moderator_handle_task_{message.task_id}" + ) as span: + try: + msg = f"Moderator received task: {message.task_id}" + log.info(msg) + span.update( + metadata={ + "task_received": msg, + "task_id": message.task_id, + "capability_id": message.capability_id, + } + ) + + # Initialize tracking for this task + self._solutions_buffer[message.task_id] = {} + self._current_round[message.task_id] = 1 + + # Send initial solution request to all solvers + await self.publish_message( + TaskSolutionRequest(task=message, round_number=1), + topic_id=DefaultTopicId(), + ) + + span.update( + metadata={"solution_request_sent": f"Round 1 solution request sent for task {message.task_id}"} + ) + + except Exception as e: + error_msg = f"Error handling task {message.task_id}: {str(e)}" + log.error(error_msg) + log.error(traceback.format_exc()) + span.update(metadata={"error": error_msg}) + + @message_handler + async def handle_agent_solution(self, message: AgentSolution, ctx: MessageContext) -> None: + """Handle solution from an agent.""" + with self._langfuse_client.start_as_current_span( + name=f"moderator_handle_solution_{message.task_id}_round_{message.round_number}" + ) as span: + try: + task_id = message.task_id + round_num = message.round_number + + msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, round {round_num}" + log.info(msg) + span.update( + metadata={ + "solution_received": msg, + "task_id": task_id, + "agent_id": message.agent_id, + "round": round_num, + } + ) + + # Initialize round buffer if needed + if round_num not in self._solutions_buffer[task_id]: + self._solutions_buffer[task_id][round_num] = [] + + # Add solution to buffer + self._solutions_buffer[task_id][round_num].append(message) + + # Check if we have all solutions for this round + if len(self._solutions_buffer[task_id][round_num]) == self._num_solvers: + await self._check_consensus_and_proceed(task_id, round_num, ctx) + + span.update( + metadata={ + "solutions_collected": f"{len(self._solutions_buffer[task_id][round_num])}/{self._num_solvers} for round {round_num}" + } + ) + + except Exception as e: + error_msg = f"Error handling solution from agent {message.agent_id}: {str(e)}" + log.error(error_msg) + log.error(traceback.format_exc()) + span.update(metadata={"error": error_msg}) + + async def _check_consensus_and_proceed(self, task_id: str, round_num: int, ctx: MessageContext) -> None: + """Check for consensus and either finalize or start next round.""" + with self._langfuse_client.start_as_current_span( + name=f"moderator_consensus_check_{task_id}_round_{round_num}" + ) as span: + try: + solutions = self._solutions_buffer[task_id][round_num] + + # First try simple consensus check + simple_consensus, simple_solution = self._check_simple_consensus(solutions) + + if simple_consensus: + # Simple consensus reached + final_solution = FinalSolution( + task_id=task_id, + solution=simple_solution, + reasoning="All agents provided the same answer", + consensus_reached=True, + total_rounds=round_num, + all_solutions=self._get_all_solutions_for_task(task_id), + ) + + self._final_solutions[task_id] = final_solution + await self._save_final_solution(final_solution) + + span.update( + metadata={ + "consensus_reached": True, + "method": "simple", + "final_solution": simple_solution[:100], + } + ) + return + + # If no simple consensus and we haven't reached max rounds, use LLM to check + if round_num < self._max_rounds: + # Use LLM moderator to check for consensus + task_content = "" # We need to get the original task content + # For now, let's get it from the first solution's context or we need to store it + + # Format solutions for LLM + all_solutions_text = "\n\n".join([ + f"Agent {sol.agent_id}:\nReasoning: {sol.thought}\nFinal Answer: {sol.final_answer}" + for sol in solutions + ]) + + prompt = TASK_MODERATOR_CONSENSUS_PROMPT.format( + problem_text=task_content, # We need to store this from the original task + all_solutions=all_solutions_text + ) + + system_message = SystemMessage(content=TASK_MODERATOR_SYSTEM_MESSAGE) + user_message = UserMessage(content=prompt, source="user") + + response = await self._model_client.create( + messages=[system_message, user_message], + cancellation_token=ctx.cancellation_token, + ) + + consensus_reached, final_solution_text, reasoning = self._extract_consensus_components(response.content) + + if consensus_reached: + # LLM found consensus + final_solution = FinalSolution( + task_id=task_id, + solution=final_solution_text, + reasoning=reasoning, + consensus_reached=True, + total_rounds=round_num, + all_solutions=self._get_all_solutions_for_task(task_id), + ) + + self._final_solutions[task_id] = final_solution + await self._save_final_solution(final_solution) + + span.update( + metadata={ + "consensus_reached": True, + "method": "llm_moderator", + "final_solution": final_solution_text[:100], + } + ) + return + else: + # No consensus, start next round + next_round = round_num + 1 + self._current_round[task_id] = next_round + + # We need the original task to send revision requests + # For now, create a placeholder task + task = Task(task_id=task_id, task_content={"task": task_content}, capability_id="") + + await self.publish_message( + AgentRevisionRequest( + task=task, + other_solutions=solutions, + round_number=next_round, + ), + topic_id=DefaultTopicId(), + ) + + span.update( + metadata={ + "consensus_reached": False, + "next_round_started": next_round, + } + ) + else: + # Max rounds reached, no consensus + final_solution = FinalSolution( + task_id=task_id, + solution="No consensus reached", + reasoning=f"Maximum rounds ({self._max_rounds}) reached without consensus", + consensus_reached=False, + total_rounds=round_num, + all_solutions=self._get_all_solutions_for_task(task_id), + ) + + self._final_solutions[task_id] = final_solution + await self._save_final_solution(final_solution) + + span.update( + metadata={ + "consensus_reached": False, + "max_rounds_reached": True, + } + ) + + except Exception as e: + error_msg = f"Error checking consensus for task {task_id}: {str(e)}" + log.error(error_msg) + log.error(traceback.format_exc()) + span.update(metadata={"error": error_msg}) + + def _get_all_solutions_for_task(self, task_id: str) -> List[AgentSolution]: + """Get all solutions for a task across all rounds.""" + all_solutions = [] + for round_solutions in self._solutions_buffer[task_id].values(): + all_solutions.extend(round_solutions) + return all_solutions + + async def _save_final_solution(self, final_solution: FinalSolution) -> None: + """Save the final solution to a file.""" + try: + output_file = self._output_dir / f"task_{final_solution.task_id}_solution.json" + + solution_data = { + "task_id": final_solution.task_id, + "solution": final_solution.solution, + "reasoning": final_solution.reasoning, + "consensus_reached": final_solution.consensus_reached, + "total_rounds": final_solution.total_rounds, + "all_solutions": [ + { + "agent_id": sol.agent_id, + "thought": sol.thought, + "final_answer": sol.final_answer, + "round_number": sol.round_number, + } + for sol in final_solution.all_solutions + ], + } + + with open(output_file, "w") as f: + json.dump(solution_data, f, indent=2) + + log.info(f"Saved final solution for task {final_solution.task_id} to {output_file}") + + except Exception as e: + log.error(f"Error saving final solution for task {final_solution.task_id}: {str(e)}") + log.error(traceback.format_exc()) \ No newline at end of file diff --git a/src/task_solving/scientist.py b/src/task_solving/scientist.py new file mode 100644 index 0000000..c493625 --- /dev/null +++ b/src/task_solving/scientist.py @@ -0,0 +1,186 @@ +"""Task solver agent for solving tasks through debate.""" + +import logging +import re +import traceback + +from autogen_core import ( + DefaultTopicId, + MessageContext, + RoutedAgent, + default_subscription, + message_handler, +) +from autogen_core.models import ( + ChatCompletionClient, + SystemMessage, + UserMessage, +) +from langfuse import Langfuse + +from src.task_solving.messages import ( + AgentRevisionRequest, + AgentSolution, + TaskSolutionRequest, +) +from src.utils.agentic_prompts import ( + TASK_SOLVER_ROUND_1_PROMPT, + TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT, + TASK_SOLVER_SYSTEM_MESSAGE, +) + + +log = logging.getLogger("task_solving.solver") + + +@default_subscription +class TaskSolvingScientist(RoutedAgent): + """A scientist that solves tasks through debate.""" + + def __init__( + self, + model_client: ChatCompletionClient, + scientist_id: str, + langfuse_client: Langfuse = None, + ) -> None: + super().__init__(f"Task Solving Scientist {scientist_id}") + self._model_client = model_client + self._scientist_id = scientist_id + self._langfuse_client = langfuse_client + + def _extract_solution_components(self, response: str) -> tuple[str, str]: + """Extract thought and final answer from the response.""" + thought_match = re.search(r"THOUGHT:\s*(.*?)(?=FINAL ANSWER:|$)", response, re.DOTALL | re.IGNORECASE) + answer_match = re.search(r"FINAL ANSWER:\s*(.*?)$", response, re.DOTALL | re.IGNORECASE) + + thought = thought_match.group(1).strip() if thought_match else response.strip() + final_answer = answer_match.group(1).strip() if answer_match else "No clear answer provided" + + return thought, final_answer + + @message_handler + async def handle_task_solution_request( + self, message: TaskSolutionRequest, ctx: MessageContext + ) -> None: + """Handle initial task solution request (Round 1).""" + with self._langfuse_client.start_as_current_span( + name=f"scientist_{self._scientist_id}_round_1" + ) as span: + try: + task_text = message.task.task_content.get("task", "") + + msg = f"Scientist {self._scientist_id} handling initial solution request for task: {message.task.task_id}" + log.info(msg) + span.update( + metadata={ + "solution_request_received": msg, + "scientist_id": self._scientist_id, + "task_id": message.task.task_id, + "round": message.round_number, + } + ) + + prompt = TASK_SOLVER_ROUND_1_PROMPT.format(problem_text=task_text) + + system_message = SystemMessage(content=TASK_SOLVER_SYSTEM_MESSAGE) + user_message = UserMessage(content=prompt, source="user") + + response = await self._model_client.create( + messages=[system_message, user_message], + cancellation_token=ctx.cancellation_token, + ) + + response_content = response.content + thought, final_answer = self._extract_solution_components(response_content) + + solution = AgentSolution( + agent_id=self._scientist_id, + task_id=message.task.task_id, + thought=thought, + final_answer=final_answer, + round_number=message.round_number, + ) + + await self.publish_message(solution, topic_id=DefaultTopicId()) + + span.update( + metadata={ + "solution_generated": f"Scientist {self._scientist_id} generated solution for task {message.task.task_id}", + "final_answer": final_answer[:100], # Truncate for logging + } + ) + + except Exception as e: + error_msg = f"Error in scientist {self._scientist_id} round 1: {str(e)}" + log.error(error_msg) + log.error(traceback.format_exc()) + span.update(metadata={"error": error_msg}) + + @message_handler + async def handle_agent_revision_request( + self, message: AgentRevisionRequest, ctx: MessageContext + ) -> None: + """Handle revision request with other agents' solutions.""" + with self._langfuse_client.start_as_current_span( + name=f"scientist_{self._scientist_id}_round_{message.round_number}" + ) as span: + try: + task_text = message.task.task_content.get("task", "") + + msg = f"Scientist {self._scientist_id} handling revision request for task: {message.task.task_id}, round: {message.round_number}" + log.info(msg) + span.update( + metadata={ + "revision_request_received": msg, + "scientist_id": self._scientist_id, + "task_id": message.task.task_id, + "round": message.round_number, + "num_other_solutions": len(message.other_solutions), + } + ) + + # Format other scientists' solutions + other_solutions_text = "\n\n".join([ + f"Scientist {sol.agent_id}: Reasoning: {sol.thought}, Final solution: {sol.final_answer}" + for sol in message.other_solutions + if sol.agent_id != self._scientist_id # Don't include our own solution + ]) + + prompt = TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT.format( + other_solutions=other_solutions_text, + problem_text=task_text + ) + + system_message = SystemMessage(content=TASK_SOLVER_SYSTEM_MESSAGE) + user_message = UserMessage(content=prompt, source="user") + + response = await self._model_client.create( + messages=[system_message, user_message], + cancellation_token=ctx.cancellation_token, + ) + + response_content = response.content + thought, final_answer = self._extract_solution_components(response_content) + + solution = AgentSolution( + agent_id=self._scientist_id, + task_id=message.task.task_id, + thought=thought, + final_answer=final_answer, + round_number=message.round_number, + ) + + await self.publish_message(solution, topic_id=DefaultTopicId()) + + span.update( + metadata={ + "revision_generated": f"Scientist {self._scientist_id} generated revision for task {message.task.task_id}", + "final_answer": final_answer[:100], # Truncate for logging + } + ) + + except Exception as e: + error_msg = f"Error in scientist {self._scientist_id} round {message.round_number}: {str(e)}" + log.error(error_msg) + log.error(traceback.format_exc()) + span.update(metadata={"error": error_msg}) \ No newline at end of file From 396feac44a98eafaaffd97d21559426528900fac Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 5 Sep 2025 03:55:51 -0400 Subject: [PATCH 04/19] switichin to two phase task generation. part 2. --- src/agentic_task_generator.py | 12 +++--- src/agentic_task_solver.py | 10 ++--- src/utils/agentic_prompts.py | 78 +++++++++++++++++++++++------------ 3 files changed, 62 insertions(+), 38 deletions(-) diff --git a/src/agentic_task_generator.py b/src/agentic_task_generator.py index ffacd99..96a221a 100644 --- a/src/agentic_task_generator.py +++ b/src/agentic_task_generator.py @@ -22,9 +22,8 @@ log = logging.getLogger("agentic_task_gen") lf = Langfuse() -openlit.init( - tracer=lf._otel_tracer, disable_batch=True, disable_metrics=True -) +openlit.init(tracer=lf._otel_tracer, disable_batch=True, disable_metrics=True) + @hydra.main(version_base=None, config_path="cfg", config_name="agentic_config") def main(cfg: DictConfig) -> None: @@ -72,12 +71,13 @@ def main(cfg: DictConfig) -> None: metadata={"capabilities_tag_missing": error_msg}, ) return - + if resume_tag: msg = f"Resuming task generation from tag: {resume_tag}" log.info(msg) - span.update(metadata={"resume_tag_found": msg, "resume_tag": resume_tag}) - + span.update( + metadata={"resume_tag_found": msg, "resume_tag": resume_tag} + ) span.update_trace( metadata={ diff --git a/src/agentic_task_solver.py b/src/agentic_task_solver.py index 355c0df..43c7935 100644 --- a/src/agentic_task_solver.py +++ b/src/agentic_task_solver.py @@ -67,7 +67,7 @@ def main(cfg: DictConfig) -> None: if not tasks_file.exists(): raise FileNotFoundError(f"Tasks file not found: {tasks_file}") - + log.info(f"Loading tasks from: {tasks_file}") tasks = load_tasks_from_file(tasks_file) log.info(f"Loaded {len(tasks)} tasks") @@ -87,11 +87,11 @@ def main(cfg: DictConfig) -> None: tasks=tasks, langfuse_client=lf, )) - + # Print summary consensus_count = sum(1 for result in results.values() if result.get("consensus_reached", False)) no_consensus_count = len(results) - consensus_count - + msg = f"Task solving completed. Consensus: {consensus_count}, No consensus: {no_consensus_count}" log.info(msg) span.update( @@ -102,7 +102,7 @@ def main(cfg: DictConfig) -> None: "no_consensus": no_consensus_count, } ) - + # Print detailed results if requested if cfg.task_solving.get("print_results", False): for task_id, result in results.items(): @@ -122,4 +122,4 @@ def main(cfg: DictConfig) -> None: if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/utils/agentic_prompts.py b/src/utils/agentic_prompts.py index b4a0d26..a65df8f 100644 --- a/src/utils/agentic_prompts.py +++ b/src/utils/agentic_prompts.py @@ -228,27 +228,6 @@ Sample tasks: {sample_tasks_text}""" -TASK_SCIENTIST_SOLUTION_SYSTEM_PROMPT = """You are Scientist {scientist_id}, an expert in {capability_domain}. You are solving a task related to the capability: {capability_name}. - -IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. The JSON should be directly parseable. - -Please return your solution and your thoughts and reasoning in the following format: -{{ - "thought": "Your reasoning and thought process about solving this problem", - "solutions": {{ - "solution_0": "SOLUTION_TEXT_1", - "solution_1": "SOLUTION_TEXT_2", - ... - }} -}} - -Provide clear, accurate, and complete solutions. Your solutions should be correct and well-reasoned.""" - -TASK_SCIENTIST_SOLUTION_USER_PROMPT = """Solve the following problems: - -{problems} - -Provide your solutions clearly and concisely.""" TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT = """You are the Moderator overseeing capability-based task design. Your task is to review proposed tasks from multiple scientist agents and synthesize a final, high-quality task set for the capability. @@ -256,7 +235,7 @@ - Eliminate any task that is not clearly aligned with the capability. - Merge or remove tasks that are redundant or overly similar. - Ensure that the final set of tasks is diverse, non-trivial, and tests different facets of the capability. -- Include a brief justification for each rejected or significantly modified task. +- Select only the highest quality tasks that best represent the capability. IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. Do not include any prefixes or prose. The JSON should be directly parseable. @@ -272,11 +251,6 @@ "task_1": "", "task_2": "", ... - }, - "rejected_tasks": { - "task_from_scientist_A": "Reason for rejection or modification", - "task_from_scientist_B": "Reason for rejection or modification", - ... } }""" @@ -289,6 +263,56 @@ Proposed Tasks: {problems_text}""" +# ============================================================================= +# TASK SOLVING DEBATE PROMPTS +# ============================================================================= + +TASK_SOLVER_SYSTEM_MESSAGE = """You are an expert problem solver participating in a collaborative debate to solve tasks. You will work with other agents to find the best solution through structured discussion and reasoning.""" + +TASK_SOLVER_ROUND_1_PROMPT = """Can you solve the following problem? + +PROBLEM: {problem_text} + +Explain your reasoning step by step. Your final answer should be clearly stated at the end of your response. + +Respond using this format: +THOUGHT: +FINAL ANSWER: """ + +TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT = """These are the reasoning and solutions to the problem from other agents: + +{other_solutions} + +Using the solutions from other agents as additional information, can you provide your answer to the problem? + +The original problem is: {problem_text} + +Explain your reasoning step by step. Your final answer should be clearly stated at the end of your response. + +Respond using this format: +THOUGHT: +FINAL ANSWER: """ + +TASK_MODERATOR_SYSTEM_MESSAGE = """You are a moderator overseeing a collaborative problem-solving debate. Your role is to check for consensus among agents and determine the final solution.""" + +TASK_MODERATOR_CONSENSUS_PROMPT = """Review the following solutions from different agents for the same problem: + +PROBLEM: {problem_text} + +SOLUTIONS: +{all_solutions} + +Determine if there is consensus among the agents. Consensus is reached when: +1. All agents provide the same final answer, OR +2. The majority of agents agree on the same answer with similar reasoning + +If consensus is reached, provide the agreed-upon solution. If not, indicate that another round of debate is needed. + +Respond using this format: +CONSENSUS_REACHED: +FINAL_SOLUTION: +REASONING: """ + # ============================================================================= # SYSTEM MESSAGES # ============================================================================= From b166e4cea29bcd35b4aab1dce00db117ebd81dc7 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 5 Sep 2025 04:06:58 -0400 Subject: [PATCH 05/19] updated agentic config and readme. --- README.md | 9 ++++++++- src/cfg/agentic_config.yaml | 26 +++++++++++++++----------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 311eb4f..07f0bf2 100644 --- a/README.md +++ b/README.md @@ -86,5 +86,12 @@ python -m src.agentic_area_generator python -m src.agentic_capability_generator # Generate tasks for each capability -python -m src.agentic_task_generator +python -m src.agentic_task_generator + +# Generate tasks for all capabilities +python -m src.agentic_task_generator pipeline_tags.capabilities_tag=_20250902_030203 + +# Generate solutions for tasks +# python -m sr + ``` diff --git a/src/cfg/agentic_config.yaml b/src/cfg/agentic_config.yaml index aff027d..8b8bb9f 100644 --- a/src/cfg/agentic_config.yaml +++ b/src/cfg/agentic_config.yaml @@ -4,7 +4,7 @@ defaults: # Global configuration global_cfg: domain: math - output_dir: /fs01/projects/aieng/public/ace/agentic_outputs/ + output_dir: agentic_outputs/ # Debate configuration (shared across all stages) debate_cfg: @@ -12,35 +12,39 @@ debate_cfg: # Agent configurations (shared across all stages) agents: - scientist_a: - model_name: o3-mini + scientist_a: + model_name: gpt-5 seed: 8 scientist_b: - model_name: claude-3-5-sonnet-20241022 + model_name: gemini-2.5-pro seed: 88 # If using same model as scientist_a, use different seed for diversity moderator: - model_name: gpt-4o + model_name: claude-opus-4-1-20250805 seed: 888 # Stage 1: Area Generation Configuration area_generation: - num_areas: 2 # Number of top-level areas to generate + num_areas: 20 # Number of top-level areas to generate # Stage 2: Capability Generation Configuration capability_generation: - num_capabilities_per_area: 3 # Number of capabilities to generate per area + num_capabilities_per_area: 20 # Number of capabilities to generate per area # Stage 3: Task Generation Configuration task_generation: - num_final_problems_per_capability: 3 # N: Number of final problems per capability - buffer_param: 2 # B: Buffer parameter (extra problems each agent proposes) - agreement_threshold: 0.6 # S: Agreement threshold for solution consensus + num_final_tasks_per_capability: 10 # N: Number of final problems per capability + buffer_param: 5 # B: Buffer parameter (extra problems each agent proposes) + +# Stage 4: Task Solving Configuration +task_solving: + max_tasks: 0 # Maximum number of tasks to process (0 = all) + print_results: false # Whether to print detailed results to console # Experiment configuration exp_cfg: - exp_id: test + exp_id: r0_20x20 # Pipeline tags for chaining stages pipeline_tags: From 084b68c12d823158eb71187dca58fa593d6e6574 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 5 Sep 2025 04:40:59 -0400 Subject: [PATCH 06/19] simplified task generations. --- src/cfg/agentic_config.yaml | 8 ++++---- src/task_generation/__init__.py | 11 ----------- src/task_generation/generator.py | 25 +++++++++++++++---------- src/task_generation/moderator.py | 28 +++++++++------------------- 4 files changed, 28 insertions(+), 44 deletions(-) diff --git a/src/cfg/agentic_config.yaml b/src/cfg/agentic_config.yaml index 8b8bb9f..68a3d9a 100644 --- a/src/cfg/agentic_config.yaml +++ b/src/cfg/agentic_config.yaml @@ -8,11 +8,11 @@ global_cfg: # Debate configuration (shared across all stages) debate_cfg: - max_round: 3 + max_round: 5 # Agent configurations (shared across all stages) agents: - scientist_a: + scientist_a: model_name: gpt-5 seed: 8 @@ -34,10 +34,10 @@ capability_generation: # Stage 3: Task Generation Configuration task_generation: - num_final_tasks_per_capability: 10 # N: Number of final problems per capability + num_final_problems_per_capability: 10 # N: Number of final problems per capability buffer_param: 5 # B: Buffer parameter (extra problems each agent proposes) -# Stage 4: Task Solving Configuration +# Stage 4: Task Solving Configuration task_solving: max_tasks: 0 # Maximum number of tasks to process (0 = all) print_results: false # Whether to print detailed results to console diff --git a/src/task_generation/__init__.py b/src/task_generation/__init__.py index 2598dec..8d54a1b 100644 --- a/src/task_generation/__init__.py +++ b/src/task_generation/__init__.py @@ -1,12 +1 @@ """Task generation package for multi-agent debate-based task generation.""" - -from .generator import generate_tasks -from .moderator import TaskModerator -from .scientist import TaskScientist - - -__all__ = [ - "generate_tasks", - "TaskModerator", - "TaskScientist", -] diff --git a/src/task_generation/generator.py b/src/task_generation/generator.py index 0504aa6..a0169ed 100644 --- a/src/task_generation/generator.py +++ b/src/task_generation/generator.py @@ -14,13 +14,13 @@ DefaultTopicId, SingleThreadedAgentRuntime, ) -from src.utils.model_client_utils import get_model_client from langfuse import Langfuse from omegaconf import DictConfig from src.task_generation.messages import Capability from src.task_generation.moderator import TaskModerator from src.task_generation.scientist import TaskScientist +from src.utils.model_client_utils import get_model_client log = logging.getLogger("agentic_task_gen.generator") @@ -160,15 +160,16 @@ async def generate_tasks_for_capability( async def generate_tasks( - cfg: DictConfig, - capabilities_tag: str, + cfg: DictConfig, + capabilities_tag: str, langfuse_client: Langfuse, - resume_tag: str = None, + resume_tag: str, ) -> None: """Generate tasks for all capabilities.""" domain_name = cfg.global_cfg.domain exp_id = cfg.exp_cfg.exp_id - + + # Use resume_tag if provided, otherwise create new tag if resume_tag: tasks_tag = resume_tag log.info(f"Resuming task generation with existing tag: {tasks_tag}") @@ -237,6 +238,7 @@ async def generate_tasks( if capabilities_file.exists(): with open(capabilities_file, "r", encoding="utf-8") as f: capabilities_data = json.load(f) + if ( isinstance(capabilities_data, dict) and "capabilities" in capabilities_data @@ -297,6 +299,9 @@ async def generate_tasks( } ) + # Print the timestamp for future reference + print(f"Tasks generated with tag: {tasks_tag}") + # Check for existing tasks if resuming existing_tasks = set() if resume_tag and output_dir.exists(): @@ -309,7 +314,7 @@ async def generate_tasks( log.info(msg) span.update(metadata={"existing_tasks": msg}) else: - log.info("No existing tasks found, will generate tasks all capabilities") + log.info("No existing tasks found, will generate all capabilities") processed_capabilities = 0 skipped_capabilities = 0 @@ -317,7 +322,7 @@ async def generate_tasks( # Process each capability individually for i, capability in enumerate(capabilities): capability_dir_name = capability.name.replace(" ", "_") - + # Skip if tasks already exist for this capability if resume_tag and capability_dir_name in existing_tasks: msg = f"Skipping capability {i + 1}/{len(capabilities)}: {capability.name} (already exists)" @@ -331,7 +336,7 @@ async def generate_tasks( ) skipped_capabilities += 1 continue - + msg = f"Processing capability {i + 1}/{len(capabilities)}: {capability.name}" log.info(msg) span.update( @@ -354,10 +359,10 @@ async def generate_tasks( "completed_capability": capability.name, } ) - + processed_capabilities += 1 await asyncio.sleep(1) - + # Final summary msg = f"Task generation completed. Processed: {processed_capabilities}, Skipped: {skipped_capabilities}, Total: {len(capabilities)}" log.info(msg) diff --git a/src/task_generation/moderator.py b/src/task_generation/moderator.py index 0238d44..711abe0 100644 --- a/src/task_generation/moderator.py +++ b/src/task_generation/moderator.py @@ -59,7 +59,6 @@ def __init__( self._domain = domain self._langfuse_client = langfuse_client - # Algorithm 1 state self._num_remaining: Dict[str, int] = {} self._final_problems: Dict[ str, Dict[str, str] @@ -71,11 +70,9 @@ def __init__( str, List[ScientistProblemProposal] ] = {} # capability -> proposals - - @message_handler async def handle_capability(self, message: Capability, ctx: MessageContext) -> None: - """Handle capability and start Algorithm 1 for problem design.""" + """Start problem design for a capability.""" with self._langfuse_client.start_as_current_span( name="task_moderator_handle_capability" ) as span: @@ -91,12 +88,9 @@ async def handle_capability(self, message: Capability, ctx: MessageContext) -> N } ) - # Initialize Algorithm 1 state self._num_remaining[message.name] = self._num_final_problems self._final_problems[message.name] = {} - self._capabilities[message.name] = ( - message # Store original capability info - ) + self._capabilities[message.name] = message await self._start_problem_iteration(message) @@ -119,14 +113,12 @@ async def handle_capability(self, message: Capability, ctx: MessageContext) -> N raise async def _start_problem_iteration(self, capability: Capability) -> None: - """Start a problem generation iteration (Algorithm 1).""" + """Start a problem generation iteration.""" try: num_remaining = self._num_remaining[capability.name] if num_remaining <= 0: - log.info( - f"Problem design completed for capability: {capability.name}, starting solution design" - ) - await self._start_solution_design(capability) + log.info(f"Problem design completed for capability: {capability.name}") + await self._finalize_tasks_without_solutions(capability.name) return # Calculate problems per scientist: ceil(num_remaining / M) + B @@ -265,7 +257,6 @@ async def _filter_and_select_problems( ) final_tasks = {} - # Update Algorithm 1 state num_remaining = self._num_remaining[capability_name] num_selected = min(len(final_tasks), num_remaining) @@ -286,13 +277,10 @@ async def _filter_and_select_problems( f"Task Moderator selected {selected_count} problems for {capability_name}, {self._num_remaining[capability_name]} remaining" ) - # Continue Algorithm 1 or move to solution design if self._num_remaining[capability_name] > 0: - # Need more problems, start another iteration capability = self._capabilities[capability_name] await self._start_problem_iteration(capability) else: - # Problem design complete, finalize tasks without solutions await self._finalize_tasks_without_solutions(capability_name) except Exception as e: @@ -301,7 +289,7 @@ async def _filter_and_select_problems( raise async def _finalize_tasks_without_solutions(self, capability_name: str) -> None: - """Finalize tasks with problems only (no solutions).""" + """Finalize tasks with problems only.""" try: log.info( f"Task Moderator finalizing tasks for capability: {capability_name}" @@ -324,7 +312,9 @@ async def _finalize_tasks_without_solutions(self, capability_name: str) -> None: # Save final tasks await self._save_tasks_to_file(capability_name, final_tasks) - log.info(f"Task generation completed for capability: {capability_name} ({len(final_tasks)} tasks)") + log.info( + f"Task generation completed for capability: {capability_name} ({len(final_tasks)} tasks)" + ) except Exception as e: log.error(f"Error in Task Moderator _finalize_tasks_without_solutions: {e}") From c155d7430b5bc6142fc7521f0b341f356b0f4e33 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 5 Sep 2025 16:39:38 -0400 Subject: [PATCH 07/19] simplified task generation. --- src/task_generation/__init__.py | 5 + src/task_generation/generator.py | 1 + src/task_generation/messages.py | 7 +- src/task_generation/moderator.py | 159 +++++++++++++++---------------- src/task_generation/scientist.py | 5 +- 5 files changed, 83 insertions(+), 94 deletions(-) diff --git a/src/task_generation/__init__.py b/src/task_generation/__init__.py index 8d54a1b..ebcd01c 100644 --- a/src/task_generation/__init__.py +++ b/src/task_generation/__init__.py @@ -1 +1,6 @@ """Task generation package for multi-agent debate-based task generation.""" + +from .generator import generate_tasks + + +__all__ = ["generate_tasks"] diff --git a/src/task_generation/generator.py b/src/task_generation/generator.py index a0169ed..56094cf 100644 --- a/src/task_generation/generator.py +++ b/src/task_generation/generator.py @@ -95,6 +95,7 @@ async def generate_tasks_for_capability( output_dir=output_dir, domain=domain_name, langfuse_client=langfuse_client, + max_round=cfg.task_generation.max_rounds, ), ) diff --git a/src/task_generation/messages.py b/src/task_generation/messages.py index 38daaa9..0ae4692 100644 --- a/src/task_generation/messages.py +++ b/src/task_generation/messages.py @@ -24,6 +24,7 @@ class ProblemProposalRequest: capability_area: str num_problems: int sample_tasks: List[str] + iteration: int = 1 @dataclass @@ -34,9 +35,3 @@ class ScientistProblemProposal: capability_name: str problems: Dict[str, str] # task_id -> task_text iteration: int - - - - - - diff --git a/src/task_generation/moderator.py b/src/task_generation/moderator.py index 711abe0..16e6193 100644 --- a/src/task_generation/moderator.py +++ b/src/task_generation/moderator.py @@ -49,6 +49,7 @@ def __init__( output_dir: Path, domain: str, langfuse_client: Langfuse, + max_round: int = 5, ) -> None: super().__init__("Task Moderator") self._model_client = model_client @@ -58,17 +59,17 @@ def __init__( self._output_dir = output_dir self._domain = domain self._langfuse_client = langfuse_client + self._max_round = max_round - self._num_remaining: Dict[str, int] = {} - self._final_problems: Dict[ - str, Dict[str, str] - ] = {} # capability -> {task_id: problem_text} - self._capabilities: Dict[str, Capability] = {} # Store original capability info + self._num_remaining = self._num_final_problems + self._final_problems: Dict[str, str] = {} # {task_id: problem_text} + self._capability: ( + Capability # Store original capability info (set in first message) + ) + self._current_round = 0 # Problem design state - self._problem_proposals: Dict[ - str, List[ScientistProblemProposal] - ] = {} # capability -> proposals + self._problem_proposals: Dict[int, List[ScientistProblemProposal]] = {} @message_handler async def handle_capability(self, message: Capability, ctx: MessageContext) -> None: @@ -77,22 +78,22 @@ async def handle_capability(self, message: Capability, ctx: MessageContext) -> N name="task_moderator_handle_capability" ) as span: try: - msg = f"Task Moderator starting problem design for capability: {message.name}" + capability_name = message.name + msg = f"Task Moderator starting problem design for capability: {capability_name}" log.info(msg) span.update( metadata={ "capability_received": msg, - "capability_name": message.name, + "capability_name": capability_name, "capability_description": message.description, "capability_area": message.area, } ) - self._num_remaining[message.name] = self._num_final_problems - self._final_problems[message.name] = {} - self._capabilities[message.name] = message + self._capability = message + self._problem_proposals[self._current_round] = [] - await self._start_problem_iteration(message) + await self._start_problem_iteration() except Exception as e: error_msg = f"Error in Task Moderator handle_capability: {e}" @@ -112,38 +113,50 @@ async def handle_capability(self, message: Capability, ctx: MessageContext) -> N ) raise - async def _start_problem_iteration(self, capability: Capability) -> None: + async def _start_problem_iteration(self) -> None: """Start a problem generation iteration.""" try: - num_remaining = self._num_remaining[capability.name] - if num_remaining <= 0: - log.info(f"Problem design completed for capability: {capability.name}") - await self._finalize_tasks_without_solutions(capability.name) + # Check if we've reached the maximum number of rounds + if self._current_round >= self._max_round: + log.info( + f"Maximum rounds ({self._max_round}) reached for capability: {self._capability.name}.\ + Finalizing with {len(self._final_problems)} problems." + ) + await self._finalize_tasks_without_solutions() + return + + if self._num_remaining <= 0: + log.info( + f"Problem design completed for capability: {self._capability.name}" + ) + await self._finalize_tasks_without_solutions() return # Calculate problems per scientist: ceil(num_remaining / M) + B problems_per_scientist = ( - math.ceil(num_remaining / self._num_scientists) + self._buffer_param + math.ceil(self._num_remaining / self._num_scientists) + + self._buffer_param ) log.info( - f"Task Moderator requesting {problems_per_scientist} problems per scientist for capability: {capability.name} (remaining: {num_remaining})" + f"Task Moderator requesting {problems_per_scientist} problems per scientist for capability: {self._capability.name} (remaining: {self._num_remaining}, round: {self._current_round}/{self._max_round})" ) # Get sample tasks from existing final problems - sample_tasks = list(self._final_problems[capability.name].values())[ + sample_tasks = list(self._final_problems.values())[ :3 ] # Use up to 3 existing problems as samples # Send problem proposal requests to all scientists await self.publish_message( ProblemProposalRequest( - capability_name=capability.name, - capability_description=capability.description, - capability_domain=capability.domain, - capability_area=capability.area, + capability_name=self._capability.name, + capability_description=self._capability.description, + capability_domain=self._capability.domain, + capability_area=self._capability.area, num_problems=problems_per_scientist, sample_tasks=sample_tasks, + iteration=self._current_round, ), topic_id=DefaultTopicId(), ) @@ -163,46 +176,30 @@ async def handle_scientist_problem_proposal( f"Task Moderator received problem proposal from Scientist {message.scientist_id} for capability: {message.capability_name}" ) - capability_name = message.capability_name - if capability_name not in self._problem_proposals: - self._problem_proposals[capability_name] = [] - - self._problem_proposals[capability_name].append(message) + self._problem_proposals[self._current_round].append(message) # Check if we have all proposals for this iteration - current_proposals = [ - p - for p in self._problem_proposals[capability_name] - if p.iteration == message.iteration - ] + current_proposals = self._problem_proposals[self._current_round] if len(current_proposals) == self._num_scientists: log.info( - f"Task Moderator received all problem proposals for capability: {capability_name}, proceeding to filter" - ) - await self._filter_and_select_problems( - capability_name, message.iteration + f"Task Moderator received all problem proposals for capability: {self._capability.name}, proceeding to filter" ) + await self._filter_and_select_problems() except Exception as e: log.error(f"Error in Task Moderator handle_scientist_problem_proposal: {e}") log.error(f"Traceback: {traceback.format_exc()}") raise - async def _filter_and_select_problems( - self, capability_name: str, iteration: int - ) -> None: + async def _filter_and_select_problems(self) -> None: """Filter and select problems using moderator LLM.""" try: log.info( - f"Task Moderator filtering problems for capability: {capability_name}" + f"Task Moderator filtering problems for capability: {self._capability.name}" ) # Collect all proposed problems - current_proposals = [ - p - for p in self._problem_proposals[capability_name] - if p.iteration == iteration - ] + current_proposals = self._problem_proposals[self._current_round] all_problems = {} scientist_attribution = {} @@ -213,7 +210,9 @@ async def _filter_and_select_problems( scientist_attribution[unique_id] = proposal.scientist_id if not all_problems: - log.warning(f"No problems received for capability: {capability_name}") + log.warning( + f"No problems received for capability: {self._capability.name}" + ) return # Format problems for moderator @@ -226,17 +225,14 @@ async def _filter_and_select_problems( problems_text += f"- {task_name}: {problem}\n" problems_text += "\n" - system_prompt = TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT - - capability_info = self._capabilities[capability_name] user_prompt = TASK_MODERATOR_PROBLEM_USER_PROMPT.format( - capability_name=capability_info.name, - capability_description=capability_info.description, - capability_domain=capability_info.domain, + capability_name=self._capability.name, + capability_description=self._capability.description, + capability_domain=self._capability.domain, problems_text=problems_text, ) - system_message = SystemMessage(content=system_prompt) + system_message = SystemMessage(content=TASK_MODERATOR_PROBLEM_SYSTEM_PROMPT) user_message = UserMessage(content=user_prompt, source="user") model_result = await self._model_client.create( @@ -257,63 +253,60 @@ async def _filter_and_select_problems( ) final_tasks = {} - num_remaining = self._num_remaining[capability_name] - num_selected = min(len(final_tasks), num_remaining) + num_selected = min(len(final_tasks), self._num_remaining) # Add selected problems to final set selected_count = 0 for _, problem_text in final_tasks.items(): if selected_count < num_selected: - final_task_id = ( - f"task_{len(self._final_problems[capability_name]) + 1}" - ) - self._final_problems[capability_name][final_task_id] = problem_text + final_task_id = f"task_{len(self._final_problems) + 1}" + self._final_problems[final_task_id] = problem_text selected_count += 1 # Update remaining count - self._num_remaining[capability_name] = num_remaining - selected_count + self._num_remaining = self._num_remaining - selected_count log.info( - f"Task Moderator selected {selected_count} problems for {capability_name}, {self._num_remaining[capability_name]} remaining" + f"Task Moderator selected {selected_count} problems for {self._capability.name}, {self._num_remaining} remaining" ) - if self._num_remaining[capability_name] > 0: - capability = self._capabilities[capability_name] - await self._start_problem_iteration(capability) + if self._num_remaining > 0: + # Increment round counter before starting next iteration + self._current_round += 1 + await self._start_problem_iteration() else: - await self._finalize_tasks_without_solutions(capability_name) + await self._finalize_tasks_without_solutions() except Exception as e: log.error(f"Error in Task Moderator _filter_and_select_problems: {e}") log.error(f"Traceback: {traceback.format_exc()}") raise - async def _finalize_tasks_without_solutions(self, capability_name: str) -> None: + async def _finalize_tasks_without_solutions(self) -> None: """Finalize tasks with problems only.""" try: log.info( - f"Task Moderator finalizing tasks for capability: {capability_name}" + f"Task Moderator finalizing tasks for capability: {self._capability.name}" ) - final_problems = self._final_problems[capability_name] - if not final_problems: + if not self._final_problems: log.error( - f"No final problems available for capability: {capability_name}" + f"No final problems available for capability: {self._capability.name}" ) return # Create tasks with problems only final_tasks = {} - for task_id, problem_text in final_problems.items(): + for task_id, problem_text in self._final_problems.items(): final_tasks[task_id] = { "task": problem_text, - "capability_id": capability_name, + "capability_id": self._capability.name, } # Save final tasks - await self._save_tasks_to_file(capability_name, final_tasks) + await self._save_tasks_to_file(final_tasks) log.info( - f"Task generation completed for capability: {capability_name} ({len(final_tasks)} tasks)" + f"Task generation completed for capability: {self._capability.name} ({len(final_tasks)} tasks)" ) except Exception as e: @@ -321,13 +314,11 @@ async def _finalize_tasks_without_solutions(self, capability_name: str) -> None: log.error(f"Traceback: {traceback.format_exc()}") raise - async def _save_tasks_to_file( - self, capability_name: str, tasks: Dict[str, Dict[str, str]] - ) -> None: + async def _save_tasks_to_file(self, tasks: Dict[str, Dict[str, str]]) -> None: """Save final tasks to file.""" try: # Create capability directory - capability_dir = self._output_dir / capability_name + capability_dir = self._output_dir / self._capability.name capability_dir.mkdir(parents=True, exist_ok=True) # Save tasks @@ -336,9 +327,9 @@ async def _save_tasks_to_file( json.dump({"tasks": tasks}, f, indent=2, ensure_ascii=False) log.info( - f"Saved {len(tasks)} tasks for capability '{capability_name}' to {tasks_file}" + f"Saved {len(tasks)} tasks for capability '{self._capability.name}' to {tasks_file}" ) except Exception as e: - log.error(f"Error saving tasks for capability {capability_name}: {e}") + log.error(f"Error saving tasks for capability {self._capability.name}: {e}") log.error(f"Traceback: {traceback.format_exc()}") raise diff --git a/src/task_generation/scientist.py b/src/task_generation/scientist.py index 25b25d8..e66a7eb 100644 --- a/src/task_generation/scientist.py +++ b/src/task_generation/scientist.py @@ -1,6 +1,5 @@ """Task scientist agent for generating problems and solutions.""" -import json import logging import traceback @@ -125,7 +124,7 @@ async def handle_problem_proposal_request( scientist_id=self._scientist_id, capability_name=message.capability_name, problems=problems, - iteration=0, + iteration=getattr(message, "iteration", 0), ), topic_id=DefaultTopicId(), ) @@ -148,5 +147,3 @@ async def handle_problem_proposal_request( }, ) raise - - From 52b4d2a8c5a4ec008701a167fe6c6570ae168713 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Sun, 7 Sep 2025 02:55:50 -0400 Subject: [PATCH 08/19] fixed mypy errors. --- README.md | 7 +- src/agentic_capability_generator.py | 7 +- src/agentic_task_solver.py | 126 ++--- src/capability_generation/generator.py | 7 +- src/capability_generation/messages.py | 2 +- src/cfg/agentic_config.yaml | 9 +- src/task_solver/__init__.py | 6 + src/task_solver/generator.py | 246 ++++++++++ src/task_solver/messages.py | 81 ++++ src/task_solver/moderator.py | 442 ++++++++++++++++++ .../scientist.py | 125 ++--- src/task_solving/__init__.py | 17 - src/task_solving/generator.py | 225 --------- src/task_solving/messages.py | 64 --- src/task_solving/moderator.py | 342 -------------- src/utils/agentic_prompts.py | 57 ++- 16 files changed, 970 insertions(+), 793 deletions(-) create mode 100644 src/task_solver/__init__.py create mode 100644 src/task_solver/generator.py create mode 100644 src/task_solver/messages.py create mode 100644 src/task_solver/moderator.py rename src/{task_solving => task_solver}/scientist.py (53%) delete mode 100644 src/task_solving/__init__.py delete mode 100644 src/task_solving/generator.py delete mode 100644 src/task_solving/messages.py delete mode 100644 src/task_solving/moderator.py diff --git a/README.md b/README.md index 07f0bf2..a88c7e3 100644 --- a/README.md +++ b/README.md @@ -86,12 +86,13 @@ python -m src.agentic_area_generator python -m src.agentic_capability_generator # Generate tasks for each capability -python -m src.agentic_task_generator +python -m src.agentic_task_generator # Generate tasks for all capabilities python -m src.agentic_task_generator pipeline_tags.capabilities_tag=_20250902_030203 -# Generate solutions for tasks -# python -m sr +# Generate solutions for tasks using multi-agent debate +python -m src.agentic_task_solver pipeline_tags.tasks_tag=_20250905_153532 + ``` diff --git a/src/agentic_capability_generator.py b/src/agentic_capability_generator.py index e9d9d80..835813e 100644 --- a/src/agentic_capability_generator.py +++ b/src/agentic_capability_generator.py @@ -4,6 +4,7 @@ import logging import os import traceback +from typing import Optional import hydra import openlit @@ -29,7 +30,9 @@ def main(cfg: DictConfig) -> None: """Run the multi-agent debate-based capability generation system.""" areas_tag = cfg.pipeline_tags.areas_tag - resume_tag = getattr(cfg.pipeline_tags, "resume_capabilities_tag", None) + resume_tag: Optional[str] = getattr( + cfg.pipeline_tags, "resume_capabilities_tag", None + ) domain_name = cfg.global_cfg.domain exp_id = cfg.exp_cfg.exp_id num_capabilities_per_area = cfg.capability_generation.num_capabilities_per_area @@ -63,7 +66,7 @@ def main(cfg: DictConfig) -> None: error_msg = "No areas_tag provided. Please provide pipeline_tags.areas_tag= to specify which areas to use." log.warning(error_msg) span.update( - level="WARNING", + level="ERROR", status_message="Missing areas_tag", metadata={"areas_tag_missing": error_msg}, ) diff --git a/src/agentic_task_solver.py b/src/agentic_task_solver.py index 43c7935..49a52f2 100644 --- a/src/agentic_task_solver.py +++ b/src/agentic_task_solver.py @@ -4,14 +4,13 @@ import logging import os import traceback -from pathlib import Path import hydra import openlit from langfuse import Langfuse from omegaconf import DictConfig, OmegaConf -from src.task_solving.generator import solve_tasks_with_debate, load_tasks_from_file +from src.task_solver import solve_tasks # Suppress OpenTelemetry console output @@ -20,25 +19,27 @@ os.environ["OTEL_PYTHON_LOG_CORRELATION"] = "false" os.environ["OTEL_PYTHON_LOG_LEVEL"] = "ERROR" -log = logging.getLogger("agentic_task_solving") +log = logging.getLogger("agentic_task_solver") -lf = Langfuse() -openlit.init(tracer=lf._otel_tracer, disable_batch=True, disable_metrics=True) +langfuse_client = Langfuse() +openlit.init( + tracer=langfuse_client._otel_tracer, disable_batch=True, disable_metrics=True +) @hydra.main(version_base=None, config_path="cfg", config_name="agentic_config") def main(cfg: DictConfig) -> None: """Run the multi-agent debate-based task solving system.""" + tasks_tag = cfg.pipeline_tags.get("tasks_tag") + resume_tag = getattr(cfg.pipeline_tags, "resume_solutions_tag", None) domain_name = cfg.global_cfg.domain exp_id = cfg.exp_cfg.exp_id - output_dir = cfg.global_cfg.output_dir - max_tasks = cfg.task_solving.get("max_tasks", 0) - with lf.start_as_current_span( - name=f"ace_agentic_task_solving:{domain_name}:{exp_id}" + with langfuse_client.start_as_current_span( + name=f"ace_agentic_task_solver:{domain_name}:{exp_id}" ) as span: try: - msg = "Starting multi-agent debate-based task solving" + msg = "Starting multi-agent debate-based task solver" log.info(msg) span.update(metadata={"system_started": msg}) @@ -54,71 +55,70 @@ def main(cfg: DictConfig) -> None: } ) - # Load tasks from the specified file or use pipeline tags to find them - tasks_file = None - if cfg.pipeline_tags.get("tasks_tag"): - # Look for tasks file using the tag - tasks_dir = Path(output_dir) / domain_name / "tasks" - tasks_file = tasks_dir / f"tasks_{cfg.pipeline_tags.tasks_tag}.json" - elif cfg.task_solving.get("input_file"): - tasks_file = Path(cfg.task_solving.input_file) + if tasks_tag: + msg = f"Using tasks from tag: {tasks_tag}" + log.info(msg) + span.update( + metadata={ + "tasks_tag_found": msg, + "tasks_tag": tasks_tag, + } + ) else: - raise ValueError("Either pipeline_tags.tasks_tag or task_solving.input_file must be specified") - - if not tasks_file.exists(): - raise FileNotFoundError(f"Tasks file not found: {tasks_file}") - - log.info(f"Loading tasks from: {tasks_file}") - tasks = load_tasks_from_file(tasks_file) - log.info(f"Loaded {len(tasks)} tasks") + error_msg = "No tasks_tag provided. Please provide pipeline_tags.tasks_tag= to specify which tasks to solve." + log.warning(error_msg) + span.update( + level="ERROR", + status_message="Missing tasks_tag", + metadata={"tasks_tag_missing": error_msg}, + ) + return + + if resume_tag: + msg = f"Resuming task solving from tag: {resume_tag}" + log.info(msg) + span.update( + metadata={"resume_tag_found": msg, "resume_tag": resume_tag} + ) + + span.update_trace( + metadata={ + "domain": domain_name, + "exp_id": exp_id, + "tasks_tag": tasks_tag, + "resume_tag": resume_tag, + "config": config_yaml, + }, + tags=["agentic_task_solver", exp_id], + ) - # Limit number of tasks if specified - if max_tasks > 0: - tasks = tasks[:max_tasks] - log.info(f"Limited to {len(tasks)} tasks") + asyncio.run(solve_tasks(cfg, tasks_tag, langfuse_client, resume_tag)) - # Run task solving - msg = f"Running task solving for {len(tasks)} tasks" + msg = "Multi-agent debate-based task solving completed successfully" log.info(msg) - span.update(metadata={"task_solving_started": msg}) + span.update(metadata={"system_completed": msg}) - results = asyncio.run(solve_tasks_with_debate( - cfg=cfg, - tasks=tasks, - langfuse_client=lf, - )) + except Exception as e: + error_msg = f"Task solving failed: {e}" + traceback_msg = f"Full traceback: {traceback.format_exc()}" - # Print summary - consensus_count = sum(1 for result in results.values() if result.get("consensus_reached", False)) - no_consensus_count = len(results) - consensus_count + log.error(error_msg) + log.error(traceback_msg) - msg = f"Task solving completed. Consensus: {consensus_count}, No consensus: {no_consensus_count}" - log.info(msg) span.update( + level="ERROR", + status_message=str(e), metadata={ - "task_solving_completed": msg, - "total_tasks": len(results), - "consensus_reached": consensus_count, - "no_consensus": no_consensus_count, - } + "system_error": error_msg, + "error": str(e), + "traceback": traceback_msg, + }, ) - # Print detailed results if requested - if cfg.task_solving.get("print_results", False): - for task_id, result in results.items(): - log.info(f"\nTask {task_id}:") - log.info(f" Solution: {result['solution'][:100]}...") - log.info(f" Consensus: {result['consensus_reached']}") - log.info(f" Rounds: {result['total_rounds']}") + raise - except Exception as e: - error_msg = f"Error in agentic task solving: {str(e)}" - log.error(error_msg) - log.error(traceback.format_exc()) - span.update(metadata={"error": error_msg}) - raise - finally: - lf.flush() + finally: + langfuse_client.flush() if __name__ == "__main__": diff --git a/src/capability_generation/generator.py b/src/capability_generation/generator.py index 54d6c0f..b8ffc65 100644 --- a/src/capability_generation/generator.py +++ b/src/capability_generation/generator.py @@ -6,6 +6,7 @@ import traceback from datetime import datetime from pathlib import Path +from typing import Optional from autogen_core import ( EVENT_LOGGER_NAME, @@ -30,7 +31,7 @@ async def generate_capabilities_for_area( - cfg: DictConfig, area: Area, output_dir: Path, langfuse_client: Langfuse = None + cfg: DictConfig, area: Area, output_dir: Path, langfuse_client: Langfuse ) -> None: """Generate capabilities for a single area.""" with langfuse_client.start_as_current_span( @@ -153,8 +154,8 @@ async def generate_capabilities_for_area( async def generate_capabilities( cfg: DictConfig, areas_tag: str, - langfuse_client: Langfuse = None, - resume_tag: str = None, + langfuse_client: Langfuse, + resume_tag: Optional[str] = None, ) -> None: """Generate capabilities using multi-agent debate system for each area.""" domain_name = cfg.global_cfg.domain diff --git a/src/capability_generation/messages.py b/src/capability_generation/messages.py index 5118ea4..32e5bba 100644 --- a/src/capability_generation/messages.py +++ b/src/capability_generation/messages.py @@ -37,4 +37,4 @@ class CapabilityRevisionRequest: scientist_id: str moderator_proposal: str area_name: str - round: int \ No newline at end of file + round: int diff --git a/src/cfg/agentic_config.yaml b/src/cfg/agentic_config.yaml index 68a3d9a..39b7db6 100644 --- a/src/cfg/agentic_config.yaml +++ b/src/cfg/agentic_config.yaml @@ -34,13 +34,14 @@ capability_generation: # Stage 3: Task Generation Configuration task_generation: - num_final_problems_per_capability: 10 # N: Number of final problems per capability - buffer_param: 5 # B: Buffer parameter (extra problems each agent proposes) + num_final_problems_per_capability: 5 # N: Number of final problems per capability + buffer_param: 2 # B: Buffer parameter (extra problems each agent proposes) + max_rounds: 2 # Maximum number of rounds for task generation # Stage 4: Task Solving Configuration -task_solving: +task_solver: max_tasks: 0 # Maximum number of tasks to process (0 = all) - print_results: false # Whether to print detailed results to console + max_rounds: 3 # Maximum number of debate rounds for task solving # Experiment configuration exp_cfg: diff --git a/src/task_solver/__init__.py b/src/task_solver/__init__.py new file mode 100644 index 0000000..ff8672d --- /dev/null +++ b/src/task_solver/__init__.py @@ -0,0 +1,6 @@ +"""Task solving module with debate-based approach.""" + +from .generator import solve_tasks + + +__all__ = ["solve_tasks"] diff --git a/src/task_solver/generator.py b/src/task_solver/generator.py new file mode 100644 index 0000000..85d12d5 --- /dev/null +++ b/src/task_solver/generator.py @@ -0,0 +1,246 @@ +"""Main task solver orchestration function.""" + +import json +import logging +import traceback +from datetime import datetime +from pathlib import Path +from typing import Optional + +from autogen_core import ( + EVENT_LOGGER_NAME, + ROOT_LOGGER_NAME, + TRACE_LOGGER_NAME, + DefaultTopicId, + SingleThreadedAgentRuntime, +) +from langfuse import Langfuse +from omegaconf import DictConfig + +from src.task_solver.messages import Task +from src.task_solver.moderator import TaskSolverModerator +from src.task_solver.scientist import TaskSolverScientist +from src.utils.model_client_utils import get_model_client + + +log = logging.getLogger("task_solver.generator") +logging.getLogger(ROOT_LOGGER_NAME).setLevel(logging.WARNING) +logging.getLogger(TRACE_LOGGER_NAME).setLevel(logging.WARNING) +logging.getLogger(EVENT_LOGGER_NAME).setLevel(logging.WARNING) + + +async def solve_task( + cfg: DictConfig, task: Task, output_dir: Path, langfuse_client: Langfuse +) -> None: + """Solve a task using multi-agent debate system.""" + max_rounds = cfg.task_solver.max_rounds + task_id = task.task_id + capability_name = task.capability_name + + with langfuse_client.start_as_current_span( + name=f"task_solver_for_task:{task_id}, capability:{capability_name}" + ) as span: + try: + msg = f"Generating solutions for task: {task_id}, capability: {capability_name}" + log.info(msg) + span.update( + metadata={ + "single_task_solver_started": msg, + "task_id": task_id, + "problem": task.problem, + "capability_name": capability_name, + } + ) + + runtime = SingleThreadedAgentRuntime() + + # Register moderator + await TaskSolverModerator.register( + runtime, + "TaskSolverModerator", + lambda: TaskSolverModerator( + model_client=get_model_client( + model_name=cfg.agents.moderator.model_name, + seed=cfg.agents.moderator.get("seed"), + ), + num_solvers=2, + max_rounds=max_rounds, + output_dir=output_dir, + langfuse_client=langfuse_client, + ), + ) + + # Register scientist agents + await TaskSolverScientist.register( + runtime, + "TaskSolverScientistA", + lambda: TaskSolverScientist( + model_client=get_model_client( + model_name=cfg.agents.scientist_a.model_name, + seed=cfg.agents.scientist_a.get("seed"), + ), + scientist_id="A", + langfuse_client=langfuse_client, + ), + ) + + await TaskSolverScientist.register( + runtime, + "TaskSolverScientistB", + lambda: TaskSolverScientist( + model_client=get_model_client( + model_name=cfg.agents.scientist_b.model_name, + seed=cfg.agents.scientist_b.get("seed"), + ), + scientist_id="B", + langfuse_client=langfuse_client, + ), + ) + + span.update( + metadata={ + "agents_registered": "All task agents registered successfully", + "scientists": ["A", "B"], + "moderator": True, + } + ) + + # Start runtime + runtime.start() + + await runtime.publish_message(task, DefaultTopicId()) + + msg = f"Task message published: {task_id}, capability: {capability_name}" + log.info(msg) + span.update( + metadata={ + "task_published": msg, + "task_id": task_id, + "capability_name": capability_name, + } + ) + + try: + await runtime.stop_when_idle() + msg = ( + f"Completed solving task: {task_id}, capability: {capability_name}" + ) + log.info(msg) + span.update(metadata={"runtime_completed": msg}) + except Exception as e: + msg = f"Error while solving task {task_id}, capability: {capability_name}: {e}" + log.error(msg) + span.update( + level="ERROR", + status_message=str(e), + metadata={ + "runtime_error": msg, + "error": str(e), + "task_id": task_id, + "capability_name": capability_name, + }, + ) + raise + except Exception as e: + error_msg = f"Error in task solver: {str(e)}" + log.error(error_msg) + log.error(traceback.format_exc()) + span.update(metadata={"error": error_msg}) + raise + + +async def solve_tasks( + cfg: DictConfig, + tasks_tag: str, + langfuse_client: Langfuse, + resume_tag: Optional[str] = None, +) -> None: + """Solve tasks using multi-agent debate system.""" + domain_name = cfg.global_cfg.domain + exp_id = cfg.exp_cfg.exp_id + + if resume_tag: + solutions_tag = resume_tag + log.info(f"Resuming task solver with existing tag: {solutions_tag}") + else: + solutions_tag = f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + + output_dir = ( + Path.home() + / cfg.global_cfg.output_dir + / domain_name.replace(" ", "_") + / exp_id + / "task_solutions" + / solutions_tag + ) + + with langfuse_client.start_as_current_span( + name=f"ace_task_solver:{domain_name}:{exp_id}:{solutions_tag}" + ) as span: + try: + msg = f"Solutions will be saved with tag: {solutions_tag}" + print(msg) + log.info(msg) + span.update( + metadata={ + "solver_started": msg, + "solutions_tag": solutions_tag, + "resume_tag": resume_tag, + "output_dir": output_dir, + "tasks_tag": tasks_tag, + "domain": domain_name, + "exp_id": exp_id, + }, + tags=["task_solver_process", exp_id], + ) + + tasks_dir = ( + Path.home() + / cfg.global_cfg.output_dir + / domain_name.replace(" ", "_") + / exp_id + / "tasks" + / tasks_tag + ) + + if not tasks_dir.exists(): + error_msg = f"Tasks directory not found: {tasks_dir}" + log.error(error_msg) + span.update( + level="ERROR", + status_message="Capabilities directory not found", + metadata={ + "directory_not_found_error": error_msg, + "tasks_dir": str(tasks_dir), + }, + ) + raise FileNotFoundError(error_msg) + + for capability_dir in tasks_dir.iterdir(): + if capability_dir.is_dir(): + # Check if the last part of capability_dir exists in output_dir + output_solver_dir = Path(output_dir) / capability_dir.name + if output_solver_dir.exists(): + msg = f"Solutions for tasks under capability {capability_dir.name} already exist: {output_solver_dir}" + log.info(msg) + span.update(metadata={"task_solver_skipped": msg}) + continue + + tasks_file = capability_dir / "tasks.json" + if tasks_file.exists(): + with open(tasks_file, "r", encoding="utf-8") as f: + tasks = json.load(f)["tasks"] + for task_id, task_data in tasks.items(): + task = Task( + task_id=task_id, + problem=task_data["task"], + capability_name=task_data["capability_id"], + ) + await solve_task(cfg, task, output_dir, langfuse_client) + + except Exception as e: + error_msg = f"Error in task solver: {str(e)}" + log.error(error_msg) + log.error(f"Traceback: {traceback.format_exc()}") + span.update(metadata={"error": error_msg}) + raise diff --git a/src/task_solver/messages.py b/src/task_solver/messages.py new file mode 100644 index 0000000..5187bcc --- /dev/null +++ b/src/task_solver/messages.py @@ -0,0 +1,81 @@ +"""Message types for task solving debate system.""" + +from dataclasses import dataclass +from typing import Dict, List + + +@dataclass +class Task: + """Task to be solved.""" + + task_id: str + problem: str + capability_name: str + + +@dataclass +class TaskSolutionRequest: + """Request to solve a task.""" + + task_id: str + problem: str + capability_name: str + round_number: int = 1 + + +@dataclass +class AgentSolution: + """Solution proposed by an agent.""" + + agent_id: str + task_id: str + thought: str + final_answer: str + numerical_answer: str + round_number: int + + def to_dict(self) -> Dict[str, str]: + """Convert to dictionary.""" + return { + "agent_id": self.agent_id, + "task_id": self.task_id, + "thought": self.thought, + "final_answer": self.final_answer, + "numerical_answer": self.numerical_answer, + "round_number": str(self.round_number), + } + + +@dataclass +class AgentRevisionRequest: + """Request for agent to revise solution based on other agents' solutions.""" + + task_id: str + problem: str + capability_name: str + other_solutions: List[Dict[str, str]] + round_number: int + + +@dataclass +class ConsensusCheck: + """Check if consensus has been reached.""" + + task_id: str + solutions: List[Dict[str, str]] + round_number: int + + +@dataclass +class FinalSolution: + """Final solution for a task.""" + + task_id: str + capability_name: str + problem: str + solution: str + numerical_answer: str + reasoning: str + consensus_reached: bool + total_rounds: int + all_solutions: List[Dict[str, str]] diff --git a/src/task_solver/moderator.py b/src/task_solver/moderator.py new file mode 100644 index 0000000..673baac --- /dev/null +++ b/src/task_solver/moderator.py @@ -0,0 +1,442 @@ +"""Task solver moderator agent for managing the debate process.""" + +import json +import logging +import re +import traceback +from pathlib import Path +from typing import Dict, List + +from autogen_core import ( + DefaultTopicId, + MessageContext, + RoutedAgent, + default_subscription, + message_handler, +) +from autogen_core.models import ( + ChatCompletionClient, + SystemMessage, + UserMessage, +) +from langfuse import Langfuse + +from src.task_solver.messages import ( + AgentRevisionRequest, + AgentSolution, + FinalSolution, + Task, + TaskSolutionRequest, +) +from src.utils.agentic_prompts import ( + TASK_MODERATOR_CONSENSUS_PROMPT, + TASK_MODERATOR_SYSTEM_MESSAGE, +) +from src.utils.json_utils import parse_llm_json_response + + +log = logging.getLogger("task_solver.moderator") + + +@default_subscription +class TaskSolverModerator(RoutedAgent): + """Moderator that manages task solver debate and checks for consensus.""" + + def __init__( + self, + model_client: ChatCompletionClient, + num_solvers: int, + max_rounds: int, + output_dir: Path, + langfuse_client: Langfuse, + ) -> None: + super().__init__("Task Solver Moderator") + self._model_client = model_client + self._num_solvers = num_solvers + self._max_rounds = max_rounds + self._output_dir = output_dir + self._langfuse_client = langfuse_client + + # Track solutions by task_id and round + self._solutions_buffer: Dict[int, List[AgentSolution]] + self._current_round = 0 + self._final_solutions: FinalSolution + self._tasks: Task # Store original tasks for consensus checking + + def _extract_consensus_components( + self, response: str + ) -> tuple[bool, str, str, str]: + """Extract consensus, solution, reasoning, and numerical answer from JSON.""" + try: + parsed = parse_llm_json_response(response) + consensus_reached = parsed.get("consensus_reached", False) + final_solution = parsed.get("final_solution", "NONE") + reasoning = parsed.get("reasoning", "No reasoning provided") + numerical_answer = parsed.get("numerical_answer") + + # Convert numerical_answer to string representation + if numerical_answer is not None: + numerical_answer = str(numerical_answer) + else: + numerical_answer = "null" + + return consensus_reached, final_solution, reasoning, numerical_answer + + except Exception as e: + # Fallback to old text parsing if JSON parsing fails + log.warning( + f"Failed to parse JSON response from moderator, falling back to text parsing: {e}" + ) + consensus_match = re.search( + r"CONSENSUS_REACHED:\s*(true|false)", response, re.IGNORECASE + ) + solution_match = re.search( + r"FINAL_SOLUTION:\s*(.*?)(?=REASONING:|$)", + response, + re.DOTALL | re.IGNORECASE, + ) + reasoning_match = re.search( + r"REASONING:\s*(.*?)$", response, re.DOTALL | re.IGNORECASE + ) + + consensus_reached = ( + consensus_match.group(1).lower() == "true" if consensus_match else False + ) + final_solution = ( + solution_match.group(1).strip() if solution_match else "NONE" + ) + reasoning = ( + reasoning_match.group(1).strip() + if reasoning_match + else "No reasoning provided" + ) + + return consensus_reached, final_solution, reasoning, "null" + + def _check_simple_consensus( + self, solutions: List[AgentSolution] + ) -> tuple[bool, str, str]: + """Check consensus; if all agents have the same final answer.""" + if not solutions: + return False, "", "null" + + # First check numerical answers if they exist + numerical_answers = [ + sol.numerical_answer for sol in solutions if sol.numerical_answer != "null" + ] + if ( + len(numerical_answers) == len(solutions) + and len(set(numerical_answers)) == 1 + ): + return True, solutions[0].final_answer, solutions[0].numerical_answer + + # Fallback to text-based consensus + answers = [sol.final_answer.strip().lower() for sol in solutions] + if len(set(answers)) == 1: + return True, solutions[0].final_answer, solutions[0].numerical_answer + + return False, "", "null" + + @message_handler + async def handle_task(self, message: Task, ctx: MessageContext) -> None: + """Handle a task and initiate the solver process.""" + with self._langfuse_client.start_as_current_span( + name=f"moderator_handle_task_{message.task_id}" + ) as span: + try: + msg = f"Moderator received task: {message.task_id}" + log.info(msg) + span.update( + metadata={ + "task_received": msg, + "task_id": message.task_id, + "capability_name": message.capability_name, + } + ) + + # Initialize tracking for this task + self._solutions_buffer = {} + self._tasks = message + + # Send initial solution request to all solvers + await self.publish_message( + TaskSolutionRequest( + task_id=message.task_id, + problem=message.problem, + capability_name=message.capability_name, + round_number=1, + ), + topic_id=DefaultTopicId(), + ) + + span.update( + metadata={ + "solution_request_sent": f"Round 1 solution request sent for task {message.task_id}" + } + ) + + except Exception as e: + error_msg = f"Error handling task {message.task_id}: {str(e)}" + log.error(error_msg) + log.error(traceback.format_exc()) + span.update(metadata={"error": error_msg}) + + @message_handler + async def handle_agent_solution( + self, message: AgentSolution, ctx: MessageContext + ) -> None: + """Handle solution from an agent.""" + with self._langfuse_client.start_as_current_span( + name=f"moderator_handle_solution_{message.task_id}_round_{message.round_number}" + ) as span: + try: + task_id = message.task_id + round_num = message.round_number + + msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, round {round_num}" + log.info(msg) + span.update( + metadata={ + "solution_received": msg, + "task_id": task_id, + "agent_id": message.agent_id, + "round": round_num, + } + ) + + # Initialize round buffer if needed + if round_num not in self._solutions_buffer: + self._solutions_buffer[round_num] = [] + + # Add solution to buffer + self._solutions_buffer[round_num].append(message) + + # Check if we have all solutions for this round + if len(self._solutions_buffer[round_num]) == self._num_solvers: + await self._check_consensus_and_proceed(task_id, round_num, ctx) + + span.update( + metadata={ + "solutions_collected": f"{len(self._solutions_buffer[round_num])}/{self._num_solvers} for round {round_num}" + } + ) + + except Exception as e: + error_msg = ( + f"Error handling solution from agent {message.agent_id}: {str(e)}" + ) + log.error(error_msg) + log.error(traceback.format_exc()) + span.update(metadata={"error": error_msg}) + + async def _check_consensus_and_proceed( + self, task_id: str, round_num: int, ctx: MessageContext + ) -> None: + """Check for consensus and either finalize or start next round.""" + with self._langfuse_client.start_as_current_span( + name=f"moderator_consensus_check_{task_id}_round_{round_num}" + ) as span: + try: + solutions = self._solutions_buffer[round_num] + + # First try simple consensus check + simple_consensus, simple_solution, simple_numerical = ( + self._check_simple_consensus(solutions) + ) + + if simple_consensus: + # Simple consensus reached + final_solution = FinalSolution( + task_id=task_id, + capability_name=self._tasks.capability_name, + problem=self._tasks.problem, + solution=simple_solution, + numerical_answer=simple_numerical, + reasoning="All agents provided the same answer", + consensus_reached=True, + total_rounds=round_num, + all_solutions=self._get_all_solutions(), + ) + + self._final_solutions = final_solution + await self._save_final_solution(final_solution) + + span.update( + metadata={ + "consensus_reached": True, + "method": "simple", + "final_solution": simple_solution[:100], + } + ) + return + + if round_num < self._max_rounds: + # Use LLM moderator to check for consensus + stored_task = self._tasks # Get original task + + # Format solutions for LLM + all_solutions_text = "\n\n".join( + [ + f"Agent {sol.agent_id}:\nReasoning: {sol.thought}\nFinal Answer: {sol.final_answer}" + for sol in solutions + ] + ) + + prompt = TASK_MODERATOR_CONSENSUS_PROMPT.format( + problem_text=stored_task.problem, + all_solutions=all_solutions_text, + ) + + system_message = SystemMessage( + content=TASK_MODERATOR_SYSTEM_MESSAGE + ) + user_message = UserMessage(content=prompt, source="user") + + response = await self._model_client.create( + messages=[system_message, user_message], + cancellation_token=ctx.cancellation_token, + ) + + ( + consensus_reached, + final_solution_text, + reasoning, + numerical_answer, + ) = self._extract_consensus_components(str(response.content)) + + if consensus_reached: + # LLM found consensus + final_solution = FinalSolution( + task_id=task_id, + capability_name=self._tasks.capability_name, + problem=self._tasks.problem, + solution=final_solution_text, + numerical_answer=numerical_answer, + reasoning=reasoning, + consensus_reached=True, + total_rounds=round_num, + all_solutions=self._get_all_solutions(), + ) + + self._final_solutions = final_solution + await self._save_final_solution(final_solution) + + span.update( + metadata={ + "consensus_reached": True, + "method": "llm_moderator", + "final_solution": final_solution_text[:100], + } + ) + return + # No consensus, start next round + next_round = round_num + 1 + self._current_round = next_round + + # Send revision request with flattened task data + stored_task = self._tasks # Get the original task + + await self.publish_message( + AgentRevisionRequest( + task_id=stored_task.task_id, + problem=stored_task.problem, + capability_name=stored_task.capability_name, + other_solutions=[ + { + "agent_id": sol.agent_id, + "task_id": sol.task_id, + "thought": sol.thought, + "final_answer": sol.final_answer, + "numerical_answer": sol.numerical_answer, + "round_number": str(sol.round_number), + } + for sol in solutions + ], + round_number=next_round, + ), + topic_id=DefaultTopicId(), + ) + + span.update( + metadata={ + "consensus_reached": False, + "next_round_started": next_round, + } + ) + else: + # Max rounds reached, no consensus + final_solution = FinalSolution( + task_id=task_id, + capability_name=self._tasks.capability_name, + problem=self._tasks.problem, + solution="No consensus reached", + numerical_answer="null", + reasoning=f"Maximum rounds ({self._max_rounds}) reached without consensus", + consensus_reached=False, + total_rounds=round_num, + all_solutions=self._get_all_solutions(), + ) + + self._final_solutions = final_solution + await self._save_final_solution(final_solution) + + span.update( + metadata={ + "consensus_reached": False, + "max_rounds_reached": True, + } + ) + + except Exception as e: + error_msg = f"Error checking consensus for task {task_id}: {str(e)}" + log.error(error_msg) + log.error(traceback.format_exc()) + span.update(metadata={"error": error_msg}) + + def _get_all_solutions(self) -> List[Dict[str, str]]: + return [ + sol.to_dict() for sols in self._solutions_buffer.values() for sol in sols + ] + + async def _save_final_solution(self, final_solution: FinalSolution) -> None: + """Save the final solution to a file.""" + try: + self._output_dir.mkdir(parents=True, exist_ok=True) + output_file = ( + self._output_dir / f"task_{final_solution.task_id}_solution.json" + ) + + solution_data = { + "task_id": final_solution.task_id, + "capability_name": final_solution.capability_name, + "problem": final_solution.problem, + "solution": final_solution.solution, + "numerical_answer": final_solution.numerical_answer, + "reasoning": final_solution.reasoning, + "consensus_reached": final_solution.consensus_reached, + "total_rounds": final_solution.total_rounds, + "all_solutions": [ + { + "agent_id": sol["agent_id"], + "task_id": sol["task_id"], + "thought": sol["thought"], + "final_answer": sol["final_answer"], + "numerical_answer": sol["numerical_answer"], + "round_number": sol["round_number"], + } + for sol in final_solution.all_solutions + ], + } + + with open(output_file, "w") as f: + json.dump(solution_data, f, indent=2) + + log.info( + f"Saved final solution for task {final_solution.task_id} to {output_file}" + ) + + except Exception as e: + log.error( + f"Error saving final solution for task {final_solution.task_id}: {str(e)}" + ) + log.error(traceback.format_exc()) diff --git a/src/task_solving/scientist.py b/src/task_solver/scientist.py similarity index 53% rename from src/task_solving/scientist.py rename to src/task_solver/scientist.py index c493625..957617f 100644 --- a/src/task_solving/scientist.py +++ b/src/task_solver/scientist.py @@ -1,7 +1,6 @@ -"""Task solver agent for solving tasks through debate.""" +"""Task solver agent for solver tasks through debate.""" import logging -import re import traceback from autogen_core import ( @@ -18,7 +17,7 @@ ) from langfuse import Langfuse -from src.task_solving.messages import ( +from src.task_solver.messages import ( AgentRevisionRequest, AgentSolution, TaskSolutionRequest, @@ -28,76 +27,90 @@ TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT, TASK_SOLVER_SYSTEM_MESSAGE, ) +from src.utils.json_utils import parse_llm_json_response -log = logging.getLogger("task_solving.solver") +log = logging.getLogger("task_solver.scientist") @default_subscription -class TaskSolvingScientist(RoutedAgent): +class TaskSolverScientist(RoutedAgent): """A scientist that solves tasks through debate.""" def __init__( self, model_client: ChatCompletionClient, scientist_id: str, - langfuse_client: Langfuse = None, + langfuse_client: Langfuse, ) -> None: - super().__init__(f"Task Solving Scientist {scientist_id}") + super().__init__(f"Task Solver Scientist {scientist_id}") self._model_client = model_client self._scientist_id = scientist_id self._langfuse_client = langfuse_client - def _extract_solution_components(self, response: str) -> tuple[str, str]: - """Extract thought and final answer from the response.""" - thought_match = re.search(r"THOUGHT:\s*(.*?)(?=FINAL ANSWER:|$)", response, re.DOTALL | re.IGNORECASE) - answer_match = re.search(r"FINAL ANSWER:\s*(.*?)$", response, re.DOTALL | re.IGNORECASE) - - thought = thought_match.group(1).strip() if thought_match else response.strip() - final_answer = answer_match.group(1).strip() if answer_match else "No clear answer provided" - - return thought, final_answer + def _extract_solution_components(self, response: str) -> tuple[str, str, str]: + """Extract thought, final answer, and numerical answer from JSON response.""" + try: + parsed = parse_llm_json_response(response) + thought = parsed.get("thought", response.strip()) + final_answer = parsed.get("final_answer", "No clear answer provided") + numerical_answer = parsed.get("numerical_answer") + + # Convert numerical_answer to string representation + if numerical_answer is not None: + numerical_answer = str(numerical_answer) + else: + numerical_answer = "null" + + return thought, final_answer, numerical_answer + + except Exception as e: + msg = f"Failed to parse JSON response: {e} \n Response: {response}" + log.error(msg) + log.error(traceback.format_exc()) + raise @message_handler async def handle_task_solution_request( self, message: TaskSolutionRequest, ctx: MessageContext ) -> None: - """Handle initial task solution request (Round 1).""" + """Handle initial task solution request.""" with self._langfuse_client.start_as_current_span( - name=f"scientist_{self._scientist_id}_round_1" + name=f"scientist_{self._scientist_id}_initial_solution_request" ) as span: try: - task_text = message.task.task_content.get("task", "") - - msg = f"Scientist {self._scientist_id} handling initial solution request for task: {message.task.task_id}" + msg = f"Scientist {self._scientist_id} handling initial solution request for task: {message.task_id}, capability: {message.capability_name} round: {message.round_number}" log.info(msg) span.update( metadata={ "solution_request_received": msg, "scientist_id": self._scientist_id, - "task_id": message.task.task_id, + "task_id": message.task_id, + "capability": message.capability_name, "round": message.round_number, } ) - prompt = TASK_SOLVER_ROUND_1_PROMPT.format(problem_text=task_text) - + prompt = TASK_SOLVER_ROUND_1_PROMPT.format(problem_text=message.problem) + system_message = SystemMessage(content=TASK_SOLVER_SYSTEM_MESSAGE) user_message = UserMessage(content=prompt, source="user") response = await self._model_client.create( - messages=[system_message, user_message], - cancellation_token=ctx.cancellation_token, + [system_message, user_message] ) - response_content = response.content - thought, final_answer = self._extract_solution_components(response_content) + response_content = str(response.content) + thought, final_answer, numerical_answer = ( + self._extract_solution_components(response_content) + ) solution = AgentSolution( agent_id=self._scientist_id, - task_id=message.task.task_id, + task_id=message.task_id, thought=thought, final_answer=final_answer, + numerical_answer=numerical_answer, round_number=message.round_number, ) @@ -105,16 +118,15 @@ async def handle_task_solution_request( span.update( metadata={ - "solution_generated": f"Scientist {self._scientist_id} generated solution for task {message.task.task_id}", - "final_answer": final_answer[:100], # Truncate for logging + "solution_generated": f"Scientist {self._scientist_id} generated solution for task {message.task_id}, capability: {message.capability_name} round: {message.round_number}", } ) except Exception as e: - error_msg = f"Error in scientist {self._scientist_id} round 1: {str(e)}" - log.error(error_msg) + msg = f"Error in scientist {self._scientist_id} task solution request: {str(e)}" + log.error(msg) log.error(traceback.format_exc()) - span.update(metadata={"error": error_msg}) + span.update(metadata={"error": msg}) @message_handler async def handle_agent_revision_request( @@ -125,48 +137,50 @@ async def handle_agent_revision_request( name=f"scientist_{self._scientist_id}_round_{message.round_number}" ) as span: try: - task_text = message.task.task_content.get("task", "") - - msg = f"Scientist {self._scientist_id} handling revision request for task: {message.task.task_id}, round: {message.round_number}" + msg = f"Scientist {self._scientist_id} handling revision request for task: {message.task_id}, capability: {message.capability_name} round: {message.round_number}" log.info(msg) span.update( metadata={ "revision_request_received": msg, "scientist_id": self._scientist_id, - "task_id": message.task.task_id, + "task_id": message.task_id, "round": message.round_number, "num_other_solutions": len(message.other_solutions), } ) # Format other scientists' solutions - other_solutions_text = "\n\n".join([ - f"Scientist {sol.agent_id}: Reasoning: {sol.thought}, Final solution: {sol.final_answer}" - for sol in message.other_solutions - if sol.agent_id != self._scientist_id # Don't include our own solution - ]) + other_solutions_text = "\n\n".join( + [ + f"Scientist {sol['agent_id']}: Reasoning: {sol['thought']}, Final solution: {sol['final_answer']}" + for sol in message.other_solutions + if sol["agent_id"] + != self._scientist_id # Don't include its own solution + ] + ) prompt = TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT.format( - other_solutions=other_solutions_text, - problem_text=task_text + other_solutions=other_solutions_text, problem_text=message.problem ) - + system_message = SystemMessage(content=TASK_SOLVER_SYSTEM_MESSAGE) user_message = UserMessage(content=prompt, source="user") response = await self._model_client.create( - messages=[system_message, user_message], - cancellation_token=ctx.cancellation_token, + [system_message, user_message] ) - response_content = response.content - thought, final_answer = self._extract_solution_components(response_content) + response_content = str(response.content) + thought, final_answer, numerical_answer = ( + self._extract_solution_components(response_content) + ) solution = AgentSolution( agent_id=self._scientist_id, - task_id=message.task.task_id, + task_id=message.task_id, thought=thought, final_answer=final_answer, + numerical_answer=numerical_answer, round_number=message.round_number, ) @@ -174,13 +188,12 @@ async def handle_agent_revision_request( span.update( metadata={ - "revision_generated": f"Scientist {self._scientist_id} generated revision for task {message.task.task_id}", - "final_answer": final_answer[:100], # Truncate for logging + "revision_generated": f"Scientist {self._scientist_id} generated revision for task {message.task_id}, capability: {message.capability_name}, round: {message.round_number}", } ) except Exception as e: - error_msg = f"Error in scientist {self._scientist_id} round {message.round_number}: {str(e)}" - log.error(error_msg) + msg = f"Error in scientist {self._scientist_id} agent revision request: {str(e)}" + log.error(msg) log.error(traceback.format_exc()) - span.update(metadata={"error": error_msg}) \ No newline at end of file + span.update(metadata={"error": msg}) diff --git a/src/task_solving/__init__.py b/src/task_solving/__init__.py deleted file mode 100644 index 51e8634..0000000 --- a/src/task_solving/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Task solving module with debate-based approach.""" - -from .generator import solve_tasks_with_debate, load_tasks_from_file -from .messages import Task, TaskSolutionRequest, AgentSolution, FinalSolution -from .moderator import TaskSolvingModerator -from .scientist import TaskSolvingScientist - -__all__ = [ - "solve_tasks_with_debate", - "load_tasks_from_file", - "Task", - "TaskSolutionRequest", - "AgentSolution", - "FinalSolution", - "TaskSolvingModerator", - "TaskSolvingScientist", -] \ No newline at end of file diff --git a/src/task_solving/generator.py b/src/task_solving/generator.py deleted file mode 100644 index 26f05b6..0000000 --- a/src/task_solving/generator.py +++ /dev/null @@ -1,225 +0,0 @@ -"""Main task solving orchestration function.""" - -import json -import logging -import traceback -from datetime import datetime -from pathlib import Path -from typing import Dict, List - -from autogen_core import ( - EVENT_LOGGER_NAME, - ROOT_LOGGER_NAME, - TRACE_LOGGER_NAME, - DefaultTopicId, - SingleThreadedAgentRuntime, -) -from langfuse import Langfuse -from omegaconf import DictConfig - -from src.task_solving.messages import Task -from src.task_solving.moderator import TaskSolvingModerator -from src.task_solving.scientist import TaskSolvingScientist -from src.utils.model_client_utils import get_model_client - - -log = logging.getLogger("task_solving.generator") -logging.getLogger(ROOT_LOGGER_NAME).setLevel(logging.WARNING) -logging.getLogger(TRACE_LOGGER_NAME).setLevel(logging.WARNING) -logging.getLogger(EVENT_LOGGER_NAME).setLevel(logging.WARNING) - - -async def solve_tasks_with_debate( - cfg: DictConfig, - tasks: List[Dict], - langfuse_client: Langfuse = None -) -> Dict[str, Dict]: - """ - Solve tasks using multi-agent debate system. - - Args: - cfg: Configuration containing debate and model settings - tasks: List of tasks to solve, each containing task_id, task content, and capability_id - langfuse_client: Langfuse client for tracing - - Returns: - Dictionary mapping task_id to final solution data - """ - domain_name = cfg.global_cfg.domain - exp_id = cfg.exp_cfg.exp_id - max_rounds = cfg.debate_cfg.max_round - num_solvers = 2 # scientist_a and scientist_b - solutions_tag = f"_{datetime.now().strftime('%Y%m%d_%H%M%S')}" - - with langfuse_client.start_as_current_span( - name=f"ace_task_solving:{domain_name}:{exp_id}:{solutions_tag}" - ) as span: - try: - msg = f"Solutions will be saved with tag: {solutions_tag}" - log.info(msg) - span.update( - metadata={ - "solving_started": msg, - "solutions_tag": solutions_tag, - "domain": domain_name, - "exp_id": exp_id, - "num_tasks": len(tasks), - "num_solvers": num_solvers, - "max_rounds": max_rounds, - } - ) - - # Create output directory - output_dir = Path(cfg.global_cfg.output_dir) / "task_solutions" / f"{domain_name}_{exp_id}{solutions_tag}" - output_dir.mkdir(parents=True, exist_ok=True) - - # Set up runtime - runtime = SingleThreadedAgentRuntime() - - # Create model clients for each agent - scientist_a_client = get_model_client( - cfg.agents.scientist_a.model_name, - seed=cfg.agents.scientist_a.get("seed") - ) - scientist_b_client = get_model_client( - cfg.agents.scientist_b.model_name, - seed=cfg.agents.scientist_b.get("seed") - ) - moderator_client = get_model_client( - cfg.agents.moderator.model_name, - seed=cfg.agents.moderator.get("seed") - ) - - # Register moderator - moderator_agent_type = await TaskSolvingModerator.register( - runtime, - "task_solving_moderator", - lambda: TaskSolvingModerator( - model_client=moderator_client, - num_solvers=num_solvers, - max_rounds=max_rounds, - output_dir=output_dir, - langfuse_client=langfuse_client, - ), - ) - - # Register scientist agents - scientist_a_type = await TaskSolvingScientist.register( - runtime, - "task_scientist_a", - lambda: TaskSolvingScientist( - model_client=scientist_a_client, - scientist_id="scientist_a", - langfuse_client=langfuse_client, - ), - ) - - scientist_b_type = await TaskSolvingScientist.register( - runtime, - "task_scientist_b", - lambda: TaskSolvingScientist( - model_client=scientist_b_client, - scientist_id="scientist_b", - langfuse_client=langfuse_client, - ), - ) - - # Start runtime - runtime.start() - - log.info(f"Starting task solving for {len(tasks)} tasks with {num_solvers} scientists") - - # Process each task - for i, (task_id, task_data) in enumerate(tasks.items()): - # Handle both old and new task formats - if isinstance(task_data, dict) and "task" in task_data: - # New format: {"task": "problem text", "capability_id": "cap_name"} - capability_id = task_data.get("capability_id", "unknown") - task_content = task_data - else: - # Old format or other formats - capability_id = task_data.get("capability_id", "unknown") if isinstance(task_data, dict) else "unknown" - task_content = {"task": str(task_data)} if not isinstance(task_data, dict) else task_data - - # Create task message - task = Task( - task_id=task_id, - task_content=task_content, - capability_id=capability_id, - ) - - # Send task to moderator - await runtime.publish_message( - task, - topic_id=DefaultTopicId() - ) - - log.info(f"Submitted task {task_id} for solving") - - # Wait for all tasks to complete - # Note: In a real implementation, you might want to add a timeout - # and check for completion status - await runtime.stop_when_idle() - - # Collect results - results = {} - for solution_file in output_dir.glob("task_*_solution.json"): - try: - with open(solution_file, "r") as f: - solution_data = json.load(f) - results[solution_data["task_id"]] = solution_data - except Exception as e: - log.error(f"Error loading solution from {solution_file}: {e}") - - log.info(f"Task solving completed. Processed {len(results)} tasks.") - - span.update( - metadata={ - "solving_completed": f"Processed {len(results)} tasks", - "output_dir": str(output_dir), - "results_count": len(results), - } - ) - - return results - - except Exception as e: - error_msg = f"Error in task solving: {str(e)}" - log.error(error_msg) - log.error(traceback.format_exc()) - span.update(metadata={"error": error_msg}) - raise - - -def load_tasks_from_file(tasks_file: Path) -> List[Dict]: - """ - Load tasks from a JSON file. - - Args: - tasks_file: Path to the tasks file - - Returns: - List of task dictionaries - """ - try: - with open(tasks_file, "r") as f: - tasks_data = json.load(f) - - # Handle different task file formats - if isinstance(tasks_data, list): - # Old format: list of tasks - return {f"task_{i+1}": task for i, task in enumerate(tasks_data)} - elif isinstance(tasks_data, dict): - # If it's a dict, try to extract tasks - if "tasks" in tasks_data: - # New format: {"tasks": {"task_1": {...}, "task_2": {...}}} - return tasks_data["tasks"] - else: - # Convert dict to single task - return {"task_1": tasks_data} - else: - raise ValueError(f"Unexpected task file format: {type(tasks_data)}") - - except Exception as e: - log.error(f"Error loading tasks from {tasks_file}: {e}") - raise \ No newline at end of file diff --git a/src/task_solving/messages.py b/src/task_solving/messages.py deleted file mode 100644 index a1af9d3..0000000 --- a/src/task_solving/messages.py +++ /dev/null @@ -1,64 +0,0 @@ -"""Message types for task solving debate system.""" - -from dataclasses import dataclass -from typing import Any, Dict, List - -from autogen_core import BaseMessage - - -@dataclass -class Task(BaseMessage): - """Task to be solved.""" - - task_id: str - task_content: Dict[str, Any] - capability_id: str - - -@dataclass -class TaskSolutionRequest(BaseMessage): - """Request to solve a task.""" - - task: Task - round_number: int = 1 - - -@dataclass -class AgentSolution(BaseMessage): - """Solution proposed by an agent.""" - - agent_id: str - task_id: str - thought: str - final_answer: str - round_number: int - - -@dataclass -class AgentRevisionRequest(BaseMessage): - """Request for agent to revise solution based on other agents' solutions.""" - - task: Task - other_solutions: List[AgentSolution] - round_number: int - - -@dataclass -class ConsensusCheck(BaseMessage): - """Check if consensus has been reached.""" - - task_id: str - solutions: List[AgentSolution] - round_number: int - - -@dataclass -class FinalSolution(BaseMessage): - """Final solution for a task.""" - - task_id: str - solution: str - reasoning: str - consensus_reached: bool - total_rounds: int - all_solutions: List[AgentSolution] \ No newline at end of file diff --git a/src/task_solving/moderator.py b/src/task_solving/moderator.py deleted file mode 100644 index 251bfd6..0000000 --- a/src/task_solving/moderator.py +++ /dev/null @@ -1,342 +0,0 @@ -"""Task solving moderator agent for managing the debate process.""" - -import json -import logging -import re -import traceback -from pathlib import Path -from typing import Dict, List - -from autogen_core import ( - DefaultTopicId, - MessageContext, - RoutedAgent, - default_subscription, - message_handler, -) -from autogen_core.models import ( - ChatCompletionClient, - SystemMessage, - UserMessage, -) -from langfuse import Langfuse - -from src.task_solving.messages import ( - AgentRevisionRequest, - AgentSolution, - ConsensusCheck, - FinalSolution, - Task, - TaskSolutionRequest, -) -from src.utils.agentic_prompts import ( - TASK_MODERATOR_CONSENSUS_PROMPT, - TASK_MODERATOR_SYSTEM_MESSAGE, -) - - -log = logging.getLogger("task_solving.moderator") - - -@default_subscription -class TaskSolvingModerator(RoutedAgent): - """Moderator that manages task solving debate and checks for consensus.""" - - def __init__( - self, - model_client: ChatCompletionClient, - num_solvers: int, - max_rounds: int, - output_dir: Path, - langfuse_client: Langfuse = None, - ) -> None: - super().__init__("Task Solving Moderator") - self._model_client = model_client - self._num_solvers = num_solvers - self._max_rounds = max_rounds - self._output_dir = output_dir - self._langfuse_client = langfuse_client - - # Track solutions by task_id and round - self._solutions_buffer: Dict[str, Dict[int, List[AgentSolution]]] = {} - self._current_round: Dict[str, int] = {} - self._final_solutions: Dict[str, FinalSolution] = {} - - def _extract_consensus_components(self, response: str) -> tuple[bool, str, str]: - """Extract consensus decision, solution, and reasoning from response.""" - consensus_match = re.search(r"CONSENSUS_REACHED:\s*(true|false)", response, re.IGNORECASE) - solution_match = re.search(r"FINAL_SOLUTION:\s*(.*?)(?=REASONING:|$)", response, re.DOTALL | re.IGNORECASE) - reasoning_match = re.search(r"REASONING:\s*(.*?)$", response, re.DOTALL | re.IGNORECASE) - - consensus_reached = consensus_match.group(1).lower() == "true" if consensus_match else False - final_solution = solution_match.group(1).strip() if solution_match else "NONE" - reasoning = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided" - - return consensus_reached, final_solution, reasoning - - def _check_simple_consensus(self, solutions: List[AgentSolution]) -> tuple[bool, str]: - """Simple consensus check - if all agents have the same final answer.""" - if not solutions: - return False, "" - - # Extract final answers and normalize them - answers = [sol.final_answer.strip().lower() for sol in solutions] - - # Check if all answers are the same - if len(set(answers)) == 1: - return True, solutions[0].final_answer - - return False, "" - - @message_handler - async def handle_task(self, message: Task, ctx: MessageContext) -> None: - """Handle a task and initiate the solving process.""" - with self._langfuse_client.start_as_current_span( - name=f"moderator_handle_task_{message.task_id}" - ) as span: - try: - msg = f"Moderator received task: {message.task_id}" - log.info(msg) - span.update( - metadata={ - "task_received": msg, - "task_id": message.task_id, - "capability_id": message.capability_id, - } - ) - - # Initialize tracking for this task - self._solutions_buffer[message.task_id] = {} - self._current_round[message.task_id] = 1 - - # Send initial solution request to all solvers - await self.publish_message( - TaskSolutionRequest(task=message, round_number=1), - topic_id=DefaultTopicId(), - ) - - span.update( - metadata={"solution_request_sent": f"Round 1 solution request sent for task {message.task_id}"} - ) - - except Exception as e: - error_msg = f"Error handling task {message.task_id}: {str(e)}" - log.error(error_msg) - log.error(traceback.format_exc()) - span.update(metadata={"error": error_msg}) - - @message_handler - async def handle_agent_solution(self, message: AgentSolution, ctx: MessageContext) -> None: - """Handle solution from an agent.""" - with self._langfuse_client.start_as_current_span( - name=f"moderator_handle_solution_{message.task_id}_round_{message.round_number}" - ) as span: - try: - task_id = message.task_id - round_num = message.round_number - - msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, round {round_num}" - log.info(msg) - span.update( - metadata={ - "solution_received": msg, - "task_id": task_id, - "agent_id": message.agent_id, - "round": round_num, - } - ) - - # Initialize round buffer if needed - if round_num not in self._solutions_buffer[task_id]: - self._solutions_buffer[task_id][round_num] = [] - - # Add solution to buffer - self._solutions_buffer[task_id][round_num].append(message) - - # Check if we have all solutions for this round - if len(self._solutions_buffer[task_id][round_num]) == self._num_solvers: - await self._check_consensus_and_proceed(task_id, round_num, ctx) - - span.update( - metadata={ - "solutions_collected": f"{len(self._solutions_buffer[task_id][round_num])}/{self._num_solvers} for round {round_num}" - } - ) - - except Exception as e: - error_msg = f"Error handling solution from agent {message.agent_id}: {str(e)}" - log.error(error_msg) - log.error(traceback.format_exc()) - span.update(metadata={"error": error_msg}) - - async def _check_consensus_and_proceed(self, task_id: str, round_num: int, ctx: MessageContext) -> None: - """Check for consensus and either finalize or start next round.""" - with self._langfuse_client.start_as_current_span( - name=f"moderator_consensus_check_{task_id}_round_{round_num}" - ) as span: - try: - solutions = self._solutions_buffer[task_id][round_num] - - # First try simple consensus check - simple_consensus, simple_solution = self._check_simple_consensus(solutions) - - if simple_consensus: - # Simple consensus reached - final_solution = FinalSolution( - task_id=task_id, - solution=simple_solution, - reasoning="All agents provided the same answer", - consensus_reached=True, - total_rounds=round_num, - all_solutions=self._get_all_solutions_for_task(task_id), - ) - - self._final_solutions[task_id] = final_solution - await self._save_final_solution(final_solution) - - span.update( - metadata={ - "consensus_reached": True, - "method": "simple", - "final_solution": simple_solution[:100], - } - ) - return - - # If no simple consensus and we haven't reached max rounds, use LLM to check - if round_num < self._max_rounds: - # Use LLM moderator to check for consensus - task_content = "" # We need to get the original task content - # For now, let's get it from the first solution's context or we need to store it - - # Format solutions for LLM - all_solutions_text = "\n\n".join([ - f"Agent {sol.agent_id}:\nReasoning: {sol.thought}\nFinal Answer: {sol.final_answer}" - for sol in solutions - ]) - - prompt = TASK_MODERATOR_CONSENSUS_PROMPT.format( - problem_text=task_content, # We need to store this from the original task - all_solutions=all_solutions_text - ) - - system_message = SystemMessage(content=TASK_MODERATOR_SYSTEM_MESSAGE) - user_message = UserMessage(content=prompt, source="user") - - response = await self._model_client.create( - messages=[system_message, user_message], - cancellation_token=ctx.cancellation_token, - ) - - consensus_reached, final_solution_text, reasoning = self._extract_consensus_components(response.content) - - if consensus_reached: - # LLM found consensus - final_solution = FinalSolution( - task_id=task_id, - solution=final_solution_text, - reasoning=reasoning, - consensus_reached=True, - total_rounds=round_num, - all_solutions=self._get_all_solutions_for_task(task_id), - ) - - self._final_solutions[task_id] = final_solution - await self._save_final_solution(final_solution) - - span.update( - metadata={ - "consensus_reached": True, - "method": "llm_moderator", - "final_solution": final_solution_text[:100], - } - ) - return - else: - # No consensus, start next round - next_round = round_num + 1 - self._current_round[task_id] = next_round - - # We need the original task to send revision requests - # For now, create a placeholder task - task = Task(task_id=task_id, task_content={"task": task_content}, capability_id="") - - await self.publish_message( - AgentRevisionRequest( - task=task, - other_solutions=solutions, - round_number=next_round, - ), - topic_id=DefaultTopicId(), - ) - - span.update( - metadata={ - "consensus_reached": False, - "next_round_started": next_round, - } - ) - else: - # Max rounds reached, no consensus - final_solution = FinalSolution( - task_id=task_id, - solution="No consensus reached", - reasoning=f"Maximum rounds ({self._max_rounds}) reached without consensus", - consensus_reached=False, - total_rounds=round_num, - all_solutions=self._get_all_solutions_for_task(task_id), - ) - - self._final_solutions[task_id] = final_solution - await self._save_final_solution(final_solution) - - span.update( - metadata={ - "consensus_reached": False, - "max_rounds_reached": True, - } - ) - - except Exception as e: - error_msg = f"Error checking consensus for task {task_id}: {str(e)}" - log.error(error_msg) - log.error(traceback.format_exc()) - span.update(metadata={"error": error_msg}) - - def _get_all_solutions_for_task(self, task_id: str) -> List[AgentSolution]: - """Get all solutions for a task across all rounds.""" - all_solutions = [] - for round_solutions in self._solutions_buffer[task_id].values(): - all_solutions.extend(round_solutions) - return all_solutions - - async def _save_final_solution(self, final_solution: FinalSolution) -> None: - """Save the final solution to a file.""" - try: - output_file = self._output_dir / f"task_{final_solution.task_id}_solution.json" - - solution_data = { - "task_id": final_solution.task_id, - "solution": final_solution.solution, - "reasoning": final_solution.reasoning, - "consensus_reached": final_solution.consensus_reached, - "total_rounds": final_solution.total_rounds, - "all_solutions": [ - { - "agent_id": sol.agent_id, - "thought": sol.thought, - "final_answer": sol.final_answer, - "round_number": sol.round_number, - } - for sol in final_solution.all_solutions - ], - } - - with open(output_file, "w") as f: - json.dump(solution_data, f, indent=2) - - log.info(f"Saved final solution for task {final_solution.task_id} to {output_file}") - - except Exception as e: - log.error(f"Error saving final solution for task {final_solution.task_id}: {str(e)}") - log.error(traceback.format_exc()) \ No newline at end of file diff --git a/src/utils/agentic_prompts.py b/src/utils/agentic_prompts.py index a65df8f..ff3f6a7 100644 --- a/src/utils/agentic_prompts.py +++ b/src/utils/agentic_prompts.py @@ -273,25 +273,50 @@ PROBLEM: {problem_text} -Explain your reasoning step by step. Your final answer should be clearly stated at the end of your response. +Provide your solution in JSON format with the following structure: +- thought: Your detailed reasoning and step-by-step solution process +- final_answer: Your complete answer with explanation +- numerical_answer: The final numerical result (if applicable, otherwise null) -Respond using this format: -THOUGHT: -FINAL ANSWER: """ +Example for a math problem: +{{ + "thought": "To solve this problem, I need to...", + "final_answer": "The solution is 42 because...", + "numerical_answer": 42 +}} + +Example for a non-numerical problem: +{{ + "thought": "To approach this problem, I should consider...", + "final_answer": "The answer is that we should use method X because...", + "numerical_answer": null +}} + +Respond with valid JSON only.""" TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT = """These are the reasoning and solutions to the problem from other agents: {other_solutions} -Using the solutions from other agents as additional information, can you provide your answer to the problem? +Using the solutions from other agents as additional information, can you provide your answer to the problem? The original problem is: {problem_text} -Explain your reasoning step by step. Your final answer should be clearly stated at the end of your response. +Consider the other agents' approaches and reasoning. You may agree with them, disagree, or provide a synthesis of different approaches. -Respond using this format: -THOUGHT: -FINAL ANSWER: """ +Provide your solution in JSON format with the following structure: +- thought: Your detailed reasoning, considering other agents' solutions +- final_answer: Your complete answer with explanation +- numerical_answer: The final numerical result (if applicable, otherwise null) + +Example: +{{ + "thought": "Looking at the other solutions, Agent A used method X which is correct, but Agent B made an error in step 2. My approach is...", + "final_answer": "The solution is 42 because...", + "numerical_answer": 42 +}} + +Respond with valid JSON only.""" TASK_MODERATOR_SYSTEM_MESSAGE = """You are a moderator overseeing a collaborative problem-solving debate. Your role is to check for consensus among agents and determine the final solution.""" @@ -305,13 +330,19 @@ Determine if there is consensus among the agents. Consensus is reached when: 1. All agents provide the same final answer, OR 2. The majority of agents agree on the same answer with similar reasoning +3. For numerical problems, the numerical answers should match or be very close If consensus is reached, provide the agreed-upon solution. If not, indicate that another round of debate is needed. -Respond using this format: -CONSENSUS_REACHED: -FINAL_SOLUTION: -REASONING: """ +Provide your assessment in JSON format: +{{ + "consensus_reached": true/false, + "final_solution": "the agreed solution if consensus reached, otherwise null", + "numerical_answer": final_numerical_result_if_applicable_otherwise_null, + "reasoning": "explanation of your decision" +}} + +Respond with valid JSON only.""" # ============================================================================= # SYSTEM MESSAGES From d1e1812dd18baa874d08a5684e93777e3e742242 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Sun, 7 Sep 2025 02:57:53 -0400 Subject: [PATCH 09/19] ruff fix. --- src/agentic_capability_generator.py | 5 +---- src/task_generation/generator.py | 3 ++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/agentic_capability_generator.py b/src/agentic_capability_generator.py index 835813e..20052ff 100644 --- a/src/agentic_capability_generator.py +++ b/src/agentic_capability_generator.py @@ -4,7 +4,6 @@ import logging import os import traceback -from typing import Optional import hydra import openlit @@ -30,9 +29,7 @@ def main(cfg: DictConfig) -> None: """Run the multi-agent debate-based capability generation system.""" areas_tag = cfg.pipeline_tags.areas_tag - resume_tag: Optional[str] = getattr( - cfg.pipeline_tags, "resume_capabilities_tag", None - ) + resume_tag = getattr(cfg.pipeline_tags, "resume_capabilities_tag", None) domain_name = cfg.global_cfg.domain exp_id = cfg.exp_cfg.exp_id num_capabilities_per_area = cfg.capability_generation.num_capabilities_per_area diff --git a/src/task_generation/generator.py b/src/task_generation/generator.py index 56094cf..9fb8aaa 100644 --- a/src/task_generation/generator.py +++ b/src/task_generation/generator.py @@ -6,6 +6,7 @@ import traceback from datetime import datetime from pathlib import Path +from typing import Optional from autogen_core import ( EVENT_LOGGER_NAME, @@ -164,7 +165,7 @@ async def generate_tasks( cfg: DictConfig, capabilities_tag: str, langfuse_client: Langfuse, - resume_tag: str, + resume_tag: Optional[str] = None, ) -> None: """Generate tasks for all capabilities.""" domain_name = cfg.global_cfg.domain From 4d237f7a3b1f059c075382e6145688416e16ca59 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Sun, 7 Sep 2025 04:09:53 -0400 Subject: [PATCH 10/19] updated saved file name for solutions. --- src/task_solver/moderator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/task_solver/moderator.py b/src/task_solver/moderator.py index 673baac..2789e89 100644 --- a/src/task_solver/moderator.py +++ b/src/task_solver/moderator.py @@ -403,7 +403,8 @@ async def _save_final_solution(self, final_solution: FinalSolution) -> None: try: self._output_dir.mkdir(parents=True, exist_ok=True) output_file = ( - self._output_dir / f"task_{final_solution.task_id}_solution.json" + self._output_dir + / f"{final_solution.task_id}_{final_solution.capability_name}_solution.json" ) solution_data = { From 38d825d56d38cbe002cbf85ade0c068293a0cf73 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Tue, 9 Sep 2025 00:28:13 -0400 Subject: [PATCH 11/19] added extra details to agent solution messages. --- src/task_solver/messages.py | 2 + src/task_solver/moderator.py | 95 ++++++++++++++---------------------- src/task_solver/scientist.py | 2 + src/utils/agentic_prompts.py | 14 ++++++ 4 files changed, 54 insertions(+), 59 deletions(-) diff --git a/src/task_solver/messages.py b/src/task_solver/messages.py index 5187bcc..b3c8694 100644 --- a/src/task_solver/messages.py +++ b/src/task_solver/messages.py @@ -33,6 +33,7 @@ class AgentSolution: final_answer: str numerical_answer: str round_number: int + capability_name: str def to_dict(self) -> Dict[str, str]: """Convert to dictionary.""" @@ -43,6 +44,7 @@ def to_dict(self) -> Dict[str, str]: "final_answer": self.final_answer, "numerical_answer": self.numerical_answer, "round_number": str(self.round_number), + "capability_name": self.capability_name, } diff --git a/src/task_solver/moderator.py b/src/task_solver/moderator.py index 2789e89..9f3cc62 100644 --- a/src/task_solver/moderator.py +++ b/src/task_solver/moderator.py @@ -2,7 +2,6 @@ import json import logging -import re import traceback from pathlib import Path from typing import Dict, List @@ -83,35 +82,10 @@ def _extract_consensus_components( return consensus_reached, final_solution, reasoning, numerical_answer except Exception as e: - # Fallback to old text parsing if JSON parsing fails - log.warning( - f"Failed to parse JSON response from moderator, falling back to text parsing: {e}" - ) - consensus_match = re.search( - r"CONSENSUS_REACHED:\s*(true|false)", response, re.IGNORECASE - ) - solution_match = re.search( - r"FINAL_SOLUTION:\s*(.*?)(?=REASONING:|$)", - response, - re.DOTALL | re.IGNORECASE, - ) - reasoning_match = re.search( - r"REASONING:\s*(.*?)$", response, re.DOTALL | re.IGNORECASE - ) - - consensus_reached = ( - consensus_match.group(1).lower() == "true" if consensus_match else False - ) - final_solution = ( - solution_match.group(1).strip() if solution_match else "NONE" - ) - reasoning = ( - reasoning_match.group(1).strip() - if reasoning_match - else "No reasoning provided" - ) - - return consensus_reached, final_solution, reasoning, "null" + msg = f"Error extracting consensus components: {e}" + log.error(msg) + log.error(traceback.format_exc()) + raise def _check_simple_consensus( self, solutions: List[AgentSolution] @@ -144,7 +118,7 @@ async def handle_task(self, message: Task, ctx: MessageContext) -> None: name=f"moderator_handle_task_{message.task_id}" ) as span: try: - msg = f"Moderator received task: {message.task_id}" + msg = f"Moderator received task: {message.task_id}, {message.capability_name} round {self._current_round}" log.info(msg) span.update( metadata={ @@ -164,14 +138,14 @@ async def handle_task(self, message: Task, ctx: MessageContext) -> None: task_id=message.task_id, problem=message.problem, capability_name=message.capability_name, - round_number=1, + round_number=self._current_round, ), topic_id=DefaultTopicId(), ) span.update( metadata={ - "solution_request_sent": f"Round 1 solution request sent for task {message.task_id}" + "solution_request_sent": f"Round {self._current_round} solution request sent for task {message.task_id}" } ) @@ -193,7 +167,7 @@ async def handle_agent_solution( task_id = message.task_id round_num = message.round_number - msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, round {round_num}" + msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, {message.capability_name} round {round_num}" log.info(msg) span.update( metadata={ @@ -204,22 +178,28 @@ async def handle_agent_solution( } ) + if round_num != self._current_round: + msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, {message.capability_name} round {round_num} but current round is {self._current_round}" + log.error(msg) + span.update(metadata={"error": msg}) + raise Exception(msg) + # Initialize round buffer if needed - if round_num not in self._solutions_buffer: - self._solutions_buffer[round_num] = [] + if self._current_round not in self._solutions_buffer: + self._solutions_buffer[self._current_round] = [] # Add solution to buffer - self._solutions_buffer[round_num].append(message) + self._solutions_buffer[self._current_round].append(message) - # Check if we have all solutions for this round - if len(self._solutions_buffer[round_num]) == self._num_solvers: - await self._check_consensus_and_proceed(task_id, round_num, ctx) + msg = f"{len(self._solutions_buffer[self._current_round])}/{self._num_solvers} solutions collected for round {self._current_round}" + log.info(msg) + span.update(metadata={"solutions_collected": msg}) - span.update( - metadata={ - "solutions_collected": f"{len(self._solutions_buffer[round_num])}/{self._num_solvers} for round {round_num}" - } - ) + if ( + len(self._solutions_buffer[self._current_round]) + == self._num_solvers + ): + await self._check_consensus_and_proceed(task_id, ctx) except Exception as e: error_msg = ( @@ -230,14 +210,14 @@ async def handle_agent_solution( span.update(metadata={"error": error_msg}) async def _check_consensus_and_proceed( - self, task_id: str, round_num: int, ctx: MessageContext + self, task_id: str, ctx: MessageContext ) -> None: """Check for consensus and either finalize or start next round.""" with self._langfuse_client.start_as_current_span( - name=f"moderator_consensus_check_{task_id}_round_{round_num}" + name=f"moderator_consensus_check_{task_id}_round_{self._current_round}" ) as span: try: - solutions = self._solutions_buffer[round_num] + solutions = self._solutions_buffer[self._current_round] # First try simple consensus check simple_consensus, simple_solution, simple_numerical = ( @@ -245,7 +225,6 @@ async def _check_consensus_and_proceed( ) if simple_consensus: - # Simple consensus reached final_solution = FinalSolution( task_id=task_id, capability_name=self._tasks.capability_name, @@ -254,7 +233,7 @@ async def _check_consensus_and_proceed( numerical_answer=simple_numerical, reasoning="All agents provided the same answer", consensus_reached=True, - total_rounds=round_num, + total_rounds=self._current_round, all_solutions=self._get_all_solutions(), ) @@ -270,9 +249,8 @@ async def _check_consensus_and_proceed( ) return - if round_num < self._max_rounds: - # Use LLM moderator to check for consensus - stored_task = self._tasks # Get original task + if self._current_round < self._max_rounds: + stored_task = self._tasks # Format solutions for LLM all_solutions_text = "\n\n".join( @@ -314,7 +292,7 @@ async def _check_consensus_and_proceed( numerical_answer=numerical_answer, reasoning=reasoning, consensus_reached=True, - total_rounds=round_num, + total_rounds=self._current_round, all_solutions=self._get_all_solutions(), ) @@ -330,8 +308,7 @@ async def _check_consensus_and_proceed( ) return # No consensus, start next round - next_round = round_num + 1 - self._current_round = next_round + self._current_round += 1 # Send revision request with flattened task data stored_task = self._tasks # Get the original task @@ -352,7 +329,7 @@ async def _check_consensus_and_proceed( } for sol in solutions ], - round_number=next_round, + round_number=self._current_round, ), topic_id=DefaultTopicId(), ) @@ -360,7 +337,7 @@ async def _check_consensus_and_proceed( span.update( metadata={ "consensus_reached": False, - "next_round_started": next_round, + "next_round_started": self._current_round, } ) else: @@ -373,7 +350,7 @@ async def _check_consensus_and_proceed( numerical_answer="null", reasoning=f"Maximum rounds ({self._max_rounds}) reached without consensus", consensus_reached=False, - total_rounds=round_num, + total_rounds=self._current_round, all_solutions=self._get_all_solutions(), ) diff --git a/src/task_solver/scientist.py b/src/task_solver/scientist.py index 957617f..8383c08 100644 --- a/src/task_solver/scientist.py +++ b/src/task_solver/scientist.py @@ -112,6 +112,7 @@ async def handle_task_solution_request( final_answer=final_answer, numerical_answer=numerical_answer, round_number=message.round_number, + capability_name=message.capability_name, ) await self.publish_message(solution, topic_id=DefaultTopicId()) @@ -182,6 +183,7 @@ async def handle_agent_revision_request( final_answer=final_answer, numerical_answer=numerical_answer, round_number=message.round_number, + capability_name=message.capability_name, ) await self.publish_message(solution, topic_id=DefaultTopicId()) diff --git a/src/utils/agentic_prompts.py b/src/utils/agentic_prompts.py index ff3f6a7..da62565 100644 --- a/src/utils/agentic_prompts.py +++ b/src/utils/agentic_prompts.py @@ -273,6 +273,13 @@ PROBLEM: {problem_text} +IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. Do not include any prefixes or prose. The JSON should be directly parseable. + +CRITICAL: When including LaTeX expressions or backslashes in your JSON strings, you must properly escape them by using double backslashes (\\\\). For example: +- Write \\\\(x^2\\\\) instead of \\(x^2\\) +- Write \\\\[equation\\\\] instead of \\[equation\\] +- Write \\\\times instead of \\times + Provide your solution in JSON format with the following structure: - thought: Your detailed reasoning and step-by-step solution process - final_answer: Your complete answer with explanation @@ -304,6 +311,13 @@ Consider the other agents' approaches and reasoning. You may agree with them, disagree, or provide a synthesis of different approaches. +IMPORTANT: Return your response as raw JSON only. Do not wrap it in markdown code blocks or add any formatting. Do not include any prefixes or prose. The JSON should be directly parseable. + +CRITICAL: When including LaTeX expressions or backslashes in your JSON strings, you must properly escape them by using double backslashes (\\\\). For example: +- Write \\\\(x^2\\\\) instead of \\(x^2\\) +- Write \\\\[equation\\\\] instead of \\[equation\\] +- Write \\\\times instead of \\times + Provide your solution in JSON format with the following structure: - thought: Your detailed reasoning, considering other agents' solutions - final_answer: Your complete answer with explanation From c5afb81815c26cb7d338a0385a14d9f4896ac08b Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Thu, 9 Oct 2025 12:20:13 -0400 Subject: [PATCH 12/19] fixed prompts. --- src/utils/agentic_prompts.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/utils/agentic_prompts.py b/src/utils/agentic_prompts.py index da62565..b0dd4cd 100644 --- a/src/utils/agentic_prompts.py +++ b/src/utils/agentic_prompts.py @@ -206,10 +206,10 @@ Please return your proposal and your thoughts and reasoning in the following format: {{ - "thought": "Your reasoning and thought process about the kind of tasks you're proposing", + "thought": "Your reasoning and thought process for designing the tasks and ensuring diversity in content and difficulty of tasks", "problems": {{ - "problem_0": "TASK_TEXT_1", - "problem_1": "TASK_TEXT_2", + "problem_0": "PROBLEM_0_DESCRIPTION", + "problem_1": "PROBLEM_1_DESCRIPTION", ... }} }} @@ -285,7 +285,7 @@ - final_answer: Your complete answer with explanation - numerical_answer: The final numerical result (if applicable, otherwise null) -Example for a math problem: +Example for a numerical problem: {{ "thought": "To solve this problem, I need to...", "final_answer": "The solution is 42 because...", From 9195b93864deb282031ce977c54644f9a59ff6de Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Thu, 16 Oct 2025 18:01:58 -0400 Subject: [PATCH 13/19] fixed output dir name to include area name. --- src/task_generation/generator.py | 15 ++++++++++----- src/task_generation/moderator.py | 8 ++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/task_generation/generator.py b/src/task_generation/generator.py index 9fb8aaa..7ff3468 100644 --- a/src/task_generation/generator.py +++ b/src/task_generation/generator.py @@ -31,7 +31,10 @@ async def generate_tasks_for_capability( - cfg: DictConfig, capability: Capability, output_dir: Path, langfuse_client: Langfuse + cfg: DictConfig, + capability: Capability, + task_output_dir_name: Path, + langfuse_client: Langfuse, ) -> None: """Generate tasks for a single capability.""" with langfuse_client.start_as_current_span( @@ -93,7 +96,7 @@ async def generate_tasks_for_capability( num_scientists=2, num_final_problems=cfg.task_generation.num_final_problems_per_capability, buffer_param=cfg.task_generation.buffer_param, - output_dir=output_dir, + output_dir=task_output_dir_name, domain=domain_name, langfuse_client=langfuse_client, max_round=cfg.task_generation.max_rounds, @@ -324,9 +327,11 @@ async def generate_tasks( # Process each capability individually for i, capability in enumerate(capabilities): capability_dir_name = capability.name.replace(" ", "_") - + area_dir_name = capability.area.replace(" ", "_").lower() + task_output_dir_name = f"[{area_dir_name}]-[{capability_dir_name}]" + tasks_output_dir = output_dir / task_output_dir_name # Skip if tasks already exist for this capability - if resume_tag and capability_dir_name in existing_tasks: + if resume_tag and task_output_dir_name in existing_tasks: msg = f"Skipping capability {i + 1}/{len(capabilities)}: {capability.name} (already exists)" log.info(msg) span.update( @@ -350,7 +355,7 @@ async def generate_tasks( ) await generate_tasks_for_capability( - cfg, capability, output_dir, langfuse_client + cfg, capability, tasks_output_dir, langfuse_client ) msg = f"Completed capability {i + 1}/{len(capabilities)}: {capability.name}" diff --git a/src/task_generation/moderator.py b/src/task_generation/moderator.py index 16e6193..3d9cf6e 100644 --- a/src/task_generation/moderator.py +++ b/src/task_generation/moderator.py @@ -301,6 +301,7 @@ async def _finalize_tasks_without_solutions(self) -> None: final_tasks[task_id] = { "task": problem_text, "capability_id": self._capability.name, + "area_id": self._capability.area, } # Save final tasks @@ -317,12 +318,11 @@ async def _finalize_tasks_without_solutions(self) -> None: async def _save_tasks_to_file(self, tasks: Dict[str, Dict[str, str]]) -> None: """Save final tasks to file.""" try: - # Create capability directory - capability_dir = self._output_dir / self._capability.name - capability_dir.mkdir(parents=True, exist_ok=True) + # Create task output directory + self._output_dir.mkdir(parents=True, exist_ok=True) # Save tasks - tasks_file = capability_dir / "tasks.json" + tasks_file = self._output_dir / "tasks.json" with open(tasks_file, "w", encoding="utf-8") as f: json.dump({"tasks": tasks}, f, indent=2, ensure_ascii=False) From 57d2d2a87c2b9c16efca79ff9d950c3088fc740c Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 17 Oct 2025 14:16:38 -0400 Subject: [PATCH 14/19] fixed task solver output dir name. --- src/task_solver/generator.py | 69 ++++++++++-------- src/task_solver/messages.py | 6 ++ src/task_solver/moderator.py | 18 +++-- src/task_solver/scientist.py | 132 +++++++++++++++++++++++++++-------- 4 files changed, 161 insertions(+), 64 deletions(-) diff --git a/src/task_solver/generator.py b/src/task_solver/generator.py index 85d12d5..0165c8b 100644 --- a/src/task_solver/generator.py +++ b/src/task_solver/generator.py @@ -36,12 +36,13 @@ async def solve_task( max_rounds = cfg.task_solver.max_rounds task_id = task.task_id capability_name = task.capability_name + area_name = task.area_name with langfuse_client.start_as_current_span( - name=f"task_solver_for_task:{task_id}, capability:{capability_name}" + name=f"task_solver_for_task:{task_id}, capability:{capability_name}, area: {area_name}" ) as span: try: - msg = f"Generating solutions for task: {task_id}, capability: {capability_name}" + msg = f"Generating solutions for task: {task_id}, capability: {capability_name}, area: {area_name}" log.info(msg) span.update( metadata={ @@ -49,6 +50,7 @@ async def solve_task( "task_id": task_id, "problem": task.problem, "capability_name": capability_name, + "area_name": area_name, } ) @@ -110,25 +112,24 @@ async def solve_task( await runtime.publish_message(task, DefaultTopicId()) - msg = f"Task message published: {task_id}, capability: {capability_name}" + msg = f"Task message published: {task_id}, capability: {capability_name}, area: {area_name}" log.info(msg) span.update( metadata={ "task_published": msg, "task_id": task_id, "capability_name": capability_name, + "area_name": area_name, } ) try: await runtime.stop_when_idle() - msg = ( - f"Completed solving task: {task_id}, capability: {capability_name}" - ) + msg = f"Completed solving task: {task_id}, capability: {capability_name}, area: {area_name}" log.info(msg) span.update(metadata={"runtime_completed": msg}) except Exception as e: - msg = f"Error while solving task {task_id}, capability: {capability_name}: {e}" + msg = f"Error while solving task {task_id}, capability: {capability_name}, area: {area_name}: {e}" log.error(msg) span.update( level="ERROR", @@ -138,6 +139,7 @@ async def solve_task( "error": str(e), "task_id": task_id, "capability_name": capability_name, + "area_name": {area_name}, }, ) raise @@ -208,7 +210,6 @@ async def solve_tasks( log.error(error_msg) span.update( level="ERROR", - status_message="Capabilities directory not found", metadata={ "directory_not_found_error": error_msg, "tasks_dir": str(tasks_dir), @@ -216,27 +217,37 @@ async def solve_tasks( ) raise FileNotFoundError(error_msg) - for capability_dir in tasks_dir.iterdir(): - if capability_dir.is_dir(): - # Check if the last part of capability_dir exists in output_dir - output_solver_dir = Path(output_dir) / capability_dir.name - if output_solver_dir.exists(): - msg = f"Solutions for tasks under capability {capability_dir.name} already exist: {output_solver_dir}" - log.info(msg) - span.update(metadata={"task_solver_skipped": msg}) - continue - - tasks_file = capability_dir / "tasks.json" - if tasks_file.exists(): - with open(tasks_file, "r", encoding="utf-8") as f: - tasks = json.load(f)["tasks"] - for task_id, task_data in tasks.items(): - task = Task( - task_id=task_id, - problem=task_data["task"], - capability_name=task_data["capability_id"], - ) - await solve_task(cfg, task, output_dir, langfuse_client) + for per_area_capability_dir in tasks_dir.iterdir(): + tasks_file = per_area_capability_dir / "tasks.json" + + if not tasks_file.exists(): + msg = f"Tasks file not found: {tasks_file}" + log.error(msg) + span.update(metadata={"warning": msg}) + continue + + with open(tasks_file, "r", encoding="utf-8") as f: + tasks = json.load(f)["tasks"] + output_solver_dir = Path(output_dir) / per_area_capability_dir.name + + for task_id, task_data in tasks.items(): + if ( + output_solver_dir.exists() + and f"{task_id}_solution.json" + in list(output_solver_dir.iterdir()) + ): + msg = f"Task {task_id} already solved" + log.info(msg) + span.update(metadata={"task_solver_skipped": msg}) + continue + + task = Task( + task_id=task_id, + problem=task_data["task"], + capability_name=task_data["capability_id"], + area_name=task_data["area_id"], + ) + await solve_task(cfg, task, output_solver_dir, langfuse_client) except Exception as e: error_msg = f"Error in task solver: {str(e)}" diff --git a/src/task_solver/messages.py b/src/task_solver/messages.py index b3c8694..36c196e 100644 --- a/src/task_solver/messages.py +++ b/src/task_solver/messages.py @@ -11,6 +11,7 @@ class Task: task_id: str problem: str capability_name: str + area_name: str @dataclass @@ -20,6 +21,7 @@ class TaskSolutionRequest: task_id: str problem: str capability_name: str + area_name: str round_number: int = 1 @@ -34,6 +36,7 @@ class AgentSolution: numerical_answer: str round_number: int capability_name: str + area_name: str def to_dict(self) -> Dict[str, str]: """Convert to dictionary.""" @@ -45,6 +48,7 @@ def to_dict(self) -> Dict[str, str]: "numerical_answer": self.numerical_answer, "round_number": str(self.round_number), "capability_name": self.capability_name, + "area_name": self.area_name, } @@ -55,6 +59,7 @@ class AgentRevisionRequest: task_id: str problem: str capability_name: str + area_name: str other_solutions: List[Dict[str, str]] round_number: int @@ -74,6 +79,7 @@ class FinalSolution: task_id: str capability_name: str + area_name: str problem: str solution: str numerical_answer: str diff --git a/src/task_solver/moderator.py b/src/task_solver/moderator.py index 9f3cc62..c46ab1c 100644 --- a/src/task_solver/moderator.py +++ b/src/task_solver/moderator.py @@ -91,7 +91,7 @@ def _check_simple_consensus( self, solutions: List[AgentSolution] ) -> tuple[bool, str, str]: """Check consensus; if all agents have the same final answer.""" - if not solutions: + if not solutions or len(solutions) < self._num_solvers: return False, "", "null" # First check numerical answers if they exist @@ -125,6 +125,7 @@ async def handle_task(self, message: Task, ctx: MessageContext) -> None: "task_received": msg, "task_id": message.task_id, "capability_name": message.capability_name, + "area_name": message.area_name, } ) @@ -138,6 +139,7 @@ async def handle_task(self, message: Task, ctx: MessageContext) -> None: task_id=message.task_id, problem=message.problem, capability_name=message.capability_name, + area_name=message.area_name, round_number=self._current_round, ), topic_id=DefaultTopicId(), @@ -167,7 +169,7 @@ async def handle_agent_solution( task_id = message.task_id round_num = message.round_number - msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, {message.capability_name} round {round_num}" + msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, {message.capability_name}, {message.area_name} round {round_num}" log.info(msg) span.update( metadata={ @@ -179,7 +181,7 @@ async def handle_agent_solution( ) if round_num != self._current_round: - msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, {message.capability_name} round {round_num} but current round is {self._current_round}" + msg = f"Moderator received solution from agent {message.agent_id} for task {task_id}, {message.capability_name}, {message.area_name} round {round_num} but current round is {self._current_round}" log.error(msg) span.update(metadata={"error": msg}) raise Exception(msg) @@ -228,6 +230,7 @@ async def _check_consensus_and_proceed( final_solution = FinalSolution( task_id=task_id, capability_name=self._tasks.capability_name, + area_name=self._tasks.area_name, problem=self._tasks.problem, solution=simple_solution, numerical_answer=simple_numerical, @@ -287,6 +290,7 @@ async def _check_consensus_and_proceed( final_solution = FinalSolution( task_id=task_id, capability_name=self._tasks.capability_name, + area_name=self._tasks.area_name, problem=self._tasks.problem, solution=final_solution_text, numerical_answer=numerical_answer, @@ -318,6 +322,7 @@ async def _check_consensus_and_proceed( task_id=stored_task.task_id, problem=stored_task.problem, capability_name=stored_task.capability_name, + area_name=stored_task.area_name, other_solutions=[ { "agent_id": sol.agent_id, @@ -345,6 +350,7 @@ async def _check_consensus_and_proceed( final_solution = FinalSolution( task_id=task_id, capability_name=self._tasks.capability_name, + area_name=self._tasks.area_name, problem=self._tasks.problem, solution="No consensus reached", numerical_answer="null", @@ -379,14 +385,12 @@ async def _save_final_solution(self, final_solution: FinalSolution) -> None: """Save the final solution to a file.""" try: self._output_dir.mkdir(parents=True, exist_ok=True) - output_file = ( - self._output_dir - / f"{final_solution.task_id}_{final_solution.capability_name}_solution.json" - ) + output_file = self._output_dir / f"{final_solution.task_id}_solution.json" solution_data = { "task_id": final_solution.task_id, "capability_name": final_solution.capability_name, + "area_name": final_solution.area_name, "problem": final_solution.problem, "solution": final_solution.solution, "numerical_answer": final_solution.numerical_answer, diff --git a/src/task_solver/scientist.py b/src/task_solver/scientist.py index 8383c08..262f255 100644 --- a/src/task_solver/scientist.py +++ b/src/task_solver/scientist.py @@ -1,5 +1,6 @@ """Task solver agent for solver tasks through debate.""" +import json import logging import traceback @@ -32,6 +33,8 @@ log = logging.getLogger("task_solver.scientist") +MAX_MODEL_ATTEMPTS = 3 + @default_subscription class TaskSolverScientist(RoutedAgent): @@ -52,11 +55,21 @@ def _extract_solution_components(self, response: str) -> tuple[str, str, str]: """Extract thought, final answer, and numerical answer from JSON response.""" try: parsed = parse_llm_json_response(response) - thought = parsed.get("thought", response.strip()) - final_answer = parsed.get("final_answer", "No clear answer provided") + thought_raw = parsed.get("thought", response.strip()) + final_answer_raw = parsed.get("final_answer", "No clear answer provided") numerical_answer = parsed.get("numerical_answer") - # Convert numerical_answer to string representation + thought = ( + json.dumps(thought_raw, ensure_ascii=False) + if isinstance(thought_raw, (dict, list)) + else str(thought_raw).strip() + ) + final_answer = ( + json.dumps(final_answer_raw, ensure_ascii=False, indent=2) + if isinstance(final_answer_raw, (dict, list)) + else str(final_answer_raw).strip() + ) + if numerical_answer is not None: numerical_answer = str(numerical_answer) else: @@ -70,6 +83,54 @@ def _extract_solution_components(self, response: str) -> tuple[str, str, str]: log.error(traceback.format_exc()) raise + async def _generate_solution_payload( + self, system_message: SystemMessage, user_message: UserMessage + ) -> tuple[str, str, str]: + """Call the model with retries until valid JSON is returned.""" + last_error: Exception | None = None + for attempt in range(1, MAX_MODEL_ATTEMPTS + 1): + try: + response = await self._model_client.create( + [system_message, user_message], + json_output=True, + ) + except Exception as exc: # pragma: no cover - network/SDK errors + last_error = exc + log.warning( + "Scientist %s failed to get response on attempt %d: %s", + self._scientist_id, + attempt, + exc, + ) + continue + + response_content = str(getattr(response, "content", "") or "").strip() + if not response_content: + last_error = ValueError("Empty response content") + log.warning( + "Scientist %s received empty response on attempt %d", + self._scientist_id, + attempt, + ) + continue + + try: + return self._extract_solution_components(response_content) + except Exception as exc: + last_error = exc + log.warning( + "Scientist %s failed to parse model response on attempt %d: %s", + self._scientist_id, + attempt, + exc, + ) + continue + + raise RuntimeError( + f"Scientist {self._scientist_id} could not obtain valid JSON " + f"after {MAX_MODEL_ATTEMPTS} attempts" + ) from last_error + @message_handler async def handle_task_solution_request( self, message: TaskSolutionRequest, ctx: MessageContext @@ -79,7 +140,11 @@ async def handle_task_solution_request( name=f"scientist_{self._scientist_id}_initial_solution_request" ) as span: try: - msg = f"Scientist {self._scientist_id} handling initial solution request for task: {message.task_id}, capability: {message.capability_name} round: {message.round_number}" + msg = ( + f"Scientist {self._scientist_id} handling initial solution request " + f"for task: {message.task_id}, capability: {message.capability_name}, area: {message.area_name}" + f"round: {message.round_number}" + ) log.info(msg) span.update( metadata={ @@ -87,6 +152,7 @@ async def handle_task_solution_request( "scientist_id": self._scientist_id, "task_id": message.task_id, "capability": message.capability_name, + "area": message.area_name, "round": message.round_number, } ) @@ -96,14 +162,11 @@ async def handle_task_solution_request( system_message = SystemMessage(content=TASK_SOLVER_SYSTEM_MESSAGE) user_message = UserMessage(content=prompt, source="user") - response = await self._model_client.create( - [system_message, user_message] - ) - - response_content = str(response.content) - thought, final_answer, numerical_answer = ( - self._extract_solution_components(response_content) - ) + ( + thought, + final_answer, + numerical_answer, + ) = await self._generate_solution_payload(system_message, user_message) solution = AgentSolution( agent_id=self._scientist_id, @@ -113,13 +176,18 @@ async def handle_task_solution_request( numerical_answer=numerical_answer, round_number=message.round_number, capability_name=message.capability_name, + area_name=message.area_name, ) await self.publish_message(solution, topic_id=DefaultTopicId()) span.update( metadata={ - "solution_generated": f"Scientist {self._scientist_id} generated solution for task {message.task_id}, capability: {message.capability_name} round: {message.round_number}", + "solution_generated": ( + f"Scientist {self._scientist_id} generated solution for task " + f"{message.task_id}, capability: {message.capability_name}, area: {message.area_name}" + f"round: {message.round_number}" + ), } ) @@ -138,7 +206,11 @@ async def handle_agent_revision_request( name=f"scientist_{self._scientist_id}_round_{message.round_number}" ) as span: try: - msg = f"Scientist {self._scientist_id} handling revision request for task: {message.task_id}, capability: {message.capability_name} round: {message.round_number}" + msg = ( + f"Scientist {self._scientist_id} handling revision request for task: " + f"{message.task_id}, capability: {message.capability_name}, area: {message.area_name}" + f"round: {message.round_number}" + ) log.info(msg) span.update( metadata={ @@ -150,31 +222,30 @@ async def handle_agent_revision_request( } ) - # Format other scientists' solutions other_solutions_text = "\n\n".join( [ - f"Scientist {sol['agent_id']}: Reasoning: {sol['thought']}, Final solution: {sol['final_answer']}" + ( + f"Scientist {sol['agent_id']}: Reasoning: {sol['thought']}, " + f"Final solution: {sol['final_answer']}" + ) for sol in message.other_solutions - if sol["agent_id"] - != self._scientist_id # Don't include its own solution + if sol["agent_id"] != self._scientist_id ] ) prompt = TASK_SOLVER_SUBSEQUENT_ROUNDS_PROMPT.format( - other_solutions=other_solutions_text, problem_text=message.problem + other_solutions=other_solutions_text, + problem_text=message.problem, ) system_message = SystemMessage(content=TASK_SOLVER_SYSTEM_MESSAGE) user_message = UserMessage(content=prompt, source="user") - response = await self._model_client.create( - [system_message, user_message] - ) - - response_content = str(response.content) - thought, final_answer, numerical_answer = ( - self._extract_solution_components(response_content) - ) + ( + thought, + final_answer, + numerical_answer, + ) = await self._generate_solution_payload(system_message, user_message) solution = AgentSolution( agent_id=self._scientist_id, @@ -184,13 +255,18 @@ async def handle_agent_revision_request( numerical_answer=numerical_answer, round_number=message.round_number, capability_name=message.capability_name, + area_name=message.area_name, ) await self.publish_message(solution, topic_id=DefaultTopicId()) span.update( metadata={ - "revision_generated": f"Scientist {self._scientist_id} generated revision for task {message.task_id}, capability: {message.capability_name}, round: {message.round_number}", + "revision_generated": ( + f"Scientist {self._scientist_id} generated revision for task " + f"{message.task_id}, capability: {message.capability_name}, area: {message.area_name}" + f"round: {message.round_number}" + ), } ) From 32922994378c4143c33dcfab66ef13cbbbe0962b Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 17 Oct 2025 14:19:46 -0400 Subject: [PATCH 15/19] upgraded json handling, and model call. --- src/utils/json_utils.py | 35 ++++++++++++++++++--------------- src/utils/model_client_utils.py | 2 +- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/src/utils/json_utils.py b/src/utils/json_utils.py index 2a57c0a..3d2fd77 100644 --- a/src/utils/json_utils.py +++ b/src/utils/json_utils.py @@ -24,35 +24,42 @@ def extract_json_from_markdown(content: str) -> str: elif content.startswith("```") and content.endswith("```"): content = content[3:-3].strip() - return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", content) + content = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", content) + + if content and not content.lstrip().startswith(("{", "[")): + brace_start = content.find("{") + brace_end = content.rfind("}") + bracket_start = content.find("[") + bracket_end = content.rfind("]") + + if brace_start != -1 and brace_end > brace_start: + content = content[brace_start : brace_end + 1].strip() + elif bracket_start != -1 and bracket_end > bracket_start: + content = content[bracket_start : bracket_end + 1].strip() + + return content def fix_common_json_errors(content: str) -> str: """Fix common JSON syntax errors.""" - # Fix extra equals signs (e.g., "area":="value" -> "area":"value") content = re.sub(r':\s*=\s*"', ':"', content) - - # Fix missing quotes around keys content = re.sub(r'(\w+):\s*"', r'"\1":"', content) - - # Fix trailing commas + content = re.sub(r'\\(?!["\\/bfnrtu])', r"\\\\", content) return re.sub(r",(\s*[}\]])", r"\1", content) def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]: """Parse LLM JSON response.""" try: - # Ensure content is a string if not isinstance(raw_content, str): raw_content = str(raw_content) - # Clean the content first cleaned_content = extract_json_from_markdown(raw_content) - - # Fix common JSON errors cleaned_content = fix_common_json_errors(cleaned_content) - # Parse the JSON + if not cleaned_content: + raise json.JSONDecodeError("Empty JSON content", cleaned_content or "", 0) + result = json.loads(cleaned_content) return result if isinstance(result, dict) else {} @@ -60,14 +67,10 @@ def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]: log.error(f"Failed to parse JSON response: {e}") log.error(f"Content length: {len(cleaned_content)} characters") - # Try to fix common JSON issues try: - # Attempt to fix unterminated strings by finding the last complete entry if "Unterminated string" in str(e): - # Find the last complete capability entry last_complete = cleaned_content.rfind('"},') if last_complete > 0: - # Truncate to last complete entry and close the JSON fixed_content = cleaned_content[: last_complete + 2] + "\n }\n}" log.warning( "Attempting to fix unterminated JSON by truncating to last complete entry" @@ -77,9 +80,9 @@ def parse_llm_json_response(raw_content: Union[str, Any]) -> Dict[str, Any]: except Exception as fix_error: log.error(f"Failed to fix JSON: {fix_error}") - # If we can't fix it, log more details and re-raise log.error(f"Raw content (last 500 chars): {raw_content[-500:]}") raise + except Exception as e: log.error(f"Unexpected error parsing JSON: {e}") log.error(f"Raw content: {raw_content}") diff --git a/src/utils/model_client_utils.py b/src/utils/model_client_utils.py index c1fdea4..c8c2ef6 100644 --- a/src/utils/model_client_utils.py +++ b/src/utils/model_client_utils.py @@ -20,7 +20,7 @@ ) -MAX_TOKENS = 1024 * 10 +MAX_TOKENS = 1024 * 30 logger = logging.getLogger(__name__) From df4860b597902f1916340a30808508a58778b158 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 17 Oct 2025 14:58:08 -0400 Subject: [PATCH 16/19] updated readme to include latest agentic changes. --- README.md | 73 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index a88c7e3..80f157f 100644 --- a/README.md +++ b/README.md @@ -73,26 +73,77 @@ Utilize the capability and the corresponding subject LLM score to select or gene ```bash python -m src.run_lbo ``` - ### Agentic Generation Scripts -Generate areas, capabilities, and tasks using multi-agent debate systems. Configure parameters in `src/cfg/agentic_config.yaml`. +These scripts implement the multi-agent debate workflow for automated generation of areas, capabilities, tasks, and solutions. +All configurable parameters are defined in `src/cfg/agentic_config.yaml`. + +--- +#### 1. Generate Areas +Generate domain areas using the scientist–moderator debate system: ```bash -# Generate capability areas python -m src.agentic_area_generator +``` + +Output location: +``` +~////areas//areas.json +``` +Where: +- comes from `global_cfg.output_dir` +- comes from `global_cfg.domain` (spaces replaced with underscores) +- comes from `exp_cfg.exp_id` +- is the tag used for the generated areas + +#### 2. Generate Capabilities +Generate capabilities for each area: +```bash +python -m src.agentic_capability_generator pipeline_tags.areas_tag=_YYYYMMDD_HHMMSS pipeline_tags.resume_capabilities_tag=_YYYYMMDD_HHMMSS +``` + +**Options:** +- `pipeline_tags.areas_tag` specifies which set of areas to use when generating capabilities. +- `pipeline_tags.resume_capabilities_tag` (optional) resumes a previous capability generation run. + +**Output location:** +``` +~////capabilities///capabilities.json +``` +Where: +- is the tag used for the generated capabilities (either resumed or auto-generated) + -# Generate capabilities for each area -python -m src.agentic_capability_generator +#### 3. Generate Tasks +Generate evaluation tasks for a specific capabilities tag: +```bash +python -m src.agentic_task_generator pipeline_tags.capabilities_tag=_YYYYMMDD_HHMMSS pipeline_tags.resume_tasks_tag=_YYYYMMDD_HHMMSS +``` -# Generate tasks for each capability -python -m src.agentic_task_generator +**Options:** +- `pipeline_tags.capabilities_tag` specifies which set of capabilities to use when generating tasks. +- `pipeline_tags.resume_tasks_tag` (optional) resumes a previous task generation run. -# Generate tasks for all capabilities -python -m src.agentic_task_generator pipeline_tags.capabilities_tag=_20250902_030203 +**Output location:** +``` +~////tasks//[]-[]/tasks.json +``` +Where: +- is the tag used for the generated tasks (either resumed or auto-generated) -# Generate solutions for tasks using multi-agent debate -python -m src.agentic_task_solver pipeline_tags.tasks_tag=_20250905_153532 +#### 4. Generate Solutions +Solve generated tasks using the multi-agent debate system: +```bash +python -m src.agentic_task_solver pipeline_tags.tasks_tag=_YYYYMMDD_HHMMSS pipeline_tags.resume_solutions_tag=_YYYYMMDD_HHMMSS +``` +**Options:** +- `pipeline_tags.tasks_tag` specifies which set of tasks to solve. +- `pipeline_tags.resume_solutions_tag` (optional) resumes a previous solution generation run. +**Output location:** +``` +~////task_solutions//[]-[]/_solution.json ``` +Where: +- is the tag used for the generated solutions (either resumed or auto-generated) From 653179baa5f855799f8a94b4af20c4dff2fa610a Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Sun, 2 Nov 2025 16:26:36 -0500 Subject: [PATCH 17/19] task diversity study scripts added. --- experimental/diverse_task_config.yaml | 89 +++++ experimental/diverse_task_dataclasses.py | 77 +++++ experimental/diverse_task_generator.py | 266 +++++++++++++++ experimental/diverse_task_prompts.py | 416 +++++++++++++++++++++++ experimental/extract_subtopics.py | 41 +++ experimental/find_combinations.py | 57 ++++ experimental/generate_blueprints.py | 66 ++++ experimental/generate_tasks.py | 68 ++++ experimental/model_utils.py | 41 +++ experimental/verify_tasks.py | 112 ++++++ 10 files changed, 1233 insertions(+) create mode 100644 experimental/diverse_task_config.yaml create mode 100644 experimental/diverse_task_dataclasses.py create mode 100644 experimental/diverse_task_generator.py create mode 100644 experimental/diverse_task_prompts.py create mode 100644 experimental/extract_subtopics.py create mode 100644 experimental/find_combinations.py create mode 100644 experimental/generate_blueprints.py create mode 100644 experimental/generate_tasks.py create mode 100644 experimental/model_utils.py create mode 100644 experimental/verify_tasks.py diff --git a/experimental/diverse_task_config.yaml b/experimental/diverse_task_config.yaml new file mode 100644 index 0000000..d0d1329 --- /dev/null +++ b/experimental/diverse_task_config.yaml @@ -0,0 +1,89 @@ +# Configuration for Diverse Task Generator + +# Model settings +model: + name: gpt-4o # OpenAI model to use + temperature: 1.0 # Temperature for all steps + max_tokens: 8192 # Max tokens for all steps + +# Task generation settings +generation: + tasks_per_blueprint: 3 # Number of tasks to generate per blueprint + min_subtopics: 3 # Suggested minimum number of sub-topics + max_subtopics: 8 # Suggested maximum number of sub-topics + +# Output settings +output: + base_dir: diverse_task_outputs + save_intermediate_steps: true # Save each step's output + pretty_print_json: true # Indent JSON files + +# Input settings +input: + capability_json_path: capability.json # Default capability JSON file path + +# Bloom's Taxonomy definitions +# Source: Revised Bloom's Taxonomy (Anderson & Krathwohl, 2001) +blooms_taxonomy: + Remember: + description: "Retrieving relevant knowledge from long-term memory. Involves recognizing and recalling facts, terms, basic concepts, or answers." + keywords: ["define", "list", "identify", "recall", "name", "state"] + + Understand: + description: "Constructing meaning from instructional messages. Involves interpreting, exemplifying, classifying, summarizing, inferring, comparing, and explaining." + keywords: ["explain", "describe", "interpret", "summarize", "compare", "contrast"] + + Apply: + description: "Carrying out or using a procedure in a given situation. Involves executing or implementing a method, technique, or process." + keywords: ["apply", "use", "implement", "execute", "solve", "demonstrate"] + + Analyze: + description: "Breaking material into constituent parts and determining how parts relate to one another and to an overall structure. Involves differentiating, organizing, and attributing." + keywords: ["analyze", "differentiate", "organize", "distinguish", "examine", "compare"] + + Evaluate: + description: "Making judgments based on criteria and standards. Involves checking for internal consistency or logical fallacies, and critiquing based on external criteria." + keywords: ["evaluate", "judge", "critique", "assess", "justify", "argue"] + + Create: + description: "Putting elements together to form a novel, coherent whole or make an original product. Involves generating, planning, and producing." + keywords: ["create", "design", "construct", "develop", "formulate", "generate"] + +# Difficulty level definitions +difficulty_levels: + easy: + description: "Basic, straightforward problems requiring minimal steps and fundamental knowledge." + characteristics: + - "Single concept application" + - "Direct recall or simple calculation" + - "Clear and unambiguous" + - "Minimal prerequisite knowledge" + + medium: + description: "Moderate complexity requiring multiple steps, integration of concepts, or non-trivial reasoning." + characteristics: + - "Multiple concept integration" + - "Multi-step solution required" + - "Some prerequisite knowledge needed" + - "May involve edge cases" + + hard: + description: "Complex, challenging problems requiring deep understanding, multiple concepts, edge cases, or sophisticated reasoning." + characteristics: + - "Complex multi-concept integration" + - "Multiple challenging steps" + - "Deep domain knowledge required" + - "Edge cases and exceptions" + - "May require insight or creative approach" + +# Verification criteria +verification: + pass_threshold: 0.8 # Minimum pass rate to consider successful + strict_mode: false # If true, all alignment criteria must pass + +# Example capability for quick testing +example_capability: + name: "compound_interest_calculations" + description: "The ability to calculate compound interest for various scenarios, including different compounding frequencies (annually, semi-annually, quarterly, monthly), different time periods, and understanding how changes in principal, rate, or time affect the final amount." + domain: "personal_finance" + area: "investing_and_savings" diff --git a/experimental/diverse_task_dataclasses.py b/experimental/diverse_task_dataclasses.py new file mode 100644 index 0000000..b03aa15 --- /dev/null +++ b/experimental/diverse_task_dataclasses.py @@ -0,0 +1,77 @@ +"""Dataclasses for the diverse task generation pipeline.""" + +from dataclasses import dataclass, field +from typing import Dict, List, Optional + + +@dataclass +class Capability: + """Represents a capability to be tested.""" + + name: str + description: str + domain: str + area: Optional[str] = None + example_tasks: List[Dict] = field(default_factory=list) + + +@dataclass +class SubTopic: + """Represents a sub-topic within a capability.""" + + name: str + description: Optional[str] = None + + +@dataclass +class Combination: + """Represents a valid (content, difficulty, reasoning) combination.""" + + content: str + difficulty: str + reasoning: str + rationale: Optional[str] = None + + +@dataclass +class Blueprint: + """Represents a task blueprint for a specific combination.""" + + combination_id: int + subtopic: str + difficulty: str + reasoning: str + blueprint: str + key_characteristics: List[str] = field(default_factory=list) + example_question_outline: Optional[str] = None + rationale: Optional[str] = None + + +@dataclass +class Task: + """Represents a generated multiple-choice task.""" + + task_id: str + blueprint_id: int + subtopic: str + difficulty: str + reasoning: str + question: str + choices: Dict[str, str] + correct_answer: str + explanation: Optional[str] = None + alignment_notes: Optional[str] = None + + +@dataclass +class VerificationResult: + """Represents the verification result for a task.""" + + task_id: str + subtopic_aligned: bool + difficulty_aligned: bool + reasoning_aligned: bool + choices_appropriate: bool + overall_aligned: bool + feedback: str + suggested_improvements: Optional[str] = None diff --git a/experimental/diverse_task_generator.py b/experimental/diverse_task_generator.py new file mode 100644 index 0000000..8518a1b --- /dev/null +++ b/experimental/diverse_task_generator.py @@ -0,0 +1,266 @@ +"""Standalone script for generating diverse tasks for a single capability.""" + +import argparse +import json +import logging +import os +from dataclasses import asdict +from datetime import datetime +from functools import partial +from pathlib import Path +from typing import Any + +import yaml +from diverse_task_dataclasses import ( + Blueprint, + Capability, + Combination, + SubTopic, + Task, + VerificationResult, +) +from extract_subtopics import extract_subtopics +from find_combinations import find_valid_combinations +from generate_blueprints import generate_blueprints +from model_utils import call_model +from openai import OpenAI +from verify_tasks import verify_tasks + +from generate_tasks import generate_tasks + + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + + +class DiverseTaskGenerator: + """Generate diverse tasks for a capability using multi-dimensional approach.""" + + def __init__( + self, + capability_dict: dict, + config: dict, + ) -> None: + """Initialize the diverse task generator.""" + # Extract example tasks from capability_data if present + example_tasks = ( + capability_dict.get("capability_data", [])[:3] + if "capability_data" in capability_dict + else [] + ) + + self.capability = Capability( + name=capability_dict["capability_name"], + description=capability_dict["capability_description"], + domain=capability_dict["capability_domain"], + area=capability_dict.get("capability_area"), + example_tasks=example_tasks, + ) + + # Store configuration + self.config = config + + # Use config values + self.model_name = self.config["model"]["name"] + self.temperature = self.config["model"]["temperature"] + self.max_tokens = self.config["model"]["max_tokens"] + self.output_dir = Path(self.config["output"]["base_dir"]) + + # Initialize OpenAI client + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + raise ValueError("OPENAI_API_KEY environment variable not set") + self.client = OpenAI(api_key=api_key) + + # Create output directory + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + self.run_output_dir = self.output_dir / f"{self.capability.name}_{timestamp}" + self.run_output_dir.mkdir(parents=True, exist_ok=True) + + logger.info("=" * 80) + logger.info(f"Initialized DiverseTaskGenerator for: {self.capability.name}") + logger.info(f"Model: {self.model_name}") + logger.info(f"Temperature: {self.temperature}") + logger.info(f"Max tokens: {self.max_tokens}") + logger.info(f"Output directory: {self.run_output_dir}") + logger.info("=" * 80) + + # Create API caller with pre-configured parameters + self._call_api = partial( + call_model, + self.client, + model_name=self.model_name, + temperature=self.temperature, + max_tokens=self.max_tokens, + ) + + def _save_json(self, filename: str, data_key: str, data: Any) -> Path: + """Save data to JSON file.""" + output_file = self.run_output_dir / filename + # Convert dataclass objects to dicts if needed + if data and hasattr( + data[0] if isinstance(data, list) else data, "__dataclass_fields__" + ): + data = ( + [asdict(item) for item in data] + if isinstance(data, list) + else asdict(data) + ) + + with open(output_file, "w") as f: + json.dump({data_key: data} if data_key else data, f, indent=2) + logger.info(f"Saved to: {output_file}") + return output_file + + def extract_and_save_subtopics(self) -> list[SubTopic]: + """Extract sub-topics and save results.""" + subtopics = extract_subtopics(self.capability, self._call_api) + self._save_json("subtopics.json", "sub_topics", subtopics) + return subtopics + + def find_and_save_combinations( + self, subtopics: list[SubTopic] + ) -> list[Combination]: + """Find valid combinations and save results.""" + combinations = find_valid_combinations( + self.capability, subtopics, self._call_api + ) + self._save_json("combinations.json", "valid_combinations", combinations) + return combinations + + def generate_and_save_blueprints( + self, combinations: list[Combination] + ) -> list[Blueprint]: + """Generate blueprints and save results.""" + blueprints = generate_blueprints( + self.capability, combinations, self._call_api, self.config + ) + self._save_json("blueprints.json", "blueprints", blueprints) + return blueprints + + def generate_and_save_tasks(self, blueprints: list[Blueprint]) -> list[Task]: + """Generate tasks and save results.""" + tasks_per_blueprint = self.config["generation"]["tasks_per_blueprint"] + tasks = generate_tasks( + self.capability, blueprints, self._call_api, tasks_per_blueprint + ) + self._save_json("tasks.json", "tasks", tasks) + return tasks + + def verify_and_save_tasks( + self, tasks: list[Task], blueprints: list[Blueprint] + ) -> VerificationResult: + """Verify tasks and save results.""" + verification = verify_tasks(self.capability, tasks, blueprints, self._call_api) + self._save_json("verification.json", None, verification) + return verification + + def run_full_pipeline(self) -> dict: + """Run the complete diverse task generation pipeline.""" + logger.info("=" * 80) + logger.info("Starting Diverse Task Generation Pipeline") + logger.info(f"Capability: {self.capability.name}") + logger.info(f"Model: {self.model_name}") + logger.info("=" * 80) + + # Extract sub-topics + subtopics = self.extract_and_save_subtopics() + + # Find valid combinations + combinations = self.find_and_save_combinations(subtopics) + + # Generate blueprints + blueprints = self.generate_and_save_blueprints(combinations) + + # Generate tasks + tasks = self.generate_and_save_tasks(blueprints) + + # Verify tasks + verification = self.verify_and_save_tasks(tasks, blueprints) + + # Compile final results + results = { + "capability_name": self.capability.name, + "capability_description": self.capability.description, + "capability_domain": self.capability.domain, + "model_name": self.model_name, + "timestamp": datetime.now().isoformat(), + "subtopics": [asdict(st) for st in subtopics], + "combinations": [asdict(c) for c in combinations], + "blueprints": [asdict(bp) for bp in blueprints], + "tasks": [asdict(t) for t in tasks], + "verification": verification, + } + + # Save final results + self._save_json("final_results.json", None, results) + + logger.info("=" * 80) + logger.info("Pipeline Complete!") + logger.info(f"All results saved to: {self.run_output_dir}") + logger.info("=" * 80) + + return results + + +def load_capability_from_json(capability_json_path: str) -> dict: + """Load capability information from a JSON file.""" + with open(capability_json_path, "r") as f: + return json.load(f) + + +def main() -> None: + """Generate diverse tasks for a single capability.""" + parser = argparse.ArgumentParser( + description="Generate diverse tasks for a capability from JSON file" + ) + parser.add_argument( + "--capability-json-path", + type=str, + help="Path to capability JSON file (default: from config file)", + ) + parser.add_argument( + "--model-name", + type=str, + help="OpenAI model name (default: from config file)", + ) + parser.add_argument( + "--output-dir", + type=str, + help="Output directory (default: from config file)", + ) + + args = parser.parse_args() + + # Load config + config_file = Path(__file__).parent / "diverse_task_config.yaml" + with open(config_file, "r") as f: + config = yaml.safe_load(f) + + # Override config with command-line arguments + if args.model_name: + config["model"]["name"] = args.model_name + if args.output_dir: + config["output"]["base_dir"] = args.output_dir + if args.capability_json_path: + config["input"]["capability_json_path"] = args.capability_json_path + + logger.info(f"Loading capability from: {config['input']['capability_json_path']}") + capability_dict = load_capability_from_json(config["input"]["capability_json_path"]) + + # Initialize and run generator + generator = DiverseTaskGenerator( + capability_dict=capability_dict, + config=config, + ) + generator.run_full_pipeline() + + logger.info("Done!") + + +if __name__ == "__main__": + main() diff --git a/experimental/diverse_task_prompts.py b/experimental/diverse_task_prompts.py new file mode 100644 index 0000000..a2fa6fc --- /dev/null +++ b/experimental/diverse_task_prompts.py @@ -0,0 +1,416 @@ +""" +Prompts for the diverse task generation pipeline. + +Edit these prompts to customize the task generation behavior. +The main script can import these instead of using hardcoded prompts. +""" + +# ============================================================================= +# SUB-TOPIC EXTRACTION +# ============================================================================= + +SUBTOPIC_SYSTEM_PROMPT = """ +You are an expert educational scientist responsible for identifying comprehensible sub-topics for a given capability. + +The name, description, and domain/area of the capability will be provided. + +Your goal is to decompose the capability into meaningful sub-topics that together provide full and balanced coverage of testing the given capability. + +Respond precisely in the following format, including the JSON start and end markers: + +RESPONSE JSON: +{ + "sub_topics": [ + "", + "", + "" + ] +} + +List each sub-topic as a concise noun phrase (5–10 words). + +Avoid redundancy and ensure each sub-topic can be independently assessed through a test question. +""" + +SUBTOPIC_USER_PROMPT_TEMPLATE = """ +Identify the key sub-topics required to assess the following capability. + +Domain: {capability_domain} +Area: {area_text} +Capability Name: {capability_name} +Capability Description: {capability_description} + +Depending on the granularity of the capability, generate 2–10 sub-topics that comprehensively represent this capability. +""" + + +# ============================================================================= +# VALID COMBINATIONS +# ============================================================================= + +COMBINATION_SYSTEM_PROMPT = """ +You are an educational scientist responsible for determining which combinations of (Content, Difficulty, Reasoning) are valid and meaningful for task generation. + +The list of available sub-topics (Content dimension), difficulty levels, and reasoning categories (based on Bloom's taxonomy) will be provided. + +Your goal is to select combinations that make pedagogical sense — i.e., combinations where a valid and meaningful question could be designed for the given sub-topic, at the specified difficulty, requiring the indicated reasoning level. + +Respond precisely in the following format, including the JSON start and end markers: + +RESPONSE JSON: +{ + "valid_combinations": [ + { + "content": "", + "difficulty": "", + "reasoning": "" + }, + ... + ] +} + +For example, extremely high reasoning levels like "Create" may not apply to simple factual sub-topics, and very easy difficulties may not pair with "Evaluate" or "Analyze" levels. + +Guidelines: +- Select only combinations that would yield meaningful assessment tasks. + +- Ensure a balanced coverage across difficulties and reasoning levels if possible. + +- Avoid redundant combinations. +""" + +COMBINATION_USER_PROMPT_TEMPLATE = """ +Determine all valid and meaningful (Content, Difficulty, Reasoning) combinations for the given capability. + +Domain: {capability_domain} +Area: {capability_area} +Capability Name: {capability_name} +Capability Description: {capability_description} + +Sub-topics (Content dimension): +{subtopics_desc} + +Difficulty levels: +- Easy: Involves direct recall, recognition, or simple application of knowledge and procedures. +- Medium: Requires connecting multiple ideas, performing multi-step reasoning, or applying knowledge in new but familiar contexts. +- Hard: Involves complex reasoning, integration of several sub-topics, or solving non-trivial problems that demand deeper conceptual understanding. + +Reasoning types (Bloom's Taxonomy): +1. Remember – Recall or recognize facts, terms, and basic concepts. Example verbs: define, list, identify. +2. Understand – Explain ideas or concepts and interpret information in one's own words. Example verbs: summarize, describe, classify. +3. Apply – Use knowledge or methods in new but familiar situations. Example verbs: calculate, demonstrate, use, implement. +4. Analyze – Break information into parts and examine relationships or patterns. Example verbs: differentiate, compare, examine, infer. +5. Evaluate – Make judgments based on criteria and standards. Example verbs: justify, critique, assess, argue. +6. Create – Combine elements to form a new pattern, structure, or product. Example verbs: design, compose, formulate, generate. + +Your task: +Identify all combinations of (Content, Difficulty, Reasoning) that are valid and pedagogically meaningful for this capability. + +Avoid combinations that are unrealistic (e.g., "Remember" level with "Hard" difficulty) or redundant. + +Ensure each selected combination could correspond to a feasible assessment task. +""" + + +# ============================================================================= +# BLUEPRINT GENERATION +# ============================================================================= + +BLUEPRINT_SYSTEM_PROMPT = """ +You are an expert educational scientist designing task blueprints for an assessment generation framework. + +Given a (Content, Difficulty, Reasoning) combination for a specific capability, you must produce a clear and detailed blueprint describing what kind of question should be designed for that combination. + +A task blueprint is a natural-language description that specifies: +1. The core skill or concept being tested (based on the content/sub-topic). + +2. The expected cognitive process or reasoning level (based on Bloom's taxonomy). + +3. The intended level of challenge or complexity (based on difficulty). + +4. The type of task or question that would fit these criteria (e.g., conceptual explanation, computation, real-world application, analysis of case, critique, design, etc.). + +Respond precisely in the following format, including the JSON start and end markers: + +RESPONSE JSON: +{ + "blueprint": "" +} + +In , write a single coherent paragraph (3–5 sentences) describing how the task should look — what the student should be asked to do, what level of reasoning it should involve, and how difficulty manifests (e.g., unfamiliar data, abstract setting, multi-step reasoning, creative synthesis). + +Ensure the blueprint is descriptive, not a question itself. +""" + +BLUEPRINT_USER_PROMPT_TEMPLATE = """ +Generate a task blueprint for the following capability and combination. + +Domain: {capability_domain} +Area: {capability_area} +Capability Name: {capability_name} +Capability Description: {capability_description} + +Selected Combination: +- Content (Sub-topic): {subtopic} +- Difficulty: {difficulty} — {difficulty_description} +- Reasoning Type (Bloom's Taxonomy): {reasoning} — {reasoning_description} + +Write a detailed blueprint describing what kind of question should be generated for this combination. + +The blueprint should explain: +1. What the learner is expected to do. +2. What kind of reasoning the task requires. +3. How difficulty manifests in the structure or context of the task. +""" + + +# ============================================================================= +# TASK GENERATION +# ============================================================================= + +TASK_SYSTEM_PROMPT = """ +You are an expert educational scientist responsible for generating high-quality multiple-choice tasks. + +Given a task blueprint that describes what the question should assess, your goal is to write a complete multiple-choice question that: + +1. Accurately reflects the blueprint and capability description. + +2. Includes exactly four answer options. + +3. Has ONLY one correct answer. + +4. Uses clear and unambiguous wording. + +5. Ensures that incorrect options (distractors) are plausible but clearly wrong when the concept is understood correctly. + +Respond precisely in the following format, including the JSON start and end markers: + +RESPONSE JSON: +{ + "question": "", + "options": { + "A": "