diff --git a/examples/lightning/assets/NB_Monarch_Lightning.svg b/examples/lightning/assets/NB_Monarch_Lightning.svg new file mode 100644 index 000000000..ffb0dd7fb --- /dev/null +++ b/examples/lightning/assets/NB_Monarch_Lightning.svg @@ -0,0 +1,2 @@ +Lightning.ai platformLightning StudioCompute AWS, GCP,providers: Lambda, etcMonarch 🦋 Notebook Initiate MMT JobMeshDefinitionActorDefinitionTrainer configsConda envLogAggregationRequiredpackagesInteractiveSessionWorker NodesMonarch ProcMeshNode1Node2Node3Node NActor 1Actor 2Actor 3Actor N* Remote actor call* Code/file sync* Workspace sync* Message exchange* Remote debugMMT@endpoint call \ No newline at end of file diff --git a/examples/lightning/assets/nodes_pending.png b/examples/lightning/assets/nodes_pending.png new file mode 100644 index 000000000..913e860c7 Binary files /dev/null and b/examples/lightning/assets/nodes_pending.png differ diff --git a/examples/lightning/assets/nodes_ready.png b/examples/lightning/assets/nodes_ready.png new file mode 100644 index 000000000..8d4c80ff9 Binary files /dev/null and b/examples/lightning/assets/nodes_ready.png differ diff --git a/examples/lightning/assets/process_allocator_log.png b/examples/lightning/assets/process_allocator_log.png new file mode 100644 index 000000000..ec6103c74 Binary files /dev/null and b/examples/lightning/assets/process_allocator_log.png differ diff --git a/examples/lightning/assets/setup_status.png b/examples/lightning/assets/setup_status.png new file mode 100644 index 000000000..96ffe3e0a Binary files /dev/null and b/examples/lightning/assets/setup_status.png differ diff --git a/examples/lightning/monarch_lightning.ipynb b/examples/lightning/monarch_lightning.ipynb new file mode 100644 index 000000000..3b359e3e1 --- /dev/null +++ b/examples/lightning/monarch_lightning.ipynb @@ -0,0 +1,2488 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hero Notebook: TorchTitan Multi-Node Training with Monarch & Lightning SDK\n", + "\n", + "This notebook demonstrates how to run TorchTitan training using Monarch for distributed multi-node training on Lightning AI infrastructure.\n", + "\n", + "
\n", + " \"Monarch\n", + "
\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of Contents\n", + "\n", + "This notebook provides a comprehensive guide to running distributed multi-node training using **Monarch** (Meta's distributed actor framework) with **TorchTitan** (PyTorch's large-scale LLM training library) on **Lightning AI** infrastructure. You'll learn how to set up, execute, debug, and manage distributed training workflows across multiple GPU nodes. \n", + "\n", + "While Part I & II are the core of this Notebook for setup and training; Part III is for users who are interested in Monarch's advanced features such as interactive distributed debugging, environment variable management, and code synchronization for workspaces between local node and remote nodes.\n", + "\n", + "### What You'll Learn\n", + "\n", + "**Part I: Environment Setup** *(Essential Prerequisites)*\n", + "- Install TorchTitan - Set up PyTorch and TorchTitan for LLM training\n", + "- Download Llama-3.1-8B Model Assets - Get model tokenizers from Hugging Face\n", + "- Install Monarch - Install Meta's distributed actor framework\n", + "- Setup Weights & Biases - Configure experiment tracking\n", + "- Update Lightning SDK - Get the latest Lightning SDK features\n", + "- Verify Installations - Confirm all dependencies are ready\n", + "\n", + "**Part II: Multi-Node Training** *(Core Training Workflow)*\n", + "- Import Lightning SDK Components - Import required classes for multi-machine training\n", + "- Configure Training Job Parameters - Set up nodes, GPUs, and network settings\n", + "- Launch Multi-Node Training Job - Start distributed infrastructure on Lightning AI\n", + "- Set Up Process Mesh - Initialize Monarch's distributed computing mesh\n", + "- Define TorchTitan Trainer Actor - Create distributed training actor\n", + "- Run TorchTitan Training - Execute Llama 3-8B training across nodes\n", + "\n", + "**Part III: Advanced Features** *(Distributed Development & Debugging)*\n", + "\n", + "1. **Environment Variable Management**\n", + " - Spawn Environment Variable Actor - Manage env vars across nodes\n", + " - Get/Set Environment Variables - Inspect and modify remote environments\n", + " - List Environment Variables - Query env vars by prefix\n", + "\n", + "2. **Workspace Synchronization** *(Hot-Reload Code & Configs)*\n", + " - Introduction to sync_workspace - Understanding workspace sync\n", + " - Content checker Actor for files - Define an Actor to check content\n", + " - Create Local Configuration - Set up training configs\n", + " - Sync to Remote Nodes - Propagate changes to workers\n", + " - Verify Synchronization - Confirm files are synced\n", + "\n", + "3. **Interactive Debugging with Breakpoints**\n", + " - Debugging Overview - Using pdb with distributed actors\n", + " - Define Debug Trainer - Create actor with breakpoints\n", + " - Spawn and Debug - Run interactive debugging session\n", + " - Debugger Commands - Learn monarch debug CLI commands\n", + "\n", + "**Part IV: Cleanup**\n", + "- Stop Process Mesh - Gracefully shutdown distributed resources\n", + "\n", + "---\n", + "\n", + "### Key Concepts\n", + "\n", + "- **Monarch Actor**: Distributed computation unit that runs on remote nodes\n", + "- **Process Mesh (ProcMesh)**: Network of processes across multiple nodes for distributed computing\n", + "- **Endpoint**: Method decorator that makes actor methods callable remotely\n", + "- **Workspace Sync**: Synchronize local code/config changes to remote worker nodes without restart\n", + "- **Lightning MMT**: Multi-Machine Training orchestration on Lightning AI\n", + "\n", + "### Prerequisites\n", + "- Lightning AI account with access to GPU machines (L40S recommended)\n", + "- Hugging Face account with Llama model access\n", + "- Basic understanding of distributed training concepts\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part I: Environment Setup\n", + "\n", + "Before running the notebook cells, ensure all dependencies are properly installed by following the steps below." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install TorchTitan\n", + "\n", + "Clone the TorchTitan repository, install the nightly PyTorch build with CUDA 12.6 support, and install TorchTitan:\n", + "\n", + "```bash\n", + "git clone https://github.com/pytorch/torchtitan.git\n", + "cd torchtitan\n", + "pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 --force-reinstall\n", + "pip install .\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download Llama-3-8B Model Assets\n", + "\n", + "Download the Llama-3.1-8B tokenizer from Hugging Face. You'll need a Hugging Face token with access to the Llama models:\n", + "\n", + "```bash\n", + "python scripts/download_hf_assets.py \\\n", + " --repo_id meta-llama/Llama-3.1-8B \\\n", + " --assets tokenizer \\\n", + " --hf_token=YOUR_HUGGINGFACE_TOKEN_KEY\n", + "```\n", + "\n", + "Replace `YOUR_HUGGINGFACE_TOKEN_KEY` with your actual Hugging Face token." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install Monarch\n", + "\n", + "Install Monarch from the GitHub repository following the Ubuntu installation instructions:\n", + "\n", + "```bash\n", + "git clone https://github.com/meta-pytorch/monarch.git\n", + "cd monarch\n", + "# Follow the Ubuntu installation instructions from the repository\n", + "```\n", + "\n", + "For detailed installation steps, visit: https://github.com/meta-pytorch/monarch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup Weights & Biases\n", + "\n", + "Check if wandb is installed. If not, install it and login:\n", + "\n", + "```bash\n", + "pip install wandb\n", + "wandb login\n", + "```\n", + "\n", + "Follow the prompts to authenticate with your wandb account." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Update the Lightning SDK\n", + "\n", + "The latest version of lightning SDK offers IP sharing between the client host and remote nodes. This features is being used in this Notebook.\n", + "\n", + "```bash\n", + "pip install -U lightning_sdk\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Verify Installations\n", + "\n", + "After completing the installation steps above, verify that TorchTitan and Monarch are properly installed:\n", + "\n", + "```python\n", + "# Verify TorchTitan installation\n", + "import torchtitan\n", + "print(\"TorchTitan is installed successfully\")\n", + "\n", + "# Verify Monarch installation\n", + "import monarch\n", + "print(\"Monarch is installed successfully\")\n", + "\n", + "# Verify PyTorch and CUDA\n", + "import torch\n", + "print(f\"PyTorch version: {torch.__version__}\")\n", + "```\n", + "\n", + "If all imports succeed, you're ready to proceed with the training workflow below." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Part II: Multi-Node Training with Monarch and Lightning\n", + "\n", + "Now that the environment is set up, we can proceed with configuring and launching the distributed training job." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Lightning SDK Components\n", + "\n", + "Import the necessary classes from Lightning SDK to manage multi-machine training jobs, including `Machine` for hardware specifications, `MMT` for multi-machine training orchestration, and `Studio` for workspace management." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from lightning_sdk import Machine, MMT, Studio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure Training Job Parameters\n", + "\n", + "Set up the configuration for the multi-node training job, including the number of nodes (2), GPUs per node (8), teamspace name, username, and port range for worker node communication." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration\n", + "import os\n", + "NUM_NODES = 16\n", + "NUM_GPUS = 8\n", + "TEAMSPACE = \"general\" # Replace with your teamspace\n", + "USER = \"meta-ai\" # Replace with your username\n", + "MMT_JOB_NAME = f\"Monarch-v0-MMT-{NUM_NODES}-nodes\"\n", + "\n", + "# Remote allowed port range for worker nodes\n", + "REMOTE_ALLOWED_PORT_RANGE = \"26601..26611\"\n", + "\n", + "# To force Monarch to use V0 for this Notebook (This will be removed in the future)\n", + "os.environ[\"MONARCH_V0_WORKAROUND_DO_NOT_USE\"] = \"1\"\n", + "os.environ[\"MONARCH_FILE_LOG\"] = \"debug\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define MMT Job Launch Function\n", + "\n", + "Create a function to launch a multi-machine training (MMT) job using Lightning SDK. This function installs the MMT plugin, configures the machine type (L40S GPUs), sets environment variables for CUDA devices and Monarch configurations, and returns the job handle and studio instance." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def launch_mmt_job(num_nodes=2, teamspace=\"my-teamspace\", user=\"my-user\"):\n", + " \"\"\"\n", + " Launch a multi-machine training job using Lightning SDK's MMT API.\n", + " \"\"\"\n", + "\n", + " studio = Studio()\n", + "\n", + " # Install the MMT plugin befor running the actual job\n", + " studio.install_plugin(\"multi-machine-training\")\n", + "\n", + " print(f\"Launching MMT job with {num_nodes} nodes...\")\n", + "\n", + " # Machine with T4 GPUs\n", + " # machine_type = getattr(Machine, f\"T4_X_{NUM_GPUS}\")\n", + "\n", + " # Machine with L40 GPUs\n", + " # machine_type = getattr(Machine, f\"L4_X_{NUM_GPUS}\")\n", + "\n", + " # Machine with L40S GPUs\n", + " machine_type = getattr(Machine, f\"L40S_X_{NUM_GPUS}\")\n", + "\n", + " job = MMT.run(\n", + " command=\"process_allocator\",\n", + " name=MMT_JOB_NAME,\n", + " machine=machine_type,\n", + " studio=studio,\n", + " num_machines=num_nodes,\n", + " env={\n", + " \"CUDA_VISIBLE_DEVICES\": \"0,1,2,3,4,5,6,7\", # Make all GPUs visible # TODO: Should make this one dynamic\n", + " \"MONARCH_FILE_LOG\": \"debug\",\n", + " \"HYPERACTOR_REMOTE_ALLOC_ALLOWED_PORT_RANGE\": REMOTE_ALLOWED_PORT_RANGE,\n", + " \"HYPERACTOR_REMOTE_ALLOC_BIND_TO_INADDR_ANY\": \"true\",\n", + " \"WORKSPACE_DIR\": \"/tmp\",\n", + " },\n", + " )\n", + "\n", + " print(f\"Job started with ID: {job.name}\")\n", + " print(f\"Job status: {job.status}\")\n", + "\n", + " # Monitor job status\n", + " return job, studio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch the Multi-Node Training Job\n", + "\n", + "Execute the `launch_mmt_job` function with the specified number of nodes, teamspace, and user credentials. This starts the distributed training infrastructure and provides commands for monitoring and stopping the job." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Launching MMT job with 16 nodes...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO - Multi-Machine Job was successfully launched. View it at https://lightning.ai/meta-ai/general/jobs/Multi-Node-Monarch-Titan-Scale-16_nodes-port_override?app_id=mmt\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Job started with ID: Multi-Node-Monarch-Titan-Scale-16_nodes-port_override\n", + "Job status: Pending\n", + "Job launched. You can monitor it using: job.status\n", + "To stop the job: job.stop()\n", + "To clean up: studio.stop()\n" + ] + } + ], + "source": [ + "# Launch the job\n", + "job, studio = launch_mmt_job(\n", + " num_nodes=NUM_NODES, teamspace=TEAMSPACE, user=USER\n", + ")\n", + "\n", + "print(f\"Job launched. You can monitor it using: job.status\")\n", + "print(f\"To stop the job: job.stop()\")\n", + "print(f\"To clean up: studio.stop()\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Monitor jobs in the MMT Plugin\n", + "\n", + "When user initiate a job, they can monitor the status of the job through the MMT plugin.\n", + "Running the cell above initaties the requested number of nodes on the lightning cluster.\n", + "The user may see different setups for the nodes like this:\n", + "\n", + "
\n", + " \"setup\n", + "
\n", + "
\n", + " \"nodes\n", + "
\n", + "\n", + "\n", + "Once nodes are available through the lightning, the SDK will take care of snapshot-ing your environment, setup the nodes, and copy the corresponded data:\n", + "
\n", + " \"nodes\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set Up Process Mesh from Job\n", + "\n", + "Initialize the Monarch process mesh using the launched Lightning job. This creates the distributed computing mesh that connects all nodes and GPUs for coordinated training.\n", + "\n", + "Before running the cell below, please make sure that the `process_allocator` process from Monarch is running on your requested nodes! You can confirm that by taking a look at the MMT SDK:\n", + "\n", + "
\n", + " \"process_allocator_log\"\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error: File /tmp/worker_nodes.txt not found\n", + "Extracted IP addresses:\n", + "\n", + "IP set: set()\n", + "ip_addresses_list=['3.150.74.121', '3.148.30.121', '3.149.152.201', '3.139.30.73', '3.130.134.76', '3.150.253.90', '3.18.136.106', '3.149.202.134', '3.133.10.50', '3.142.183.138', '18.217.116.20', '3.134.162.180', '3.20.57.208', '3.151.0.192', '18.216.236.74', '18.223.178.27']\n", + "ip_addresses_set={'3.151.0.192', '3.18.136.106', '3.133.10.50', '18.223.178.27', '3.150.74.121', '3.149.152.201', '3.20.57.208', '3.148.30.121', '3.150.253.90', '18.217.116.20', '18.216.236.74', '3.134.162.180', '3.130.134.76', '3.139.30.73', '3.142.183.138', '3.149.202.134'}\n", + "IP addresses are available: True\n", + "private_master_host_ip_address='10.192.12.151'\n", + "public_master_host_ip_address='54.209.46.214'\n", + "tcp!3.151.0.192:26600 tcp!3.18.136.106:26600 tcp!3.133.10.50:26600 tcp!18.223.178.27:26600 tcp!3.150.74.121:26600 tcp!3.149.152.201:26600 tcp!3.20.57.208:26600 tcp!3.148.30.121:26600 tcp!3.150.253.90:26600 tcp!18.217.116.20:26600 tcp!18.216.236.74:26600 tcp!3.134.162.180:26600 tcp!3.130.134.76:26600 tcp!3.139.30.73:26600 tcp!3.142.183.138:26600 tcp!3.149.202.134:26600\n", + "AllocHandle(_hy_alloc=, _extent={'hosts': 16, 'gpus': 8}, _stream_logs=True, _allocator=, _constraints=)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "sys:1: UserWarning: The AllocSpec passed to RemoteAllocator.allocate has transport unix, but the transport from the remote process alloc initializer is tcp(Hostname). This will soon be an error unless you explicitly configure monarch's default transport to tcp(Hostname). The current default transport is unix.\n" + ] + } + ], + "source": [ + "from utils.mesh_utils import setup_proc_mesh_from_job\n", + "\n", + "proc_mesh = setup_proc_mesh_from_job(job, NUM_NODES, NUM_GPUS)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example Hero - Run TorchTitan using Monarch for Llama 3 - 8B" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generate Job Name Helper\n", + "\n", + "Define a utility function to generate a unique job name based on the username, number of hosts, and GPUs per host. This helps identify and track different training runs." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "monarch-alisol-hosts16-gpus8\n" + ] + } + ], + "source": [ + "import getpass\n", + "def get_job_name(num_hosts: int, num_gpus_per_host: int):\n", + " return f\"monarch-{getpass.getuser()}-hosts{num_hosts}-gpus{num_gpus_per_host}\"\n", + "print(get_job_name(num_hosts=NUM_NODES, num_gpus_per_host=NUM_GPUS))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define TorchTitan Trainer Actor\n", + "\n", + "Create the `TitanTrainerWrapper` class, a Monarch Actor that wraps TorchTitan's training functionality. This actor handles initialization, training execution, checkpointing, and cleanup of the distributed training process across all nodes." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import logging\n", + "from monarch.actor import ProcMesh, Actor, endpoint, current_rank\n", + "import socket\n", + "from torchtitan.tools.logging import init_logger, logger\n", + "from torchtitan.train import Trainer\n", + "from typing import Optional\n", + "import torch\n", + "from torchtitan.config import JobConfig\n", + "\n", + "\n", + "class TitanTrainerWrapper(Actor):\n", + " def __init__(self, job_config: JobConfig):\n", + " self.rank = current_rank().rank\n", + " self.job_config = job_config\n", + "\n", + " def _rprint(self, msg):\n", + " \"\"\"Helper method to print with rank information.\"\"\"\n", + " print(f\"{self.rank=} {msg}\")\n", + "\n", + " @endpoint\n", + " def init(self):\n", + " logging.getLogger().addHandler(logging.StreamHandler(sys.stderr))\n", + " print(f\"Initializing actor: {self.rank} {current_rank()=} {socket.gethostname()=}\")\n", + "\n", + "\n", + " @endpoint\n", + " def train(self):\n", + " logger.info(\"Starting training\")\n", + " config = self.job_config\n", + " trainer: Optional[Trainer] = None\n", + "\n", + " try:\n", + " trainer = Trainer(config)\n", + " trainer.train()\n", + "\n", + " if config.checkpoint.create_seed_checkpoint:\n", + " assert (\n", + " int(os.environ[\"WORLD_SIZE\"]) == 1\n", + " ), \"Must create seed checkpoint using a single device, to disable sharding.\"\n", + " assert (\n", + " # config.checkpoint.enable_checkpoint\n", + " config.checkpoint.enable\n", + " ), \"Must enable checkpointing when creating a seed checkpoint.\"\n", + " trainer.checkpointer.save(curr_step=0, )\n", + " logger.info(\"Created seed checkpoint\")\n", + " else:\n", + " trainer.train()\n", + " finally:\n", + " if trainer:\n", + " trainer.close()\n", + "\n", + " if torch.distributed.is_initialized():\n", + " torch.distributed.destroy_process_group()\n", + " logger.info(\"Process group destroyed.\")\n", + " print(\"Done training\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Async Main Training Function\n", + "\n", + "Set up the main asynchronous function that orchestrates the distributed training. This function configures the environment for distributed execution, spawns trainer actors across the process mesh, and initiates the training workflow. The reason that this function is defined as async is becuase of those call of endpoints where need to be awaited. This makes sure that coordination of operations across multiple machines are done asynchronously rather than blocking the main thread." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from torchtitan.config import ConfigManager, JobConfig\n", + "from monarch.tools.network import AddrType\n", + "from monarch.utils import setup_env_for_distributed\n", + "\n", + "async def async_main(job_config: JobConfig):\n", + " torch.use_deterministic_algorithms(True)\n", + " job_name = get_job_name(NUM_NODES, NUM_GPUS)\n", + "\n", + " \"\"\"\n", + " # if use_ipaddr is not passed, then default is IPv6 for MASTER_ADDR\n", + " \"\"\"\n", + " await setup_env_for_distributed(proc_mesh, use_ipaddr=AddrType.IPv4)\n", + "\n", + " await proc_mesh.logging_option(stream_to_client=True, aggregate_window_sec=3)\n", + "\n", + " print(job_config)\n", + " print(f\"Spawning meshes on {job_name}\")\n", + "\n", + " trainer_actor = proc_mesh.spawn(\"trainer_actor\", TitanTrainerWrapper, job_config)\n", + "\n", + " await trainer_actor.init.call()\n", + " await trainer_actor.train.call()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Logger and Run Training\n", + "\n", + "Configure the TorchTitan logger and parse training arguments including model configuration file, tokenizer path, dataset location, number of training steps, and output directory. Then execute the asynchronous training pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[titan] 2025-10-20 05:16:23,787 - root - WARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily.\n", + "JobConfig(job=Job(config_file='/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml', dump_folder='/teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts16-gpus8', description='Llama 3 8B training', print_config=False, custom_config_module=''), profiling=Profiling(enable_profiling=True, save_traces_folder='profile_trace', profile_freq=100, profiler_active=1, profiler_warmup=3, enable_memory_snapshot=False, save_memory_snapshot_folder='memory_snapshot'), metrics=Metrics(log_freq=1, enable_tensorboard=True, disable_color_printing=False, save_tb_folder='tb', save_for_all_ranks=False, enable_wandb=True), model=Model(name='llama3', flavor='8B', hf_assets_path='/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B', tokenizer_path='/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B', converters=[], print_after_conversion=False), optimizer=Optimizer(name='AdamW', lr=0.0003, beta1=0.9, beta2=0.95, eps=1e-08, weight_decay=0.1, implementation='fused', early_step_in_backward=False), lr_scheduler=LRScheduler(warmup_steps=200, decay_ratio=None, decay_type='linear', min_lr_factor=0.0), training=Training(dataset='c4_test', dataset_path='/teamspace/studios/this_studio/torchtitan/tests/assets/c4_test', local_batch_size=1, global_batch_size=-1, seq_len=1024, max_norm=1.0, steps=25, enable_cpu_offload=False, dtype='float32', mixed_precision_param='bfloat16', mixed_precision_reduce='float32', gc_freq=50, gc_debug=False, seed=None, deterministic=False, debug_moe_force_load_balance=False), parallelism=Parallelism(data_parallel_replicate_degree=1, enable_compiled_autograd=False, data_parallel_shard_degree=-1, fsdp_reshard_after_forward='default', tensor_parallel_degree=1, disable_loss_parallel=False, enable_async_tensor_parallel=False, pipeline_parallel_degree=1, module_fqns_per_model_part=None, pipeline_parallel_first_stage_less_layers=1, pipeline_parallel_last_stage_less_layers=1, pipeline_parallel_layers_per_stage=None, pipeline_parallel_schedule='1F1B', pipeline_parallel_schedule_csv='', pipeline_parallel_microbatch_size=1, context_parallel_degree=1, context_parallel_rotate_method='allgather', expert_parallel_degree=1, expert_tensor_parallel_degree=1), checkpoint=Checkpoint(enable=False, enable_ft_dataloader_checkpoints=True, folder='checkpoint', interval=500, initial_load_path=None, initial_load_model_only=True, initial_load_in_hf=False, initial_load_in_hf_quantized=False, last_save_model_only=True, last_save_in_hf=False, export_dtype='float32', async_mode='disabled', keep_latest_k=10, load_step=-1, exclude_from_loading=[], enable_first_step_checkpoint=False, create_seed_checkpoint=False, load_only=False), activation_checkpoint=ActivationCheckpoint(mode='selective', selective_ac_option='op', per_op_sac_force_recompute_mm_shapes_by_fqns=['moe.router.gate'], early_stop=False, memory_budget=0.5, visualize_memory_budget_pareto=False), compile=Compile(enable=False, components=['model', 'loss'], backend='inductor'), quantize=Quantize(linear=QuantizedLinear(float8=Float8Linear(enable_fsdp_float8_all_gather=False, precompute_float8_dynamic_scale_for_fsdp=False, recipe_name=None, filter_fqns=['output'], emulate=False), mx=MXLinear(mxfp8_dim1_cast_kernel_choice='triton', recipe_name='mxfp8_cublas', filter_fqns=['output'])), grouped_mm=QuantizedGroupedMM(float8=Float8GroupedMM(fqns=[]), mx=MXGroupedMM(recipe_name='mxfp8', fqns=[]))), comm=Comm(init_timeout_seconds=300, train_timeout_seconds=100, trace_buf_size=20000, save_traces_folder='comm_traces', save_traces_file_prefix='rank_'), memory_estimation=MemoryEstimation(enable=False, disable_fake_mode=False), fault_tolerance=FaultTolerance(enable=False, process_group='gloo', process_group_timeout_ms=10000, replica_id=0, group_size=0, min_replica_size=1, semi_sync_method=None), experimental=Experimental(custom_import='', custom_args_module=''), validation=Validation(enable=False, dataset='c4_validation', dataset_path=None, local_batch_size=8, seq_len=2048, freq=500, steps=1200))\n", + "Spawning meshes on monarch-alisol-hosts16-gpus8\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:14:41) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m Initializing actor: 16 current_rank()={'hosts': 2/16, 'gpus': 0/8} socket.gethostname()='ip-10-192-11-77'\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:16:27) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:14:41) >>>\u001b[0m\n", + "\u001b[33m[128 similar log lines]\u001b[0m Starting training\n", + "\u001b[33m[128 similar log lines]\u001b[0m Starting job: Llama 3 8B training\n", + "\u001b[33m[128 similar log lines]\u001b[0m Building 1-D device mesh with ['dp_shard'], [128]\n", + "\u001b[33m[128 similar log lines]\u001b[0m /home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/torch/distributed/device_mesh.py:788: UserWarning: Slicing a flattened dim from root mesh will be deprecated in PT 2.11. Users need to bookkeep the flattened mesh directly. \n", + "\u001b[33m[128 similar log lines]\u001b[0m warnings.warn(\n", + "\u001b[33m[128 similar log lines]\u001b[0m [GC] Initial GC collection took 0.00 seconds\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:16:30) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:16:27) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m Initializing actor: 22 current_rank()={'hosts': 2/16, 'gpus': 6/8} socket.gethostname()='ip-10-192-11-77'\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:16:30) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:16:30) >>>\u001b[0m\n", + "\u001b[33m[128 similar log lines]\u001b[0m Loading tokenizer from tokenizer.json\n", + "\u001b[33m[128 similar log lines]\u001b[0m Preparing c4_test dataset from /teamspace/studios/this_studio/torchtitan/tests/assets/c4_test\n", + "Generating train split: 0 examples [00:00, ? examples/s]\n", + "Generating train split: 2000 examples [00:00, 89312.72 examples/s]\n", + "\u001b[33m[128 similar log lines]\u001b[0m Building llama3 8B with TransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, rope_scaling_args=RoPEScalingArgs(scaling_factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_position_embeddings=8192), max_seq_len=1024, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)\n", + "\u001b[33m[127 similar log lines]\u001b[0m CUDA capacity: NVIDIA L40S with 44.64GiB memory\n", + "\u001b[33m[159 similar log lines]\u001b[0m Peak flops undefined for: NVIDIA L40S, fallback to A100\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[34mModel llama3 8B \u001b[31msize: 8,030,261,248 total parameters\u001b[39m\n", + "\u001b[33m[127 similar log lines]\u001b[0m Applied selective activation checkpointing to the model\n", + "\u001b[33m[127 similar log lines]\u001b[0m /home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/torch/distributed/device_mesh.py:788: UserWarning: Slicing a flattened dim from root mesh will be deprecated in PT 2.11. Users need to bookkeep the flattened mesh directly. \n", + "\u001b[33m[129 similar log lines]\u001b[0m warnings.warn(\n", + "\u001b[33m[127 similar log lines]\u001b[0m Applied FSDP to the model\n", + "\u001b[33m[2 similar log lines]\u001b[0m /home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'repr' attribute with value False was provided to the `Field()` function, which has no effect in the context it was used. 'repr' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.\n", + "\u001b[33m[104 similar log lines]\u001b[0m /home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/torch/nn/init.py:119: UserWarning: Specified kernel cache directory could not be created! This disables kernel caching. Specified directory is /home/zeus/.cache/torch/kernels. This warning will appear only once per process. (Triggered internally at /pytorch/aten/src/ATen/native/cuda/jit_utils.cpp:1487.)\n", + "\u001b[33m[104 similar log lines]\u001b[0m tensor.erfinv_()\n", + "\u001b[33m[29 similar log lines]\u001b[0m Peak FLOPS used for computing MFU: 3.120e+14\n", + "\u001b[33m[28 similar log lines]\u001b[0m CUDA memory usage for model: 0.25GiB(0.56%)\n", + "\u001b[33m[26 similar log lines]\u001b[0m Warmup steps (200) exceed total training steps (25). Adjusting warmup steps to 25.\n", + "\u001b[33m[26 similar log lines]\u001b[0m model.safetensors.index.json not found at hf_assets_path: /teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B/model.safetensors.index.json. Defaulting to saving a single safetensors file if checkpoint is saved in HF format\n", + "\u001b[33m[24 similar log lines]\u001b[0m Mixed precision training is handled by fully_shard\n", + "\u001b[33m[22 similar log lines]\u001b[0m Trainer is initialized with local batch size 1, global batch size 128, gradient accumulation steps 1, sequence length 1024, total steps 25 (warmup 200)\n", + "\u001b[33m[19 similar log lines]\u001b[0m Training starts at step 1\n", + "\u001b[33m[15 similar log lines]\u001b[0m Profiling active. Traces will be saved at /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts16-gpus8/profile_trace\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Currently logged in as: a-shamsoshoara (a-shamsoshoara-m) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:16:33) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:16:33) >>>\u001b[0m\n", + "\u001b[33m[105 similar log lines]\u001b[0m Trainer is initialized with local batch size 1, global batch size 128, gradient accumulation steps 1, sequence length 1024, total steps 25 (warmup 200)\n", + "\u001b[33m[112 similar log lines]\u001b[0m Profiling active. Traces will be saved at /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts16-gpus8/profile_trace\n", + "\u001b[33m[98 similar log lines]\u001b[0m Peak FLOPS used for computing MFU: 3.120e+14\n", + "\u001b[33m[95 similar log lines]\u001b[0m Peak flops undefined for: NVIDIA L40S, fallback to A100\n", + "\u001b[33m[108 similar log lines]\u001b[0m Training starts at step 1\n", + "\u001b[33m[101 similar log lines]\u001b[0m Warmup steps (200) exceed total training steps (25). Adjusting warmup steps to 25.\n", + "\u001b[33m[99 similar log lines]\u001b[0m CUDA memory usage for model: 0.25GiB(0.56%)\n", + "\u001b[33m[103 similar log lines]\u001b[0m Mixed precision training is handled by fully_shard\n", + "\u001b[33m[101 similar log lines]\u001b[0m model.safetensors.index.json not found at hf_assets_path: /teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B/model.safetensors.index.json. Defaulting to saving a single safetensors file if checkpoint is saved in HF format\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:16:33) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:16:33) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Tracking run with wandb version 0.22.2\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Run data is saved locally in /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts16-gpus8/tb/20251020-0516/wandb/run-20251020_051633-kog9t67d\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Run `wandb offline` to turn off syncing.\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Syncing run easy-waterfall-51\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: ⭐️ View project at https://wandb.ai/a-shamsoshoara-m/torchtitan\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: 🚀 View run at https://wandb.ai/a-shamsoshoara-m/torchtitan/runs/kog9t67d\n", + "\u001b[33m[1 similar log lines]\u001b[0m WandB logging enabled\n", + "\u001b[33m[1 similar log lines]\u001b[0m TensorBoard logging enabled. Logs will be saved at /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts16-gpus8/tb/20251020-0516\n", + "\u001b[33m[1 similar log lines]\u001b[0m CUDA capacity: NVIDIA L40S with 44.64GiB memory\n", + "\u001b[33m[2 similar log lines]\u001b[0m Peak flops undefined for: NVIDIA L40S, fallback to A100\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[34mModel llama3 8B \u001b[31msize: 8,030,261,248 total parameters\u001b[39m\n", + "\u001b[33m[1 similar log lines]\u001b[0m Applied selective activation checkpointing to the model\n", + "\u001b[33m[1 similar log lines]\u001b[0m /home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/torch/distributed/device_mesh.py:788: UserWarning: Slicing a flattened dim from root mesh will be deprecated in PT 2.11. Users need to bookkeep the flattened mesh directly. \n", + "\u001b[33m[1 similar log lines]\u001b[0m warnings.warn(\n", + "\u001b[33m[1 similar log lines]\u001b[0m Applied FSDP to the model\n", + "\u001b[33m[1 similar log lines]\u001b[0m Peak FLOPS used for computing MFU: 3.120e+14\n", + "\u001b[33m[1 similar log lines]\u001b[0m CUDA memory usage for model: 0.25GiB(0.56%)\n", + "\u001b[33m[1 similar log lines]\u001b[0m Warmup steps (200) exceed total training steps (25). Adjusting warmup steps to 25.\n", + "\u001b[33m[1 similar log lines]\u001b[0m model.safetensors.index.json not found at hf_assets_path: /teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B/model.safetensors.index.json. Defaulting to saving a single safetensors file if checkpoint is saved in HF format\n", + "\u001b[33m[1 similar log lines]\u001b[0m Mixed precision training is handled by fully_shard\n", + "\u001b[33m[1 similar log lines]\u001b[0m Trainer is initialized with local batch size 1, global batch size 128, gradient accumulation steps 1, sequence length 1024, total steps 25 (warmup 200)\n", + "\u001b[33m[1 similar log lines]\u001b[0m Training starts at step 1\n", + "\u001b[33m[1 similar log lines]\u001b[0m Profiling active. Traces will be saved at /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts16-gpus8/profile_trace\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:16:36) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:16:36) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m /home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/torch/distributed/device_mesh.py:788: UserWarning: Slicing a flattened dim from root mesh will be deprecated in PT 2.11. Users need to bookkeep the flattened mesh directly. \n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:16:53) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:16:53) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m /home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/torch/distributed/device_mesh.py:788: UserWarning: Slicing a flattened dim from root mesh will be deprecated in PT 2.11. Users need to bookkeep the flattened mesh directly. \n", + "\u001b[33m[128 similar log lines]\u001b[0m warnings.warn(\n", + "\u001b[33m[128 similar log lines]\u001b[0m \u001b[31mstep: 1 \u001b[32mloss: 12.2511 \u001b[38;2;180;60;0mgrad_norm: 3.7981 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 47 \u001b[36mtflops: 2.21 \u001b[35mmfu: 0.71%\u001b[39m\n", + "\u001b[33m[128 similar log lines]\u001b[0m Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:16:56) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:16:56) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 2 \u001b[32mloss: 11.3997 \u001b[38;2;180;60;0mgrad_norm: 4.5709 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.81 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:17:10) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:17:10) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 2 \u001b[32mloss: 11.3997 \u001b[38;2;180;60;0mgrad_norm: 4.5709 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.81 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:17:13) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:17:13) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 3 \u001b[32mloss: 12.1637 \u001b[38;2;180;60;0mgrad_norm: 58.1643 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:17:27) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:17:27) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 3 \u001b[32mloss: 12.1637 \u001b[38;2;180;60;0mgrad_norm: 58.1643 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[33m[9 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:17:30) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:17:30) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 4 \u001b[32mloss: 13.2350 \u001b[38;2;180;60;0mgrad_norm: 50.7540 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:17:44) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:17:44) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 4 \u001b[32mloss: 13.2350 \u001b[38;2;180;60;0mgrad_norm: 50.7540 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[33m[21 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:17:47) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:17:47) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 5 \u001b[32mloss: 11.5873 \u001b[38;2;180;60;0mgrad_norm: 12.7209 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:18:01) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:18:01) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 5 \u001b[32mloss: 11.5873 \u001b[38;2;180;60;0mgrad_norm: 12.7209 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[33m[21 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:18:04) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:18:04) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 6 \u001b[32mloss: 12.4519 \u001b[38;2;180;60;0mgrad_norm: 11.3216 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:18:18) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:18:18) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 6 \u001b[32mloss: 12.4519 \u001b[38;2;180;60;0mgrad_norm: 11.3216 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[33m[21 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:18:21) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:18:21) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 7 \u001b[32mloss: 14.5130 \u001b[38;2;180;60;0mgrad_norm: 67.7407 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:18:35) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:18:35) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 7 \u001b[32mloss: 14.5130 \u001b[38;2;180;60;0mgrad_norm: 67.7407 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[33m[22 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:18:38) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:18:38) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 8 \u001b[32mloss: 12.5781 \u001b[38;2;180;60;0mgrad_norm: 30.1054 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:18:52) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:18:52) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 8 \u001b[32mloss: 12.5781 \u001b[38;2;180;60;0mgrad_norm: 30.1054 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[33m[24 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:18:55) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:18:55) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 9 \u001b[32mloss: 11.3309 \u001b[38;2;180;60;0mgrad_norm: 14.7525 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.81 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:19:09) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:19:09) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 9 \u001b[32mloss: 11.3309 \u001b[38;2;180;60;0mgrad_norm: 14.7525 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.81 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[33m[24 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:19:12) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:19:12) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 10 \u001b[32mloss: 10.5148 \u001b[38;2;180;60;0mgrad_norm: 5.6056 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.81 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:19:26) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:19:26) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 10 \u001b[32mloss: 10.5148 \u001b[38;2;180;60;0mgrad_norm: 5.6056 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.81 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[33m[22 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:19:29) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:19:29) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 11 \u001b[32mloss: 10.0892 \u001b[38;2;180;60;0mgrad_norm: 11.5532 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:19:43) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:19:43) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 11 \u001b[32mloss: 10.0892 \u001b[38;2;180;60;0mgrad_norm: 11.5532 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[33m[18 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:19:46) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:19:46) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 12 \u001b[32mloss: 9.5798 \u001b[38;2;180;60;0mgrad_norm: 6.9032 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:20:00) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:20:00) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 12 \u001b[32mloss: 9.5798 \u001b[38;2;180;60;0mgrad_norm: 6.9032 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[33m[19 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:20:03) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:20:03) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 13 \u001b[32mloss: 8.9717 \u001b[38;2;180;60;0mgrad_norm: 4.7648 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:20:17) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:20:17) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 13 \u001b[32mloss: 8.9717 \u001b[38;2;180;60;0mgrad_norm: 4.7648 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[33m[20 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:20:20) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:20:20) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 14 \u001b[32mloss: 8.3563 \u001b[38;2;180;60;0mgrad_norm: 11.2116 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:20:33) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:20:33) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 14 \u001b[32mloss: 8.3563 \u001b[38;2;180;60;0mgrad_norm: 11.2116 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[33m[23 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:20:36) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:20:36) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 15 \u001b[32mloss: 8.6429 \u001b[38;2;180;60;0mgrad_norm: 18.0148 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:20:50) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:20:50) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 15 \u001b[32mloss: 8.6429 \u001b[38;2;180;60;0mgrad_norm: 18.0148 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[33m[16 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:20:53) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:20:53) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 16 \u001b[32mloss: 8.4340 \u001b[38;2;180;60;0mgrad_norm: 14.1969 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.81 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:21:07) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:21:07) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 16 \u001b[32mloss: 8.4340 \u001b[38;2;180;60;0mgrad_norm: 14.1969 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.81 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[33m[23 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:21:10) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:21:10) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 17 \u001b[32mloss: 8.0953 \u001b[38;2;180;60;0mgrad_norm: 4.8753 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:21:24) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:21:24) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 17 \u001b[32mloss: 8.0953 \u001b[38;2;180;60;0mgrad_norm: 4.8753 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[33m[20 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:21:27) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:21:27) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 18 \u001b[32mloss: 7.9582 \u001b[38;2;180;60;0mgrad_norm: 5.3509 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:21:41) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:21:41) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 18 \u001b[32mloss: 7.9582 \u001b[38;2;180;60;0mgrad_norm: 5.3509 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[33m[19 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:21:44) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:21:44) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 19 \u001b[32mloss: 7.6446 \u001b[38;2;180;60;0mgrad_norm: 15.6095 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:21:58) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:21:58) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 19 \u001b[32mloss: 7.6446 \u001b[38;2;180;60;0mgrad_norm: 15.6095 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[33m[19 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:22:01) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:22:01) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 20 \u001b[32mloss: 7.8602 \u001b[38;2;180;60;0mgrad_norm: 11.7412 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:22:15) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:22:15) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 20 \u001b[32mloss: 7.8602 \u001b[38;2;180;60;0mgrad_norm: 11.7412 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[33m[20 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:22:18) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:22:18) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 21 \u001b[32mloss: 7.6879 \u001b[38;2;180;60;0mgrad_norm: 5.6927 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:22:32) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:22:32) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 21 \u001b[32mloss: 7.6879 \u001b[38;2;180;60;0mgrad_norm: 5.6927 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[33m[24 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:22:35) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:22:35) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 22 \u001b[32mloss: 7.5203 \u001b[38;2;180;60;0mgrad_norm: 4.5345 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:22:49) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:22:49) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 22 \u001b[32mloss: 7.5203 \u001b[38;2;180;60;0mgrad_norm: 4.5345 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[33m[30 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:22:52) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:22:52) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 23 \u001b[32mloss: 7.5767 \u001b[38;2;180;60;0mgrad_norm: 6.0536 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:23:06) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:23:06) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 23 \u001b[32mloss: 7.5767 \u001b[38;2;180;60;0mgrad_norm: 6.0536 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 61 \u001b[36mtflops: 2.83 \u001b[35mmfu: 0.91%\u001b[39m\n", + "\u001b[33m[14 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:23:09) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:23:09) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 24 \u001b[32mloss: 7.4580 \u001b[38;2;180;60;0mgrad_norm: 3.3011 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:23:23) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:23:23) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 24 \u001b[32mloss: 7.4580 \u001b[38;2;180;60;0mgrad_norm: 3.3011 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.82 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[33m[16 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:23:26) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:23:26) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 25 \u001b[32mloss: 7.4618 \u001b[38;2;180;60;0mgrad_norm: 4.8066 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.81 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:23:40) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:16:30) >>>\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33m[126 similar log lines]\u001b[0m Done training\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:23:43) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-10-20 05:23:40) >>>\u001b[0m\n", + "\u001b[33m[127 similar log lines]\u001b[0m \u001b[31mstep: 25 \u001b[32mloss: 7.4618 \u001b[38;2;180;60;0mgrad_norm: 4.8066 \u001b[38;2;54;234;195mmemory: 12.80GiB(28.69%) \u001b[34mtps: 60 \u001b[36mtflops: 2.81 \u001b[35mmfu: 0.90%\u001b[39m\n", + "\u001b[33m[255 similar log lines]\u001b[0m Training completed\n", + "\u001b[33m[128 similar log lines]\u001b[0m Training starts at step 26\n", + "\u001b[33m[128 similar log lines]\u001b[0m Profiling active. Traces will be saved at /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts16-gpus8/profile_trace\n", + "\u001b[33m[2 similar log lines]\u001b[0m Sleeping 2 seconds for other ranks to complete\n", + "\u001b[33m[126 similar log lines]\u001b[0m Process group destroyed.\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:23:43) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:23:43) >>>\u001b[0m\n", + "\u001b[33m[2 similar log lines]\u001b[0m Done training\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:23:46) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-10-20 05:23:43) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m Training completed\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: updating run metadata\n", + "\u001b[33m[3 similar log lines]\u001b[0m wandb: \n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Run history:\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: grad_norm ▁▁▇▆▂▂█▄▂▁▂▁▁▂▃▂▁▁▂▂▁▁▁▁▁\n", + "\u001b[33m[2 similar log lines]\u001b[0m wandb: loss_metrics/global_avg_loss ▆▅▆▇▅▆█▆▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: lr ▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇██\n", + "\u001b[33m[2 similar log lines]\u001b[0m wandb: memory/max_active(%) ▁████████████████████████\n", + "\u001b[33m[3 similar log lines]\u001b[0m wandb: memory/max_reserved(%) ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/num_ooms ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n", + "\u001b[33m[2 similar log lines]\u001b[0m wandb: +7 ...\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Run summary:\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: grad_norm 4.80659\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: loss_metrics/global_avg_loss 7.4618\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: loss_metrics/global_max_loss 10.02059\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: lr 0.0003\n", + "\u001b[33m[2 similar log lines]\u001b[0m Process group destroyed.\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/max_active(%) 12.94759\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/max_active(GiB) 5.77951\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/max_reserved(%) 28.68578\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/max_reserved(GiB) 12.80469\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/num_alloc_retries 0\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/num_ooms 0\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: 🚀 View run easy-waterfall-51 at: https://wandb.ai/a-shamsoshoara-m/torchtitan/runs/kog9t67d\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: ⭐️ View project at: https://wandb.ai/a-shamsoshoara-m/torchtitan\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Find logs at: ./torchtitan/outputs/monarch-alisol-hosts16-gpus8/tb/20251020-0516/wandb/run-20251020_051633-kog9t67d/logs\n", + "\u001b[36m<<< Aggregated Logs (2025-10-20 05:23:46) <<<\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "init_logger()\n", + "config_manager = ConfigManager()\n", + "\n", + "job_name = get_job_name(NUM_NODES, NUM_GPUS)\n", + "\n", + "manual_args = [\n", + " \"--job.config_file\",\n", + " os.path.expanduser(\"/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml\"),\n", + " \"--model.tokenizer-path\",\n", + " \"/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B\",\n", + " \"--training.steps\",\n", + " \"25\",\n", + " \"--training.dataset_path\",\n", + " \"/teamspace/studios/this_studio/torchtitan/tests/assets/c4_test\",\n", + " \"--job.dump_folder\",\n", + " \"/teamspace/studios/this_studio/torchtitan/outputs/\" + job_name,\n", + " \"--training.seq_len\",\n", + " \"1024\",\n", + " # \"8192\",\n", + " ]\n", + "config = config_manager.parse_args(manual_args)\n", + "await async_main(config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**🎉🎉 Congratulations!!!! 🎉🎉 You just ran the interactive distributed training for Llama-3 model in a Notebook using Monarch actors and Lightning setup!**\n", + "\n", + "This already gives the user lots of flexibilities such as changing the configurations and launching another training without iniatiating another job or set of nodes; or experiencing the logging aggregation using Monarch.\n", + "\n", + "However, a curious user can dig more into advanced features of Monarch in Part III. Monarch offers features such as interactive distributed debugging while your training is running on mutliple nodes and ranks. Another feature is the `workspace_sync` where users can update packages, environments and files and sync them with remote nodes. Without Monarch, users may need to re-initiate their launches which usually takes lots of times. \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "--- \n", + "\n", + "# Part III: Advanced Features (Distributed Development & Debugging)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Variable Management with Remote Actors\n", + "\n", + "Spawn an actor that can interact with environment variables on remote nodes. This is useful for debugging, configuration management, and runtime environment inspection across the distributed system." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from monarch.actor import Actor, endpoint, current_rank\n", + "import os\n", + "import socket\n", + "\n", + "class EnvVarActor(Actor):\n", + " \"\"\"Actor for managing environment variables on remote nodes.\"\"\"\n", + "\n", + " def __init__(self):\n", + " self.rank = current_rank().rank\n", + " self.hostname = socket.gethostname()\n", + "\n", + " @endpoint\n", + " def get_env(self, var_name: str) -> dict:\n", + " \"\"\"Get an environment variable value from the remote node.\"\"\"\n", + " value = os.environ.get(var_name)\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"var_name\": var_name,\n", + " \"value\": value\n", + " }\n", + "\n", + " @endpoint\n", + " def set_env(self, var_name: str, var_value: str) -> dict:\n", + " \"\"\"Set an environment variable on the remote node.\"\"\"\n", + " os.environ[var_name] = var_value\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"var_name\": var_name,\n", + " \"value\": var_value,\n", + " \"status\": \"set\"\n", + " }\n", + "\n", + " @endpoint\n", + " def list_env_vars(self, prefix: str = \"\") -> dict:\n", + " \"\"\"List all environment variables matching a prefix.\"\"\"\n", + " matching_vars = {k: v for k, v in os.environ.items() if k.startswith(prefix)}\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"matching_vars\": matching_vars,\n", + " \"count\": len(matching_vars)\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Spawn the Environment Variable Actor\n", + "\n", + "Spawn the `EnvVarActor` across all nodes in the process mesh. Each node will have an instance that can be used to inspect and modify its local environment." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EnvVarActor spawned across all nodes\n" + ] + } + ], + "source": [ + "# Spawn the environment variable actor across all nodes\n", + "env_actor = proc_mesh.spawn(\"env_actor\", EnvVarActor)\n", + "print(\"EnvVarActor spawned across all nodes\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get Environment Variables from Remote Nodes\n", + "\n", + "Query environment variables from all remote nodes. This example retrieves the `CUDA_VISIBLE_DEVICES` variable that was set during job initialization." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "CUDA_VISIBLE_DEVICES on all nodes:\n", + " Host 0 gpus 0 Rank 0 (ip-10-192-11-251): 0,1,2,3,4,5,6,7\n", + " Host 0 gpus 1 Rank 1 (ip-10-192-11-251): 0,1,2,3,4,5,6,7\n", + " Host 0 gpus 2 Rank 2 (ip-10-192-11-251): 0,1,2,3,4,5,6,7\n", + " Host 0 gpus 3 Rank 3 (ip-10-192-11-251): 0,1,2,3,4,5,6,7\n", + " Host 0 gpus 4 Rank 4 (ip-10-192-11-251): 0,1,2,3,4,5,6,7\n", + " Host 0 gpus 5 Rank 5 (ip-10-192-11-251): 0,1,2,3,4,5,6,7\n", + " Host 0 gpus 6 Rank 6 (ip-10-192-11-251): 0,1,2,3,4,5,6,7\n", + " Host 0 gpus 7 Rank 7 (ip-10-192-11-251): 0,1,2,3,4,5,6,7\n", + " Host 1 gpus 0 Rank 8 (ip-10-192-11-128): 0,1,2,3,4,5,6,7\n", + " Host 1 gpus 1 Rank 9 (ip-10-192-11-128): 0,1,2,3,4,5,6,7\n", + " Host 1 gpus 2 Rank 10 (ip-10-192-11-128): 0,1,2,3,4,5,6,7\n", + " Host 1 gpus 3 Rank 11 (ip-10-192-11-128): 0,1,2,3,4,5,6,7\n", + " Host 1 gpus 4 Rank 12 (ip-10-192-11-128): 0,1,2,3,4,5,6,7\n", + " Host 1 gpus 5 Rank 13 (ip-10-192-11-128): 0,1,2,3,4,5,6,7\n", + " Host 1 gpus 6 Rank 14 (ip-10-192-11-128): 0,1,2,3,4,5,6,7\n", + " Host 1 gpus 7 Rank 15 (ip-10-192-11-128): 0,1,2,3,4,5,6,7\n", + " Host 2 gpus 0 Rank 16 (ip-10-192-11-77): 0,1,2,3,4,5,6,7\n", + " Host 2 gpus 1 Rank 17 (ip-10-192-11-77): 0,1,2,3,4,5,6,7\n", + " Host 2 gpus 2 Rank 18 (ip-10-192-11-77): 0,1,2,3,4,5,6,7\n", + " Host 2 gpus 3 Rank 19 (ip-10-192-11-77): 0,1,2,3,4,5,6,7\n", + " Host 2 gpus 4 Rank 20 (ip-10-192-11-77): 0,1,2,3,4,5,6,7\n", + " Host 2 gpus 5 Rank 21 (ip-10-192-11-77): 0,1,2,3,4,5,6,7\n", + " Host 2 gpus 6 Rank 22 (ip-10-192-11-77): 0,1,2,3,4,5,6,7\n", + " Host 2 gpus 7 Rank 23 (ip-10-192-11-77): 0,1,2,3,4,5,6,7\n", + " Host 3 gpus 0 Rank 24 (ip-10-192-11-28): 0,1,2,3,4,5,6,7\n", + " Host 3 gpus 1 Rank 25 (ip-10-192-11-28): 0,1,2,3,4,5,6,7\n", + " Host 3 gpus 2 Rank 26 (ip-10-192-11-28): 0,1,2,3,4,5,6,7\n", + " Host 3 gpus 3 Rank 27 (ip-10-192-11-28): 0,1,2,3,4,5,6,7\n", + " Host 3 gpus 4 Rank 28 (ip-10-192-11-28): 0,1,2,3,4,5,6,7\n", + " Host 3 gpus 5 Rank 29 (ip-10-192-11-28): 0,1,2,3,4,5,6,7\n", + " Host 3 gpus 6 Rank 30 (ip-10-192-11-28): 0,1,2,3,4,5,6,7\n", + " Host 3 gpus 7 Rank 31 (ip-10-192-11-28): 0,1,2,3,4,5,6,7\n", + " Host 4 gpus 0 Rank 32 (ip-10-192-11-124): 0,1,2,3,4,5,6,7\n", + " Host 4 gpus 1 Rank 33 (ip-10-192-11-124): 0,1,2,3,4,5,6,7\n", + " Host 4 gpus 2 Rank 34 (ip-10-192-11-124): 0,1,2,3,4,5,6,7\n", + " Host 4 gpus 3 Rank 35 (ip-10-192-11-124): 0,1,2,3,4,5,6,7\n", + " Host 4 gpus 4 Rank 36 (ip-10-192-11-124): 0,1,2,3,4,5,6,7\n", + " Host 4 gpus 5 Rank 37 (ip-10-192-11-124): 0,1,2,3,4,5,6,7\n", + " Host 4 gpus 6 Rank 38 (ip-10-192-11-124): 0,1,2,3,4,5,6,7\n", + " Host 4 gpus 7 Rank 39 (ip-10-192-11-124): 0,1,2,3,4,5,6,7\n", + " Host 5 gpus 0 Rank 40 (ip-10-192-11-35): 0,1,2,3,4,5,6,7\n", + " Host 5 gpus 1 Rank 41 (ip-10-192-11-35): 0,1,2,3,4,5,6,7\n", + " Host 5 gpus 2 Rank 42 (ip-10-192-11-35): 0,1,2,3,4,5,6,7\n", + " Host 5 gpus 3 Rank 43 (ip-10-192-11-35): 0,1,2,3,4,5,6,7\n", + " Host 5 gpus 4 Rank 44 (ip-10-192-11-35): 0,1,2,3,4,5,6,7\n", + " Host 5 gpus 5 Rank 45 (ip-10-192-11-35): 0,1,2,3,4,5,6,7\n", + " Host 5 gpus 6 Rank 46 (ip-10-192-11-35): 0,1,2,3,4,5,6,7\n", + " Host 5 gpus 7 Rank 47 (ip-10-192-11-35): 0,1,2,3,4,5,6,7\n", + " Host 6 gpus 0 Rank 48 (ip-10-192-11-103): 0,1,2,3,4,5,6,7\n", + " Host 6 gpus 1 Rank 49 (ip-10-192-11-103): 0,1,2,3,4,5,6,7\n", + " Host 6 gpus 2 Rank 50 (ip-10-192-11-103): 0,1,2,3,4,5,6,7\n", + " Host 6 gpus 3 Rank 51 (ip-10-192-11-103): 0,1,2,3,4,5,6,7\n", + " Host 6 gpus 4 Rank 52 (ip-10-192-11-103): 0,1,2,3,4,5,6,7\n", + " Host 6 gpus 5 Rank 53 (ip-10-192-11-103): 0,1,2,3,4,5,6,7\n", + " Host 6 gpus 6 Rank 54 (ip-10-192-11-103): 0,1,2,3,4,5,6,7\n", + " Host 6 gpus 7 Rank 55 (ip-10-192-11-103): 0,1,2,3,4,5,6,7\n", + " Host 7 gpus 0 Rank 56 (ip-10-192-11-176): 0,1,2,3,4,5,6,7\n", + " Host 7 gpus 1 Rank 57 (ip-10-192-11-176): 0,1,2,3,4,5,6,7\n", + " Host 7 gpus 2 Rank 58 (ip-10-192-11-176): 0,1,2,3,4,5,6,7\n", + " Host 7 gpus 3 Rank 59 (ip-10-192-11-176): 0,1,2,3,4,5,6,7\n", + " Host 7 gpus 4 Rank 60 (ip-10-192-11-176): 0,1,2,3,4,5,6,7\n", + " Host 7 gpus 5 Rank 61 (ip-10-192-11-176): 0,1,2,3,4,5,6,7\n", + " Host 7 gpus 6 Rank 62 (ip-10-192-11-176): 0,1,2,3,4,5,6,7\n", + " Host 7 gpus 7 Rank 63 (ip-10-192-11-176): 0,1,2,3,4,5,6,7\n", + " Host 8 gpus 0 Rank 64 (ip-10-192-11-186): 0,1,2,3,4,5,6,7\n", + " Host 8 gpus 1 Rank 65 (ip-10-192-11-186): 0,1,2,3,4,5,6,7\n", + " Host 8 gpus 2 Rank 66 (ip-10-192-11-186): 0,1,2,3,4,5,6,7\n", + " Host 8 gpus 3 Rank 67 (ip-10-192-11-186): 0,1,2,3,4,5,6,7\n", + " Host 8 gpus 4 Rank 68 (ip-10-192-11-186): 0,1,2,3,4,5,6,7\n", + " Host 8 gpus 5 Rank 69 (ip-10-192-11-186): 0,1,2,3,4,5,6,7\n", + " Host 8 gpus 6 Rank 70 (ip-10-192-11-186): 0,1,2,3,4,5,6,7\n", + " Host 8 gpus 7 Rank 71 (ip-10-192-11-186): 0,1,2,3,4,5,6,7\n", + " Host 9 gpus 0 Rank 72 (ip-10-192-11-147): 0,1,2,3,4,5,6,7\n", + " Host 9 gpus 1 Rank 73 (ip-10-192-11-147): 0,1,2,3,4,5,6,7\n", + " Host 9 gpus 2 Rank 74 (ip-10-192-11-147): 0,1,2,3,4,5,6,7\n", + " Host 9 gpus 3 Rank 75 (ip-10-192-11-147): 0,1,2,3,4,5,6,7\n", + " Host 9 gpus 4 Rank 76 (ip-10-192-11-147): 0,1,2,3,4,5,6,7\n", + " Host 9 gpus 5 Rank 77 (ip-10-192-11-147): 0,1,2,3,4,5,6,7\n", + " Host 9 gpus 6 Rank 78 (ip-10-192-11-147): 0,1,2,3,4,5,6,7\n", + " Host 9 gpus 7 Rank 79 (ip-10-192-11-147): 0,1,2,3,4,5,6,7\n", + " Host 10 gpus 0 Rank 80 (ip-10-192-11-151): 0,1,2,3,4,5,6,7\n", + " Host 10 gpus 1 Rank 81 (ip-10-192-11-151): 0,1,2,3,4,5,6,7\n", + " Host 10 gpus 2 Rank 82 (ip-10-192-11-151): 0,1,2,3,4,5,6,7\n", + " Host 10 gpus 3 Rank 83 (ip-10-192-11-151): 0,1,2,3,4,5,6,7\n", + " Host 10 gpus 4 Rank 84 (ip-10-192-11-151): 0,1,2,3,4,5,6,7\n", + " Host 10 gpus 5 Rank 85 (ip-10-192-11-151): 0,1,2,3,4,5,6,7\n", + " Host 10 gpus 6 Rank 86 (ip-10-192-11-151): 0,1,2,3,4,5,6,7\n", + " Host 10 gpus 7 Rank 87 (ip-10-192-11-151): 0,1,2,3,4,5,6,7\n", + " Host 11 gpus 0 Rank 88 (ip-10-192-11-58): 0,1,2,3,4,5,6,7\n", + " Host 11 gpus 1 Rank 89 (ip-10-192-11-58): 0,1,2,3,4,5,6,7\n", + " Host 11 gpus 2 Rank 90 (ip-10-192-11-58): 0,1,2,3,4,5,6,7\n", + " Host 11 gpus 3 Rank 91 (ip-10-192-11-58): 0,1,2,3,4,5,6,7\n", + " Host 11 gpus 4 Rank 92 (ip-10-192-11-58): 0,1,2,3,4,5,6,7\n", + " Host 11 gpus 5 Rank 93 (ip-10-192-11-58): 0,1,2,3,4,5,6,7\n", + " Host 11 gpus 6 Rank 94 (ip-10-192-11-58): 0,1,2,3,4,5,6,7\n", + " Host 11 gpus 7 Rank 95 (ip-10-192-11-58): 0,1,2,3,4,5,6,7\n", + " Host 12 gpus 0 Rank 96 (ip-10-192-11-9): 0,1,2,3,4,5,6,7\n", + " Host 12 gpus 1 Rank 97 (ip-10-192-11-9): 0,1,2,3,4,5,6,7\n", + " Host 12 gpus 2 Rank 98 (ip-10-192-11-9): 0,1,2,3,4,5,6,7\n", + " Host 12 gpus 3 Rank 99 (ip-10-192-11-9): 0,1,2,3,4,5,6,7\n", + " Host 12 gpus 4 Rank 100 (ip-10-192-11-9): 0,1,2,3,4,5,6,7\n", + " Host 12 gpus 5 Rank 101 (ip-10-192-11-9): 0,1,2,3,4,5,6,7\n", + " Host 12 gpus 6 Rank 102 (ip-10-192-11-9): 0,1,2,3,4,5,6,7\n", + " Host 12 gpus 7 Rank 103 (ip-10-192-11-9): 0,1,2,3,4,5,6,7\n", + " Host 13 gpus 0 Rank 104 (ip-10-192-11-211): 0,1,2,3,4,5,6,7\n", + " Host 13 gpus 1 Rank 105 (ip-10-192-11-211): 0,1,2,3,4,5,6,7\n", + " Host 13 gpus 2 Rank 106 (ip-10-192-11-211): 0,1,2,3,4,5,6,7\n", + " Host 13 gpus 3 Rank 107 (ip-10-192-11-211): 0,1,2,3,4,5,6,7\n", + " Host 13 gpus 4 Rank 108 (ip-10-192-11-211): 0,1,2,3,4,5,6,7\n", + " Host 13 gpus 5 Rank 109 (ip-10-192-11-211): 0,1,2,3,4,5,6,7\n", + " Host 13 gpus 6 Rank 110 (ip-10-192-11-211): 0,1,2,3,4,5,6,7\n", + " Host 13 gpus 7 Rank 111 (ip-10-192-11-211): 0,1,2,3,4,5,6,7\n", + " Host 14 gpus 0 Rank 112 (ip-10-192-11-89): 0,1,2,3,4,5,6,7\n", + " Host 14 gpus 1 Rank 113 (ip-10-192-11-89): 0,1,2,3,4,5,6,7\n", + " Host 14 gpus 2 Rank 114 (ip-10-192-11-89): 0,1,2,3,4,5,6,7\n", + " Host 14 gpus 3 Rank 115 (ip-10-192-11-89): 0,1,2,3,4,5,6,7\n", + " Host 14 gpus 4 Rank 116 (ip-10-192-11-89): 0,1,2,3,4,5,6,7\n", + " Host 14 gpus 5 Rank 117 (ip-10-192-11-89): 0,1,2,3,4,5,6,7\n", + " Host 14 gpus 6 Rank 118 (ip-10-192-11-89): 0,1,2,3,4,5,6,7\n", + " Host 14 gpus 7 Rank 119 (ip-10-192-11-89): 0,1,2,3,4,5,6,7\n", + " Host 15 gpus 0 Rank 120 (ip-10-192-11-207): 0,1,2,3,4,5,6,7\n", + " Host 15 gpus 1 Rank 121 (ip-10-192-11-207): 0,1,2,3,4,5,6,7\n", + " Host 15 gpus 2 Rank 122 (ip-10-192-11-207): 0,1,2,3,4,5,6,7\n", + " Host 15 gpus 3 Rank 123 (ip-10-192-11-207): 0,1,2,3,4,5,6,7\n", + " Host 15 gpus 4 Rank 124 (ip-10-192-11-207): 0,1,2,3,4,5,6,7\n", + " Host 15 gpus 5 Rank 125 (ip-10-192-11-207): 0,1,2,3,4,5,6,7\n", + " Host 15 gpus 6 Rank 126 (ip-10-192-11-207): 0,1,2,3,4,5,6,7\n", + " Host 15 gpus 7 Rank 127 (ip-10-192-11-207): 0,1,2,3,4,5,6,7\n" + ] + } + ], + "source": [ + "# Get an environment variable from all nodes\n", + "results = await env_actor.get_env.call(\"CUDA_VISIBLE_DEVICES\")\n", + "print(\"\\nCUDA_VISIBLE_DEVICES on all nodes:\")\n", + "for result in results:\n", + " if len(result) > 1:\n", + " print(f\" Host {result[0].get('hosts', '?')} gpus {result[0].get('gpus', '?')} Rank {result[1].get('rank', '?')} ({result[1].get('hostname', '?')}): {result[1].get('value', '?')}\")\n", + " else:\n", + " print(f\" Rank {result.get('rank', '?')} ({result.get('hostname', '?')}): {result.get('value', '?')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set Environment Variables on Remote Nodes\n", + "\n", + "Set a custom environment variable on all remote nodes and verify it was set correctly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set a custom environment variable on all nodes\n", + "set_results = await env_actor.set_env.call(\"CUSTOM_VAR\", \"test_value_123\")\n", + "print(\"\\nSetting CUSTOM_VAR on all nodes:\")\n", + "for result in set_results:\n", + " if len(result) > 1:\n", + " print(f\" Rank {result[1]['rank']} ({result[1]['hostname']}): {result[1]['status']} - {result[1]['value']}\")\n", + " else:\n", + " print(f\" Rank {result['rank']} ({result['hostname']}): {result['status']} - {result['value']}\")\n", + "\n", + "# Verify the variable was set by reading it back\n", + "verify_results = await env_actor.get_env.call(\"CUSTOM_VAR\")\n", + "print(\"\\nVerifying CUSTOM_VAR on all nodes:\")\n", + "for result in verify_results:\n", + " if len(result) > 1:\n", + " print(f\" Rank {result[1]['rank']} ({result[1]['hostname']}): {result[1]['value']}\")\n", + " else:\n", + " print(f\" Rank {result['rank']} ({result['hostname']}): {result['value']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List Environment Variables with Prefix\n", + "\n", + "List all environment variables that match a specific prefix (e.g., all CUDA-related or MONARCH-related variables)." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "CUDA-related environment variables on all nodes:\n", + "\n", + " Rank 0 (ip-10-192-11-251) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 1 (ip-10-192-11-251) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 2 (ip-10-192-11-251) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 3 (ip-10-192-11-251) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 4 (ip-10-192-11-251) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 5 (ip-10-192-11-251) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 6 (ip-10-192-11-251) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 7 (ip-10-192-11-251) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 8 (ip-10-192-11-128) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 9 (ip-10-192-11-128) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 10 (ip-10-192-11-128) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 11 (ip-10-192-11-128) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 12 (ip-10-192-11-128) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 13 (ip-10-192-11-128) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 14 (ip-10-192-11-128) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 15 (ip-10-192-11-128) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 16 (ip-10-192-11-77) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 17 (ip-10-192-11-77) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 18 (ip-10-192-11-77) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 19 (ip-10-192-11-77) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 20 (ip-10-192-11-77) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 21 (ip-10-192-11-77) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 22 (ip-10-192-11-77) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 23 (ip-10-192-11-77) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 24 (ip-10-192-11-28) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 25 (ip-10-192-11-28) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 26 (ip-10-192-11-28) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 27 (ip-10-192-11-28) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 28 (ip-10-192-11-28) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 29 (ip-10-192-11-28) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 30 (ip-10-192-11-28) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 31 (ip-10-192-11-28) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 32 (ip-10-192-11-124) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 33 (ip-10-192-11-124) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 34 (ip-10-192-11-124) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 35 (ip-10-192-11-124) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 36 (ip-10-192-11-124) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 37 (ip-10-192-11-124) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 38 (ip-10-192-11-124) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 39 (ip-10-192-11-124) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 40 (ip-10-192-11-35) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 41 (ip-10-192-11-35) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 42 (ip-10-192-11-35) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 43 (ip-10-192-11-35) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 44 (ip-10-192-11-35) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 45 (ip-10-192-11-35) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 46 (ip-10-192-11-35) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 47 (ip-10-192-11-35) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 48 (ip-10-192-11-103) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 49 (ip-10-192-11-103) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 50 (ip-10-192-11-103) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 51 (ip-10-192-11-103) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 52 (ip-10-192-11-103) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 53 (ip-10-192-11-103) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 54 (ip-10-192-11-103) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 55 (ip-10-192-11-103) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 56 (ip-10-192-11-176) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 57 (ip-10-192-11-176) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 58 (ip-10-192-11-176) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 59 (ip-10-192-11-176) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 60 (ip-10-192-11-176) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 61 (ip-10-192-11-176) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 62 (ip-10-192-11-176) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 63 (ip-10-192-11-176) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 64 (ip-10-192-11-186) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 65 (ip-10-192-11-186) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 66 (ip-10-192-11-186) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 67 (ip-10-192-11-186) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 68 (ip-10-192-11-186) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 69 (ip-10-192-11-186) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 70 (ip-10-192-11-186) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 71 (ip-10-192-11-186) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 72 (ip-10-192-11-147) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 73 (ip-10-192-11-147) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 74 (ip-10-192-11-147) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 75 (ip-10-192-11-147) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 76 (ip-10-192-11-147) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 77 (ip-10-192-11-147) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 78 (ip-10-192-11-147) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 79 (ip-10-192-11-147) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 80 (ip-10-192-11-151) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 81 (ip-10-192-11-151) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 82 (ip-10-192-11-151) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 83 (ip-10-192-11-151) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 84 (ip-10-192-11-151) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 85 (ip-10-192-11-151) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 86 (ip-10-192-11-151) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 87 (ip-10-192-11-151) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 88 (ip-10-192-11-58) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 89 (ip-10-192-11-58) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 90 (ip-10-192-11-58) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 91 (ip-10-192-11-58) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 92 (ip-10-192-11-58) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 93 (ip-10-192-11-58) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 94 (ip-10-192-11-58) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 95 (ip-10-192-11-58) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 96 (ip-10-192-11-9) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 97 (ip-10-192-11-9) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 98 (ip-10-192-11-9) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 99 (ip-10-192-11-9) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 100 (ip-10-192-11-9) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 101 (ip-10-192-11-9) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 102 (ip-10-192-11-9) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 103 (ip-10-192-11-9) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 104 (ip-10-192-11-211) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 105 (ip-10-192-11-211) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 106 (ip-10-192-11-211) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 107 (ip-10-192-11-211) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 108 (ip-10-192-11-211) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 109 (ip-10-192-11-211) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 110 (ip-10-192-11-211) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 111 (ip-10-192-11-211) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 112 (ip-10-192-11-89) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 113 (ip-10-192-11-89) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 114 (ip-10-192-11-89) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 115 (ip-10-192-11-89) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 116 (ip-10-192-11-89) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 117 (ip-10-192-11-89) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 118 (ip-10-192-11-89) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 119 (ip-10-192-11-89) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 120 (ip-10-192-11-207) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 121 (ip-10-192-11-207) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 122 (ip-10-192-11-207) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 123 (ip-10-192-11-207) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 124 (ip-10-192-11-207) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 125 (ip-10-192-11-207) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 126 (ip-10-192-11-207) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n", + "\n", + " Rank 127 (ip-10-192-11-207) - 3 variables:\n", + " CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda\n", + " CUDA_VERSION=12.6.3\n", + " CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7\n" + ] + } + ], + "source": [ + "# List all environment variables starting with \"CUDA\"\n", + "list_results = await env_actor.list_env_vars.call(\"CUDA\")\n", + "print(\"\\nCUDA-related environment variables on all nodes:\")\n", + "for result in list_results:\n", + " if len(result) > 1:\n", + " print(f\"\\n Rank {result[1]['rank']} ({result[1]['hostname']}) - {result[1]['count']} variables:\")\n", + " for var_name, var_value in result[1]['matching_vars'].items():\n", + " print(f\" {var_name}={var_value}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Workspace Synchronization with `sync_workspace`\n", + "\n", + "When working with distributed training, you often need to modify configuration files, training scripts, or other code locally and sync those changes to remote worker nodes without restarting the entire job. Monarch's `proc_mesh.sync_workspace()` enables this workflow.\n", + "\n", + "### How it works:\n", + "\n", + "1. **Make changes locally** - Edit files in your local workspace (e.g., configuration files, training scripts)\n", + "2. **Call `sync_workspace()`** - Synchronize changes to all remote worker nodes\n", + "3. **Continue execution** - The updated files are immediately available on all nodes\n", + "\n", + "This is particularly useful for:\n", + "- Tweaking hyperparameters in configuration files\n", + "- Updating training schedules\n", + "- Modifying data processing logic\n", + "- Hot-reloading code changes without job restart\n", + "\n", + "Let's see a practical example using TorchTitan training configurations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Actor to Check File Contents\n", + "\n", + "First, create an actor that can read and verify file contents on remote nodes. This will help us confirm that files are properly synchronized across the cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "class FileCheckerActor(Actor):\n", + " \"\"\"Actor to read and verify file contents on remote nodes.\"\"\"\n", + "\n", + " def __init__(self):\n", + " self.rank = current_rank().rank\n", + " self.hostname = socket.gethostname()\n", + "\n", + " @endpoint\n", + " def read_file(self, file_path: str) -> dict:\n", + " \"\"\"Read a file and return its contents.\"\"\"\n", + " try:\n", + " with open(file_path, 'r') as f:\n", + " content = f.read()\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"file_path\": file_path,\n", + " \"content\": content,\n", + " \"exists\": True,\n", + " \"size\": len(content)\n", + " }\n", + " except FileNotFoundError:\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"file_path\": file_path,\n", + " \"exists\": False,\n", + " \"error\": \"File not found\"\n", + " }\n", + " except Exception as e:\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"file_path\": file_path,\n", + " \"exists\": False,\n", + " \"error\": str(e)\n", + " }\n", + "\n", + " @endpoint\n", + " def file_exists(self, file_path: str) -> dict:\n", + " \"\"\"Check if a file exists on the remote node.\"\"\"\n", + " exists = os.path.exists(file_path)\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"file_path\": file_path,\n", + " \"exists\": exists\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Spawn File Checker Actor\n", + "\n", + "Spawn the file checker actor across all nodes to verify file synchronization." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FileCheckerActor spawned across all nodes\n" + ] + } + ], + "source": [ + "# Spawn the file checker actor\n", + "file_checker = proc_mesh.spawn(\"file_checker\", FileCheckerActor)\n", + "print(\"FileCheckerActor spawned across all nodes\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a Local Configuration File\n", + "\n", + "Create a local training configuration file that we'll later modify and sync to worker nodes. This simulates a common workflow where you want to tweak hyperparameters or training settings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a local workspace directory for our custom config\n", + "local_workspace = \"/teamspace/studios/this_studio/monarch_sync_example\"\n", + "os.makedirs(local_workspace, exist_ok=True)\n", + "\n", + "# Create a custom training configuration file\n", + "config_file_name = \"custom_training_config.toml\"\n", + "local_config_path = os.path.join(local_workspace, config_file_name)\n", + "\n", + "# Write initial configuration\n", + "with open(local_config_path, 'w') as f:\n", + " f.write(\"\"\"# TorchTitan Custom Training Configuration\n", + "# This file demonstrates workspace synchronization\n", + "\n", + "[training]\n", + "batch_size = 32\n", + "learning_rate = 0.001\n", + "max_steps = 100\n", + "warmup_steps = 10\n", + "\n", + "[model]\n", + "model_type = \"llama3_8b\"\n", + "seq_len = 1024\n", + "\n", + "[optimizer]\n", + "optimizer_type = \"AdamW\"\n", + "weight_decay = 0.01\n", + "\"\"\")\n", + "\n", + "print(f\"Created local config file: {local_config_path}\")\n", + "with open(local_config_path, 'r') as f:\n", + " print(f\"\\nInitial configuration:\\n{f.read()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup Workspace and Perform Initial Sync\n", + "\n", + "Create a Monarch `Workspace` object and perform the initial synchronization to all remote worker nodes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from monarch.tools.config.workspace import Workspace\n", + "from pathlib import Path\n", + "\n", + "# Create a Workspace object pointing to our local directory\n", + "workspace = Workspace(dirs=[Path(local_workspace)])\n", + "\n", + "print(f\"Workspace configured: {workspace.dirs}\")\n", + "print(f\"\\nSyncing workspace to remote nodes...\")\n", + "# Perform initial sync\n", + "await proc_mesh.sync_workspace(workspace=workspace, conda=False, auto_reload=False)\n", + "\n", + "print(\"Initial workspace sync completed!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Verify File on Remote Nodes\n", + "\n", + "Check that the configuration file was successfully synced to all remote worker nodes by reading it from each node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Construct the remote file path (files are synced to WORKSPACE_DIR)\n", + "remote_workspace_root = os.environ.get(\"WORKSPACE_DIR\", \"/workspace\")\n", + "remote_config_path = os.path.join(remote_workspace_root, \"monarch_sync_example\", config_file_name)\n", + "\n", + "print(f\"Checking file on remote nodes: {remote_config_path}\\n\")\n", + "\n", + "# Check file existence on all nodes\n", + "exists_results = await file_checker.file_exists.call(remote_config_path)\n", + "for result in exists_results:\n", + " status = \"EXISTS\" if result['exists'] else \" NOT FOUND\"\n", + " print(f\" Rank {result['rank']} ({result['hostname']}): {status}\")\n", + "\n", + "# Read file content from rank 0 to verify\n", + "print(f\"\\nReading config from rank 0:\")\n", + "read_results = await file_checker.read_file.call(remote_config_path)\n", + "if read_results[0]['exists']:\n", + " print(f\"\\n{read_results[0]['content']}\")\n", + "else:\n", + " print(f\"Error: {read_results[0].get('error', 'Unknown error')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Debugging with Breakpoints in Monarch\n", + "\n", + "Monarch supports interactive debugging of distributed actors using Python's built-in `pdb` debugger. You can set breakpoints in your actors, attach to specific ranks, and inspect their state during execution.\n", + "\n", + "### How to Debug:\n", + "\n", + "1. **Add breakpoints** to your actor endpoints using `breakpoint()`\n", + "2. **Run your training** as usual - execution will pause when breakpoints are hit\n", + "3. **Open a separate terminal** and run: `monarch debug`\n", + "4. **Use debugger commands**:\n", + " - `list` - Show all active breakpoints across ranks\n", + " - `attach ` - Attach to a specific actor/rank for interactive debugging\n", + " - `cast ranks() ` - Send pdb commands to multiple ranks\n", + " - `continue` - Resume execution\n", + "\n", + "Let's create a debugging example using a TorchTitan trainer with breakpoints." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define TitanTrainerActor with Breakpoints\n", + "\n", + "Create a TorchTitan trainer actor with breakpoints at key stages. This allows you to inspect the training state, configuration, and execution flow interactively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class TitanTrainerDebug(Actor):\n", + " \"\"\"TorchTitan Trainer Actor with debugging breakpoints.\"\"\"\n", + "\n", + " def __init__(self, job_config: JobConfig):\n", + " self.rank = current_rank().rank\n", + " self.job_config = job_config\n", + " self.trainer: Optional[Trainer] = None\n", + "\n", + " def _rprint(self, msg):\n", + " \"\"\"Helper method to print with rank information.\"\"\"\n", + " print(f\"{self.rank=} {msg}\")\n", + "\n", + " @endpoint\n", + " def init(self):\n", + " logging.getLogger().addHandler(logging.StreamHandler(sys.stderr))\n", + " self._rprint(f\"Initializing debug actor: {current_rank()=} {socket.gethostname()=}\")\n", + "\n", + " # Breakpoint 1: After initialization\n", + " breakpoint() # Debug: Inspect actor initialization state\n", + "\n", + " @endpoint\n", + " def setup_trainer(self):\n", + " \"\"\"Setup the trainer with a breakpoint to inspect configuration.\"\"\"\n", + " logger.info(f\"Setting up trainer on rank {self.rank}\")\n", + " config = self.job_config\n", + "\n", + " # Breakpoint 2: Before trainer creation\n", + " if self.rank == 0: # Only break on rank 0 for simplicity\n", + " breakpoint() # Debug: Inspect job config before trainer creation\n", + "\n", + " self.trainer = Trainer(config)\n", + " self._rprint(\"Trainer setup complete\")\n", + "\n", + " @endpoint\n", + " def train_step(self, num_steps: int = 5):\n", + " \"\"\"Run a few training steps with breakpoints.\"\"\"\n", + " if not self.trainer:\n", + " raise RuntimeError(\"Trainer not initialized. Call setup_trainer first.\")\n", + "\n", + " logger.info(f\"Starting training for {num_steps} steps on rank {self.rank}\")\n", + "\n", + " # Breakpoint 3: Before training starts\n", + " if self.rank == 0:\n", + " breakpoint() # Debug: Inspect trainer state before training\n", + "\n", + " # In a real scenario, you'd call trainer.train()\n", + " # For debugging purposes, we'll just simulate a few steps\n", + " for step in range(num_steps):\n", + " if step == 2 and self.rank == 0: # Break mid-training on rank 0\n", + " breakpoint() # Debug: Inspect mid-training state\n", + "\n", + " self._rprint(f\"Processing step {step + 1}/{num_steps}\")\n", + "\n", + " self._rprint(f\"Completed {num_steps} training steps\")\n", + "\n", + " @endpoint\n", + " def cleanup(self):\n", + " \"\"\"Cleanup resources.\"\"\"\n", + " logger.info(f\"Cleaning up trainer on rank {self.rank}\")\n", + "\n", + " if self.trainer:\n", + " self.trainer.close()\n", + "\n", + " if torch.distributed.is_initialized():\n", + " torch.distributed.destroy_process_group()\n", + " logger.info(\"Process group destroyed.\")\n", + "\n", + " self._rprint(\"Cleanup complete\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Spawn Debug Trainer Actor\n", + "\n", + "Spawn the debug trainer actor across the process mesh. When you run the following cells, execution will pause at breakpoints, allowing you to debug interactively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Spawn the debug trainer actor\n", + "debug_trainer = proc_mesh.spawn(\"debug_trainer\", TitanTrainerDebug, config)\n", + "print(\"Debug trainer actor spawned across all nodes\")\n", + "print(\"When breakpoints are hit, run 'monarch debug' in a separate terminal\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run Debug Training Session\n", + "\n", + "Execute the training endpoints. When breakpoints are hit:\n", + "1. Open a separate terminal\n", + "2. Run `monarch debug`\n", + "3. Use `list` to see all active breakpoints\n", + "4. Use `attach debug_trainer 0` to attach to rank 0\n", + "5. Use standard pdb commands (`n`, `s`, `p `, `l`, etc.)\n", + "6. Use `continue` to resume execution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize actors (will hit first breakpoint)\n", + "await debug_trainer.init.call()\n", + "\n", + "# Setup trainer (will hit second breakpoint on rank 0)\n", + "await debug_trainer.setup_trainer.call()\n", + "\n", + "# Run training steps (will hit breakpoints during training)\n", + "await debug_trainer.train_step.call(num_steps=5)\n", + "\n", + "# Cleanup\n", + "await debug_trainer.cleanup.call()\n", + "\n", + "print(\"Debug training session completed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example Debugger Commands\n", + "\n", + "Once in the Monarch debugger, try these commands:\n", + "\n", + "```bash\n", + "# List all active breakpoints\n", + "monarch_dbg> list\n", + "\n", + "# Attach to rank 0 for interactive debugging\n", + "monarch_dbg> attach debug_trainer 0\n", + "\n", + "# Standard pdb commands when attached:\n", + "(Pdb) n # Next line\n", + "(Pdb) s # Step into function\n", + "(Pdb) p self.rank # Print variable\n", + "(Pdb) l # List source code\n", + "(Pdb) c # Continue execution\n", + "\n", + "# Cast commands to multiple ranks (without attaching)\n", + "monarch_dbg> cast debug_trainer ranks(0,1) n\n", + "monarch_dbg> cast debug_trainer ranks(0:4) c\n", + "\n", + "# Continue all breakpoints\n", + "monarch_dbg> continue\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Cleanup and Stop Process Mesh\n", + "\n", + "Gracefully stop the Monarch process mesh, cleaning up all distributed resources and shutting down the actors across all nodes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await proc_mesh.stop()" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/lightning/monarch_titan_mmt.ipynb b/examples/lightning/monarch_titan_mmt.ipynb new file mode 100644 index 000000000..09cfb574e --- /dev/null +++ b/examples/lightning/monarch_titan_mmt.ipynb @@ -0,0 +1,811 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from lightning_sdk import Machine, MMT, Studio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10.192.12.177\n" + ] + } + ], + "source": [ + "from utils.master_node import MasterNodeServer\n", + "private_master_host_ip_address = MasterNodeServer.get_master_ip()\n", + "public_master_host_ip_address = MasterNodeServer.get_master_public_ip_curl()\n", + "public_master_host_ip_address_services = MasterNodeServer.get_master_public_ip()\n", + "print(f\"private_master_host_ip_address = {private_master_host_ip_address}\")\n", + "print(f\"public_master_host_ip_address = {public_master_host_ip_address}\")\n", + "print(f\"public_master_host_ip_address = {public_master_host_ip_address_services}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration\n", + "NUM_NODES = 2\n", + "NUM_GPUS = 8\n", + "TEAMSPACE = \"general\" # Replace with your teamspace\n", + "USER = \"meta-ai\" # Replace with your username\n", + "MONARCH_DEFAULT_PORT = 26600 # Monarch default port\n", + "HTTP_SERVER_PORT = MONARCH_DEFAULT_PORT # 8080 # HTTP Server PORT for IP registration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def launch_mmt_job(num_nodes=2, teamspace=\"my-teamspace\", user=\"my-user\"):\n", + " \"\"\"\n", + " Launch a multi-machine training job using Lightning SDK's MMT API.\n", + " \"\"\"\n", + "\n", + " studio = Studio()\n", + "\n", + " # Install the MMT plugin befor running the actual job\n", + " studio.install_plugin(\"multi-machine-training\")\n", + "\n", + " print(f\"Launching MMT job with {num_nodes} nodes...\")\n", + "\n", + " # Machine with T4 GPUs\n", + " # machine_type = getattr(Machine, f\"T4_X_{NUM_GPUS}\")\n", + "\n", + " # Machine with L40S GPUs\n", + " machine_type = getattr(Machine, f\"L40S_X_{NUM_GPUS}\")\n", + "\n", + " job = MMT.run(\n", + " command=f\"python example/utils/worker_node.py {public_master_host_ip_address} {HTTP_SERVER_PORT} && sleep 10 && process_allocator\",\n", + " name=\"Multi-Node-Monarch-Titan\",\n", + " # machine=Machine.T4_X_4, # Use GPU machines for training\n", + " machine=machine_type,\n", + " studio=studio,\n", + " num_machines=num_nodes,\n", + " env={\n", + " \"CUDA_VISIBLE_DEVICES\": \"0,1,2,3,4,5,6,7\", # Make all GPUs visible # TODO: Should make this one dynamic\n", + " },\n", + " )\n", + "\n", + " print(f\"Job started with ID: {job.name}\")\n", + " print(f\"Job status: {job.status}\")\n", + "\n", + " # Monitor job status\n", + " return job, studio" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Launching MMT job with 2 nodes...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO - Multi-Machine Job was successfully launched. View it at https://lightning.ai/meta-ai/general/jobs/Multi-Node-Monarch-Titan?app_id=mmt\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Job started with ID: Multi-Node-Monarch-Titan\n", + "Job status: Pending\n", + "Job launched. You can monitor it using: job.status\n", + "To stop the job: job.stop()\n", + "To clean up: studio.stop()\n" + ] + } + ], + "source": [ + "# Launch the job\n", + "job, studio = launch_mmt_job(\n", + " num_nodes=NUM_NODES, teamspace=TEAMSPACE, user=USER\n", + ")\n", + "\n", + "print(f\"Job launched. You can monitor it using: job.status\")\n", + "print(f\"To stop the job: job.stop()\")\n", + "print(f\"To clean up: studio.stop()\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Master node IP: 10.192.12.177\n", + "Expecting 2 worker nodes to register...\n", + "Starting server on port 8080...\n", + "Waiting for workers... (0/2 registered) - Elapsed: 0s\n", + "Server started on 10.192.12.177:8080\n", + "Waiting for workers... (0/2 registered) - Elapsed: 30s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 60s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 90s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 120s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 150s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 180s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 210s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 240s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 270s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 300s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 330s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 360s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 390s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 420s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 450s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 480s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 510s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 540s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 570s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 600s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 630s\n", + "Registered worker node: 10.192.12.52 (1/2)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "10.192.12.52 - - [19/Sep/2025 03:51:42] \"POST /register HTTP/1.1\" 200 -\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Registered worker node: 10.192.12.72 (2/2)\n", + "All worker nodes registered!\n", + "Registration server stopped\n", + "Final registered worker nodes: ['10.192.12.52', '10.192.12.72']\n", + "Worker IPs saved to /tmp/worker_nodes.txt\n", + "Cluster info saved to /tmp/cluster_info.json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "10.192.12.72 - - [19/Sep/2025 03:51:44] \"POST /register HTTP/1.1\" 200 -\n" + ] + } + ], + "source": [ + "from utils.master_node import run_master_server\n", + "cluster_info = run_master_server(expected_workers=NUM_NODES, port=HTTP_SERVER_PORT)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted IP addresses:\n", + "10.192.12.52\n", + "10.192.12.72\n", + "\n", + "IP set: {'10.192.12.72', '10.192.12.52'}\n", + "['10.192.12.72', '10.192.12.52']\n" + ] + } + ], + "source": [ + "from utils.ip_utils import extract_ips_simple\n", + "worker_nodes_ip_file_path = \"/tmp/worker_nodes.txt\"\n", + "ip_addresses_set = extract_ips_simple(worker_nodes_ip_file_path)\n", + "ip_addresses_list = list(ip_addresses_set)\n", + "print(ip_addresses_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tcp!10.192.12.72:26600 tcp!10.192.12.52:26600\n" + ] + } + ], + "source": [ + "tcp_addresses = [f\"tcp!{ip}:26600\" for ip in ip_addresses_set]\n", + "\n", + "# # Or if you want to test it locally first on the local machine uncomment line below:\n", + "# tcp_addresses = [\"tcp![::]:26600\"]\n", + "# # For the local host machine only, please make sure that NUM_NODES is equal to 1;\n", + "# NUM_NODES = 1\n", + "\n", + "print(*tcp_addresses)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example 1 - Run TorchTitan using Monarch for Llama 3 - 8B" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from monarch._src.actor.allocator import RemoteAllocator, StaticRemoteAllocInitializer\n", + "from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec\n", + "from monarch.actor import ProcMesh\n", + "\n", + "allocator = RemoteAllocator(\n", + " world_id=\"foo\",\n", + " initializer=StaticRemoteAllocInitializer(*tcp_addresses),\n", + " )\n", + "\n", + "alloc = allocator.allocate(\n", + " AllocSpec(AllocConstraints(), hosts=NUM_NODES, gpus=NUM_GPUS)\n", + " )\n", + "\n", + "proc_mesh = await ProcMesh.from_alloc(alloc)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "monarch-alisol-hosts2-gpus8\n" + ] + } + ], + "source": [ + "import getpass\n", + "def get_job_name(num_hosts: int, num_gpus_per_host: int):\n", + " return f\"monarch-{getpass.getuser()}-hosts{num_hosts}-gpus{num_gpus_per_host}\"\n", + "print(get_job_name(num_hosts=NUM_NODES, num_gpus_per_host=NUM_GPUS))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import logging\n", + "from monarch.actor import ProcMesh, Actor, endpoint, current_rank\n", + "import socket\n", + "from torchtitan.tools.logging import init_logger, logger\n", + "from torchtitan.train import Trainer\n", + "from typing import Optional\n", + "import torch\n", + "from torchtitan.config import JobConfig\n", + "\n", + "\n", + "class TitanTrainerWrapper(Actor):\n", + " def __init__(self, job_config: JobConfig):\n", + " self.rank = current_rank().rank\n", + " self.job_config = job_config\n", + "\n", + " def _rprint(self, msg):\n", + " \"\"\"Helper method to print with rank information.\"\"\"\n", + " print(f\"{self.rank=} {msg}\")\n", + "\n", + " @endpoint\n", + " def init(self):\n", + " logging.getLogger().addHandler(logging.StreamHandler(sys.stderr))\n", + " print(f\"Initializing actor: {self.rank} {current_rank()=} {socket.gethostname()=}\")\n", + "\n", + "\n", + " @endpoint\n", + " def train(self):\n", + " logger.info(\"Starting training\")\n", + " config = self.job_config\n", + " trainer: Optional[Trainer] = None\n", + "\n", + " try:\n", + " trainer = Trainer(config)\n", + " trainer.train()\n", + "\n", + " if config.checkpoint.create_seed_checkpoint:\n", + " assert (\n", + " int(os.environ[\"WORLD_SIZE\"]) == 1\n", + " ), \"Must create seed checkpoint using a single device, to disable sharding.\"\n", + " assert (\n", + " # config.checkpoint.enable_checkpoint\n", + " config.checkpoint.enable\n", + " ), \"Must enable checkpointing when creating a seed checkpoint.\"\n", + " trainer.checkpointer.save(curr_step=0, )\n", + " logger.info(\"Created seed checkpoint\")\n", + " else:\n", + " trainer.train()\n", + " finally:\n", + " if trainer:\n", + " trainer.close()\n", + "\n", + " if torch.distributed.is_initialized():\n", + " torch.distributed.destroy_process_group()\n", + " logger.info(\"Process group destroyed.\")\n", + " print(\"Done training\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.xpu import stream\n", + "from torchtitan.config import ConfigManager, JobConfig\n", + "from monarch.utils import setup_env_for_distributed\n", + "\n", + "async def async_main(job_config: JobConfig):\n", + " torch.use_deterministic_algorithms(True)\n", + " job_name = get_job_name(NUM_NODES, NUM_GPUS)\n", + "\n", + " await setup_env_for_distributed(proc_mesh)\n", + "\n", + " await proc_mesh.logging_option(stream_to_client=True, aggregate_window_sec=3)\n", + "\n", + " print(job_config)\n", + " print(f\"Spawning meshes on {job_name}\")\n", + "\n", + " trainer_actor = await proc_mesh.spawn(\"trainer_actor\", TitanTrainerWrapper, job_config)\n", + " await trainer_actor.init.call()\n", + " await trainer_actor.train.call()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[titan] 2025-09-19 03:54:41,922 - root - WARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JobConfig(job=Job(config_file='/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml', dump_folder='/teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts2-gpus8', description='Llama 3 8B training', print_args=False), profiling=Profiling(enable_profiling=True, save_traces_folder='profile_trace', profile_freq=100, enable_memory_snapshot=False, save_memory_snapshot_folder='memory_snapshot'), metrics=Metrics(log_freq=1, enable_tensorboard=True, disable_color_printing=False, save_tb_folder='tb', save_for_all_ranks=False, enable_wandb=True), model=Model(name='llama3', flavor='8B', hf_assets_path='/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B', tokenizer_path='/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B', converters=[], print_after_conversion=False), optimizer=Optimizer(name='AdamW', lr=0.0003, beta1=0.9, beta2=0.95, eps=1e-08, weight_decay=0.1, implementation='fused', early_step_in_backward=False), lr_scheduler=LRScheduler(warmup_steps=200, decay_ratio=None, decay_type='linear', min_lr_factor=0.0), training=Training(dataset='c4_test', dataset_path='/teamspace/studios/this_studio/torchtitan/tests/assets/c4_test', local_batch_size=1, global_batch_size=-1, seq_len=2048, max_norm=1.0, steps=25, enable_cpu_offload=False, mixed_precision_param='bfloat16', mixed_precision_reduce='float32', gc_freq=50, gc_debug=False, seed=None, deterministic=False), parallelism=Parallelism(data_parallel_replicate_degree=1, enable_compiled_autograd=False, data_parallel_shard_degree=-1, fsdp_reshard_after_forward='default', tensor_parallel_degree=1, disable_loss_parallel=False, enable_async_tensor_parallel=False, pipeline_parallel_degree=1, pipeline_parallel_split_points=[], module_fqns_per_model_part=None, pipeline_parallel_first_stage_less_layers=1, pipeline_parallel_last_stage_less_layers=1, pipeline_parallel_layers_per_stage=None, pipeline_parallel_schedule='1F1B', pipeline_parallel_schedule_csv='', pipeline_parallel_microbatch_size=1, context_parallel_degree=1, context_parallel_rotate_method='allgather', expert_parallel_degree=1, expert_tensor_parallel_degree=1), checkpoint=Checkpoint(enable=False, folder='checkpoint', interval=500, initial_load_path=None, initial_load_model_only=True, initial_load_in_hf=False, last_save_model_only=True, last_save_in_hf=False, export_dtype='float32', async_mode='disabled', keep_latest_k=10, load_step=-1, exclude_from_loading=[], enable_first_step_checkpoint=False, create_seed_checkpoint=False), activation_checkpoint=ActivationCheckpoint(mode='selective', selective_ac_option='op', per_op_sac_force_recompute_mm_shapes_by_fqns=['moe.router.gate'], early_stop=False), compile=Compile(enable=False, components=['model', 'loss']), float8=Float8(enable_fsdp_float8_all_gather=False, precompute_float8_dynamic_scale_for_fsdp=False, recipe_name=None, filter_fqns=['output'], emulate=False, moe_fqns_prototype=[]), mx=MX(mxfp8_dim1_cast_kernel_choice='triton', recipe_name='mxfp8_cublas', filter_fqns=['output'], moe_fqns_prototype=[]), comm=Comm(init_timeout_seconds=300, train_timeout_seconds=100, trace_buf_size=20000, save_traces_folder='comm_traces'), memory_estimation=MemoryEstimation(enable=False, disable_fake_mode=False), fault_tolerance=FaultTolerance(enable=False, process_group='gloo', process_group_timeout_ms=10000, replica_id=0, group_size=0, min_replica_size=1, semi_sync_method=None), experimental=Experimental(custom_import='', custom_args_module=''), validation=Validation(enable=False, dataset='c4_validation', dataset_path=None, local_batch_size=8, seq_len=2048, freq=500, steps=1200))\n", + "Spawning meshes on monarch-alisol-hosts2-gpus8\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:54:11) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m Initializing actor: 7 current_rank()={'hosts': 0/2, 'gpus': 7/8} socket.gethostname()='ip-10-192-12-72'\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:54:46) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:54:11) >>>\u001b[0m\n", + "\u001b[33m[16 similar log lines]\u001b[0m Starting training\n", + "\u001b[33m[16 similar log lines]\u001b[0m Starting job: Llama 3 8B training\n", + "\u001b[33m[15 similar log lines]\u001b[0m [W919 03:54:48.025441173 socket.cpp:767] [c10d] The client socket has failed to connect to [ip-10-192-12-72]:50717 (errno: 22 - Invalid argument).\n", + "\u001b[33m[16 similar log lines]\u001b[0m Building 1-D device mesh with ['dp_shard'], [16]\n", + "\u001b[33m[16 similar log lines]\u001b[0m [GC] Initial GC collection 0.00 seconds\n", + "\u001b[33m[16 similar log lines]\u001b[0m Loading tokenizer from tokenizer.json\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:54:49) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:54:46) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m Initializing actor: 5 current_rank()={'hosts': 0/2, 'gpus': 5/8} socket.gethostname()='ip-10-192-12-72'\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:54:49) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:54:49) >>>\u001b[0m\n", + "\u001b[33m[16 similar log lines]\u001b[0m Preparing c4_test dataset from /teamspace/studios/this_studio/torchtitan/tests/assets/c4_test\n", + "\u001b[33m[16 similar log lines]\u001b[0m Building llama3 8B with TransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)\n", + "\u001b[33m[16 similar log lines]\u001b[0m CUDA capacity: NVIDIA L40S with 44.64GiB memory\n", + "\u001b[33m[31 similar log lines]\u001b[0m Peak flops undefined for: NVIDIA L40S, fallback to A100\n", + "\u001b[33m[16 similar log lines]\u001b[0m \u001b[34mModel llama3 8B \u001b[31msize: 8,030,261,248 total parameters\u001b[39m\n", + "\u001b[33m[16 similar log lines]\u001b[0m Applied selective activation checkpointing to the model\n", + "\u001b[33m[16 similar log lines]\u001b[0m Applied FSDP to the model\n", + "\u001b[33m[15 similar log lines]\u001b[0m Peak FLOPS used for computing MFU: 3.120e+14\n", + "\u001b[33m[15 similar log lines]\u001b[0m CUDA memory usage for model: 1.90GiB(4.25%)\n", + "\u001b[33m[15 similar log lines]\u001b[0m Warmup steps (200) exceed total training steps (25). Adjusting warmup steps to 25.\n", + "\u001b[33m[15 similar log lines]\u001b[0m model.safetensors.index.json not found at hf_assets_path: /teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B/model.safetensors.index.json. Defaulting to saving a single safetensors file if checkpoint is saved in HF format\n", + "\u001b[33m[15 similar log lines]\u001b[0m Mixed precision training is handled by fully_shard\n", + "\u001b[33m[15 similar log lines]\u001b[0m Trainer is initialized with local batch size 1, global batch size 16, gradient accumulation steps 1, sequence length 2048, total steps 25 (warmup 200)\n", + "\u001b[33m[15 similar log lines]\u001b[0m Training starts at step 1\n", + "\u001b[33m[15 similar log lines]\u001b[0m Profiling active. Traces will be saved at /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts2-gpus8/profile_trace\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Currently logged in as: a-shamsoshoara (a-shamsoshoara-m) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Tracking run with wandb version 0.21.3\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Run data is saved locally in /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts2-gpus8/tb/20250919-0354/wandb/run-20250919_035451-0p5lifho\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Run `wandb offline` to turn off syncing.\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Syncing run graceful-river-27\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: ⭐️ View project at https://wandb.ai/a-shamsoshoara-m/torchtitan\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: 🚀 View run at https://wandb.ai/a-shamsoshoara-m/torchtitan/runs/0p5lifho\n", + "\u001b[33m[1 similar log lines]\u001b[0m WandB logging enabled\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:54:52) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:54:52) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m Peak flops undefined for: NVIDIA L40S, fallback to A100\n", + "\u001b[33m[1 similar log lines]\u001b[0m Peak FLOPS used for computing MFU: 3.120e+14\n", + "\u001b[33m[1 similar log lines]\u001b[0m CUDA memory usage for model: 1.90GiB(4.25%)\n", + "\u001b[33m[1 similar log lines]\u001b[0m Warmup steps (200) exceed total training steps (25). Adjusting warmup steps to 25.\n", + "\u001b[33m[1 similar log lines]\u001b[0m model.safetensors.index.json not found at hf_assets_path: /teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B/model.safetensors.index.json. Defaulting to saving a single safetensors file if checkpoint is saved in HF format\n", + "\u001b[33m[1 similar log lines]\u001b[0m Mixed precision training is handled by fully_shard\n", + "\u001b[33m[1 similar log lines]\u001b[0m Trainer is initialized with local batch size 1, global batch size 16, gradient accumulation steps 1, sequence length 2048, total steps 25 (warmup 200)\n", + "\u001b[33m[1 similar log lines]\u001b[0m Training starts at step 1\n", + "\u001b[33m[1 similar log lines]\u001b[0m Profiling active. Traces will be saved at /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts2-gpus8/profile_trace\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:54:55) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:54:55) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 1 \u001b[32mloss: 12.2286 \u001b[38;2;180;60;0mgrad_norm: 3.8083 \u001b[38;2;54;234;195mmemory: 17.41GiB(39.00%) \u001b[34mtps: 98 \u001b[36mtflops: 4.72 \u001b[35mmfu: 1.51%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:55:11) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:55:11) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 1 \u001b[32mloss: 12.2286 \u001b[38;2;180;60;0mgrad_norm: 3.8083 \u001b[38;2;54;234;195mmemory: 17.41GiB(39.00%) \u001b[34mtps: 98 \u001b[36mtflops: 4.72 \u001b[35mmfu: 1.51%\u001b[39m\n", + "\u001b[33m[16 similar log lines]\u001b[0m Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:55:14) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:55:14) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 2 \u001b[32mloss: 11.4166 \u001b[38;2;180;60;0mgrad_norm: 5.0591 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 118 \u001b[36mtflops: 5.71 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:55:28) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:55:28) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 2 \u001b[32mloss: 11.4166 \u001b[38;2;180;60;0mgrad_norm: 5.0591 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 118 \u001b[36mtflops: 5.71 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:55:31) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:55:31) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 3 \u001b[32mloss: 12.0842 \u001b[38;2;180;60;0mgrad_norm: 56.4885 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:55:46) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:55:46) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 3 \u001b[32mloss: 12.0842 \u001b[38;2;180;60;0mgrad_norm: 56.4885 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:55:49) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:55:49) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 4 \u001b[32mloss: 13.0378 \u001b[38;2;180;60;0mgrad_norm: 49.4604 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:56:03) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:56:03) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 4 \u001b[32mloss: 13.0378 \u001b[38;2;180;60;0mgrad_norm: 49.4604 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:56:06) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:56:06) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 5 \u001b[32mloss: 11.8098 \u001b[38;2;180;60;0mgrad_norm: 8.7387 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:56:20) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:56:20) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 5 \u001b[32mloss: 11.8098 \u001b[38;2;180;60;0mgrad_norm: 8.7387 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:56:23) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:56:23) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 6 \u001b[32mloss: 11.7958 \u001b[38;2;180;60;0mgrad_norm: 25.1134 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.74 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:56:37) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:56:37) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 6 \u001b[32mloss: 11.7958 \u001b[38;2;180;60;0mgrad_norm: 25.1134 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.74 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:56:40) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:56:40) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 7 \u001b[32mloss: 11.4801 \u001b[38;2;180;60;0mgrad_norm: 8.4318 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:56:55) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:56:55) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 7 \u001b[32mloss: 11.4801 \u001b[38;2;180;60;0mgrad_norm: 8.4318 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:56:58) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:56:58) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 8 \u001b[32mloss: 10.5124 \u001b[38;2;180;60;0mgrad_norm: 10.8082 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:57:12) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:57:12) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 8 \u001b[32mloss: 10.5124 \u001b[38;2;180;60;0mgrad_norm: 10.8082 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:57:15) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:57:15) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 9 \u001b[32mloss: 10.4442 \u001b[38;2;180;60;0mgrad_norm: 20.3616 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:57:29) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:57:29) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 9 \u001b[32mloss: 10.4442 \u001b[38;2;180;60;0mgrad_norm: 20.3616 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:57:32) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:57:32) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 10 \u001b[32mloss: 9.7792 \u001b[38;2;180;60;0mgrad_norm: 7.5678 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:57:46) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:57:46) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 10 \u001b[32mloss: 9.7792 \u001b[38;2;180;60;0mgrad_norm: 7.5678 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:57:49) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:57:49) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 11 \u001b[32mloss: 9.1549 \u001b[38;2;180;60;0mgrad_norm: 4.6241 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:58:04) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:58:04) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 11 \u001b[32mloss: 9.1549 \u001b[38;2;180;60;0mgrad_norm: 4.6241 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:58:07) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:58:07) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 12 \u001b[32mloss: 9.3845 \u001b[38;2;180;60;0mgrad_norm: 32.4210 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.74 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:58:21) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:58:21) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 12 \u001b[32mloss: 9.3845 \u001b[38;2;180;60;0mgrad_norm: 32.4210 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.74 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:58:24) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:58:24) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 13 \u001b[32mloss: 10.4570 \u001b[38;2;180;60;0mgrad_norm: 40.4274 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.74 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:58:38) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:58:38) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 13 \u001b[32mloss: 10.4570 \u001b[38;2;180;60;0mgrad_norm: 40.4274 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.74 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:58:41) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:58:41) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 14 \u001b[32mloss: 10.1626 \u001b[38;2;180;60;0mgrad_norm: 43.6353 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:58:55) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:58:55) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 14 \u001b[32mloss: 10.1626 \u001b[38;2;180;60;0mgrad_norm: 43.6353 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:58:58) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:58:58) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 15 \u001b[32mloss: 8.8694 \u001b[38;2;180;60;0mgrad_norm: 15.3759 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:59:13) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:59:13) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 15 \u001b[32mloss: 8.8694 \u001b[38;2;180;60;0mgrad_norm: 15.3759 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:59:16) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:59:16) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 16 \u001b[32mloss: 8.5247 \u001b[38;2;180;60;0mgrad_norm: 4.8650 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:59:30) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:59:30) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 16 \u001b[32mloss: 8.5247 \u001b[38;2;180;60;0mgrad_norm: 4.8650 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:59:33) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:59:33) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 17 \u001b[32mloss: 8.6274 \u001b[38;2;180;60;0mgrad_norm: 14.6862 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:59:47) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:59:47) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 17 \u001b[32mloss: 8.6274 \u001b[38;2;180;60;0mgrad_norm: 14.6862 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 03:59:50) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:59:50) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 18 \u001b[32mloss: 8.2122 \u001b[38;2;180;60;0mgrad_norm: 4.1870 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:00:04) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:00:04) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 18 \u001b[32mloss: 8.2122 \u001b[38;2;180;60;0mgrad_norm: 4.1870 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:00:07) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:00:07) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 19 \u001b[32mloss: 8.0256 \u001b[38;2;180;60;0mgrad_norm: 5.2243 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:00:22) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:00:22) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 19 \u001b[32mloss: 8.0256 \u001b[38;2;180;60;0mgrad_norm: 5.2243 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:00:25) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:00:25) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 20 \u001b[32mloss: 8.0121 \u001b[38;2;180;60;0mgrad_norm: 10.4728 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.74 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:00:39) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:00:39) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 20 \u001b[32mloss: 8.0121 \u001b[38;2;180;60;0mgrad_norm: 10.4728 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.74 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:00:42) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:00:42) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 21 \u001b[32mloss: 9.3447 \u001b[38;2;180;60;0mgrad_norm: 9.1810 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.74 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:00:56) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:00:56) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 21 \u001b[32mloss: 9.3447 \u001b[38;2;180;60;0mgrad_norm: 9.1810 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.74 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:00:59) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:00:59) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 22 \u001b[32mloss: 8.0301 \u001b[38;2;180;60;0mgrad_norm: 8.9817 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:01:13) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:01:13) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 22 \u001b[32mloss: 8.0301 \u001b[38;2;180;60;0mgrad_norm: 8.9817 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.73 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:01:16) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:01:16) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 23 \u001b[32mloss: 8.0246 \u001b[38;2;180;60;0mgrad_norm: 4.4800 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:01:31) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:01:31) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 23 \u001b[32mloss: 8.0246 \u001b[38;2;180;60;0mgrad_norm: 4.4800 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.72 \u001b[35mmfu: 1.83%\u001b[39m\n", + "\u001b[33m[1 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:01:34) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:01:34) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 24 \u001b[32mloss: 7.9291 \u001b[38;2;180;60;0mgrad_norm: 3.7897 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.75 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:01:48) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:01:48) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 24 \u001b[32mloss: 7.9291 \u001b[38;2;180;60;0mgrad_norm: 3.7897 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.75 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[33m[2 similar log lines]\u001b[0m Dataset c4_test is being re-looped\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:01:51) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:01:51) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 25 \u001b[32mloss: 7.8506 \u001b[38;2;180;60;0mgrad_norm: 2.9660 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.74 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:02:05) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:02:05) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 25 \u001b[32mloss: 7.8506 \u001b[38;2;180;60;0mgrad_norm: 2.9660 \u001b[38;2;54;234;195mmemory: 19.37GiB(43.40%) \u001b[34mtps: 119 \u001b[36mtflops: 5.74 \u001b[35mmfu: 1.84%\u001b[39m\n", + "\u001b[33m[31 similar log lines]\u001b[0m Training completed\n", + "\u001b[33m[2 similar log lines]\u001b[0m Sleeping 2 seconds for other ranks to complete\n", + "\u001b[33m[16 similar log lines]\u001b[0m Training starts at step 26\n", + "\u001b[33m[16 similar log lines]\u001b[0m Profiling active. Traces will be saved at /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts2-gpus8/profile_trace\n", + "\u001b[33m[14 similar log lines]\u001b[0m Process group destroyed.\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:02:08) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-09-19 03:54:49) >>>\u001b[0m\n", + "\u001b[33m[14 similar log lines]\u001b[0m Done training\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:02:08) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:02:08) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m Training completed\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: updating run metadata\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: \n", + "\u001b[33m[3 similar log lines]\u001b[0m wandb: \n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Run history:\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: grad_norm ▁▁█▇▂▄▂▂▃▂▁▅▆▆▃▁▃▁▁▂▂▂▁▁▁\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: loss_metrics/global_avg_loss ▇▆▇█▆▆▆▅▅▄▃▃▅▄▂▂▂▁▁▁▃▁▁▁▁\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: loss_metrics/global_max_loss ▅▄▆▆▅▅▅▃▄▃▂▃▄▃▂▂▂▁▂▂█▂▂▁▁\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: lr ▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇██\n", + "\u001b[33m[4 similar log lines]\u001b[0m wandb: memory/max_active(%) ▁████████████████████████\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/num_alloc_retries ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/num_ooms ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁��\n", + "\u001b[33m[2 similar log lines]\u001b[0m wandb: +7 ...\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Run summary:\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: grad_norm 2.96598\n", + "\u001b[33m[2 similar log lines]\u001b[0m Process group destroyed.\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: loss_metrics/global_avg_loss 7.85056\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: loss_metrics/global_max_loss 8.68094\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: lr 0.0003\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/max_active(%) 30.14646\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/max_active(GiB) 13.4567\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/max_reserved(%) 43.40058\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/max_reserved(GiB) 19.37305\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/num_alloc_retries 0\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/num_ooms 0\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: 🚀 View run graceful-river-27 at: https://wandb.ai/a-shamsoshoara-m/torchtitan/runs/0p5lifho\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: ⭐️ View project at: https://wandb.ai/a-shamsoshoara-m/torchtitan\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Find logs at: ./torchtitan/outputs/monarch-alisol-hosts2-gpus8/tb/20250919-0354/wandb/run-20250919_035451-0p5lifho/logs\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:02:11) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-09-19 04:02:08) >>>\u001b[0m\n", + "\u001b[33m[2 similar log lines]\u001b[0m Done training\n", + "\u001b[36m<<< Aggregated Logs (2025-09-19 04:02:11) <<<\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "init_logger()\n", + "config_manager = ConfigManager()\n", + "\n", + "job_name = get_job_name(NUM_NODES, NUM_GPUS)\n", + "\n", + "manual_args = [\n", + " \"--job.config_file\",\n", + " os.path.expanduser(\"/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml\"),\n", + " \"--model.tokenizer-path\",\n", + " # f\"{FUSE_DST}/Llama-3.1-8B\",\n", + " \"/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B\",\n", + " \"--training.steps\",\n", + " \"25\",\n", + " \"--training.dataset_path\",\n", + " # f\"{FUSE_DST}/c4\",\n", + " \"/teamspace/studios/this_studio/torchtitan/tests/assets/c4_test\",\n", + " \"--job.dump_folder\",\n", + " # f\"{FUSE_DST}/outputs/\" + job_name,\n", + " \"/teamspace/studios/this_studio/torchtitan/outputs/\" + job_name\n", + " ]\n", + "config = config_manager.parse_args(manual_args)\n", + "await async_main(config)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "proc_mesh.stop()" + ] + } + ], + "metadata": { + "fileHeader": "", + "fileUid": "6d5af34b-6e4e-48c6-82c4-ccf442e377c5", + "isAdHoc": false, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/lightning/monarch_v1_titan_aws.ipynb b/examples/lightning/monarch_v1_titan_aws.ipynb new file mode 100644 index 000000000..66fc6a19a --- /dev/null +++ b/examples/lightning/monarch_v1_titan_aws.ipynb @@ -0,0 +1,420 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from lightning_sdk import Machine, MMT, Studio" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "private_master_host_ip_address = 10.192.10.43\n", + "public_master_host_ip_address = 34.201.107.243\n", + "public_master_host_ip_address = 34.201.107.243\n" + ] + } + ], + "source": [ + "from utils.master_node import MasterNodeServer\n", + "private_master_host_ip_address = MasterNodeServer.get_master_ip()\n", + "public_master_host_ip_address = MasterNodeServer.get_master_public_ip_curl()\n", + "public_master_host_ip_address_services = MasterNodeServer.get_master_public_ip()\n", + "print(f\"private_master_host_ip_address = {private_master_host_ip_address}\")\n", + "print(f\"public_master_host_ip_address = {public_master_host_ip_address}\")\n", + "print(f\"public_master_host_ip_address = {public_master_host_ip_address_services}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration\n", + "import os\n", + "NUM_NODES = 2\n", + "NUM_CPUS = 2\n", + "NUM_GPUS = 8\n", + "NUM_PROCS = NUM_NODES * NUM_GPUS\n", + "TEAMSPACE = \"general\" # Replace with your teamspace\n", + "USER = \"meta-ai\" # Replace with your username\n", + "MONARCH_DEFAULT_PORT = 26600 # Monarch default port\n", + "HTTP_SERVER_PORT = MONARCH_DEFAULT_PORT # 8080 # HTTP Server PORT for IP registration\n", + "MMT_JOB_NAME = f\"Monarch-v1-Titan-{NUM_NODES}_nodes-port_override\"\n", + "os.environ[\"MONARCH_FILE_LOG\"] = \"debug\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def launch_mmt_job(num_nodes=2, teamspace=\"my-teamspace\", user=\"my-user\"):\n", + " \"\"\"\n", + " Launch a multi-machine training job using Lightning SDK's MMT API.\n", + " \"\"\"\n", + "\n", + " studio = Studio()\n", + "\n", + " # Install the MMT plugin befor running the actual job\n", + " studio.install_plugin(\"multi-machine-training\")\n", + "\n", + " print(f\"Launching MMT job with {num_nodes} nodes...\")\n", + "\n", + " # Machine with CPUs\n", + " # machine_type = getattr(Machine, f\"CPU_X_{NUM_CPUS}\")\n", + "\n", + " # Machine with T4 GPUs\n", + " # machine_type = getattr(Machine, f\"T4_X_{NUM_GPUS}\")\n", + "\n", + " # Machine with L4 GPUs\n", + " # machine_type = getattr(Machine, f\"L4_X_{NUM_GPUS}\")\n", + "\n", + " # Machine with L40S GPUs\n", + " machine_type = getattr(Machine, f\"L40S_X_{NUM_GPUS}\")\n", + "\n", + " job = MMT.run(\n", + " command=\"process_allocator\",\n", + " # command=f\"tail -f /dev/null\",\n", + " name=MMT_JOB_NAME,\n", + " machine=machine_type,\n", + " studio=studio,\n", + " num_machines=num_nodes,\n", + " env={\n", + " \"CUDA_VISIBLE_DEVICES\": \"0,1,2,3,4,5,6,7\", # Make all GPUs visible # TODO: Should make this one dynamic\n", + " \"MONARCH_FILE_LOG\": \"debug\",\n", + " \"HYPERACTOR_REMOTE_ALLOC_ALLOWED_PORT_RANGE\": \"26601-26610\",\n", + " \"HYPERACTOR_REMOTE_ALLOC_BIND_TO_INADDR_ANY\": \"true\",\n", + " },\n", + " )\n", + "\n", + " print(f\"Job started with ID: {job.name}\")\n", + " print(f\"Job status: {job.status}\")\n", + "\n", + " # Monitor job status\n", + " return job, studio" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Launching MMT job with 2 nodes...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO - Multi-Machine Job was successfully launched. View it at https://lightning.ai/meta-ai/general/jobs/Monarch-v1-Titan-2_nodes-port_override-437zt?app_id=mmt\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Job started with ID: Monarch-v1-Titan-2_nodes-port_override-437zt\n", + "Job status: Pending\n", + "Job launched. You can monitor it using: job.status\n", + "To stop the job: job.stop()\n", + "To clean up: studio.stop()\n" + ] + } + ], + "source": [ + "# Launch the job\n", + "job, studio = launch_mmt_job(\n", + " num_nodes=NUM_NODES, teamspace=TEAMSPACE, user=USER\n", + ")\n", + "\n", + "print(f\"Job launched. You can monitor it using: job.status\")\n", + "print(f\"To stop the job: job.stop()\")\n", + "print(f\"To clean up: studio.stop()\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ip_addresses_list=['3.150.40.243', '18.189.125.53']\n", + "ip_addresses_set={'18.189.125.53', '3.150.40.243'}\n", + "IP addresses are available: True\n" + ] + } + ], + "source": [ + "ip_addresses_list = [machine.public_ip for machine in job.machines]\n", + "ip_addresses_set = set(ip_addresses_list)\n", + "print(f\"{ip_addresses_list=}\")\n", + "print(f\"{ip_addresses_set=}\")\n", + "ips_available = not ip_addresses_set == {''}\n", + "print(f\"IP addresses are available: {ips_available}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['tcp!18.189.125.53:26600', 'tcp!3.150.40.243:26600']\n" + ] + } + ], + "source": [ + "if ips_available:\n", + " tcp_addresses = [f\"tcp!{ip}:{MONARCH_DEFAULT_PORT}\" for ip in ip_addresses_set]\n", + " print(tcp_addresses)\n", + "else:\n", + " raise ValueError(\"IPs are not available yet!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "tcp!34.201.107.243:0\n" + ] + } + ], + "source": [ + "import os\n", + "from monarch._src.actor.allocator import RemoteAllocator, StaticRemoteAllocInitializer\n", + "# from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec\n", + "# from monarch.actor import ProcMesh\n", + "# tcp_addresses = ['tcp!3.21.117.93:26600', 'tcp!18.220.66.230:26600']\n", + "\n", + "os.environ[\"HYPERACTOR_REMOTE_ALLOC_ALLOWED_PORT_RANGE\"] = \"26600-26610\"\n", + "os.environ[\"HYPERACTOR_REMOTE_ALLOC_BOOTSTRAP_ADDR\"] = f\"tcp!{public_master_host_ip_address}:0\"\n", + "os.environ[\"HYPERACTOR_REMOTE_ALLOC_BIND_TO_INADDR_ANY\"] = \"true\"\n", + "os.environ[\"MONARCH_HOST_MESH_V1_REMOVE_ME_BEFORE_RELEASE\"] = \"1\"\n", + "\n", + "allocator = RemoteAllocator(\n", + " world_id=\"foo\",\n", + " initializer=StaticRemoteAllocInitializer(*tcp_addresses),\n", + " )\n", + "\n", + "print(allocator)\n", + "print(os.environ[\"HYPERACTOR_REMOTE_ALLOC_BOOTSTRAP_ADDR\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "from monarch.actor import HostMesh\n", + "from monarch._rust_bindings.monarch_hyperactor.shape import Extent\n", + "\n", + "host_mesh = HostMesh.allocate_nonblocking(\n", + " \"hostmeshtest\",\n", + " extent=Extent([\"hosts\", \"procs\"], [NUM_NODES, NUM_PROCS]),\n", + " allocator=allocator,\n", + " )\n", + "proc_mesh = host_mesh.spawn_procs({\"gpus\": NUM_GPUS})" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "monarch-alisol-hosts2-gpus8\n" + ] + } + ], + "source": [ + "import getpass\n", + "def get_job_name(num_hosts: int, num_gpus_per_host: int):\n", + " return f\"monarch-{getpass.getuser()}-hosts{num_hosts}-gpus{num_gpus_per_host}\"\n", + "print(get_job_name(num_hosts=NUM_NODES, num_gpus_per_host=NUM_GPUS))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import logging\n", + "from monarch.actor import ProcMesh, Actor, endpoint, current_rank\n", + "import socket\n", + "from torchtitan.tools.logging import logger\n", + "from torchtitan.train import Trainer\n", + "from typing import Optional\n", + "import torch\n", + "from torchtitan.config import JobConfig\n", + "\n", + "\n", + "class TitanTrainerWrapper(Actor):\n", + " def __init__(self, job_config: JobConfig):\n", + " self.rank = current_rank().rank\n", + " self.job_config = job_config\n", + "\n", + " def _rprint(self, msg):\n", + " \"\"\"Helper method to print with rank information.\"\"\"\n", + " print(f\"{self.rank=} {msg}\")\n", + "\n", + " @endpoint\n", + " def init(self):\n", + " logging.getLogger().addHandler(logging.StreamHandler(sys.stderr))\n", + " print(f\"Initializing actor: {self.rank} {current_rank()=} {socket.gethostname()=}\")\n", + "\n", + "\n", + " @endpoint\n", + " def train(self):\n", + " logger.info(\"Starting training\")\n", + " config = self.job_config\n", + " trainer: Optional[Trainer] = None\n", + "\n", + " try:\n", + " trainer = Trainer(config)\n", + " trainer.train()\n", + "\n", + " if config.checkpoint.create_seed_checkpoint:\n", + " assert (\n", + " int(os.environ[\"WORLD_SIZE\"]) == 1\n", + " ), \"Must create seed checkpoint using a single device, to disable sharding.\"\n", + " assert (\n", + " # config.checkpoint.enable_checkpoint\n", + " config.checkpoint.enable\n", + " ), \"Must enable checkpointing when creating a seed checkpoint.\"\n", + " trainer.checkpointer.save(curr_step=0, )\n", + " logger.info(\"Created seed checkpoint\")\n", + " else:\n", + " trainer.train()\n", + " finally:\n", + " if trainer:\n", + " trainer.close()\n", + "\n", + " if torch.distributed.is_initialized():\n", + " torch.distributed.destroy_process_group()\n", + " logger.info(\"Process group destroyed.\")\n", + " print(\"Done training\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.xpu import stream\n", + "from torchtitan.config import JobConfig\n", + "from monarch.utils import setup_env_for_distributed\n", + "\n", + "async def async_main(job_config: JobConfig):\n", + " torch.use_deterministic_algorithms(True)\n", + " job_name = get_job_name(NUM_NODES, NUM_GPUS)\n", + "\n", + " await setup_env_for_distributed(proc_mesh, )\n", + "\n", + " await proc_mesh.logging_option(stream_to_client=True, aggregate_window_sec=3)\n", + "\n", + " print(job_config)\n", + " print(f\"Spawning meshes on {job_name}\")\n", + "\n", + " # trainer_actor = await proc_mesh.spawn(\"trainer_actor\", TitanTrainerWrapper, job_config)\n", + " trainer_actor = proc_mesh.spawn_procs(\"trainer_actor\", TitanTrainerWrapper, job_config)\n", + "\n", + " await trainer_actor.init.call()\n", + " await trainer_actor.train.call()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[titan] 2025-10-15 03:18:27,751 - root - WARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily.\n" + ] + } + ], + "source": [ + "from torchtitan.config import ConfigManager\n", + "from torchtitan.tools.logging import init_logger\n", + "init_logger()\n", + "config_manager = ConfigManager()\n", + "\n", + "job_name = get_job_name(NUM_NODES, NUM_GPUS)\n", + "\n", + "manual_args = [\n", + " \"--job.config_file\",\n", + " os.path.expanduser(\"/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml\"),\n", + " \"--model.tokenizer-path\",\n", + " # f\"{FUSE_DST}/Llama-3.1-8B\",\n", + " \"/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B\",\n", + " \"--training.steps\",\n", + " \"25\",\n", + " \"--training.dataset_path\",\n", + " # f\"{FUSE_DST}/c4\",\n", + " \"/teamspace/studios/this_studio/torchtitan/tests/assets/c4_test\",\n", + " \"--job.dump_folder\",\n", + " # f\"{FUSE_DST}/outputs/\" + job_name,\n", + " \"/teamspace/studios/this_studio/torchtitan/outputs/\" + job_name,\n", + " \"--training.seq_len\",\n", + " \"1024\",\n", + " # \"8192\",\n", + " ]\n", + "config = config_manager.parse_args(manual_args)\n", + "await async_main(config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from monarch.job import SlurmJob, JobTrait" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/lightning/studio_0_monarch_basics.ipynb b/examples/lightning/studio_0_monarch_basics.ipynb new file mode 100644 index 000000000..4d7c2652b --- /dev/null +++ b/examples/lightning/studio_0_monarch_basics.ipynb @@ -0,0 +1,730 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": "# Studio 0: Monarch Basics - Ping Pong Tutorial\n\nWelcome to the Lightning Studios Monarch series! This is **Studio 0**, where you'll learn the fundamentals of Monarch's Actor API through simple, hands-on examples.\n\n## What is Monarch?\n\n**Monarch** is Meta's distributed actor framework for building scalable, distributed applications. It makes it easy to:\n- Run code across multiple processes or machines\n- Coordinate distributed computations\n- Build complex distributed systems with simple Python code\n\n## What You'll Learn\n\nIn this tutorial, you'll learn:\n1. **Core Concepts**: Actors, Endpoints, and Process Meshes\n2. **Hello World**: Creating and calling actors\n3. **Calling Patterns**: Broadcasting vs. targeting specific actors\n4. **Actor Communication**: How actors talk to each other (Ping Pong!)\n\n## Prerequisites\n\n- Basic Python knowledge\n- Understanding of `async`/`await` (we'll provide a quick refresher)\n- Monarch installed (see [installation guide](https://github.com/meta-pytorch/monarch))\n\n## Lightning Studios Learning Path\n\nThis is the **foundation** studio. After completing this, you can progress to:\n\n- **[Studio 1: Getting Started](./studio_1_getting_started.ipynb)** - Multi-node training with Lightning\n- **[Studio 2: Workspace Sync](./studio_2_workspace_sync.ipynb)** - Hot-reload configs without restarting\n- **[Studio 3: Interactive Debugging](./studio_3_interactive_debugging.ipynb)** - Debug distributed systems\n\nLet's dive in! 🚀" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Part 1: Core Concepts\n", + "\n", + "Before we write code, let's understand the key concepts." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What is an Actor?\n", + "\n", + "Think of an **Actor** as an independent worker that:\n", + "- Has its own state (variables)\n", + "- Runs in its own process (possibly on a different machine)\n", + "- Exposes **endpoints** (methods) that can be called remotely\n", + "\n", + "```\n", + "┌─────────────────┐\n", + "│ Actor Instance │\n", + "│ │\n", + "│ State: │\n", + "│ - rank: 0 │\n", + "│ - data: [...] │\n", + "│ │\n", + "│ Endpoints: │\n", + "│ - hello() │\n", + "│ - process() │\n", + "└─────────────────┘\n", + "```\n", + "\n", + "## What is an Endpoint?\n", + "\n", + "An **Endpoint** is a method on an Actor that can be called remotely. It's marked with the `@endpoint` decorator.\n", + "\n", + "```python\n", + "class MyActor(Actor):\n", + " @endpoint\n", + " async def my_method(self, arg):\n", + " # This can be called remotely!\n", + " return f\"Processed {arg}\"\n", + "```\n", + "\n", + "## What is a Process Mesh?\n", + "\n", + "A **Process Mesh** (or ProcMesh) is a collection of processes where actors can be spawned. Think of it as a cluster of workers.\n", + "\n", + "```\n", + "Process Mesh (4 GPUs)\n", + "┌────────┬────────┬────────┬────────┐\n", + "│ GPU 0 │ GPU 1 │ GPU 2 │ GPU 3 │\n", + "│ │ │ │ │\n", + "│ Actor │ Actor │ Actor │ Actor │\n", + "│ Rank 0 │ Rank 1 │ Rank 2 │ Rank 3 │\n", + "└────────┴────────┴────────┴────────┘\n", + "```\n", + "\n", + "## Async/Await Quick Refresher\n", + "\n", + "Monarch uses Python's `async`/`await` for non-blocking operations:\n", + "\n", + "```python\n", + "# Calling an endpoint\n", + "result = await actor.my_method.call(\"hello\") # Wait for result\n", + "\n", + "# Running multiple operations in parallel\n", + "results = await asyncio.gather(\n", + " actor.method_1.call(),\n", + " actor.method_2.call(),\n", + ") # Wait for both to complete\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Part 2: Hello World\n", + "\n", + "Let's create our first Monarch actor!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Monarch\n", + "\n", + "First, import the necessary components from Monarch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import asyncio\n", + "from monarch.actor import Actor, current_rank, endpoint, proc_mesh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define a Simple Actor\n", + "\n", + "Let's create a `ToyActor` that:\n", + "- Stores its rank (unique ID)\n", + "- Has a `hello_world` endpoint that prints a message" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "NUM_ACTORS = 4\n", + "\n", + "\n", + "class ToyActor(Actor):\n", + " def __init__(self):\n", + " # Get the rank (unique ID) of this actor instance\n", + " self.rank = current_rank().rank\n", + "\n", + " @endpoint\n", + " async def hello_world(self, msg):\n", + " \"\"\"A simple endpoint that prints a message.\"\"\"\n", + " print(f\"Actor {self.rank}: Received message '{msg}'\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Key Points\n", + "\n", + "- `Actor` base class: All Monarch actors inherit from this\n", + "- `current_rank()`: Returns information about this actor's position in the mesh\n", + "- `@endpoint`: Decorator that makes a method remotely callable\n", + "- `async def`: Endpoints are async functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a Process Mesh and Spawn Actors\n", + "\n", + "Now we'll:\n", + "1. Create a process mesh with 4 processes\n", + "2. Spawn 4 instances of `ToyActor` (one per process)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def create_toy_actors():\n", + " # Create a local process mesh with 4 GPU slots\n", + " # Note: This works even without actual GPUs!\n", + " local_proc_mesh = proc_mesh(gpus=NUM_ACTORS)\n", + " \n", + " # Spawn 4 instances of ToyActor (one per GPU slot)\n", + " # This returns a \"handle\" to all instances\n", + " toy_actor = await local_proc_mesh.spawn(\"toy_actor\", ToyActor)\n", + " \n", + " print(f\"✓ Spawned {NUM_ACTORS} ToyActor instances\")\n", + " \n", + " return toy_actor, local_proc_mesh" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Understanding `proc_mesh(gpus=4)`\n", + "\n", + "This creates 4 processes. The parameter is called `gpus` because Monarch is often used for GPU computing, but it works fine without GPUs - it just means \"4 parallel processes.\"\n", + "\n", + "### Understanding `spawn()`\n", + "\n", + "When we call `spawn(\"toy_actor\", ToyActor)`:\n", + "- Monarch creates 4 instances of `ToyActor`\n", + "- Each runs in its own process\n", + "- Each gets a unique rank (0, 1, 2, 3)\n", + "- We get back a handle to communicate with all of them" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Call All Actors at Once\n", + "\n", + "The most common pattern: broadcast a call to **all** actor instances." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def call_all_actors():\n", + " toy_actor, local_proc_mesh = await create_toy_actors()\n", + " \n", + " # Call hello_world on ALL actor instances\n", + " # .call() broadcasts to all instances\n", + " await toy_actor.hello_world.call(\"Hello from main!\")\n", + " \n", + " return toy_actor, local_proc_mesh\n", + "\n", + "# Run it!\n", + "toy_actor, toy_mesh = await call_all_actors()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Expected Output\n", + "\n", + "You should see output from all 4 actors:\n", + "```\n", + "Actor 0: Received message 'Hello from main!'\n", + "Actor 1: Received message 'Hello from main!'\n", + "Actor 2: Received message 'Hello from main!'\n", + "Actor 3: Received message 'Hello from main!'\n", + "```\n", + "\n", + "### What Just Happened?\n", + "\n", + "```\n", + " Main Process\n", + " │\n", + " ├──> toy_actor.hello_world.call(\"Hello\")\n", + " │\n", + " ┌───────┼───────┬───────┬───────┐\n", + " ▼ ▼ ▼ ▼ ▼\n", + " Actor0 Actor1 Actor2 Actor3\n", + " Rank0 Rank1 Rank2 Rank3\n", + " print print print print\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Part 3: Calling Specific Actors\n", + "\n", + "Sometimes you want to call **specific** actor instances, not all of them. This is where `.slice()` comes in!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The Slice API\n", + "\n", + "`.slice()` lets you select specific actor instances:\n", + "\n", + "```python\n", + "# Select actor at GPU 0\n", + "actor_0 = toy_actor.slice(gpus=0)\n", + "\n", + "# Select actor at GPU 2\n", + "actor_2 = toy_actor.slice(gpus=2)\n", + "\n", + "# Then call with .call_one()\n", + "await actor_0.hello_world.call_one(\"Hi from actor 0!\")\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example: Call Each Actor with a Unique Message" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def call_specific_actors():\n", + " futures = []\n", + " \n", + " for idx in range(NUM_ACTORS):\n", + " # Select the actor at index 'idx'\n", + " actor_instance = toy_actor.slice(gpus=idx)\n", + " \n", + " # Call with a unique message for this actor\n", + " future = actor_instance.hello_world.call_one(\n", + " f\"Unique message for actor {idx}\"\n", + " )\n", + " futures.append(future)\n", + " \n", + " # Wait for all calls to complete (in parallel!)\n", + " await asyncio.gather(*futures)\n", + " \n", + " print(\"\\n✓ All specific actor calls completed\")\n", + "\n", + "# Run it!\n", + "await call_specific_actors()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Expected Output\n", + "\n", + "```\n", + "Actor 0: Received message 'Unique message for actor 0'\n", + "Actor 1: Received message 'Unique message for actor 1'\n", + "Actor 2: Received message 'Unique message for actor 2'\n", + "Actor 3: Received message 'Unique message for actor 3'\n", + "```\n", + "\n", + "### Key Insight\n", + "\n", + "We used `asyncio.gather()` to schedule all calls in parallel. Without `gather()`, they'd run sequentially (slower).\n", + "\n", + "```\n", + "Sequential (slow): Parallel with gather() (fast):\n", + "┌────┐ ┌────┐\n", + "│ A0 │────┐ │ A0 │────┐\n", + "└────┘ │ ├────┤ │\n", + " │ │ A1 │────┤\n", + "┌────┐ │ ├────┤ ├─> All complete!\n", + "│ A1 │────┤ │ A2 │────┤\n", + "└────┘ │ ├────┤ │\n", + " │ │ A3 │────┘\n", + "┌────┐ │ └────┘\n", + "│ A2 │────┤\n", + "└────┘ │\n", + " │\n", + "┌────┐ │\n", + "│ A3 │────┘\n", + "└────┘\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparison: `.call()` vs `.call_one()`\n", + "\n", + "| Method | Use Case | Example |\n", + "|--------|----------|----------|\n", + "| `.call()` | Broadcast to **all** instances | `actor.method.call(arg)` |\n", + "| `.call_one()` | Call a **specific** instance (after `.slice()`) | `actor.slice(gpus=0).method.call_one(arg)` |\n", + "\n", + "### When to Use Each\n", + "\n", + "- **`.call()`**: When you want all actors to do the same thing\n", + " - Example: Initialize all actors, broadcast data, synchronize state\n", + " \n", + "- **`.call_one()` with `.slice()`**: When you want specific behavior per actor\n", + " - Example: Assign different data partitions, target specific workers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Part 4: Actor-to-Actor Communication (Ping Pong!)\n", + "\n", + "So far, we've called actors from our main code. But actors can also **talk to each other**! This is powerful for building distributed systems." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The Ping Pong Example\n", + "\n", + "We'll create two groups of actors that send messages to each other:\n", + "\n", + "```\n", + "Actor Group 0 Actor Group 1\n", + "┌──────────┐ ┌──────────┐\n", + "│ Actor 0 │──── Ping ───> │ Actor 0 │\n", + "│ Actor 1 │ │ Actor 1 │\n", + "└──────────┘ └──────────┘\n", + " │\n", + " Pong!\n", + " │\n", + "┌──────────┐ ┌──────────┐\n", + "│ Actor 0 │ <─── Ping ─── │ Actor 0 │\n", + "│ Actor 1 │ │ Actor 1 │\n", + "└──────────┘ └──────────┘\n", + " │\n", + " Pong!\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define the PingPong Actor\n", + "\n", + "This actor can:\n", + "- Store a reference to another actor\n", + "- Send messages to that actor\n", + "- Receive messages from that actor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class PingPongActor(Actor):\n", + " def __init__(self, actor_name):\n", + " \"\"\"Initialize with a name to identify this actor group.\"\"\"\n", + " self.actor_name = actor_name\n", + " self.identity = None\n", + " self.other_actor = None\n", + " self.other_actor_pair = None\n", + "\n", + " @endpoint\n", + " async def init(self, other_actor):\n", + " \"\"\"\n", + " Initialize this actor with a reference to another actor.\n", + " \n", + " Key insight: We store a 'slice' of the other actor that corresponds\n", + " to our rank. So Actor 0 will talk to the other Actor 0, \n", + " Actor 1 to the other Actor 1, etc.\n", + " \"\"\"\n", + " self.other_actor = other_actor\n", + " \n", + " # Get my rank\n", + " self.identity = current_rank().rank\n", + " \n", + " # Slice the other actor to get my \"pair\" (same rank)\n", + " self.other_actor_pair = other_actor.slice(**current_rank())\n", + " \n", + " print(f\"[{self.actor_name}:{self.identity}] Initialized and paired with other actor\")\n", + "\n", + " @endpoint\n", + " async def send(self, msg):\n", + " \"\"\"Send a message to our paired actor in the other group.\"\"\"\n", + " await self.other_actor_pair.recv.call(\n", + " f\"Sender ({self.actor_name}:{self.identity}) says: {msg}\"\n", + " )\n", + "\n", + " @endpoint\n", + " async def recv(self, msg):\n", + " \"\"\"Receive a message from our paired actor.\"\"\"\n", + " print(f\"Pong! [{self.actor_name}:{self.identity}] received: {msg}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Understanding the Code\n", + "\n", + "**The `init` endpoint:**\n", + "- Takes a reference to another actor group\n", + "- Uses `.slice(**current_rank())` to pair actors by rank\n", + " - Actor 0 in group A pairs with Actor 0 in group B\n", + " - Actor 1 in group A pairs with Actor 1 in group B\n", + "\n", + "**The `send` endpoint:**\n", + "- Calls `recv` on the paired actor\n", + "- This demonstrates **actor-to-actor communication**!\n", + "\n", + "**The `recv` endpoint:**\n", + "- Receives and prints the message\n", + "- The \"Pong!\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Two Actor Groups" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def create_ping_pong_actors():\n", + " # Create first mesh with 2 actors\n", + " local_mesh_0 = proc_mesh(gpus=2)\n", + " actor_0 = await local_mesh_0.spawn(\n", + " \"actor_0\",\n", + " PingPongActor,\n", + " \"GroupA\", # This argument is passed to __init__\n", + " )\n", + "\n", + " # Create second mesh with 2 actors\n", + " local_mesh_1 = proc_mesh(gpus=2)\n", + " actor_1 = await local_mesh_1.spawn(\n", + " \"actor_1\",\n", + " PingPongActor,\n", + " \"GroupB\", # This argument is passed to __init__\n", + " )\n", + " \n", + " print(\"\\n✓ Created two actor groups (2 actors each)\")\n", + "\n", + " return actor_0, actor_1, local_mesh_0, local_mesh_1\n", + "\n", + "# Create the actors\n", + "actor_group_a, actor_group_b, mesh_a, mesh_b = await create_ping_pong_actors()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What We Have Now\n", + "\n", + "```\n", + "Group A (actor_group_a) Group B (actor_group_b)\n", + "┌──────────────────┐ ┌──────────────────┐\n", + "│ GroupA Actor 0 │ │ GroupB Actor 0 │\n", + "│ GroupA Actor 1 │ │ GroupB Actor 1 │\n", + "└──────────────────┘ └──────────────────┘\n", + "\n", + "They don't know about each other yet!\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize: Pair the Actors\n", + "\n", + "Now we'll tell each actor group about the other." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def init_ping_pong(actor_0, actor_1):\n", + " # Initialize actors with references to each other\n", + " # We do this in parallel using asyncio.gather\n", + " await asyncio.gather(\n", + " actor_0.init.call(actor_1), # Group A learns about Group B\n", + " actor_1.init.call(actor_0), # Group B learns about Group A\n", + " )\n", + " \n", + " print(\"\\n✓ Actors are now paired and ready to communicate!\")\n", + "\n", + "# Initialize the pairing\n", + "await init_ping_pong(actor_group_a, actor_group_b)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### After Initialization\n", + "\n", + "```\n", + "Group A Group B\n", + "┌──────────────────┐ ┌──────────────────┐\n", + "│ GroupA Actor 0 │ <──────> │ GroupB Actor 0 │\n", + "│ │ paired │ │\n", + "│ GroupA Actor 1 │ <──────> │ GroupB Actor 1 │\n", + "└──────────────────┘ paired └──────────────────┘\n", + "\n", + "Each actor knows its \"pair\" in the other group!\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Send Messages Between Actors\n", + "\n", + "Now for the exciting part - let's make them talk!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def send_ping_pong(actor_0, actor_1):\n", + " print(\"\\n\" + \"=\"*60)\n", + " print(\"Starting Ping Pong Communication\")\n", + " print(\"=\"*60 + \"\\n\")\n", + " \n", + " # Group A sends \"Ping!\" to Group B\n", + " print(\"📤 Group A sending 'Ping!' to Group B...\\n\")\n", + " await actor_0.send.call(\"Ping!\")\n", + " \n", + " print(\"\\n\" + \"-\"*60 + \"\\n\")\n", + " \n", + " # Group B sends \"Ping!\" to Group A\n", + " print(\"📤 Group B sending 'Ping!' to Group A...\\n\")\n", + " await actor_1.send.call(\"Ping!\")\n", + " \n", + " print(\"\\n\" + \"=\"*60)\n", + " print(\"✓ Ping Pong Complete!\")\n", + " print(\"=\"*60)\n", + "\n", + "# Run the ping pong!\n", + "await send_ping_pong(actor_group_a, actor_group_b)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Expected Output\n", + "\n", + "```\n", + "📤 Group A sending 'Ping!' to Group B...\n", + "\n", + "Pong! [GroupB:0] received: Sender (GroupA:0) says: Ping!\n", + "Pong! [GroupB:1] received: Sender (GroupA:1) says: Ping!\n", + "\n", + "📤 Group B sending 'Ping!' to Group A...\n", + "\n", + "Pong! [GroupA:0] received: Sender (GroupB:0) says: Ping!\n", + "Pong! [GroupA:1] received: Sender (GroupB:1) says: Ping!\n", + "```\n", + "\n", + "### What Happened?\n", + "\n", + "1. **Group A's Actor 0** called `send(\"Ping!\")`\n", + "2. This invoked `recv()` on **Group B's Actor 0** (its pair)\n", + "3. Group B's Actor 0 printed \"Pong!\"\n", + "4. Same for Actor 1 in both groups\n", + "5. Then we reversed the direction!\n", + "\n", + "```\n", + " GroupA Actor 0 ──send()──> GroupB Actor 0\n", + " │\n", + " recv()\n", + " │\n", + " Pong!\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "---\n\n# 🎉 Congratulations! 🎉\n\nYou've learned the fundamentals of Monarch!\n\n## What You Learned\n\n### Core Concepts\n- ✓ **Actors**: Independent workers with state and endpoints\n- ✓ **Endpoints**: Remotely callable methods (with `@endpoint`)\n- ✓ **Process Mesh**: Collection of processes for spawning actors\n\n### Calling Patterns\n- ✓ **`.call()`**: Broadcast to all actor instances\n- ✓ **`.slice()`**: Select specific actor instances\n- ✓ **`.call_one()`**: Call a specific sliced actor\n\n### Communication\n- ✓ **Main → Actor**: Call endpoints from your code\n- ✓ **Actor → Actor**: Actors calling other actors' endpoints\n- ✓ **Pairing**: Using `.slice(**current_rank())` to pair actors\n\n## Key Takeaways\n\n1. **Actors run independently** in separate processes\n2. **Endpoints are async** - use `await` when calling them\n3. **Use `.call()` for broadcast**, `.call_one()` for targeted calls\n4. **Actors can reference other actors** for complex distributed systems\n5. **`asyncio.gather()` runs operations in parallel** for better performance\n\n## Next Steps: Lightning Studios Series\n\nNow that you understand Monarch basics, continue your journey with the Lightning Studios:\n\n### 🚀 Studio 1: Getting Started (Recommended Next!)\n**[studio_1_getting_started.ipynb](./studio_1_getting_started.ipynb)**\n\nLearn how to run distributed multi-node training:\n- Launch multi-node jobs on Lightning AI\n- Set up distributed process meshes across machines\n- Run TorchTitan training for Llama-3-8B\n- Scale from 2 to 16+ nodes\n\n### 🔄 Studio 2: Workspace Synchronization\n**[studio_2_workspace_sync.ipynb](./studio_2_workspace_sync.ipynb)**\n\nMaster hot-reloading for faster iteration:\n- Sync local code/config changes to remote nodes\n- Update training configs without restarting jobs\n- 10x faster iteration cycles\n\n### 🐛 Studio 3: Interactive Debugging\n**[studio_3_interactive_debugging.ipynb](./studio_3_interactive_debugging.ipynb)**\n\nDebug distributed systems like a pro:\n- Set breakpoints in distributed actors\n- Inspect environment variables across nodes\n- Use `monarch debug` CLI for interactive debugging\n\n---\n\n## Additional Resources\n\n### 📚 More Examples\nCheck out these examples in the docs:\n- `getting_started.py` - More Monarch fundamentals\n- `distributed_tensors.py` - Working with tensors across actors\n- `debugging.py` - Debugging distributed actors\n- `spmd_ddp.py` - Distributed data parallel training\n\n### 📖 Documentation\n- [Monarch GitHub](https://github.com/meta-pytorch/monarch)\n- [Monarch Documentation](https://github.com/meta-pytorch/monarch/tree/main/docs)\n- [TorchTitan with Monarch](https://github.com/pytorch/torchtitan)\n\n---\n\n## Practice Exercises\n\nHere are some exercises to reinforce your learning:\n\n1. **Modify `ToyActor`** to return a value instead of printing\n2. **Create a chain** of 3 actor groups where A → B → C → A\n3. **Add a counter** to `PingPongActor` that tracks messages sent/received\n4. **Experiment with different mesh sizes** - try 8 or 16 actors\n\nReady for real-world distributed training? Head to **[Studio 1](./studio_1_getting_started.ipynb)** next!\n\nHappy coding with Monarch! 🎊" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Cleanup\n", + "\n", + "When you're done, it's good practice to stop the process meshes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Stop the meshes\n", + "await toy_mesh.stop()\n", + "await mesh_a.stop()\n", + "await mesh_b.stop()\n", + "\n", + "print(\"✓ All process meshes stopped\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/lightning/studio_1_getting_started.ipynb b/examples/lightning/studio_1_getting_started.ipynb new file mode 100644 index 000000000..f35cc6bb2 --- /dev/null +++ b/examples/lightning/studio_1_getting_started.ipynb @@ -0,0 +1,531 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": "# Studio 1: Getting Started - Multi-Node Training with Monarch & Lightning\n\nWelcome! This notebook will guide you through running **distributed multi-node training** using **Monarch** (Meta's distributed actor framework) with **TorchTitan** (PyTorch's large-scale LLM training library) on **Lightning AI** infrastructure.\n\n
\n \"Monarch\n
\n\n## What You'll Learn\n\nBy the end of this notebook, you'll:\n- Set up TorchTitan, Monarch, and Lightning SDK\n- Launch a multi-node training job on Lightning AI\n- Run distributed Llama-3-8B training across multiple GPUs\n- Monitor and manage your distributed training\n\n## Prerequisites\n\n- **Monarch Basics**: New to Monarch? Start with [Studio 0: Monarch Basics](./studio_0_monarch_basics.ipynb) to learn about Actors, Endpoints, and Process Meshes\n- Lightning AI account with access to GPU machines (L40S recommended)\n- Hugging Face account with Llama model access\n- Basic understanding of distributed training concepts\n\n## Lightning Studios Series\n\nThis is **Studio 1** of the series:\n\n- **[Studio 0: Monarch Basics](./studio_0_monarch_basics.ipynb)** - Learn Monarch fundamentals (Start here if new!)\n- **Studio 1: Getting Started** - Multi-node training (YOU ARE HERE)\n- **[Studio 2: Workspace Sync](./studio_2_workspace_sync.ipynb)** - Hot-reload configs without restarting\n- **[Studio 3: Interactive Debugging](./studio_3_interactive_debugging.ipynb)** - Debug distributed systems\n\nLet's get started!" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Part I: Environment Setup\n", + "\n", + "Before running distributed training, we need to install dependencies. Follow the steps below." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install TorchTitan\n", + "\n", + "Clone the TorchTitan repository, install the nightly PyTorch build with CUDA 12.6 support, and install TorchTitan:\n", + "\n", + "```bash\n", + "git clone https://github.com/pytorch/torchtitan.git\n", + "cd torchtitan\n", + "pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 --force-reinstall\n", + "pip install .\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download Llama-3-8B Model Assets\n", + "\n", + "Download the Llama-3.1-8B tokenizer from Hugging Face. You'll need a Hugging Face token with access to the Llama models:\n", + "\n", + "```bash\n", + "python scripts/download_hf_assets.py \\\n", + " --repo_id meta-llama/Llama-3.1-8B \\\n", + " --assets tokenizer \\\n", + " --hf_token=YOUR_HUGGINGFACE_TOKEN_KEY\n", + "```\n", + "\n", + "Replace `YOUR_HUGGINGFACE_TOKEN_KEY` with your actual Hugging Face token." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install Monarch\n", + "\n", + "Install Monarch from the GitHub repository following the Ubuntu installation instructions:\n", + "\n", + "```bash\n", + "git clone https://github.com/meta-pytorch/monarch.git\n", + "cd monarch\n", + "# Follow the Ubuntu installation instructions from the repository\n", + "```\n", + "\n", + "For detailed installation steps, visit: https://github.com/meta-pytorch/monarch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup Weights & Biases\n", + "\n", + "Install wandb for experiment tracking:\n", + "\n", + "```bash\n", + "pip install wandb\n", + "wandb login\n", + "```\n", + "\n", + "Follow the prompts to authenticate with your wandb account." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Update the Lightning SDK\n", + "\n", + "Install the latest version of Lightning SDK for IP sharing features:\n", + "\n", + "```bash\n", + "pip install -U lightning_sdk\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Verify Installations\n", + "\n", + "After completing the installation steps above, verify that TorchTitan and Monarch are properly installed:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Verify TorchTitan installation\n", + "import torchtitan\n", + "print(\"TorchTitan is installed successfully\")\n", + "\n", + "# Verify Monarch installation\n", + "import monarch\n", + "print(\"Monarch is installed successfully\")\n", + "\n", + "# Verify PyTorch and CUDA\n", + "import torch\n", + "print(f\"PyTorch version: {torch.__version__}\")\n", + "print(f\"CUDA available: {torch.cuda.is_available()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Part II: Multi-Node Training with Monarch and Lightning\n", + "\n", + "Now that the environment is set up, we'll configure and launch distributed training across multiple nodes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Lightning SDK Components\n", + "\n", + "Import the necessary classes from Lightning SDK to manage multi-machine training jobs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from lightning_sdk import Machine, MMT, Studio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure Training Job Parameters\n", + "\n", + "Set up the configuration for the multi-node training job. We'll start with **2 nodes** to keep things manageable.\n", + "\n", + "> **Note:** You can easily scale this up to 16+ nodes once you're comfortable with the workflow!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration\n", + "import os\n", + "\n", + "NUM_NODES = 2\n", + "NUM_GPUS = 8\n", + "TEAMSPACE = \"general\" # Replace with your teamspace\n", + "USER = \"your-username\" # Replace with your username\n", + "MMT_JOB_NAME = f\"Monarch-MMT-{NUM_NODES}-nodes\"\n", + "\n", + "# Remote allowed port range for worker nodes\n", + "REMOTE_ALLOWED_PORT_RANGE = \"26601..26611\"\n", + "\n", + "# To force Monarch to use V0 for this Notebook (This will be removed in the future)\n", + "os.environ[\"MONARCH_V0_WORKAROUND_DO_NOT_USE\"] = \"1\"\n", + "os.environ[\"MONARCH_FILE_LOG\"] = \"debug\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define MMT Job Launch Function\n", + "\n", + "Create a function to launch a multi-machine training (MMT) job using Lightning SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def launch_mmt_job(num_nodes=2, teamspace=\"my-teamspace\", user=\"my-user\"):\n", + " \"\"\"\n", + " Launch a multi-machine training job using Lightning SDK's MMT API.\n", + " \"\"\"\n", + "\n", + " studio = Studio()\n", + "\n", + " # Install the MMT plugin before running the actual job\n", + " studio.install_plugin(\"multi-machine-training\")\n", + "\n", + " print(f\"Launching MMT job with {num_nodes} nodes...\")\n", + "\n", + " # Machine with L40S GPUs\n", + " machine_type = getattr(Machine, f\"L40S_X_{NUM_GPUS}\")\n", + "\n", + " job = MMT.run(\n", + " command=\"process_allocator\",\n", + " name=MMT_JOB_NAME,\n", + " machine=machine_type,\n", + " studio=studio,\n", + " num_machines=num_nodes,\n", + " env={\n", + " \"CUDA_VISIBLE_DEVICES\": \"0,1,2,3,4,5,6,7\",\n", + " \"MONARCH_FILE_LOG\": \"debug\",\n", + " \"HYPERACTOR_REMOTE_ALLOC_ALLOWED_PORT_RANGE\": REMOTE_ALLOWED_PORT_RANGE,\n", + " \"HYPERACTOR_REMOTE_ALLOC_BIND_TO_INADDR_ANY\": \"true\",\n", + " \"WORKSPACE_DIR\": \"/tmp\",\n", + " },\n", + " )\n", + "\n", + " print(f\"Job started with ID: {job.name}\")\n", + " print(f\"Job status: {job.status}\")\n", + "\n", + " return job, studio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch the Multi-Node Training Job\n", + "\n", + "Execute the launch function to start the distributed training infrastructure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Launch the job\n", + "job, studio = launch_mmt_job(\n", + " num_nodes=NUM_NODES, teamspace=TEAMSPACE, user=USER\n", + ")\n", + "\n", + "print(f\"\\nJob launched. You can monitor it using: job.status\")\n", + "print(f\"To stop the job: job.stop()\")\n", + "print(f\"To clean up: studio.stop()\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Monitor Job Status\n", + "\n", + "You can monitor your job through the MMT plugin in Lightning AI. The nodes will go through these stages:\n", + "\n", + "1. **Pending** - Waiting for resources\n", + "2. **Setting up** - Installing dependencies and snapshotting environment\n", + "3. **Ready** - All nodes ready with `process_allocator` running\n", + "\n", + "Wait for all nodes to show **Ready** status before proceeding to the next cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check job status\n", + "print(f\"Current job status: {job.status}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set Up Process Mesh from Job\n", + "\n", + "Initialize the Monarch process mesh using the launched Lightning job. This creates the distributed computing mesh that connects all nodes and GPUs.\n", + "\n", + "> **Important:** Make sure the `process_allocator` process is running on all nodes before running this cell!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from utils.mesh_utils import setup_proc_mesh_from_job\n", + "\n", + "proc_mesh = setup_proc_mesh_from_job(job, NUM_NODES, NUM_GPUS)\n", + "print(\"\\nProcess mesh initialized successfully!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Run TorchTitan Training for Llama-3-8B\n", + "\n", + "Now we'll define a Monarch Actor that wraps TorchTitan's training functionality and run distributed training." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generate Job Name Helper\n", + "\n", + "Create a unique job name for tracking." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "\n", + "def get_job_name(num_hosts: int, num_gpus_per_host: int):\n", + " return f\"monarch-{getpass.getuser()}-hosts{num_hosts}-gpus{num_gpus_per_host}\"\n", + "\n", + "print(get_job_name(num_hosts=NUM_NODES, num_gpus_per_host=NUM_GPUS))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define TorchTitan Trainer Actor\n", + "\n", + "Create the `TitanTrainerWrapper` class, a Monarch Actor that wraps TorchTitan's training functionality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import logging\n", + "from monarch.actor import ProcMesh, Actor, endpoint, current_rank\n", + "import socket\n", + "from torchtitan.tools.logging import init_logger, logger\n", + "from torchtitan.train import Trainer\n", + "from typing import Optional\n", + "import torch\n", + "from torchtitan.config import JobConfig\n", + "\n", + "\n", + "class TitanTrainerWrapper(Actor):\n", + " def __init__(self, job_config: JobConfig):\n", + " self.rank = current_rank().rank\n", + " self.job_config = job_config\n", + "\n", + " def _rprint(self, msg):\n", + " \"\"\"Helper method to print with rank information.\"\"\"\n", + " print(f\"{self.rank=} {msg}\")\n", + "\n", + " @endpoint\n", + " def init(self):\n", + " logging.getLogger().addHandler(logging.StreamHandler(sys.stderr))\n", + " print(f\"Initializing actor: {self.rank} {current_rank()=} {socket.gethostname()=}\")\n", + "\n", + " @endpoint\n", + " def train(self):\n", + " logger.info(\"Starting training\")\n", + " config = self.job_config\n", + " trainer: Optional[Trainer] = None\n", + "\n", + " try:\n", + " trainer = Trainer(config)\n", + " trainer.train()\n", + "\n", + " if config.checkpoint.create_seed_checkpoint:\n", + " assert (\n", + " int(os.environ[\"WORLD_SIZE\"]) == 1\n", + " ), \"Must create seed checkpoint using a single device, to disable sharding.\"\n", + " assert config.checkpoint.enable, \"Must enable checkpointing when creating a seed checkpoint.\"\n", + " trainer.checkpointer.save(curr_step=0)\n", + " logger.info(\"Created seed checkpoint\")\n", + " else:\n", + " trainer.train()\n", + " finally:\n", + " if trainer:\n", + " trainer.close()\n", + "\n", + " if torch.distributed.is_initialized():\n", + " torch.distributed.destroy_process_group()\n", + " logger.info(\"Process group destroyed.\")\n", + " print(\"Done training\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Async Main Training Function\n", + "\n", + "Set up the main asynchronous function that orchestrates distributed training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torchtitan.config import ConfigManager, JobConfig\n", + "from monarch.tools.network import AddrType\n", + "from monarch.utils import setup_env_for_distributed\n", + "\n", + "\n", + "async def async_main(job_config: JobConfig):\n", + " torch.use_deterministic_algorithms(True)\n", + " job_name = get_job_name(NUM_NODES, NUM_GPUS)\n", + "\n", + " # Use IPv4 for MASTER_ADDR\n", + " await setup_env_for_distributed(proc_mesh, use_ipaddr=AddrType.IPv4)\n", + "\n", + " await proc_mesh.logging_option(stream_to_client=True, aggregate_window_sec=3)\n", + "\n", + " print(job_config)\n", + " print(f\"Spawning meshes on {job_name}\")\n", + "\n", + " trainer_actor = proc_mesh.spawn(\"trainer_actor\", TitanTrainerWrapper, job_config)\n", + "\n", + " await trainer_actor.init.call()\n", + " await trainer_actor.train.call()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Logger and Run Training\n", + "\n", + "Configure the TorchTitan logger, set up training parameters, and execute the training pipeline.\n", + "\n", + "> **Note:** This will train Llama-3-8B for 25 steps. Adjust the paths below to match your setup." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "init_logger()\n", + "config_manager = ConfigManager()\n", + "\n", + "job_name = get_job_name(NUM_NODES, NUM_GPUS)\n", + "\n", + "manual_args = [\n", + " \"--job.config_file\",\n", + " os.path.expanduser(\"/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml\"),\n", + " \"--model.tokenizer-path\",\n", + " \"/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B\",\n", + " \"--training.steps\",\n", + " \"25\",\n", + " \"--training.dataset_path\",\n", + " \"/teamspace/studios/this_studio/torchtitan/tests/assets/c4_test\",\n", + " \"--job.dump_folder\",\n", + " \"/teamspace/studios/this_studio/torchtitan/outputs/\" + job_name,\n", + " \"--training.seq_len\",\n", + " \"1024\",\n", + "]\n", + "config = config_manager.parse_args(manual_args)\n", + "await async_main(config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "---\n\n# 🎉 Congratulations! 🎉\n\nYou just ran **interactive distributed training** for a Llama-3-8B model in a Jupyter notebook using **Monarch actors** and **Lightning infrastructure**!\n\n## What You Accomplished\n\n- Launched a multi-node training job on Lightning AI\n- Set up a distributed process mesh with Monarch\n- Ran TorchTitan training across multiple GPUs and nodes\n- Monitored training with aggregated logging\n\n## Key Benefits\n\n- **Flexibility**: Change configurations and relaunch training without restarting nodes\n- **Observability**: Monarch aggregates logs from all ranks\n- **Scalability**: Easily scale from 2 to 16+ nodes by changing `NUM_NODES`\n\n## Next Steps\n\nNow that you've mastered multi-node training, continue with the Lightning Studios series:\n\n### 🔄 Studio 2: Workspace Synchronization (Recommended Next!)\n**[studio_2_workspace_sync.ipynb](./studio_2_workspace_sync.ipynb)**\n\nLearn how to:\n- Sync local code/config changes to remote nodes **without restarting**\n- Hot-reload training configurations\n- Iterate faster on distributed training (10x speedup!)\n\n### 🐛 Studio 3: Interactive Debugging\n**[studio_3_interactive_debugging.ipynb](./studio_3_interactive_debugging.ipynb)**\n\nMaster advanced debugging:\n- Set breakpoints in distributed actors\n- Debug specific ranks interactively\n- Inspect environment variables across nodes\n\n### 📚 Review Monarch Basics\n**[studio_0_monarch_basics.ipynb](./studio_0_monarch_basics.ipynb)**\n\nIf you want to review Monarch fundamentals:\n- Actors, Endpoints, and Process Meshes\n- Calling patterns (`.call()` vs `.call_one()`)\n- Actor-to-actor communication\n\n---\n\n## Cleanup\n\nWhen you're done, remember to stop the process mesh and clean up resources:" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Stop the process mesh\n", + "await proc_mesh.stop()\n", + "\n", + "# Stop the Lightning job\n", + "job.stop()\n", + "\n", + "print(\"Cleanup complete!\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/lightning/studio_2_workspace_sync.ipynb b/examples/lightning/studio_2_workspace_sync.ipynb new file mode 100644 index 000000000..39fbce5e3 --- /dev/null +++ b/examples/lightning/studio_2_workspace_sync.ipynb @@ -0,0 +1,542 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": "# Studio 2: Hot-Reloading with Workspace Synchronization\n\nWelcome to Studio 2! In this notebook, you'll learn one of Monarch's most powerful features: **workspace synchronization**.\n\n## The Problem\n\nIn traditional distributed training:\n1. You launch a multi-node job (takes 5-10 minutes)\n2. You realize you need to change a config value (e.g., learning rate)\n3. You have to **stop everything** and restart (another 5-10 minutes)\n4. Rinse and repeat...\n\nThis is incredibly frustrating and wastes valuable time and compute resources!\n\n## The Solution: Workspace Sync\n\nWith Monarch's `proc_mesh.sync_workspace()`:\n1. Launch your multi-node job once\n2. Edit configs or code **locally**\n3. Run `sync_workspace()` to propagate changes to all remote nodes\n4. Re-run training with updated configs - **no restart needed!**\n\n## What You'll Learn\n\n- How workspace synchronization works\n- Creating and modifying training configs locally\n- Syncing changes to remote worker nodes\n- Verifying synchronization across the cluster\n- Practical hot-reload workflows\n\n## Prerequisites\n\n**Required:** Complete [Studio 1: Getting Started](./studio_1_getting_started.ipynb) first!\n\nYou should have:\n- A running multi-node Lightning job\n- An initialized Monarch process mesh\n- Basic understanding of Monarch actors\n\n**New to Monarch?** Start with [Studio 0: Monarch Basics](./studio_0_monarch_basics.ipynb) to learn the fundamentals!\n\n## Lightning Studios Series\n\nThis is **Studio 2** of the series:\n\n- **[Studio 0: Monarch Basics](./studio_0_monarch_basics.ipynb)** - Learn Monarch fundamentals\n- **[Studio 1: Getting Started](./studio_1_getting_started.ipynb)** - Multi-node training\n- **Studio 2: Workspace Sync** - Hot-reload configs (YOU ARE HERE)\n- **[Studio 3: Interactive Debugging](./studio_3_interactive_debugging.ipynb)** - Debug distributed systems\n\n## Quick Recap from Studio 1\n\nIf you completed Studio 1, you should have:\n- `job` - Your Lightning MMT job\n- `proc_mesh` - Your Monarch process mesh\n- `NUM_NODES` and `NUM_GPUS` configured\n\nIf you need to restart, run the setup cells from Studio 1 first.\n\nLet's get started!" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Setup (If Starting Fresh)\n", + "\n", + "If you're continuing from Studio 1, **skip this section**. If you're starting fresh, run these cells to set up your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Only run if starting fresh (not continuing from Studio 1)\n", + "from lightning_sdk import Machine, MMT, Studio\n", + "import os\n", + "\n", + "NUM_NODES = 2\n", + "NUM_GPUS = 8\n", + "TEAMSPACE = \"general\"\n", + "USER = \"your-username\"\n", + "MMT_JOB_NAME = f\"Monarch-MMT-{NUM_NODES}-nodes\"\n", + "REMOTE_ALLOWED_PORT_RANGE = \"26601..26611\"\n", + "\n", + "os.environ[\"MONARCH_V0_WORKAROUND_DO_NOT_USE\"] = \"1\"\n", + "os.environ[\"MONARCH_FILE_LOG\"] = \"debug\"\n", + "\n", + "# Launch job (see Studio 1 for full details)\n", + "# job, studio = launch_mmt_job(...)\n", + "# proc_mesh = setup_proc_mesh_from_job(job, NUM_NODES, NUM_GPUS)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Workspace Synchronization Workflow\n", + "\n", + "Let's dive into workspace sync with a practical example!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define File Checker Actor\n", + "\n", + "First, we'll create an actor that can read and verify file contents on remote nodes. This helps us confirm that files are properly synchronized." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from monarch.actor import Actor, endpoint, current_rank\n", + "import os\n", + "import socket\n", + "\n", + "\n", + "class FileCheckerActor(Actor):\n", + " \"\"\"Actor to read and verify file contents on remote nodes.\"\"\"\n", + "\n", + " def __init__(self):\n", + " self.rank = current_rank().rank\n", + " self.hostname = socket.gethostname()\n", + "\n", + " @endpoint\n", + " def read_file(self, file_path: str) -> dict:\n", + " \"\"\"Read a file and return its contents.\"\"\"\n", + " try:\n", + " with open(file_path, 'r') as f:\n", + " content = f.read()\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"file_path\": file_path,\n", + " \"content\": content,\n", + " \"exists\": True,\n", + " \"size\": len(content)\n", + " }\n", + " except FileNotFoundError:\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"file_path\": file_path,\n", + " \"exists\": False,\n", + " \"error\": \"File not found\"\n", + " }\n", + " except Exception as e:\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"file_path\": file_path,\n", + " \"exists\": False,\n", + " \"error\": str(e)\n", + " }\n", + "\n", + " @endpoint\n", + " def file_exists(self, file_path: str) -> dict:\n", + " \"\"\"Check if a file exists on the remote node.\"\"\"\n", + " exists = os.path.exists(file_path)\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"file_path\": file_path,\n", + " \"exists\": exists\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spawn File Checker Actor\n", + "\n", + "Spawn the file checker actor across all nodes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Spawn the file checker actor\n", + "file_checker = proc_mesh.spawn(\"file_checker\", FileCheckerActor)\n", + "print(\"FileCheckerActor spawned across all nodes\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a Local Configuration File\n", + "\n", + "Let's create a training configuration file locally. This simulates a common workflow where you want to tweak hyperparameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a local workspace directory for our custom config\n", + "local_workspace = \"/teamspace/studios/this_studio/monarch_sync_example\"\n", + "os.makedirs(local_workspace, exist_ok=True)\n", + "\n", + "# Create a custom training configuration file\n", + "config_file_name = \"custom_training_config.toml\"\n", + "local_config_path = os.path.join(local_workspace, config_file_name)\n", + "\n", + "# Write initial configuration\n", + "initial_config = \"\"\"# TorchTitan Custom Training Configuration\n", + "# Version 1.0 - Initial configuration\n", + "\n", + "[training]\n", + "batch_size = 32\n", + "learning_rate = 0.001\n", + "max_steps = 100\n", + "warmup_steps = 10\n", + "\n", + "[model]\n", + "model_type = \"llama3_8b\"\n", + "seq_len = 1024\n", + "\n", + "[optimizer]\n", + "optimizer_type = \"AdamW\"\n", + "weight_decay = 0.01\n", + "\"\"\"\n", + "\n", + "with open(local_config_path, 'w') as f:\n", + " f.write(initial_config)\n", + "\n", + "print(f\"✓ Created local config file: {local_config_path}\")\n", + "print(f\"\\nInitial configuration:\\n{'-'*50}\")\n", + "print(initial_config)\n", + "print(f\"{'-'*50}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Workspace and Perform Initial Sync\n", + "\n", + "Now we'll create a Monarch `Workspace` object and sync our local directory to all remote nodes.\n", + "\n", + "**This is the magic step!** 🪄" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from monarch.tools.config.workspace import Workspace\n", + "from pathlib import Path\n", + "\n", + "# Create a Workspace object pointing to our local directory\n", + "workspace = Workspace(dirs=[Path(local_workspace)])\n", + "\n", + "print(f\"Workspace configured: {workspace.dirs}\")\n", + "print(f\"\\n🔄 Syncing workspace to {NUM_NODES * NUM_GPUS} remote processes...\")\n", + "\n", + "# Perform initial sync\n", + "await proc_mesh.sync_workspace(workspace=workspace, conda=False, auto_reload=False)\n", + "\n", + "print(\"\\n✅ Initial workspace sync completed!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Verify File on Remote Nodes\n", + "\n", + "Let's verify that our config file was successfully synced to all remote worker nodes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Construct the remote file path (files are synced to WORKSPACE_DIR)\n", + "remote_workspace_root = os.environ.get(\"WORKSPACE_DIR\", \"/workspace\")\n", + "remote_config_path = os.path.join(remote_workspace_root, \"monarch_sync_example\", config_file_name)\n", + "\n", + "print(f\"Checking file on remote nodes: {remote_config_path}\\n\")\n", + "\n", + "# Check file existence on all nodes (just check first rank of each node)\n", + "exists_results = await file_checker.file_exists.call(remote_config_path)\n", + "\n", + "# Group by hostname to show node-level status\n", + "nodes_checked = set()\n", + "for result in exists_results:\n", + " hostname = result['hostname']\n", + " if hostname not in nodes_checked:\n", + " status = \"✓ EXISTS\" if result['exists'] else \"✗ NOT FOUND\"\n", + " print(f\" Node {hostname}: {status}\")\n", + " nodes_checked.add(hostname)\n", + "\n", + "# Read file content from rank 0 to verify\n", + "print(f\"\\n📄 Reading config from rank 0:\")\n", + "print(f\"{'-'*50}\")\n", + "read_results = await file_checker.read_file.call(remote_config_path)\n", + "if read_results[0]['exists']:\n", + " print(read_results[0]['content'])\n", + "else:\n", + " print(f\"Error: {read_results[0].get('error', 'Unknown error')}\")\n", + "print(f\"{'-'*50}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Hot-Reload: Modify and Re-Sync\n", + "\n", + "Now comes the powerful part! Let's modify our config locally and sync it again - **without restarting anything**." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modify Local Configuration\n", + "\n", + "Let's say we want to:\n", + "- Decrease the learning rate (0.001 → 0.0005)\n", + "- Increase max steps (100 → 200)\n", + "- Change sequence length (1024 → 2048)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Modify the configuration\n", + "updated_config = \"\"\"# TorchTitan Custom Training Configuration\n", + "# Version 2.0 - Updated after initial run\n", + "\n", + "[training]\n", + "batch_size = 32\n", + "learning_rate = 0.0005 # ← CHANGED: Reduced from 0.001\n", + "max_steps = 200 # ← CHANGED: Increased from 100\n", + "warmup_steps = 10\n", + "\n", + "[model]\n", + "model_type = \"llama3_8b\"\n", + "seq_len = 2048 # ← CHANGED: Increased from 1024\n", + "\n", + "[optimizer]\n", + "optimizer_type = \"AdamW\"\n", + "weight_decay = 0.01\n", + "\"\"\"\n", + "\n", + "# Write updated config locally\n", + "with open(local_config_path, 'w') as f:\n", + " f.write(updated_config)\n", + "\n", + "print(f\"✓ Updated local config file: {local_config_path}\")\n", + "print(f\"\\nUpdated configuration:\\n{'-'*50}\")\n", + "print(updated_config)\n", + "print(f\"{'-'*50}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Re-Sync to Remote Nodes\n", + "\n", + "Now sync the changes to all remote nodes. This is instant - no job restart required!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"🔄 Re-syncing updated workspace to remote nodes...\")\n", + "\n", + "# Sync again - Monarch only transfers what changed!\n", + "await proc_mesh.sync_workspace(workspace=workspace, conda=False, auto_reload=False)\n", + "\n", + "print(\"\\n✅ Workspace re-sync completed!\")\n", + "print(\"\\n💡 The updated config is now available on all remote nodes!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Verify Updated File on Remote Nodes\n", + "\n", + "Let's confirm the updated config made it to the remote nodes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"📄 Reading updated config from rank 0:\")\n", + "print(f\"{'-'*50}\")\n", + "\n", + "read_results = await file_checker.read_file.call(remote_config_path)\n", + "if read_results[0]['exists']:\n", + " remote_content = read_results[0]['content']\n", + " print(remote_content)\n", + " \n", + " # Verify it matches our local update\n", + " if \"learning_rate = 0.0005\" in remote_content and \"max_steps = 200\" in remote_content:\n", + " print(f\"{'-'*50}\")\n", + " print(\"\\n✅ SUCCESS! Remote config matches local changes:\")\n", + " print(\" ✓ Learning rate: 0.001 → 0.0005\")\n", + " print(\" ✓ Max steps: 100 → 200\")\n", + " print(\" ✓ Sequence length: 1024 → 2048\")\n", + " else:\n", + " print(f\"{'-'*50}\")\n", + " print(\"\\n⚠️ Warning: Remote config may not have updated correctly\")\n", + "else:\n", + " print(f\"Error: {read_results[0].get('error', 'Unknown error')}\")\n", + " print(f\"{'-'*50}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Real-World Workflow Example\n", + "\n", + "Here's how you'd use workspace sync in a real training scenario:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Workflow: Iterative Training with Config Changes\n", + "\n", + "```python\n", + "# 1. Initial training run\n", + "await async_main(config) # Train with initial settings\n", + "\n", + "# 2. Review results, decide to adjust learning rate\n", + "# Edit local config file...\n", + "\n", + "# 3. Sync changes (< 1 second)\n", + "await proc_mesh.sync_workspace(workspace=workspace)\n", + "\n", + "# 4. Re-run training with new config (no restart!)\n", + "config = config_manager.parse_args(manual_args) # Reload config\n", + "await async_main(config) # Train with updated settings\n", + "\n", + "# 5. Repeat as needed!\n", + "```\n", + "\n", + "### Time Savings\n", + "\n", + "**Without Monarch:**\n", + "- Change config: 1 min\n", + "- Stop job: 1 min\n", + "- Restart job: 5-10 min\n", + "- **Total per iteration: ~7-12 min**\n", + "\n", + "**With Monarch:**\n", + "- Change config: 1 min\n", + "- Sync: < 1 sec\n", + "- **Total per iteration: ~1 min**\n", + "\n", + "**10x faster iteration!** 🚀" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced: Syncing Multiple Files and Directories\n", + "\n", + "You can sync entire directory trees, not just single files!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example: Sync multiple directories\n", + "from pathlib import Path\n", + "\n", + "# Create a workspace with multiple directories\n", + "multi_dir_workspace = Workspace(dirs=[\n", + " Path(\"/teamspace/studios/this_studio/configs\"),\n", + " Path(\"/teamspace/studios/this_studio/custom_modules\"),\n", + " Path(\"/teamspace/studios/this_studio/data_processors\"),\n", + "])\n", + "\n", + "# Sync all directories at once\n", + "# await proc_mesh.sync_workspace(workspace=multi_dir_workspace)\n", + "\n", + "print(\"\\n💡 Tip: You can sync entire project directories, not just config files!\")\n", + "print(\"This enables hot-reloading of:\")\n", + "print(\" • Training scripts\")\n", + "print(\" • Model definitions\")\n", + "print(\" • Data preprocessing code\")\n", + "print(\" • Custom layers and modules\")\n", + "print(\" • And more!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# 🎉 Congratulations! 🎉\n", + "\n", + "You've mastered **workspace synchronization** with Monarch!\n", + "\n", + "## What You Learned\n", + "\n", + "- Creating a Monarch `Workspace` for local directories\n", + "- Syncing files to remote nodes with `proc_mesh.sync_workspace()`\n", + "- Verifying synchronization across the cluster\n", + "- Hot-reloading configs without job restarts\n", + "- Real-world iterative training workflows\n", + "\n", + "## Key Takeaways\n", + "\n", + "- **10x faster iteration** - No more waiting for job restarts\n", + "- **Edit locally, run remotely** - Keep your familiar dev environment\n", + "- **Sync is smart** - Only changed files are transferred\n", + "- **Works with any files** - Configs, code, data processors, etc.\n", + "\n", + "## Next Steps\n", + "\n", + "### 🐛 Studio 3: Interactive Debugging (Recommended Next)\n", + "Learn advanced debugging techniques:\n", + "- Set breakpoints in distributed actors\n", + "- Debug specific ranks with `monarch debug`\n", + "- Inspect and modify environment variables\n", + "- Troubleshoot training issues interactively\n", + "\n", + "### 📚 Back to Studio 1\n", + "Review the basics: [Studio 1: Getting Started](./studio_1_getting_started.ipynb)\n", + "\n", + "---\n", + "\n", + "## Try It Yourself!\n", + "\n", + "Before moving on, try modifying the config one more time:\n", + "1. Change the batch size to 64\n", + "2. Sync the workspace\n", + "3. Verify the changes\n", + "\n", + "This workflow will become second nature!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/lightning/studio_3_interactive_debugging.ipynb b/examples/lightning/studio_3_interactive_debugging.ipynb new file mode 100644 index 000000000..77e9595d1 --- /dev/null +++ b/examples/lightning/studio_3_interactive_debugging.ipynb @@ -0,0 +1,678 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": "# Studio 3: Interactive Debugging for Distributed Training\n\nWelcome to Studio 3! In this notebook, you'll master **interactive debugging** techniques for distributed systems using Monarch.\n\n## The Challenge\n\nDebugging distributed training is notoriously difficult:\n- Issues may only appear on specific ranks or nodes\n- Traditional debuggers don't work across multiple processes\n- Environment differences between nodes are hard to inspect\n- Logs from 128+ processes are overwhelming\n\n## Monarch's Solution\n\nMonarch provides powerful debugging capabilities:\n1. **Interactive breakpoints** - Use `pdb` with distributed actors\n2. **Selective debugging** - Attach to specific ranks\n3. **Environment inspection** - Query env vars across all nodes\n4. **Monarch debug CLI** - Unified interface for distributed debugging\n\n## What You'll Learn\n\n### Environment Variable Management\n- Inspect environment variables across nodes\n- Set and modify env vars remotely\n- Query variables by prefix (CUDA, NCCL, etc.)\n\n### Interactive Debugging with Breakpoints\n- Add breakpoints to actor methods\n- Use `monarch debug` CLI\n- Attach to specific ranks for interactive debugging\n- Send debugger commands to multiple ranks\n\n## Prerequisites\n\n**Recommended:** Complete [Studio 1: Getting Started](./studio_1_getting_started.ipynb) and [Studio 2: Workspace Sync](./studio_2_workspace_sync.ipynb) first!\n\nYou should have:\n- A running multi-node Lightning job\n- An initialized Monarch process mesh\n- Understanding of Monarch actors and endpoints\n\n**New to Monarch?** Start with [Studio 0: Monarch Basics](./studio_0_monarch_basics.ipynb) to learn about Actors, Endpoints, and Process Meshes!\n\n## Lightning Studios Series\n\nThis is **Studio 3** of the series:\n\n- **[Studio 0: Monarch Basics](./studio_0_monarch_basics.ipynb)** - Learn Monarch fundamentals\n- **[Studio 1: Getting Started](./studio_1_getting_started.ipynb)** - Multi-node training\n- **[Studio 2: Workspace Sync](./studio_2_workspace_sync.ipynb)** - Hot-reload configs\n- **Studio 3: Interactive Debugging** - Debug distributed systems (YOU ARE HERE)\n\nLet's dive in!" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Setup (If Starting Fresh)\n", + "\n", + "If you're continuing from Studio 1 or 2, **skip this section**. Otherwise, run these cells." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Only run if starting fresh\n", + "from lightning_sdk import Machine, MMT, Studio\n", + "import os\n", + "\n", + "NUM_NODES = 2\n", + "NUM_GPUS = 8\n", + "TEAMSPACE = \"general\"\n", + "USER = \"your-username\"\n", + "\n", + "os.environ[\"MONARCH_V0_WORKAROUND_DO_NOT_USE\"] = \"1\"\n", + "os.environ[\"MONARCH_FILE_LOG\"] = \"debug\"\n", + "\n", + "# Launch job and setup proc_mesh (see Studio 1 for details)\n", + "# job, studio = launch_mmt_job(...)\n", + "# proc_mesh = setup_proc_mesh_from_job(job, NUM_NODES, NUM_GPUS)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Part 1: Environment Variable Management\n", + "\n", + "Let's start by creating an actor that can inspect and manage environment variables across all nodes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Environment Variable Actor\n", + "\n", + "This actor provides methods to get, set, and list environment variables on remote nodes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from monarch.actor import Actor, endpoint, current_rank\n", + "import os\n", + "import socket\n", + "\n", + "\n", + "class EnvVarActor(Actor):\n", + " \"\"\"Actor for managing environment variables on remote nodes.\"\"\"\n", + "\n", + " def __init__(self):\n", + " self.rank = current_rank().rank\n", + " self.hostname = socket.gethostname()\n", + "\n", + " @endpoint\n", + " def get_env(self, var_name: str) -> dict:\n", + " \"\"\"Get an environment variable value from the remote node.\"\"\"\n", + " value = os.environ.get(var_name)\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"var_name\": var_name,\n", + " \"value\": value\n", + " }\n", + "\n", + " @endpoint\n", + " def set_env(self, var_name: str, var_value: str) -> dict:\n", + " \"\"\"Set an environment variable on the remote node.\"\"\"\n", + " os.environ[var_name] = var_value\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"var_name\": var_name,\n", + " \"value\": var_value,\n", + " \"status\": \"set\"\n", + " }\n", + "\n", + " @endpoint\n", + " def list_env_vars(self, prefix: str = \"\") -> dict:\n", + " \"\"\"List all environment variables matching a prefix.\"\"\"\n", + " matching_vars = {k: v for k, v in os.environ.items() if k.startswith(prefix)}\n", + " return {\n", + " \"rank\": self.rank,\n", + " \"hostname\": self.hostname,\n", + " \"matching_vars\": matching_vars,\n", + " \"count\": len(matching_vars)\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spawn Environment Variable Actor\n", + "\n", + "Spawn the actor across all nodes in the process mesh." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Spawn the environment variable actor across all nodes\n", + "env_actor = proc_mesh.spawn(\"env_actor\", EnvVarActor)\n", + "print(\"✓ EnvVarActor spawned across all nodes\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query Environment Variables\n", + "\n", + "Let's inspect CUDA-related environment variables across all nodes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get CUDA_VISIBLE_DEVICES from all nodes\n", + "results = await env_actor.get_env.call(\"CUDA_VISIBLE_DEVICES\")\n", + "\n", + "print(\"\\nCUDA_VISIBLE_DEVICES on all nodes:\")\n", + "print(f\"{'-'*70}\")\n", + "\n", + "# Show unique values by node\n", + "seen_nodes = set()\n", + "for result in results:\n", + " if len(result) > 1:\n", + " rank = result[1].get('rank', '?')\n", + " hostname = result[1].get('hostname', '?')\n", + " value = result[1].get('value', '?')\n", + " else:\n", + " rank = result.get('rank', '?')\n", + " hostname = result.get('hostname', '?')\n", + " value = result.get('value', '?')\n", + " \n", + " if hostname not in seen_nodes:\n", + " print(f\" Node {hostname} (Rank {rank}): {value}\")\n", + " seen_nodes.add(hostname)\n", + "\n", + "print(f\"{'-'*70}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set Custom Environment Variables\n", + "\n", + "You can set environment variables remotely for debugging purposes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set a custom environment variable on all nodes\n", + "print(\"Setting CUSTOM_DEBUG_VAR on all nodes...\")\n", + "set_results = await env_actor.set_env.call(\"CUSTOM_DEBUG_VAR\", \"debug_enabled\")\n", + "\n", + "print(f\"\\n✓ Set CUSTOM_DEBUG_VAR on {len(set_results)} ranks\")\n", + "\n", + "# Verify the variable was set\n", + "verify_results = await env_actor.get_env.call(\"CUSTOM_DEBUG_VAR\")\n", + "print(f\"\\nVerification (first 3 ranks):\")\n", + "for i, result in enumerate(verify_results[:3]):\n", + " if len(result) > 1:\n", + " rank = result[1]['rank']\n", + " value = result[1]['value']\n", + " else:\n", + " rank = result['rank']\n", + " value = result['value']\n", + " print(f\" Rank {rank}: CUSTOM_DEBUG_VAR = {value}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List Variables by Prefix\n", + "\n", + "Query all environment variables matching a specific prefix - useful for debugging CUDA, NCCL, or PyTorch settings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List all CUDA-related environment variables\n", + "list_results = await env_actor.list_env_vars.call(\"CUDA\")\n", + "\n", + "print(\"\\nCUDA-related environment variables (Rank 0):\")\n", + "print(f\"{'-'*70}\")\n", + "\n", + "if list_results[0]:\n", + " result = list_results[0][1] if len(list_results[0]) > 1 else list_results[0]\n", + " matching_vars = result.get('matching_vars', {})\n", + " \n", + " if matching_vars:\n", + " for var_name, var_value in matching_vars.items():\n", + " # Truncate long values\n", + " display_value = var_value if len(var_value) < 60 else var_value[:57] + \"...\"\n", + " print(f\" {var_name} = {display_value}\")\n", + " else:\n", + " print(\" No CUDA variables found\")\n", + "\n", + "print(f\"{'-'*70}\")\n", + "\n", + "# Try other prefixes\n", + "print(\"\\n💡 Tip: Try querying other prefixes like:\")\n", + "print(\" • 'NCCL' - NCCL communication settings\")\n", + "print(\" • 'TORCH' - PyTorch settings\")\n", + "print(\" • 'MONARCH' - Monarch-specific configs\")\n", + "print(\" • 'MASTER' - Distributed training master node info\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Part 2: Interactive Debugging with Breakpoints\n", + "\n", + "Now let's explore Monarch's most powerful debugging feature: **interactive breakpoints** in distributed actors!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How Monarch Debugging Works\n", + "\n", + "### The Workflow\n", + "\n", + "1. **Add `breakpoint()`** to your actor methods\n", + "2. **Run your code** - execution pauses when breakpoint is hit\n", + "3. **Open a terminal** and run `monarch debug`\n", + "4. **Use debugger commands**:\n", + " - `list` - Show all active breakpoints\n", + " - `attach ` - Attach to a specific rank\n", + " - Standard pdb commands: `n`, `s`, `p`, `l`, `c`\n", + " - `cast ranks() ` - Send commands to multiple ranks\n", + " - `continue` - Resume all paused processes\n", + "\n", + "### Key Features\n", + "\n", + "- Debug specific ranks (e.g., only rank 0 or only GPU 3)\n", + "- Inspect local variables and actor state\n", + "- Step through code interactively\n", + "- Send commands to multiple ranks simultaneously" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Debug Trainer Actor\n", + "\n", + "Let's create a simplified trainer with strategic breakpoints." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import logging\n", + "from typing import Optional\n", + "import torch\n", + "from torchtitan.config import JobConfig\n", + "from torchtitan.train import Trainer\n", + "from torchtitan.tools.logging import logger\n", + "\n", + "\n", + "class DebugTrainerActor(Actor):\n", + " \"\"\"TorchTitan Trainer Actor with debugging breakpoints.\"\"\"\n", + "\n", + " def __init__(self, job_config: JobConfig):\n", + " self.rank = current_rank().rank\n", + " self.job_config = job_config\n", + " self.trainer: Optional[Trainer] = None\n", + " self.step_count = 0\n", + "\n", + " def _rprint(self, msg):\n", + " \"\"\"Helper method to print with rank information.\"\"\"\n", + " print(f\"[Rank {self.rank}] {msg}\")\n", + "\n", + " @endpoint\n", + " def init(self):\n", + " logging.getLogger().addHandler(logging.StreamHandler(sys.stderr))\n", + " self._rprint(f\"Initializing debug actor: {current_rank()=} {socket.gethostname()=}\")\n", + "\n", + " # Breakpoint 1: After initialization (only on rank 0)\n", + " if self.rank == 0:\n", + " self._rprint(\"🔴 Breakpoint 1: Initialization complete\")\n", + " breakpoint() # Debug: Inspect actor initialization state\n", + "\n", + " @endpoint\n", + " def setup_trainer(self):\n", + " \"\"\"Setup the trainer with a breakpoint to inspect configuration.\"\"\"\n", + " logger.info(f\"Setting up trainer on rank {self.rank}\")\n", + " config = self.job_config\n", + "\n", + " # Breakpoint 2: Before trainer creation (only on rank 0)\n", + " if self.rank == 0:\n", + " self._rprint(\"🔴 Breakpoint 2: About to create trainer\")\n", + " self._rprint(f\"Config: batch_size={getattr(config.training, 'batch_size', 'N/A')}\")\n", + " breakpoint() # Debug: Inspect job config before trainer creation\n", + "\n", + " self.trainer = Trainer(config)\n", + " self._rprint(\"Trainer setup complete\")\n", + "\n", + " @endpoint\n", + " def train_step(self, num_steps: int = 5):\n", + " \"\"\"Run a few training steps with breakpoints.\"\"\"\n", + " if not self.trainer:\n", + " raise RuntimeError(\"Trainer not initialized. Call setup_trainer first.\")\n", + "\n", + " logger.info(f\"Starting training for {num_steps} steps on rank {self.rank}\")\n", + "\n", + " # Breakpoint 3: Before training starts (only on rank 0)\n", + " if self.rank == 0:\n", + " self._rprint(\"🔴 Breakpoint 3: About to start training\")\n", + " breakpoint() # Debug: Inspect trainer state before training\n", + "\n", + " # Simulate training steps\n", + " for step in range(num_steps):\n", + " self.step_count += 1\n", + " \n", + " # Breakpoint 4: Mid-training on rank 0 at step 2\n", + " if step == 2 and self.rank == 0:\n", + " self._rprint(f\"🔴 Breakpoint 4: Mid-training (step {self.step_count})\")\n", + " breakpoint() # Debug: Inspect mid-training state\n", + "\n", + " self._rprint(f\"Processing step {step + 1}/{num_steps}\")\n", + "\n", + " self._rprint(f\"Completed {num_steps} training steps\")\n", + "\n", + " @endpoint\n", + " def cleanup(self):\n", + " \"\"\"Cleanup resources.\"\"\"\n", + " logger.info(f\"Cleaning up trainer on rank {self.rank}\")\n", + "\n", + " if self.trainer:\n", + " self.trainer.close()\n", + "\n", + " if torch.distributed.is_initialized():\n", + " torch.distributed.destroy_process_group()\n", + "\n", + " self._rprint(\"Cleanup complete\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spawn Debug Trainer\n", + "\n", + "Spawn the debug trainer actor. When you run the cells below, execution will pause at breakpoints." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torchtitan.config import ConfigManager\n", + "\n", + "# Parse config (using simple defaults for debugging)\n", + "config_manager = ConfigManager()\n", + "manual_args = [\n", + " \"--job.config_file\",\n", + " \"/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml\",\n", + " \"--training.steps\", \"5\",\n", + "]\n", + "debug_config = config_manager.parse_args(manual_args)\n", + "\n", + "# Spawn the debug trainer actor\n", + "debug_trainer = proc_mesh.spawn(\"debug_trainer\", DebugTrainerActor, debug_config)\n", + "print(\"✓ Debug trainer actor spawned across all nodes\")\n", + "print(\"\\n⚠️ When breakpoints are hit, execution will pause.\")\n", + "print(\"📍 Open a separate terminal and run: monarch debug\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Debug Session\n", + "\n", + "Now let's run the training methods. When breakpoints are hit:\n", + "\n", + "### In This Notebook\n", + "- Execution will pause\n", + "- You'll see `🔴 Breakpoint X: ...` messages\n", + "\n", + "### In a Separate Terminal\n", + "1. Run: `monarch debug`\n", + "2. Use `list` to see all active breakpoints\n", + "3. Use `attach debug_trainer 0` to attach to rank 0\n", + "4. Use standard pdb commands or `continue` to resume\n", + "\n", + "**Note:** For this demo, we'll skip the interactive debugging. In practice, you'd have two terminals open." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize actors (will hit breakpoint 1)\n", + "print(\"📍 Step 1: Initializing actors...\")\n", + "print(\" (Breakpoint 1 will trigger on rank 0)\\n\")\n", + "\n", + "# In a real scenario, this would pause at the breakpoint\n", + "# await debug_trainer.init.call()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Setup trainer (will hit breakpoint 2)\n", + "print(\"📍 Step 2: Setting up trainer...\")\n", + "print(\" (Breakpoint 2 will trigger on rank 0)\\n\")\n", + "\n", + "# await debug_trainer.setup_trainer.call()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run training steps (will hit breakpoints 3 and 4)\n", + "print(\"📍 Step 3: Running training steps...\")\n", + "print(\" (Breakpoints 3 and 4 will trigger on rank 0)\\n\")\n", + "\n", + "# await debug_trainer.train_step.call(num_steps=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Monarch Debug CLI Commands\n", + "\n", + "Here's a quick reference for the `monarch debug` CLI:\n", + "\n", + "### Listing Breakpoints\n", + "```bash\n", + "monarch_dbg> list\n", + "# Shows all active breakpoints across ranks\n", + "# Example output:\n", + "# debug_trainer (rank 0): /path/to/file.py:42\n", + "# debug_trainer (rank 0): /path/to/file.py:58\n", + "```\n", + "\n", + "### Attaching to a Rank\n", + "```bash\n", + "monarch_dbg> attach debug_trainer 0\n", + "# Enters interactive pdb session for rank 0\n", + "\n", + "(Pdb) n # Next line\n", + "(Pdb) s # Step into function\n", + "(Pdb) p self.rank # Print variable\n", + "(Pdb) l # List source code\n", + "(Pdb) pp self.job_config # Pretty-print object\n", + "(Pdb) c # Continue execution\n", + "```\n", + "\n", + "### Casting Commands to Multiple Ranks\n", + "```bash\n", + "# Send \"next\" command to ranks 0 and 1\n", + "monarch_dbg> cast debug_trainer ranks(0,1) n\n", + "\n", + "# Send \"continue\" to ranks 0 through 7\n", + "monarch_dbg> cast debug_trainer ranks(0:8) c\n", + "\n", + "# Print a variable on multiple ranks\n", + "monarch_dbg> cast debug_trainer ranks(0,1,2,3) p self.step_count\n", + "```\n", + "\n", + "### Continuing All\n", + "```bash\n", + "monarch_dbg> continue\n", + "# Resumes execution on all paused ranks\n", + "```\n", + "\n", + "### Getting Help\n", + "```bash\n", + "monarch_dbg> help\n", + "# Shows all available commands\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Common Debugging Scenarios\n", + "\n", + "### Scenario 1: Rank-Specific Bug\n", + "```python\n", + "# Problem: Training fails on rank 5 but works on other ranks\n", + "\n", + "@endpoint\n", + "def train(self):\n", + " if self.rank == 5:\n", + " breakpoint() # Only pause rank 5\n", + " # ... training code\n", + "```\n", + "\n", + "Then in terminal:\n", + "```bash\n", + "monarch debug\n", + "monarch_dbg> attach trainer_actor 5\n", + "(Pdb) p self.data_batch # Inspect what's different on rank 5\n", + "```\n", + "\n", + "### Scenario 2: Collective Operation Hang\n", + "```python\n", + "# Problem: All-reduce hangs, need to check all ranks\n", + "\n", + "@endpoint\n", + "def sync_gradients(self):\n", + " breakpoint() # Pause all ranks before all-reduce\n", + " torch.distributed.all_reduce(self.gradients)\n", + "```\n", + "\n", + "Then:\n", + "```bash\n", + "monarch_dbg> list # Check which ranks hit the breakpoint\n", + "monarch_dbg> cast trainer_actor ranks(0:8) p self.gradients.shape\n", + "# Verify all ranks have same shape\n", + "```\n", + "\n", + "### Scenario 3: Environment Mismatch\n", + "```python\n", + "# Problem: Different NCCL settings causing issues\n", + "\n", + "# Use EnvVarActor to inspect\n", + "results = await env_actor.list_env_vars.call(\"NCCL\")\n", + "# Compare NCCL settings across all ranks\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# 🎉 Congratulations! 🎉\n", + "\n", + "You've mastered **interactive debugging** for distributed training with Monarch!\n", + "\n", + "## What You Learned\n", + "\n", + "### Environment Variable Management\n", + "- ✓ Query env vars across all nodes\n", + "- ✓ Set and modify env vars remotely\n", + "- ✓ List variables by prefix (CUDA, NCCL, etc.)\n", + "\n", + "### Interactive Debugging\n", + "- ✓ Add breakpoints to distributed actors\n", + "- ✓ Use `monarch debug` CLI\n", + "- ✓ Attach to specific ranks\n", + "- ✓ Send commands to multiple ranks\n", + "- ✓ Common debugging scenarios\n", + "\n", + "## Key Takeaways\n", + "\n", + "- **Debug like local code** - Use familiar pdb commands in distributed settings\n", + "- **Selective debugging** - Focus on problematic ranks without noise from others\n", + "- **Environment inspection** - Quickly identify configuration mismatches\n", + "- **No more print debugging** - Interactive inspection is much more powerful\n", + "\n", + "## The Complete Monarch Workflow\n", + "\n", + "You've now learned the three pillars of efficient distributed development:\n", + "\n", + "1. **Studio 1: Getting Started** - Launch multi-node training\n", + "2. **Studio 2: Workspace Sync** - Hot-reload configs and code\n", + "3. **Studio 3: Interactive Debugging** - Debug efficiently (YOU ARE HERE!)\n", + "\n", + "Together, these enable:\n", + "- **10x faster iteration** (no job restarts)\n", + "- **Easier debugging** (interactive breakpoints)\n", + "- **Better observability** (env var inspection, log aggregation)\n", + "\n", + "## Next Steps\n", + "\n", + "### Put It Into Practice\n", + "Try debugging your own training code:\n", + "1. Add strategic breakpoints\n", + "2. Run `monarch debug` when they're hit\n", + "3. Inspect state and identify issues\n", + "\n", + "### Explore More\n", + "- Review [Studio 1: Getting Started](./studio_1_getting_started.ipynb)\n", + "- Review [Studio 2: Workspace Sync](./studio_2_workspace_sync.ipynb)\n", + "- Check out the [Monarch documentation](https://github.com/meta-pytorch/monarch)\n", + "\n", + "---\n", + "\n", + "## Pro Tips\n", + "\n", + "### Debugging Best Practices\n", + "1. **Use conditional breakpoints** - Only pause specific ranks\n", + "2. **Check env vars first** - Many issues are configuration mismatches\n", + "3. **Use `cast` for comparison** - Check variables across multiple ranks\n", + "4. **Don't forget `continue`** - Resume execution when done debugging\n", + "\n", + "### Performance Tip\n", + "Remove or comment out `breakpoint()` calls for production runs - they have minimal overhead when not triggered, but it's cleaner to remove them.\n", + "\n", + "Happy debugging! 🐛🔧" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/lightning/titan_monarch_mmt_aws_portOverride.ipynb b/examples/lightning/titan_monarch_mmt_aws_portOverride.ipynb new file mode 100644 index 000000000..a0118165f --- /dev/null +++ b/examples/lightning/titan_monarch_mmt_aws_portOverride.ipynb @@ -0,0 +1,908 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from lightning_sdk import Machine, MMT, Studio" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "private_master_host_ip_address = 10.192.12.204\n", + "public_master_host_ip_address = 3.84.102.51\n", + "public_master_host_ip_address = 3.84.102.51\n" + ] + } + ], + "source": [ + "from utils.master_node import MasterNodeServer\n", + "private_master_host_ip_address = MasterNodeServer.get_master_ip()\n", + "public_master_host_ip_address = MasterNodeServer.get_master_public_ip_curl()\n", + "public_master_host_ip_address_services = MasterNodeServer.get_master_public_ip()\n", + "print(f\"private_master_host_ip_address = {private_master_host_ip_address}\")\n", + "print(f\"public_master_host_ip_address = {public_master_host_ip_address}\")\n", + "print(f\"public_master_host_ip_address = {public_master_host_ip_address_services}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration\n", + "import os\n", + "NUM_NODES = 2\n", + "NUM_GPUS = 8\n", + "TEAMSPACE = \"general\" # Replace with your teamspace\n", + "USER = \"meta-ai\" # Replace with your username\n", + "MONARCH_DEFAULT_PORT = 26600 # Monarch default port\n", + "HTTP_SERVER_PORT = MONARCH_DEFAULT_PORT # 8080 # HTTP Server PORT for IP registration\n", + "\n", + "os.environ[\"MONARCH_FILE_LOG\"] = \"debug\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "debug\n" + ] + } + ], + "source": [ + "print(os.environ.get(\"MONARCH_FILE_LOG\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def launch_mmt_job(num_nodes=2, teamspace=\"my-teamspace\", user=\"my-user\"):\n", + " \"\"\"\n", + " Launch a multi-machine training job using Lightning SDK's MMT API.\n", + " \"\"\"\n", + "\n", + " studio = Studio()\n", + "\n", + " # Install the MMT plugin befor running the actual job\n", + " studio.install_plugin(\"multi-machine-training\")\n", + "\n", + " print(f\"Launching MMT job with {num_nodes} nodes...\")\n", + "\n", + " # Machine with T4 GPUs\n", + " # machine_type = getattr(Machine, f\"T4_X_{NUM_GPUS}\")\n", + "\n", + " # Machine with L40 GPUs\n", + " # machine_type = getattr(Machine, f\"L4_X_{NUM_GPUS}\")\n", + "\n", + " # Machine with L40S GPUs\n", + " machine_type = getattr(Machine, f\"L40S_X_{NUM_GPUS}\")\n", + "\n", + " job = MMT.run(\n", + " command=f\"python example/utils/worker_node.py {public_master_host_ip_address} {HTTP_SERVER_PORT} && sleep 10 && process_allocator\",\n", + " name=f\"Multi-Node-Monarch-Titan-Scale-{NUM_NODES}_nodes-port_override\",\n", + " # machine=Machine.T4_X_4, # Use GPU machines for training\n", + " machine=machine_type,\n", + " studio=studio,\n", + " num_machines=num_nodes,\n", + " env={\n", + " \"CUDA_VISIBLE_DEVICES\": \"0,1,2,3,4,5,6,7\", # Make all GPUs visible # TODO: Should make this one dynamic\n", + " \"MONARCH_FILE_LOG\": \"debug\",\n", + " \"HYPERACTOR_REMOTE_ALLOC_ALLOWED_PORT_RANGE\": \"26601-26610\",\n", + " \"HYPERACTOR_REMOTE_ALLOC_BIND_TO_INADDR_ANY\": \"true\",\n", + " },\n", + " )\n", + "\n", + " print(f\"Job started with ID: {job.name}\")\n", + " print(f\"Job status: {job.status}\")\n", + "\n", + " # Monitor job status\n", + " return job, studio" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Launching MMT job with 2 nodes...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO - Multi-Machine Job was successfully launched. View it at https://lightning.ai/meta-ai/general/jobs/Multi-Node-Monarch-Titan-Scale-2_nodes-port_override-hspx9?app_id=mmt\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Job started with ID: Multi-Node-Monarch-Titan-Scale-2_nodes-port_override-hspx9\n", + "Job status: Pending\n", + "Job launched. You can monitor it using: job.status\n", + "To stop the job: job.stop()\n", + "To clean up: studio.stop()\n" + ] + } + ], + "source": [ + "# Launch the job\n", + "job, studio = launch_mmt_job(\n", + " num_nodes=NUM_NODES, teamspace=TEAMSPACE, user=USER\n", + ")\n", + "\n", + "print(f\"Job launched. You can monitor it using: job.status\")\n", + "print(f\"To stop the job: job.stop()\")\n", + "print(f\"To clean up: studio.stop()\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Master node IP: 3.84.102.51\n", + "Expecting 2 worker nodes to register...\n", + "Starting server on port 26600...\n", + "Waiting for workers... (0/2 registered) - Elapsed: 0s\n", + "Server started on 3.84.102.51:26600\n", + "Waiting for workers... (0/2 registered) - Elapsed: 30s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 60s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 90s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 120s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 150s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 180s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 210s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 240s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 270s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 300s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 330s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 360s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 390s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 420s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 450s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 480s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 510s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 540s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 570s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 600s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 630s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 660s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 690s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 720s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 750s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 780s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 810s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 840s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 870s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 900s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 930s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 960s\n", + "Waiting for workers... (0/2 registered) - Elapsed: 990s\n", + "Registered worker node: 52.14.215.20 (1/2)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "52.14.215.20 - - [14/Oct/2025 00:41:36] \"POST /register HTTP/1.1\" 200 -\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Registered worker node: 18.219.107.185 (2/2)\n", + "All worker nodes registered!\n", + "Registration server stopped\n", + "Final registered worker nodes: ['52.14.215.20', '18.219.107.185']\n", + "Worker IPs saved to /tmp/worker_nodes.txt\n", + "Cluster info saved to /tmp/cluster_info.json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "18.219.107.185 - - [14/Oct/2025 00:41:40] \"POST /register HTTP/1.1\" 200 -\n" + ] + } + ], + "source": [ + "from utils.master_node import run_master_server\n", + "cluster_info = run_master_server(expected_workers=NUM_NODES, port=HTTP_SERVER_PORT)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted IP addresses:\n", + "18.219.107.185\n", + "52.14.215.20\n", + "\n", + "IP set: {'18.219.107.185', '52.14.215.20'}\n", + "['18.219.107.185', '52.14.215.20']\n" + ] + } + ], + "source": [ + "from utils.ip_utils import extract_ips_simple\n", + "worker_nodes_ip_file_path = \"/tmp/worker_nodes.txt\"\n", + "ip_addresses_set = extract_ips_simple(worker_nodes_ip_file_path)\n", + "ip_addresses_list = list(ip_addresses_set)\n", + "print(ip_addresses_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tcp!18.219.107.185:26600 tcp!52.14.215.20:26600\n" + ] + } + ], + "source": [ + "# ip_addresses_set = {'3.143.199.198', '3.132.52.102', '3.15.95.43'}\n", + "# ip_addresses_set = {'18.219.107.185', '52.14.215.20'}\n", + "tcp_addresses = [f\"tcp!{ip}:{MONARCH_DEFAULT_PORT}\" for ip in ip_addresses_set]\n", + "\n", + "# # Or if you want to test it locally first on the local machine uncomment line below:\n", + "# tcp_addresses = [\"tcp![::]:26600\"]\n", + "# # For the local host machine only, please make sure that NUM_NODES is equal to 1;\n", + "# NUM_NODES = 1\n", + "\n", + "print(*tcp_addresses)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example 1 - Run TorchTitan using Monarch for Llama 3 - 8B" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AllocHandle(_hy_alloc=, _extent={'hosts': 2, 'gpus': 8}, _stream_logs=True)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "sys:1: UserWarning: The AllocSpec passed to RemoteAllocator.allocate has transport unix, but the transport from the remote process alloc initializer is tcp. This will soon be an error unless you explicitly configure monarch's default transport to tcp. The current default transport is unix.\n" + ] + } + ], + "source": [ + "from monarch._src.actor.allocator import RemoteAllocator, StaticRemoteAllocInitializer\n", + "from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec\n", + "from monarch.actor import ProcMesh\n", + "import os\n", + "\n", + "# os.environ[\"HYPERACTOR_REMOTE_PROCESS_ALLOC_PORT\"] = \"26600\"\n", + "# os.environ[\"HYPERACTOR_REMOTE_PROCESS_ALLOC_ADDR\"] = f\"tcp!{public_master_host_ip_address}:{MONARCH_DEFAULT_PORT}\"\n", + "# os.environ[\"HYPERACTOR_REMOTE_ALLOC_ALLOWED_PORT_RANGE\"] = \"26600-26610\"\n", + "os.environ[\"HYPERACTOR_REMOTE_ALLOC_ALLOWED_PORT_RANGE\"] = \"26600-26610\"\n", + "os.environ[\"HYPERACTOR_REMOTE_ALLOC_BOOTSTRAP_ADDR\"] = f\"tcp!{public_master_host_ip_address}:0\"\n", + "# os.environ[\"HYPERACTOR_REMOTE_ALLOC_BOOTSTRAP_ADDR\"] = \"tcp!127.0.0.1:0\"\n", + "# os.environ[\"HYPERACTOR_REMOTE_ALLOC_BOOTSTRAP_ADDR\"] = \"tcp!3.84.102.51:0\"\n", + "# os.environ[\"HYPERACTOR_REMOTE_ALLOC_BOOTSTRAP_ADDR\"] = \"tcp!10.192.12.204:0\"\n", + "os.environ[\"HYPERACTOR_REMOTE_ALLOC_BIND_TO_INADDR_ANY\"] = \"true\"\n", + "\n", + "\n", + "allocator = RemoteAllocator(\n", + " world_id=\"foo\",\n", + " initializer=StaticRemoteAllocInitializer(*tcp_addresses),\n", + " )\n", + "\n", + "alloc = allocator.allocate(\n", + " AllocSpec(AllocConstraints(), hosts=NUM_NODES, gpus=NUM_GPUS)\n", + " )\n", + "\n", + "print(alloc)\n", + "# proc_mesh = await ProcMesh.from_alloc(alloc)\n", + "# proc_mesh = ProcMesh.from_alloc(alloc)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "proc_mesh = ProcMesh.from_alloc(alloc)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'tcp!3.84.102.51:0'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "os.environ[\"HYPERACTOR_REMOTE_ALLOC_BOOTSTRAP_ADDR\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "monarch-alisol-hosts2-gpus8\n" + ] + } + ], + "source": [ + "import getpass\n", + "def get_job_name(num_hosts: int, num_gpus_per_host: int):\n", + " return f\"monarch-{getpass.getuser()}-hosts{num_hosts}-gpus{num_gpus_per_host}\"\n", + "print(get_job_name(num_hosts=NUM_NODES, num_gpus_per_host=NUM_GPUS))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import logging\n", + "from monarch.actor import ProcMesh, Actor, endpoint, current_rank\n", + "import socket\n", + "from torchtitan.tools.logging import init_logger, logger\n", + "from torchtitan.train import Trainer\n", + "from typing import Optional\n", + "import torch\n", + "from torchtitan.config import JobConfig\n", + "\n", + "\n", + "class TitanTrainerWrapper(Actor):\n", + " def __init__(self, job_config: JobConfig):\n", + " self.rank = current_rank().rank\n", + " self.job_config = job_config\n", + "\n", + " def _rprint(self, msg):\n", + " \"\"\"Helper method to print with rank information.\"\"\"\n", + " print(f\"{self.rank=} {msg}\")\n", + "\n", + " @endpoint\n", + " def init(self):\n", + " logging.getLogger().addHandler(logging.StreamHandler(sys.stderr))\n", + " print(f\"Initializing actor: {self.rank} {current_rank()=} {socket.gethostname()=}\")\n", + "\n", + "\n", + " @endpoint\n", + " def train(self):\n", + " logger.info(\"Starting training\")\n", + " config = self.job_config\n", + " trainer: Optional[Trainer] = None\n", + "\n", + " try:\n", + " trainer = Trainer(config)\n", + " trainer.train()\n", + "\n", + " if config.checkpoint.create_seed_checkpoint:\n", + " assert (\n", + " int(os.environ[\"WORLD_SIZE\"]) == 1\n", + " ), \"Must create seed checkpoint using a single device, to disable sharding.\"\n", + " assert (\n", + " # config.checkpoint.enable_checkpoint\n", + " config.checkpoint.enable\n", + " ), \"Must enable checkpointing when creating a seed checkpoint.\"\n", + " trainer.checkpointer.save(curr_step=0, )\n", + " logger.info(\"Created seed checkpoint\")\n", + " else:\n", + " trainer.train()\n", + " finally:\n", + " if trainer:\n", + " trainer.close()\n", + "\n", + " if torch.distributed.is_initialized():\n", + " torch.distributed.destroy_process_group()\n", + " logger.info(\"Process group destroyed.\")\n", + " print(\"Done training\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.xpu import stream\n", + "from torchtitan.config import ConfigManager, JobConfig\n", + "from monarch.utils import setup_env_for_distributed\n", + "\n", + "async def async_main(job_config: JobConfig):\n", + " torch.use_deterministic_algorithms(True)\n", + " job_name = get_job_name(NUM_NODES, NUM_GPUS)\n", + "\n", + " await setup_env_for_distributed(proc_mesh,\n", + " )\n", + "\n", + " await proc_mesh.logging_option(stream_to_client=True, aggregate_window_sec=3)\n", + "\n", + " print(job_config)\n", + " print(f\"Spawning meshes on {job_name}\")\n", + "\n", + " # trainer_actor = await proc_mesh.spawn(\"trainer_actor\", TitanTrainerWrapper, job_config)\n", + " trainer_actor = proc_mesh.spawn(\"trainer_actor\", TitanTrainerWrapper, job_config)\n", + "\n", + " await trainer_actor.init.call()\n", + " await trainer_actor.train.call()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[titan] 2025-10-14 01:07:48,206 - root - WARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily.\n", + "JobConfig(job=Job(config_file='/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml', dump_folder='/teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts2-gpus8', description='Llama 3 8B training', print_args=False), profiling=Profiling(enable_profiling=True, save_traces_folder='profile_trace', profile_freq=100, profiler_active=1, profiler_warmup=3, enable_memory_snapshot=False, save_memory_snapshot_folder='memory_snapshot'), metrics=Metrics(log_freq=1, enable_tensorboard=True, disable_color_printing=False, save_tb_folder='tb', save_for_all_ranks=False, enable_wandb=True), model=Model(name='llama3', flavor='8B', hf_assets_path='/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B', tokenizer_path='/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B', converters=[], print_after_conversion=False), optimizer=Optimizer(name='AdamW', lr=0.0003, beta1=0.9, beta2=0.95, eps=1e-08, weight_decay=0.1, implementation='fused', early_step_in_backward=False), lr_scheduler=LRScheduler(warmup_steps=200, decay_ratio=None, decay_type='linear', min_lr_factor=0.0), training=Training(dataset='c4_test', dataset_path='/teamspace/studios/this_studio/torchtitan/tests/assets/c4_test', local_batch_size=1, global_batch_size=-1, seq_len=1024, max_norm=1.0, steps=25, enable_cpu_offload=False, dtype='float32', mixed_precision_param='bfloat16', mixed_precision_reduce='float32', gc_freq=50, gc_debug=False, seed=None, deterministic=False, debug_moe_force_load_balance=False), parallelism=Parallelism(data_parallel_replicate_degree=1, enable_compiled_autograd=False, data_parallel_shard_degree=-1, fsdp_reshard_after_forward='default', tensor_parallel_degree=1, disable_loss_parallel=False, enable_async_tensor_parallel=False, pipeline_parallel_degree=1, pipeline_parallel_split_points=[], module_fqns_per_model_part=None, pipeline_parallel_first_stage_less_layers=1, pipeline_parallel_last_stage_less_layers=1, pipeline_parallel_layers_per_stage=None, pipeline_parallel_schedule='1F1B', pipeline_parallel_schedule_csv='', pipeline_parallel_microbatch_size=1, context_parallel_degree=1, context_parallel_rotate_method='allgather', expert_parallel_degree=1, expert_tensor_parallel_degree=1), checkpoint=Checkpoint(enable=False, folder='checkpoint', interval=500, initial_load_path=None, initial_load_model_only=True, initial_load_in_hf=False, initial_load_in_hf_quantized=False, last_save_model_only=True, last_save_in_hf=False, export_dtype='float32', async_mode='disabled', keep_latest_k=10, load_step=-1, exclude_from_loading=[], enable_first_step_checkpoint=False, create_seed_checkpoint=False, load_only=False), activation_checkpoint=ActivationCheckpoint(mode='selective', selective_ac_option='op', per_op_sac_force_recompute_mm_shapes_by_fqns=['moe.router.gate'], early_stop=False, memory_budget=0.5, visualize_memory_budget_pareto=False), compile=Compile(enable=False, components=['model', 'loss'], backend='inductor'), quantize=Quantize(linear=QuantizedLinear(float8=Float8Linear(enable_fsdp_float8_all_gather=False, precompute_float8_dynamic_scale_for_fsdp=False, recipe_name=None, filter_fqns=['output'], emulate=False), mx=MXLinear(mxfp8_dim1_cast_kernel_choice='triton', recipe_name='mxfp8_cublas', filter_fqns=['output'])), grouped_mm=QuantizedGroupedMM(float8=Float8GroupedMM(fqns=[]), mx=MXGroupedMM(recipe_name='mxfp8', fqns=[]))), comm=Comm(init_timeout_seconds=300, train_timeout_seconds=100, trace_buf_size=20000, save_traces_folder='comm_traces', save_traces_file_prefix='rank_'), memory_estimation=MemoryEstimation(enable=False, disable_fake_mode=False), fault_tolerance=FaultTolerance(enable=False, process_group='gloo', process_group_timeout_ms=10000, replica_id=0, group_size=0, min_replica_size=1, semi_sync_method=None), experimental=Experimental(custom_import='', custom_args_module=''), validation=Validation(enable=False, dataset='c4_validation', dataset_path=None, local_batch_size=8, seq_len=2048, freq=500, steps=1200))\n", + "Spawning meshes on monarch-alisol-hosts2-gpus8\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:06:30) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m Initializing actor: 3 current_rank()={'hosts': 0/2, 'gpus': 3/8} socket.gethostname()='ip-10-192-12-106'\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:07:52) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:06:30) >>>\u001b[0m\n", + "\u001b[33m[16 similar log lines]\u001b[0m Starting training\n", + "\u001b[33m[16 similar log lines]\u001b[0m Starting job: Llama 3 8B training\n", + "\u001b[33m[5 similar log lines]\u001b[0m [W1014 01:07:53.636464501 socket.cpp:767] [c10d] The client socket has failed to connect to [ip-10-192-12-106]:50173 (errno: 22 - Invalid argument).\n", + "\u001b[33m[16 similar log lines]\u001b[0m Building 1-D device mesh with ['dp_shard'], [16]\n", + "\u001b[33m[16 similar log lines]\u001b[0m [GC] Initial GC collection took 0.00 seconds\n", + "\u001b[33m[11 similar log lines]\u001b[0m Loading tokenizer from tokenizer.json\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:07:55) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:07:52) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m Initializing actor: 9 current_rank()={'hosts': 1/2, 'gpus': 1/8} socket.gethostname()='ip-10-192-12-142'\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:07:55) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:07:55) >>>\u001b[0m\n", + "\u001b[33m[5 similar log lines]\u001b[0m Loading tokenizer from tokenizer.json\n", + "\u001b[33m[16 similar log lines]\u001b[0m Preparing c4_test dataset from /teamspace/studios/this_studio/torchtitan/tests/assets/c4_test\n", + "\u001b[33m[16 similar log lines]\u001b[0m Building llama3 8B with TransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=128256, multiple_of=1024, ffn_dim_multiplier=1.3, norm_eps=1e-05, rope_theta=500000, rope_scaling_args=RoPEScalingArgs(scaling_factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_position_embeddings=8192), max_seq_len=1024, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)\n", + "\u001b[33m[16 similar log lines]\u001b[0m CUDA capacity: NVIDIA L40S with 44.64GiB memory\n", + "\u001b[33m[32 similar log lines]\u001b[0m Peak flops undefined for: NVIDIA L40S, fallback to A100\n", + "\u001b[33m[16 similar log lines]\u001b[0m \u001b[34mModel llama3 8B \u001b[31msize: 8,030,261,248 total parameters\u001b[39m\n", + "\u001b[33m[16 similar log lines]\u001b[0m Applied selective activation checkpointing to the model\n", + "\u001b[33m[16 similar log lines]\u001b[0m Applied FSDP to the model\n", + "\u001b[33m[16 similar log lines]\u001b[0m Peak FLOPS used for computing MFU: 3.120e+14\n", + "\u001b[33m[16 similar log lines]\u001b[0m CUDA memory usage for model: 1.90GiB(4.25%)\n", + "\u001b[33m[16 similar log lines]\u001b[0m Warmup steps (200) exceed total training steps (25). Adjusting warmup steps to 25.\n", + "\u001b[33m[16 similar log lines]\u001b[0m model.safetensors.index.json not found at hf_assets_path: /teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B/model.safetensors.index.json. Defaulting to saving a single safetensors file if checkpoint is saved in HF format\n", + "\u001b[33m[16 similar log lines]\u001b[0m Mixed precision training is handled by fully_shard\n", + "\u001b[33m[16 similar log lines]\u001b[0m Trainer is initialized with local batch size 1, global batch size 16, gradient accumulation steps 1, sequence length 1024, total steps 25 (warmup 200)\n", + "\u001b[33m[16 similar log lines]\u001b[0m Training starts at step 1\n", + "\u001b[33m[16 similar log lines]\u001b[0m Profiling active. Traces will be saved at /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts2-gpus8/profile_trace\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Currently logged in as: a-shamsoshoara (a-shamsoshoara-m) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Tracking run with wandb version 0.22.2\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Run data is saved locally in /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts2-gpus8/tb/20251014-0107/wandb/run-20251014_010756-napbrhpr\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Run `wandb offline` to turn off syncing.\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Syncing run magic-fire-38\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: ⭐️ View project at https://wandb.ai/a-shamsoshoara-m/torchtitan\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: 🚀 View run at https://wandb.ai/a-shamsoshoara-m/torchtitan/runs/napbrhpr\n", + "\u001b[33m[1 similar log lines]\u001b[0m WandB logging enabled\n", + "\u001b[33m[1 similar log lines]\u001b[0m TensorBoard logging enabled. Logs will be saved at /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts2-gpus8/tb/20251014-0107\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:07:58) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:07:58) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 1 \u001b[32mloss: 12.2378 \u001b[38;2;180;60;0mgrad_norm: 4.0878 \u001b[38;2;54;234;195mmemory: 16.50GiB(36.97%) \u001b[34mtps: 51 \u001b[36mtflops: 2.37 \u001b[35mmfu: 0.76%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:08:15) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:08:15) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 1 \u001b[32mloss: 12.2378 \u001b[38;2;180;60;0mgrad_norm: 4.0878 \u001b[38;2;54;234;195mmemory: 16.50GiB(36.97%) \u001b[34mtps: 51 \u001b[36mtflops: 2.36 \u001b[35mmfu: 0.76%\u001b[39m\n", + "\u001b[33m[16 similar log lines]\u001b[0m Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:08:18) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:08:18) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 2 \u001b[32mloss: 11.5027 \u001b[38;2;180;60;0mgrad_norm: 4.2669 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:08:33) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:08:33) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 2 \u001b[32mloss: 11.5027 \u001b[38;2;180;60;0mgrad_norm: 4.2669 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:08:36) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:08:36) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 3 \u001b[32mloss: 11.1583 \u001b[38;2;180;60;0mgrad_norm: 24.6423 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:08:50) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:08:50) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 3 \u001b[32mloss: 11.1583 \u001b[38;2;180;60;0mgrad_norm: 24.6423 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:08:53) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:08:53) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 4 \u001b[32mloss: 11.9682 \u001b[38;2;180;60;0mgrad_norm: 30.1718 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:09:07) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:09:07) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 4 \u001b[32mloss: 11.9682 \u001b[38;2;180;60;0mgrad_norm: 30.1718 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:09:10) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:09:10) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 5 \u001b[32mloss: 11.6185 \u001b[38;2;180;60;0mgrad_norm: 9.5712 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:09:25) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:09:25) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 5 \u001b[32mloss: 11.6185 \u001b[38;2;180;60;0mgrad_norm: 9.5712 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:09:28) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:09:28) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 6 \u001b[32mloss: 12.5239 \u001b[38;2;180;60;0mgrad_norm: 35.6288 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:09:42) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:09:42) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 6 \u001b[32mloss: 12.5239 \u001b[38;2;180;60;0mgrad_norm: 35.6288 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:09:45) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:09:45) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 7 \u001b[32mloss: 11.7442 \u001b[38;2;180;60;0mgrad_norm: 29.6369 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:09:59) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:09:59) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 7 \u001b[32mloss: 11.7442 \u001b[38;2;180;60;0mgrad_norm: 29.6369 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:10:02) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:10:02) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 8 \u001b[32mloss: 11.1135 \u001b[38;2;180;60;0mgrad_norm: 7.6529 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:10:17) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:10:17) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 8 \u001b[32mloss: 11.1135 \u001b[38;2;180;60;0mgrad_norm: 7.6529 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:10:20) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:10:20) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 9 \u001b[32mloss: 10.2921 \u001b[38;2;180;60;0mgrad_norm: 10.3646 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:10:34) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:10:34) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 9 \u001b[32mloss: 10.2921 \u001b[38;2;180;60;0mgrad_norm: 10.3646 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:10:37) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:10:37) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 10 \u001b[32mloss: 9.8799 \u001b[38;2;180;60;0mgrad_norm: 11.4258 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:10:51) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:10:51) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 10 \u001b[32mloss: 9.8799 \u001b[38;2;180;60;0mgrad_norm: 11.4258 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:10:54) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:10:54) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 11 \u001b[32mloss: 9.4768 \u001b[38;2;180;60;0mgrad_norm: 6.1265 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.89%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:11:09) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:11:09) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 11 \u001b[32mloss: 9.4768 \u001b[38;2;180;60;0mgrad_norm: 6.1265 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.89%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:11:12) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:11:12) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 12 \u001b[32mloss: 9.6915 \u001b[38;2;180;60;0mgrad_norm: 23.5042 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:11:26) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:11:26) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 12 \u001b[32mloss: 9.6915 \u001b[38;2;180;60;0mgrad_norm: 23.5042 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:11:29) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:11:29) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 13 \u001b[32mloss: 8.9296 \u001b[38;2;180;60;0mgrad_norm: 11.6299 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:11:43) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:11:43) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 13 \u001b[32mloss: 8.9296 \u001b[38;2;180;60;0mgrad_norm: 11.6299 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:11:46) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:11:46) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 14 \u001b[32mloss: 8.6181 \u001b[38;2;180;60;0mgrad_norm: 6.6545 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:12:01) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:12:01) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 14 \u001b[32mloss: 8.6181 \u001b[38;2;180;60;0mgrad_norm: 6.6545 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:12:04) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:12:04) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 15 \u001b[32mloss: 8.8019 \u001b[38;2;180;60;0mgrad_norm: 13.4164 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:12:18) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:12:18) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 15 \u001b[32mloss: 8.8019 \u001b[38;2;180;60;0mgrad_norm: 13.4164 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:12:21) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:12:21) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 16 \u001b[32mloss: 8.5115 \u001b[38;2;180;60;0mgrad_norm: 6.6547 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:12:35) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:12:35) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 16 \u001b[32mloss: 8.5115 \u001b[38;2;180;60;0mgrad_norm: 6.6547 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:12:38) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:12:38) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 17 \u001b[32mloss: 8.4151 \u001b[38;2;180;60;0mgrad_norm: 9.0338 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:12:53) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:12:53) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 17 \u001b[32mloss: 8.4151 \u001b[38;2;180;60;0mgrad_norm: 9.0338 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:12:56) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:12:56) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 18 \u001b[32mloss: 8.1653 \u001b[38;2;180;60;0mgrad_norm: 4.2390 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:13:10) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:13:10) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 18 \u001b[32mloss: 8.1653 \u001b[38;2;180;60;0mgrad_norm: 4.2390 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:13:13) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:13:13) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 19 \u001b[32mloss: 8.0662 \u001b[38;2;180;60;0mgrad_norm: 8.6250 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:13:27) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:13:27) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 19 \u001b[32mloss: 8.0662 \u001b[38;2;180;60;0mgrad_norm: 8.6250 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:13:30) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:13:30) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 20 \u001b[32mloss: 13.0456 \u001b[38;2;180;60;0mgrad_norm: 556.3015 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:13:45) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:13:45) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 20 \u001b[32mloss: 13.0456 \u001b[38;2;180;60;0mgrad_norm: 556.3015 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:13:48) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:13:48) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 21 \u001b[32mloss: 8.4873 \u001b[38;2;180;60;0mgrad_norm: 16.7799 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:14:02) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:14:02) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 21 \u001b[32mloss: 8.4873 \u001b[38;2;180;60;0mgrad_norm: 16.7799 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:14:05) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:14:05) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 22 \u001b[32mloss: 8.1071 \u001b[38;2;180;60;0mgrad_norm: 5.1292 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:14:20) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:14:20) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 22 \u001b[32mloss: 8.1071 \u001b[38;2;180;60;0mgrad_norm: 5.1292 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:14:23) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:14:23) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 23 \u001b[32mloss: 8.0859 \u001b[38;2;180;60;0mgrad_norm: 28.9328 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:14:37) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:14:37) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 23 \u001b[32mloss: 8.0859 \u001b[38;2;180;60;0mgrad_norm: 28.9328 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.75 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:14:40) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:14:40) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 24 \u001b[32mloss: 8.2353 \u001b[38;2;180;60;0mgrad_norm: 6.1007 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.77 \u001b[35mmfu: 0.89%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:14:54) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:14:54) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 24 \u001b[32mloss: 8.2353 \u001b[38;2;180;60;0mgrad_norm: 6.1007 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.77 \u001b[35mmfu: 0.89%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:14:57) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:14:57) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m \u001b[31mstep: 25 \u001b[32mloss: 8.3818 \u001b[38;2;180;60;0mgrad_norm: 3.8415 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:15:11) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:07:55) >>>\u001b[0m\n", + "\u001b[33m[14 similar log lines]\u001b[0m Done training\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:15:14) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:15:11) >>>\u001b[0m\n", + "\u001b[33m[15 similar log lines]\u001b[0m \u001b[31mstep: 25 \u001b[32mloss: 8.3818 \u001b[38;2;180;60;0mgrad_norm: 3.8415 \u001b[38;2;54;234;195mmemory: 18.47GiB(41.37%) \u001b[34mtps: 59 \u001b[36mtflops: 2.76 \u001b[35mmfu: 0.88%\u001b[39m\n", + "\u001b[33m[31 similar log lines]\u001b[0m Training completed\n", + "\u001b[33m[16 similar log lines]\u001b[0m Training starts at step 26\n", + "\u001b[33m[2 similar log lines]\u001b[0m Sleeping 2 seconds for other ranks to complete\n", + "\u001b[33m[16 similar log lines]\u001b[0m Profiling active. Traces will be saved at /teamspace/studios/this_studio/torchtitan/outputs/monarch-alisol-hosts2-gpus8/profile_trace\n", + "\u001b[33m[14 similar log lines]\u001b[0m Process group destroyed.\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:15:14) <<<\u001b[0m\n", + "\n", + "\u001b[36m>>> Aggregated Logs (" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m>>> Aggregated Logs (2025-10-14 01:15:14) >>>\u001b[0m\n", + "\u001b[33m[2 similar log lines]\u001b[0m Done training\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:15:17) <<<\u001b[0m\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-10-14 01:15:14) >>>\u001b[0m\n", + "\u001b[33m[1 similar log lines]\u001b[0m Training completed\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: updating run metadata\n", + "\u001b[33m[3 similar log lines]\u001b[0m wandb: \n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Run history:\n", + "\u001b[33m[2 similar log lines]\u001b[0m wandb: grad_norm ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: loss_metrics/global_avg_loss ▇▆▅▆▆▇▆▅▄▄▃▃▂▂▂▂▁▁▁█▂▁▁▁▁\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: loss_metrics/global_max_loss ▅▄▄▅▄▆▅▄▃▃▂▄▃▂▂▁▂▁▂█▂▂▂▁▂\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: lr ▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇██\n", + "\u001b[33m[4 similar log lines]\u001b[0m wandb: memory/max_active(%) ▁████████████████████████\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/num_alloc_retries ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n", + "\u001b[33m[2 similar log lines]\u001b[0m wandb: +7 ...\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Run summary:\n", + "\u001b[33m[2 similar log lines]\u001b[0m Process group destroyed.\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: grad_norm 3.84154\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: loss_metrics/global_avg_loss 8.38181\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: loss_metrics/global_max_loss 10.08879\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: lr 0.0003\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/max_active(%) 24.73506\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/max_active(GiB) 11.04117\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/max_reserved(%) 41.37035\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/max_reserved(GiB) 18.4668\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/num_alloc_retries 0\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: memory/num_ooms 0\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: 🚀 View run magic-fire-38 at: https://wandb.ai/a-shamsoshoara-m/torchtitan/runs/napbrhpr\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: ⭐️ View project at: https://wandb.ai/a-shamsoshoara-m/torchtitan\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)\n", + "\u001b[33m[1 similar log lines]\u001b[0m wandb: Find logs at: ./torchtitan/outputs/monarch-alisol-hosts2-gpus8/tb/20251014-0107/wandb/run-20251014_010756-napbrhpr/logs\n", + "\u001b[36m<<< Aggregated Logs (2025-10-14 01:15:17) <<<\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "init_logger()\n", + "config_manager = ConfigManager()\n", + "\n", + "job_name = get_job_name(NUM_NODES, NUM_GPUS)\n", + "\n", + "manual_args = [\n", + " \"--job.config_file\",\n", + " os.path.expanduser(\"/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml\"),\n", + " \"--model.tokenizer-path\",\n", + " # f\"{FUSE_DST}/Llama-3.1-8B\",\n", + " \"/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B\",\n", + " \"--training.steps\",\n", + " \"25\",\n", + " \"--training.dataset_path\",\n", + " # f\"{FUSE_DST}/c4\",\n", + " \"/teamspace/studios/this_studio/torchtitan/tests/assets/c4_test\",\n", + " \"--job.dump_folder\",\n", + " # f\"{FUSE_DST}/outputs/\" + job_name,\n", + " \"/teamspace/studios/this_studio/torchtitan/outputs/\" + job_name,\n", + " \"--training.seq_len\",\n", + " \"1024\",\n", + " # \"8192\",\n", + " ]\n", + "config = config_manager.parse_args(manual_args)\n", + "await async_main(config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "proc_mesh" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dir(TitanTrainerWrapper)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await proc_mesh.stop()" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/lightning/utils/ip_utils.py b/examples/lightning/utils/ip_utils.py new file mode 100644 index 000000000..85e41a6b6 --- /dev/null +++ b/examples/lightning/utils/ip_utils.py @@ -0,0 +1,77 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from utils.master_node import MasterNodeServer + + +def get_master_ips(): + """ + Get private and public IP addresses of the master node. + + Returns: + tuple: (private_master_host_ip_address, public_master_host_ip_address) + """ + private_master_host_ip_address = MasterNodeServer.get_master_ip() + public_master_host_ip_address = MasterNodeServer.get_master_public_ip_curl() + print(f"{private_master_host_ip_address=}") + print(f"{public_master_host_ip_address=}") + return private_master_host_ip_address, public_master_host_ip_address + + +def extract_ips_simple(file_path): + """ + Simple extraction assuming each line contains an IP address. + """ + ip_set = set() + + try: + with open(file_path, "r") as file: + for line in file: + ip = line.strip() + if ip: # Skip empty lines + ip_set.add(ip) + except FileNotFoundError: + print(f"Error: File {file_path} not found") + except Exception as e: + print(f"Error reading file: {e}") + + return ip_set + + +def check_ips_available(job, num_nodes): + """ + Extract IP addresses from job machines and check if they are available. + + Args: + job: MMT job object with machines attribute + num_nodes: Expected number of nodes + + Returns: + tuple: (ips_available flag, ip_addresses_set) + """ + ip_addresses_list = [machine.public_ip for machine in job.machines] + ip_addresses_set = set(ip_addresses_list) + print(f"{ip_addresses_list=}") + print(f"{ip_addresses_set=}") + ips_available = not ip_addresses_set == {""} and len(ip_addresses_set) == num_nodes + print(f"IP addresses are available: {ips_available}") + return ips_available, ip_addresses_set + + +def create_tcp_addresses(ip_addresses_set, port): + """ + Create TCP addresses from a set of IP addresses and a port. + + Args: + ip_addresses_set: Set of IP addresses + port: Port number to use + + Returns: + list: List of TCP addresses in the format "tcp!{ip}:{port}" + """ + tcp_addresses = [f"tcp!{ip}:{port}" for ip in ip_addresses_set] + print(*tcp_addresses) + return tcp_addresses diff --git a/examples/lightning/utils/master_node.py b/examples/lightning/utils/master_node.py new file mode 100644 index 000000000..b7324ce6c --- /dev/null +++ b/examples/lightning/utils/master_node.py @@ -0,0 +1,341 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +import signal +import socket +import subprocess +import sys +import threading +import time +import urllib.request +from http.server import BaseHTTPRequestHandler, HTTPServer + + +class MasterNodeServer: + def __init__(self, expected_workers=2, max_wait_hours=2, port=8080): + self.worker_nodes = [] + self.expected_workers = expected_workers + self.max_wait_hours = max_wait_hours + self.server_running = True + self.httpd = None + self.port = port + self.master_ip = self.get_master_public_ip_curl() + + @staticmethod + def get_master_ip(): + hostname = socket.gethostname() + return socket.gethostbyname(hostname) + + @staticmethod + def get_master_public_ip(): + """Get the public IP address of the master node by querying an external service""" + try: + # Try multiple services in case one is down + services = [ + "https://api.ipify.org", + "https://checkip.amazonaws.com", + "https://ipecho.net/plain", + ] + + for service in services: + try: + with urllib.request.urlopen(service, timeout=10) as response: + public_ip = response.read().decode("utf-8").strip() + # Basic validation that we got an IP address + if "." in public_ip and len(public_ip.split(".")) == 4: + return public_ip + except Exception: + continue + + # If all services fail, return None + return None + + except Exception as e: + print(f"Error getting public IP: {e}") + return None + + @staticmethod + def get_master_public_ip_curl(): + """Get the public IP address using curl command (simpler approach)""" + try: + result = subprocess.run( + ["curl", "-4", "ifconfig.me"], + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode == 0: + public_ip = result.stdout.strip() + # Basic validation that we got an IP address + if "." in public_ip and len(public_ip.split(".")) == 4: + return public_ip + + return None + + except Exception as e: + print(f"Error getting public IP with curl: {e}") + return None + + def create_handler_class(self): + """Create handler class with access to server instance""" + server_instance = self + + class NodeRegistrationHandler(BaseHTTPRequestHandler): + def do_POST(self): + if self.path == "/register": + try: + content_length = int(self.headers["Content-Length"]) + post_data = self.rfile.read(content_length) + node_info = json.loads(post_data.decode("utf-8")) + + # Store worker node IP (avoid duplicates) + if node_info["ip"] not in server_instance.worker_nodes: + server_instance.worker_nodes.append(node_info["ip"]) + print( + f"Registered worker node: {node_info['ip']} ({len(server_instance.worker_nodes)}/{server_instance.expected_workers})" + ) + + # Save to file immediately when each node registers + server_instance.save_worker_nodes_to_file() + + self.send_response(200) + self.end_headers() + self.wfile.write(b"OK") + + # Check if we have all workers + if ( + len(server_instance.worker_nodes) + >= server_instance.expected_workers + ): + print("All worker nodes registered!") + # Signal the server to stop + threading.Thread( + target=server_instance.stop_server_delayed + ).start() + + except Exception as e: + print(f"Error processing registration: {e}") + self.send_response(500) + self.end_headers() + + def do_GET(self): + if self.path == "/status": + status = { + "registered_workers": len(server_instance.worker_nodes), + "expected_workers": server_instance.expected_workers, + "worker_ips": server_instance.worker_nodes, + } + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(json.dumps(status).encode("utf-8")) + + return NodeRegistrationHandler + + def save_worker_nodes_to_file(self): + """Save worker node IPs to files""" + # Save IPs + with open("/tmp/worker_nodes.txt", "w") as f: + for ip in self.worker_nodes: + f.write(f"{ip}\n") + + # Save count + with open("/tmp/worker_count.txt", "w") as f: + f.write(f"{len(self.worker_nodes)}\n") + + def stop_server_delayed(self): + """Stop server after a short delay""" + time.sleep(2) # Give time for the response to be sent + self.server_running = False + if self.httpd: + self.httpd.shutdown() + + def setup_signal_handlers(self): + """Setup signal handlers for graceful shutdown""" + + def signal_handler(sig, frame): + print("\nShutting down server...") + self.server_running = False + if self.httpd: + self.httpd.shutdown() + sys.exit(0) + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + def start_registration_server(self): + """Start the HTTP registration server with safe port handling""" + print(f"Starting server on port {self.port}...") + handler_class = self.create_handler_class() + + try: + # Create server with socket reuse option + self.httpd = HTTPServer(("0.0.0.0", self.port), handler_class) + + # Enable socket reuse to handle TIME_WAIT states + self.httpd.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + + print(f"Server started on {self.master_ip}:{self.port}") + + # Handle requests until all workers register + while ( + self.server_running and len(self.worker_nodes) < self.expected_workers + ): + try: + self.httpd.timeout = 10 + self.httpd.handle_request() + except KeyboardInterrupt: + break + except Exception as e: + print(f"Server error: {e}") + time.sleep(1) + + print("Registration server stopped") + + except OSError as e: + if e.errno == 98: # Address already in use + print(f"Port {self.port} is busy!") + print(f"Solutions:") + print(f" 1. Wait a few minutes and try again") + print(f" 2. Restart your notebook kernel") + print(f" 3. Check: lsof -i :{self.port} (to see what's using it)") + raise RuntimeError( + "Port {self.port} is not available. Please try again in a few minutes." + ) + else: + print(f"Server error: {e}") + raise + + finally: + # Clean shutdown + if self.httpd: + try: + self.httpd.socket.close() + except: + pass + + def wait_for_workers_with_status(self): + """Wait for workers and show periodic status updates""" + start_time = time.time() + + while len(self.worker_nodes) < self.expected_workers: + elapsed = int(time.time() - start_time) + print( + f"Waiting for workers... ({len(self.worker_nodes)}/{self.expected_workers} registered) - Elapsed: {elapsed}s" + ) + time.sleep(30) # Status update every 30 seconds + + # Optional: Add a maximum wait time if needed + if self.max_wait_hours > 0 and elapsed > self.max_wait_hours * 3600: + print(f"Timeout waiting for workers after {self.max_wait_hours} hours") + break + + def save_cluster_info(self): + """Save complete cluster information to files""" + # Save master IP + with open("/tmp/master_ip.txt", "w") as f: + f.write(self.master_ip) + + # Save worker IPs + self.save_worker_nodes_to_file() + + # Save complete cluster info + cluster_info = { + "master_ip": self.master_ip, + "worker_ips": self.worker_nodes, + "total_workers": len(self.worker_nodes), + "expected_workers": self.expected_workers, + "registration_complete": len(self.worker_nodes) >= self.expected_workers, + } + + with open("/tmp/cluster_info.json", "w") as f: + json.dump(cluster_info, f, indent=2) + + return cluster_info + + def run(self): + """Main method to run the master server""" + self.setup_signal_handlers() + + print(f"Master node IP: {self.master_ip}") + print(f"Expecting {self.expected_workers} worker nodes to register...") + + # Start registration server in background + server_thread = threading.Thread(target=self.start_registration_server) + server_thread.daemon = True + server_thread.start() + + # Wait for worker nodes with status updates + status_thread = threading.Thread(target=self.wait_for_workers_with_status) + status_thread.daemon = True + status_thread.start() + + # Keep main thread alive until all workers register + server_thread.join() + + print(f"Final registered worker nodes: {self.worker_nodes}") + + # Save all cluster information + cluster_info = self.save_cluster_info() + + print("Worker IPs saved to /tmp/worker_nodes.txt") + print("Cluster info saved to /tmp/cluster_info.json") + + return cluster_info + + +def run_master_server(expected_workers=2, max_wait_hours=0, port=8080): + """ + Notebook-friendly function to run the master server. + + Args: + expected_workers: Number of worker nodes to expect + max_wait_hours: Maximum hours to wait (0 for no limit) + + Returns: + dict: Cluster information when complete + """ + server = MasterNodeServer( + expected_workers=expected_workers, max_wait_hours=max_wait_hours, port=port + ) + return server.run() + + +def main(): + """Command line interface""" + import argparse + + # Parse command line arguments + parser = argparse.ArgumentParser(description="Master node registration server") + parser.add_argument( + "--expected-workers", + type=int, + default=2, + help="Number of worker nodes to expect (default: 2)", + ) + parser.add_argument( + "--max-wait-hours", + type=int, + default=0, + help="Maximum hours to wait for workers (default: 0 for no limit)", + ) + + args = parser.parse_args() + + # Create and run server + server = MasterNodeServer( + expected_workers=args.expected_workers, max_wait_hours=args.max_wait_hours + ) + + cluster_info = server.run() + return cluster_info + + +if __name__ == "__main__": + main() diff --git a/examples/lightning/utils/mesh_utils.py b/examples/lightning/utils/mesh_utils.py new file mode 100644 index 000000000..6a16fa4b3 --- /dev/null +++ b/examples/lightning/utils/mesh_utils.py @@ -0,0 +1,116 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import os + +# To force Monarch to use V0 for this Notebook (This will be remove in the future) +os.environ["MONARCH_V0_WORKAROUND_DO_NOT_USE"] = "1" + +from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec +from monarch._src.actor.allocator import RemoteAllocator, StaticRemoteAllocInitializer +from monarch.actor import ProcMesh +from utils.ip_utils import check_ips_available, create_tcp_addresses, get_master_ips + +# Monarch default port configuration +MONARCH_DEFAULT_PORT = 26600 + +# Client allowed port range for Monarch communication +CLIENT_ALLOWED_PORT_RANGE = "26600..26610" +os.environ["MONARCH_FILE_LOG"] = "debug" + + +def setup_allocator(tcp_addresses, public_master_host_ip_address, num_nodes, num_gpus): + """ + Set up the RemoteAllocator and allocate resources. + + Args: + tcp_addresses: List of TCP addresses for remote allocation + public_master_host_ip_address: Public IP address of the master host + num_nodes: Number of nodes to allocate + num_gpus: Number of GPUs per node + + Returns: + tuple: (allocator, alloc) + """ + os.environ["HYPERACTOR_REMOTE_ALLOC_ALLOWED_PORT_RANGE"] = CLIENT_ALLOWED_PORT_RANGE + os.environ["HYPERACTOR_REMOTE_ALLOC_BOOTSTRAP_ADDR"] = ( + f"tcp!{public_master_host_ip_address}:0" + ) + os.environ["HYPERACTOR_REMOTE_ALLOC_BIND_TO_INADDR_ANY"] = "true" + os.environ["MONARCH_FILE_LOG"] = "debug" + + allocator = RemoteAllocator( + world_id="foo", + initializer=StaticRemoteAllocInitializer(*tcp_addresses), + ) + + alloc = allocator.allocate( + AllocSpec(AllocConstraints(), hosts=num_nodes, gpus=num_gpus) + ) + + print(alloc) + return allocator, alloc + + +def create_proc_mesh(alloc): + """ + Create a ProcMesh from an allocation. + + Args: + alloc: Allocation object from RemoteAllocator + + Returns: + ProcMesh: Process mesh created from the allocation + """ + proc_mesh = ProcMesh.from_alloc(alloc) + return proc_mesh + + +def setup_proc_mesh_from_job(job=None, num_nodes=2, num_gpus=8, port=MONARCH_DEFAULT_PORT, ip_addresses_set={}): + """ + High-level function to set up ProcMesh from an MMT job. + + This function handles all the low-level details of: + - Getting master node IPs + - Checking IP availability + - Creating TCP addresses + - Setting up allocator + - Creating proc_mesh + + Args: + job: MMT job object with machines attribute + num_nodes: Number of nodes to allocate + num_gpus: Number of GPUs per node + port: Port number to use for TCP connections (default: 26600 - Monarch default port) + ip_set: Set of remote IP addresses in case that user provides them + + Returns: + ProcMesh: Process mesh ready to use for distributed training + """ + if not ip_addresses_set: + # Check IP availability and get IP addresses + ips_available, ip_addresses_set = check_ips_available(job, num_nodes) + + if not ips_available: + raise RuntimeError( + f"IPs are not available. Expected {num_nodes} nodes, got {len(ip_addresses_set)}" + ) + + # Get master IPs (internal use only) + _, public_master_host_ip_address = get_master_ips() + + # Create TCP addresses + tcp_addresses = create_tcp_addresses(ip_addresses_set, port) + + # Setup allocator and get allocation + allocator, alloc = setup_allocator( + tcp_addresses, public_master_host_ip_address, num_nodes, num_gpus + ) + + # Create and return proc_mesh + proc_mesh = create_proc_mesh(alloc) + + return proc_mesh diff --git a/examples/lightning/utils/worker_node.py b/examples/lightning/utils/worker_node.py new file mode 100644 index 000000000..eeabfdb54 --- /dev/null +++ b/examples/lightning/utils/worker_node.py @@ -0,0 +1,161 @@ +import argparse +import json +import random +import socket +import subprocess +import sys +import time + +import requests + + +def get_local_ip(): + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + try: + sock.connect(("8.8.8.8", 80)) + local_ip = sock.getsockname()[0] + except Exception: + local_ip = "127.0.0.1" + finally: + sock.close() + return local_ip + + +def get_public_ip(): + """Get the public IP address of this worker node using external service.""" + try: + # Using ipify.org service to get public IP + response = requests.get("https://api.ipify.org", timeout=10) + if response.status_code == 200: + return response.text.strip() + except Exception as e: + print(f"Failed to get public IP from ipify.org: {e}") + + # Fallback to alternative service + try: + response = requests.get("https://icanhazip.com", timeout=10) + if response.status_code == 200: + return response.text.strip() + except Exception as e: + print(f"Failed to get public IP from icanhazip.com: {e}") + + # If all external services fail, return None + print("Unable to determine public IP address") + return None + + +def get_public_ip_with_curl(): + """Get the public IP address using curl command line tool.""" + try: + result = subprocess.run( + ["curl", "-4", "ifconfig.me"], capture_output=True, text=True, timeout=15 + ) + + if result.returncode == 0 and result.stdout.strip(): + ip = result.stdout.strip() + # Basic validation - check if it looks like an IP address + parts = ip.split(".") + if len(parts) == 4 and all( + part.isdigit() and 0 <= int(part) <= 255 for part in parts + ): + return ip + + except subprocess.TimeoutExpired: + print("Curl request to ifconfig.me timed out") + except Exception as e: + print(f"Failed to get public IP using curl: {e}") + + # Try ifconfig as a fallback to get network interface info + try: + result = subprocess.run( + ["ifconfig"], capture_output=True, text=True, timeout=10 + ) + + if result.returncode == 0: + # This won't give us the public IP directly, but can help debug network issues + print("Network interfaces available (for debugging):") + lines = result.stdout.split("\n") + for line in lines[:10]: # Show first 10 lines + if line.strip(): + print(f" {line}") + + except Exception as e: + print(f"Failed to run ifconfig: {e}") + + print("Unable to determine public IP address using curl") + return None + + +def register_with_master(master_ip, master_port=8080): + # worker_ip = get_local_ip() + worker_ip = get_public_ip_with_curl() + hostname = socket.gethostname() + + registration_data = { + "ip": worker_ip, + "hostname": hostname, + "timestamp": time.time(), + } + + print( + f"Worker {hostname} trying to register IP {worker_ip} with master {master_ip} and {master_port}" + ) + + # Retry indefinitely with exponential backoff + attempt = 0 + base_delay = 5 + max_delay = 300 # 5 minutes max delay + + while True: + try: + response = requests.post( + f"http://{master_ip}:{master_port}/register", + data=json.dumps(registration_data), + headers={"Content-Type": "application/json"}, + timeout=10, + ) + + if response.status_code == 200: + print(f"Successfully registered worker IP {worker_ip} with master") + return True + + except Exception as e: + attempt += 1 + # Exponential backoff with jitter + delay = min(base_delay * (2 ** min(attempt, 6)), max_delay) + jitter = random.uniform(0.5, 1.5) + actual_delay = delay * jitter + + print(f"Attempt {attempt} failed: {e}") + print(f"Retrying in {actual_delay:.1f} seconds...") + time.sleep(actual_delay) + + +def main(): + parser = argparse.ArgumentParser(description="Worker node registration") + parser.add_argument("master_ip", help="IP address of the master node") + parser.add_argument( + "master_port", + help="PORT address of the connection between the worker and master node", + ) + + args = parser.parse_args() + + print( + f"Starting worker registration with master: {args.master_ip} and PORT {args.master_port}" + ) + + # Add a small random delay to avoid all workers hitting at once + time.sleep(random.uniform(1, 10)) + + success = register_with_master(args.master_ip, args.master_port) + if success: + print("Registration completed successfully") + return 0 + else: + print("Registration failed") + return 1 + + +if __name__ == "__main__": + exit(main())