Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
72e605e
add the lightning dir in example
AlirezaShamsoshoara Aug 7, 2025
053f7c3
add ip utils
AlirezaShamsoshoara Aug 9, 2025
300b245
add master node to start the http server
AlirezaShamsoshoara Aug 6, 2025
b01b608
add worker node as a http client to register the ipv4
AlirezaShamsoshoara Aug 2, 2025
5136bbc
update public ip address
AlirezaShamsoshoara Sep 29, 2025
e7344ef
Merge the upstream to the local alidev_lightning_monarch
AlirezaShamsoshoara Sep 29, 2025
325644b
add public ip address to the worker nodes
AlirezaShamsoshoara Sep 29, 2025
e39603d
fix the issues of the public ip with open port
AlirezaShamsoshoara Sep 29, 2025
25aaa86
add http server port as a variable
AlirezaShamsoshoara Oct 1, 2025
3c19de8
add the new NB changes for port override for aws on v0
AlirezaShamsoshoara Oct 14, 2025
a0d51c1
add the monarch v1 example using host mesh
AlirezaShamsoshoara Oct 15, 2025
fb52856
Update the Monarch Hero Notebook
AlirezaShamsoshoara Oct 17, 2025
ca5077a
clean up the Notebook for sharing
AlirezaShamsoshoara Oct 18, 2025
b1d33ec
fix the steps and hyperlinks
AlirezaShamsoshoara Oct 18, 2025
43ab50f
testing the html anchor
AlirezaShamsoshoara Oct 18, 2025
14a54ab
test the anchor name
AlirezaShamsoshoara Oct 18, 2025
305f6f2
test the anchor name
AlirezaShamsoshoara Oct 18, 2025
d0f9c46
remove all hyperlinks as they only work in vscode
AlirezaShamsoshoara Oct 18, 2025
e12a4ea
fix a typo
AlirezaShamsoshoara Oct 18, 2025
61083f0
run 16 nodes training and add all details for master NB
AlirezaShamsoshoara Oct 20, 2025
1edb067
breakdown to three notebooks
HamidShojanazeri Oct 20, 2025
5829043
update tweaks
HamidShojanazeri Oct 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions examples/lightning/assets/NB_Monarch_Lightning.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/lightning/assets/nodes_pending.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/lightning/assets/nodes_ready.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/lightning/assets/setup_status.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2,488 changes: 2,488 additions & 0 deletions examples/lightning/monarch_lightning.ipynb

Large diffs are not rendered by default.

811 changes: 811 additions & 0 deletions examples/lightning/monarch_titan_mmt.ipynb

Large diffs are not rendered by default.

420 changes: 420 additions & 0 deletions examples/lightning/monarch_v1_titan_aws.ipynb

Large diffs are not rendered by default.

730 changes: 730 additions & 0 deletions examples/lightning/studio_0_monarch_basics.ipynb

Large diffs are not rendered by default.

531 changes: 531 additions & 0 deletions examples/lightning/studio_1_getting_started.ipynb

Large diffs are not rendered by default.

542 changes: 542 additions & 0 deletions examples/lightning/studio_2_workspace_sync.ipynb

Large diffs are not rendered by default.

678 changes: 678 additions & 0 deletions examples/lightning/studio_3_interactive_debugging.ipynb

Large diffs are not rendered by default.

908 changes: 908 additions & 0 deletions examples/lightning/titan_monarch_mmt_aws_portOverride.ipynb

Large diffs are not rendered by default.

77 changes: 77 additions & 0 deletions examples/lightning/utils/ip_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from utils.master_node import MasterNodeServer


def get_master_ips():
"""
Get private and public IP addresses of the master node.

Returns:
tuple: (private_master_host_ip_address, public_master_host_ip_address)
"""
private_master_host_ip_address = MasterNodeServer.get_master_ip()
public_master_host_ip_address = MasterNodeServer.get_master_public_ip_curl()
print(f"{private_master_host_ip_address=}")
print(f"{public_master_host_ip_address=}")
return private_master_host_ip_address, public_master_host_ip_address


def extract_ips_simple(file_path):
"""
Simple extraction assuming each line contains an IP address.
"""
ip_set = set()

try:
with open(file_path, "r") as file:
for line in file:
ip = line.strip()
if ip: # Skip empty lines
ip_set.add(ip)
except FileNotFoundError:
print(f"Error: File {file_path} not found")
except Exception as e:
print(f"Error reading file: {e}")

return ip_set


def check_ips_available(job, num_nodes):
"""
Extract IP addresses from job machines and check if they are available.

Args:
job: MMT job object with machines attribute
num_nodes: Expected number of nodes

Returns:
tuple: (ips_available flag, ip_addresses_set)
"""
ip_addresses_list = [machine.public_ip for machine in job.machines]
ip_addresses_set = set(ip_addresses_list)
print(f"{ip_addresses_list=}")
print(f"{ip_addresses_set=}")
ips_available = not ip_addresses_set == {""} and len(ip_addresses_set) == num_nodes
print(f"IP addresses are available: {ips_available}")
return ips_available, ip_addresses_set


def create_tcp_addresses(ip_addresses_set, port):
"""
Create TCP addresses from a set of IP addresses and a port.

Args:
ip_addresses_set: Set of IP addresses
port: Port number to use

Returns:
list: List of TCP addresses in the format "tcp!{ip}:{port}"
"""
tcp_addresses = [f"tcp!{ip}:{port}" for ip in ip_addresses_set]
print(*tcp_addresses)
return tcp_addresses
Loading