diff --git a/cloudci/.gitignore b/cloudci/.gitignore new file mode 100644 index 00000000..9ebbe595 --- /dev/null +++ b/cloudci/.gitignore @@ -0,0 +1,3 @@ +state/* +.terraform +.terraform.lock.hcl diff --git a/cloudci/CLOUDCI.md b/cloudci/CLOUDCI.md new file mode 100644 index 00000000..b21d5ef6 --- /dev/null +++ b/cloudci/CLOUDCI.md @@ -0,0 +1,64 @@ +# Proposal: Cloud-Based CI for IsaacLab Arena + +## Motivation + +Upon looking at the current CI setup and challenges, it seems that adding a cloud-based CI, could help with: + +- Faster turnaround times on MRs due to ability to scale up and down as needed +- Testing on a variety of GPUs, including ones, users are likely to run on (Isaac Lab is often used in the cloud) - A10G, L4, L40, T4, etc. +- Cost effectiveness (more on that below) + +Initially, it can be implemented on AWS (A10G, L4, L40S, T4, H100, B100, GB200), and later expanded to GCP (T4, L4, RTX PRO 6000 - GA soon, B200, GB200) and Azure (A10, H100, TBD) as needed. + +## Cost effectiveness analysis + +Here is a table outlining hour of use on various GPU instances (on-demand, single GPU, 256GB permanent sortage): + +| Cloud Provider | GPU Model | Instance Type | Approx. Hourly Cost | +| --- | --- | --- | --- | +| AWS | T4 | g4dn.xlarge | ~$0.6 | +| AWS | A10G | g5.xlarge | ~$1.3 | +| AWS | L4 | g6.xlarge | ~$1 | +| AWS | L40S | g6e.xlarge | ~$2 | +| AWS | H100 | p5.4xlarge | ~$7 | +| AWS | B100 | TBD | TBD | +| AWS | GB200 | TBD | TBD | + +(to be continued with GCP and Azure data) + +Those costs are charged only if the instance is running. If it is stopped, only the minimal storage and IP address costs are incurred. This means that if the CI jobs are not run for some time, the costs are minimal as instances an be automatically stopped. + +## Architecture + +Cloud CI setup will be very loosely based on the Isaac Automator ([https://github.com/NVIDIA-Omniverse/IsaacAutomator](https://github.com/isaac-sim/IsaacAutomator)), but will be simplified to avoid the need to support any unneeded functionality. + +There will be 3 main components: + +1. CLI tool to: + - Deploy an instance + - Connect to the instance + - Start, stop an instance + - Destroy an instance and all associated resources +2. Terraform scripts for clouds that are implemented, starting with AWS +3. Ansible tasks to configure the instance +4. Optionally packer to be able to save AMIs for faster startup times and consistent environments. + +## Security considerations + +Isaac Arena is open source, so there should not be any concerns with running upcoming changes in the cloud, especially if the access is properly secured. + +Access conbtrol can be implemented by: + +- Limiting range of IP addresses that can connect to the instances +- Setting up reliable key-based authentication for every new instance +- Using least-privilege IAM (or equivalents) roles + +## Timeline + +Since this can be based on existing Isaac Automator code, the timeline can be relatively short: + +| Step | Duration | Outcome | +|--------------------|----------|-----------------------------------------| +| Prototype | 2 weeks | Cloud CI workflow working "somehow" | +| Testing & Polishing| 2 weeks | Other members of Isaac Arena team are happy with it | +| Final Integration | 1 week | MR is merged and CI is run (as needed or regularly) using the cloud CI tools | diff --git a/cloudci/Dockerfile b/cloudci/Dockerfile new file mode 100644 index 00000000..70d52b47 --- /dev/null +++ b/cloudci/Dockerfile @@ -0,0 +1,40 @@ + +FROM ubuntu:24.04 + +ENV force_color_prompt=yes +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONPATH=/cloudci + +RUN apt-get update +RUN apt-get install -qy jq +RUN apt-get install -qy curl +RUN apt-get install -qy wget +RUN apt-get install -qy unzip +RUN apt-get install -qy python3-pip +RUN apt-get install -qy openssh-client + +# opentofu (instead of terraform) +# https://opentofu.org/docs/intro/install/deb/ +RUN curl --proto '=https' --tlsv1.2 -fsSL https://get.opentofu.org/install-opentofu.sh -o install-opentofu.sh +RUN chmod +x install-opentofu.sh +RUN ./install-opentofu.sh --install-method deb + +# aws cli +# https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html +WORKDIR /tmp +RUN case "$(dpkg --print-architecture)" in \ + amd64) curl -sS "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" ;; \ + arm64) curl -sS "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip" ;; \ + esac +RUN unzip awscliv2.zip +RUN ./aws/install + +# copy cloudci code into container +COPY . /cloudci + +# customoize bash prompt +RUN echo "export PS1='\[\033[01;31m\][IsaacLab Arena Cloud CI]\[\033[00m\] \w $ '" >> /root/.bashrc + +WORKDIR /cloudci + +ENTRYPOINT [ "/bin/sh", "-c" ] diff --git a/cloudci/README.md b/cloudci/README.md new file mode 100644 index 00000000..5dbcad61 --- /dev/null +++ b/cloudci/README.md @@ -0,0 +1,28 @@ +# Cloud-Based CI for IsaacLab Arena + +## Usage + +### Building and Running the CLI + +```sh +cd cloudci && ./build && ./run +``` + +### Deploying a Github Runner Instance + +```sh +cd cloudci && ./deploy +``` + +### Stopping/Starting a Runner Instance + +```sh +cd cloudci && ./stop # To stop the instance +cd cloudci && ./start # To start the instance +``` + +### Destroying the Deployment + +```sh +cd cloudci && ./destroy +``` diff --git a/cloudci/build b/cloudci/build new file mode 100755 index 00000000..06e80ef6 --- /dev/null +++ b/cloudci/build @@ -0,0 +1,15 @@ +#!/bin/bash + +IMAGE="isaaclabarena-cloudci" +SELF_DIR="$(realpath "$(dirname "${BASH_SOURCE}")")" + +# run build and passthrough all the args + +# check if we're on mac - use x86 emulation +if [[ "$OSTYPE" == "darwin"* ]]; then + # we're on macOS + docker build --platform linux/x86_64 -t "${IMAGE}" "${SELF_DIR}" "$@" +else + # we're on Linux + docker build -t "${IMAGE}" "${SELF_DIR}" "$@" +fi diff --git a/cloudci/opentofu/aws/common/main.tf b/cloudci/opentofu/aws/common/main.tf new file mode 100644 index 00000000..c6137f6e --- /dev/null +++ b/cloudci/opentofu/aws/common/main.tf @@ -0,0 +1,46 @@ +resource "tls_private_key" "ssh_key" { + algorithm = "RSA" + rsa_bits = 4096 +} + +resource "aws_key_pair" "keypair" { + key_name = "${var.prefix}.keypair" + public_key = tls_private_key.ssh_key.public_key_openssh + + tags = { + Name = "${var.prefix}.keypair" + } +} + +resource "aws_vpc" "vpc" { + cidr_block = var.vpc_cidr_block + enable_dns_hostnames = true + tags = { + Name = "${var.prefix}.vpc" + } +} + +resource "aws_internet_gateway" "vpc_gateway" { + vpc_id = aws_vpc.vpc.id + tags = { + Name = "${var.prefix}.vpc_gateway" + } +} + +resource "aws_default_route_table" "vpc_route_table" { + default_route_table_id = aws_vpc.vpc.default_route_table_id + + route { + cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.vpc_gateway.id + } + + timeouts { + create = "5m" + update = "5m" + } + + tags = { + Name = "${var.prefix}.vpc_route_table" + } +} diff --git a/cloudci/opentofu/aws/common/outputs.tf b/cloudci/opentofu/aws/common/outputs.tf new file mode 100644 index 00000000..f066dd5d --- /dev/null +++ b/cloudci/opentofu/aws/common/outputs.tf @@ -0,0 +1,14 @@ +output "ssh_key" { + value = tls_private_key.ssh_key +} + +output "aws_key_pair_id" { + value = aws_key_pair.keypair.id +} + +output "vpc" { + value = { + id = aws_vpc.vpc.id + cidr_block = aws_vpc.vpc.cidr_block + } +} diff --git a/cloudci/opentofu/aws/common/variables.tf b/cloudci/opentofu/aws/common/variables.tf new file mode 100644 index 00000000..7b628096 --- /dev/null +++ b/cloudci/opentofu/aws/common/variables.tf @@ -0,0 +1,16 @@ +variable "prefix" { + type = string +} + +variable "ssh_key" { + default = null +} + +variable "region" { + type = string +} + +variable "vpc_cidr_block" { + default = "10.1.0.0/16" + type = string +} diff --git a/cloudci/opentofu/aws/isaacsim_runner/ami.tf b/cloudci/opentofu/aws/isaacsim_runner/ami.tf new file mode 100644 index 00000000..29ab947c --- /dev/null +++ b/cloudci/opentofu/aws/isaacsim_runner/ami.tf @@ -0,0 +1,23 @@ + +# finds available base image +data "aws_ami" "ami" { + most_recent = true + + filter { + name = "name" + values = [ + var.base_ami_name + ] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + owners = [ + "565494100184", # NVIDIA + "099720109477", # Canonical + "self" # Customer + ] +} diff --git a/cloudci/opentofu/aws/isaacsim_runner/main.tf b/cloudci/opentofu/aws/isaacsim_runner/main.tf new file mode 100644 index 00000000..0288fd14 --- /dev/null +++ b/cloudci/opentofu/aws/isaacsim_runner/main.tf @@ -0,0 +1,57 @@ +# query availability zones where +# we can launch instances of type required + +data "aws_ec2_instance_type_offerings" "zones" { + filter { + name = "instance-type" + values = [var.instance_type] + } + location_type = "availability-zone" +} + +# create a subnet for the isaac instance + +resource "aws_subnet" "subnet" { + # get a /24 block from vpc cidr + cidr_block = cidrsubnet(var.vpc.cidr_block, 8, 3) + availability_zone = try(sort(data.aws_ec2_instance_type_offerings.zones.locations)[0], "not-available") + vpc_id = var.vpc.id + map_public_ip_on_launch = true + + tags = { + Name = "${var.prefix}.subnet" + } +} + +# instance +resource "aws_instance" "instance" { + ami = data.aws_ami.ami.id + instance_type = var.instance_type + key_name = var.keypair_id + vpc_security_group_ids = [aws_security_group.sg.id] + subnet_id = aws_subnet.subnet.id + iam_instance_profile = var.iam_instance_profile + + root_block_device { + volume_type = "gp3" + volume_size = "256" # GB + delete_on_termination = true + + tags = { + Name = "${var.prefix}.root_ebs" + Deployment = "${var.deployment_name}" + } + } + + tags = { + Name = "${var.prefix}.vm" + } +} + +# elastic ip +resource "aws_eip" "eip" { + instance = aws_instance.instance.id + tags = { + Name = "${var.prefix}.eip" + } +} diff --git a/cloudci/opentofu/aws/isaacsim_runner/outputs.tf b/cloudci/opentofu/aws/isaacsim_runner/outputs.tf new file mode 100644 index 00000000..74949332 --- /dev/null +++ b/cloudci/opentofu/aws/isaacsim_runner/outputs.tf @@ -0,0 +1,8 @@ + +output "public_ip" { + value = aws_eip.eip.public_ip +} + +output "vm_id" { + value = aws_instance.instance.id +} diff --git a/cloudci/opentofu/aws/isaacsim_runner/security.tf b/cloudci/opentofu/aws/isaacsim_runner/security.tf new file mode 100644 index 00000000..9d5e4b54 --- /dev/null +++ b/cloudci/opentofu/aws/isaacsim_runner/security.tf @@ -0,0 +1,68 @@ +# security group for isaac +resource "aws_security_group" "sg" { + name = "${var.prefix}.sg" + vpc_id = var.vpc.id + + tags = { + Name = "${var.prefix}.sg" + } + + # ssh + ingress { + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = var.ingress_cidrs + } + + # nomachine + ingress { + from_port = 4000 + to_port = 4000 + protocol = "tcp" + cidr_blocks = var.ingress_cidrs + } + ingress { + from_port = 4000 + to_port = 4000 + protocol = "udp" + cidr_blocks = var.ingress_cidrs + } + + # vnc + ingress { + from_port = 5900 + to_port = 5900 + protocol = "tcp" + cidr_blocks = var.ingress_cidrs + } + + # novnc + ingress { + from_port = 6080 + to_port = 6080 + protocol = "tcp" + cidr_blocks = var.ingress_cidrs + } + + # allow outbound traffic + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + ipv6_cidr_blocks = ["::/0"] + } +} + +# custom ssh port +resource "aws_security_group_rule" "custom_ssh" { + count = var.ssh_port != 22 ? 1 : 0 + type = "ingress" + security_group_id = aws_security_group.sg.id + from_port = var.ssh_port + to_port = var.ssh_port + protocol = "tcp" + cidr_blocks = var.ingress_cidrs +} diff --git a/cloudci/opentofu/aws/isaacsim_runner/variables.tf b/cloudci/opentofu/aws/isaacsim_runner/variables.tf new file mode 100644 index 00000000..ddfb06bd --- /dev/null +++ b/cloudci/opentofu/aws/isaacsim_runner/variables.tf @@ -0,0 +1,51 @@ +variable "prefix" { + type = string +} + +variable "keypair_id" { + type = string +} + +variable "instance_type" { + type = string +} + +variable "region" { + type = string +} + +variable "from_image" { + default = true + type = bool +} + +variable "vpc" { + type = object({ + id = string, + cidr_block = string, + }) +} + +variable "iam_instance_profile" { + default = null + type = string +} + +variable "deployment_name" { + type = string +} + +# base - used when from_image is *false* +variable "base_ami_name" { + default = "ubuntu/images/hvm-ssd/ubuntu-*-22.04-amd64-server-*" +} + +variable "ssh_port" { + type = number +} + +# for general use, ["0.0.0.0/0"] is ok +# but may be helpful for accounts with stricter security policies +variable "ingress_cidrs" { + type = list(string) +} diff --git a/cloudci/opentofu/aws/main.tf b/cloudci/opentofu/aws/main.tf new file mode 100644 index 00000000..3b55d322 --- /dev/null +++ b/cloudci/opentofu/aws/main.tf @@ -0,0 +1,48 @@ +terraform { + required_version = ">= 1.3.5" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.41" + } + } +} + +provider "aws" { + region = var.region + + access_key = var.aws_access_key_id + secret_key = var.aws_secret_access_key + token = var.aws_session_token + + default_tags { + tags = { + Deployment = "${var.deployment_name}" + } + } +} + +module "common" { + source = "./common" + prefix = "${var.prefix}.${var.deployment_name}" + region = var.region +} + +module "isaacsim_runner" { + source = "./isaacsim_runner" + prefix = "${var.prefix}.${var.deployment_name}.isaacsim_runner" + count = var.isaacsim_runner_enabled ? 1 : 0 + keypair_id = module.common.aws_key_pair_id + instance_type = var.isaacsim_runner_instance_type + region = var.region + ssh_port = var.ssh_port + deployment_name = var.deployment_name + ingress_cidrs = var.ingress_cidrs + + iam_instance_profile = null + + vpc = { + id = module.common.vpc.id + cidr_block = module.common.vpc.cidr_block + } +} diff --git a/cloudci/opentofu/aws/outputs.tf b/cloudci/opentofu/aws/outputs.tf new file mode 100644 index 00000000..3e3b04f9 --- /dev/null +++ b/cloudci/opentofu/aws/outputs.tf @@ -0,0 +1,18 @@ + +output "ssh_key" { + value = module.common.ssh_key.private_key_pem + sensitive = true +} + +output "cloud" { + value = "aws" +} + +output "isaacsim_runner_ip" { + value = var.isaacsim_runner_enabled ? module.isaacsim_runner[0].public_ip : "NA" +} + +output "isaacsim_runner_vm_id" { + value = try(var.isaacsim_runner_enabled ? module.isaacsim_runner[0].vm_id : "NA", "NA") +} + diff --git a/cloudci/opentofu/aws/variables.tf b/cloudci/opentofu/aws/variables.tf new file mode 100644 index 00000000..a80a1af9 --- /dev/null +++ b/cloudci/opentofu/aws/variables.tf @@ -0,0 +1,44 @@ +# prefix for created resources and tags +# full name looks like ... +variable "prefix" { + default = "isaaclabarena-cloudci" + type = string +} + +variable "deployment_name" { + type = string +} + +variable "region" { + type = string +} + +variable "aws_access_key_id" { + type = string +} + +variable "aws_secret_access_key" { + type = string +} + +variable "aws_session_token" { + type = string + default = "" +} + +variable "isaacsim_runner_enabled" { + type = bool +} + +variable "isaacsim_runner_instance_type" { + type = string +} + +variable "ssh_port" { + default = 22 + type = string +} + +variable "ingress_cidrs" { + type = list(string) +} diff --git a/cloudci/run b/cloudci/run new file mode 100755 index 00000000..c2e38c2a --- /dev/null +++ b/cloudci/run @@ -0,0 +1,18 @@ +#!/bin/bash + +IMAGE="isaaclabarena-cloudci" +SELF_DIR="$(realpath "$(dirname "${BASH_SOURCE}")")" + +# build image if it doesn't exist +if [[ $(docker images -q "$IMAGE" 2> /dev/null) == '' ]]; then + "${SELF_DIR}/build" +fi + +# check if we're on mac +if [[ "$OSTYPE" == "darwin"* ]]; then + # we're on mac - enable emulation + docker run --platform linux/x86_64 -it --rm -v "${SELF_DIR}":/cloudci $IMAGE "${*:-bash}" +else + # we're not on mac + docker run -it --rm -v "${SELF_DIR}":/cloudci $IMAGE "${*:-bash}" +fi