Skip to content

feat: add support to backup nomad raft snapshot to s3 #25

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion modules/nomad-servers/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ locals {
nomad_acl_bootstrap_token = var.nomad_acl_bootstrap_token
nomad_acl_enable = var.nomad_acl_enable
nomad_file_limit = var.nomad_file_limit
nomad_dc = var.cluster_name
nomad_raft_backup_bucket = var.nomad_raft_backup_bucket

nomad_server_cfg = templatefile("${path.module}/templates/nomad.tftpl", {
nomad_dc = var.cluster_name
aws_region = var.aws_region
Expand All @@ -14,4 +17,4 @@ locals {
})
nomad_file_limit = var.nomad_file_limit
})
}
}
130 changes: 129 additions & 1 deletion modules/nomad-servers/scripts/setup_server.tftpl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ set_hostname() {

# Increase the file limit
modify_nomad_systemd_config() {
if [ ${nomad_file_limit} > 65536 ]; then
if [ "${nomad_file_limit}" -gt "65536" ]; then
sudo sed -i '/^LimitNOFILE/s/=.*$/=${nomad_file_limit}/' /lib/systemd/system/nomad.service
fi
}
Expand Down Expand Up @@ -148,6 +148,127 @@ bootstrap_acl() {
fi
}

# Sets up the backup script and systemd timer for Nomad state backups
setup_state_backup() {
log "INFO" "Setting up Nomad state backup to S3"

# Create backup script
cat <<EOF >/usr/local/bin/nomad-backup.sh
#!/usr/bin/env bash

set -e

BACKUP_FILE="nomad-snapshot-\$(date +%Y%m%d-%H%M%S).snap"
S3_BUCKET="${nomad_raft_backup_bucket}"
CLUSTER_NAME="${nomad_dc}"
LOG_FILE="/var/log/nomad-backup.log"
%{ if nomad_acl_enable }
NOMAD_TOKEN="${nomad_acl_bootstrap_token}"
%{ endif }

# Log to the file and console
log() {
echo "\$(date +"%Y-%m-%d %H:%M:%S") [\$1] \$2" | tee -a \$LOG_FILE
}

# Check if this node is the leader
is_leader() {
%{ if nomad_acl_enable }
LEADER_CHECK=\$(NOMAD_TOKEN=\$NOMAD_TOKEN nomad agent-info | grep "leader = true" || echo "")
%{ else }
LEADER_CHECK=\$(nomad agent-info | grep "leader = true" || echo "")
%{ endif }

if [ -n "\$LEADER_CHECK" ]; then
return 0
else
return 1
fi
}

# Main backup function
perform_backup() {
log "INFO" "Starting Nomad state backup"

# Check if we're the leader
if ! is_leader; then
log "INFO" "This node is not the leader, skipping backup"
exit 0
fi

log "INFO" "This node is the leader, performing backup"

# Create temp directory
TEMP_DIR=\$(mktemp -d)
cd \$TEMP_DIR

# Create snapshot
log "INFO" "Creating Nomad snapshot"
%{ if nomad_acl_enable }
NOMAD_TOKEN=\$NOMAD_TOKEN nomad operator snapshot save \$BACKUP_FILE
%{ else }
nomad operator snapshot save \$BACKUP_FILE
%{ endif }

# Compress the snapshot
log "INFO" "Compressing snapshot"
gzip \$BACKUP_FILE

# Upload to S3
log "INFO" "Uploading snapshot to S3"
aws s3 cp "\$BACKUP_FILE.gz" "s3://\$S3_BUCKET/\$CLUSTER_NAME/\$BACKUP_FILE.gz"

# Clean up
log "INFO" "Cleaning up temporary files"
rm -rf \$TEMP_DIR

log "INFO" "Backup completed successfully"
}

# Execute the backup
perform_backup
EOF

# Make the script executable
chmod +x /usr/local/bin/nomad-backup.sh

# Create systemd timer unit
cat <<EOF >/etc/systemd/system/nomad-backup.timer
[Unit]
Description=Run Nomad backup twice daily
Requires=nomad-backup.service

[Timer]
OnCalendar=*-*-* 00,12:00:00
Persistent=true

[Install]
WantedBy=timers.target
EOF

# Create systemd service unit
cat <<EOF >/etc/systemd/system/nomad-backup.service
[Unit]
Description=Nomad State Backup Service
After=nomad.service

[Service]
Type=oneshot
ExecStart=/usr/local/bin/nomad-backup.sh
User=root

[Install]
WantedBy=multi-user.target
EOF

# Enable and start the timer
systemctl daemon-reload
systemctl enable nomad-backup.timer
systemctl start nomad-backup.timer

log "INFO" "Nomad state backup has been configured successfully"
}

log "INFO" "Fetching EC2 Tags from AWS"
store_tags

Expand All @@ -173,6 +294,13 @@ bootstrap_acl
log "INFO" "Skipping ACL Bootstrap for Nomad as 'nomad_acl_enable' is not set to true"
%{ endif }

%{ if nomad_raft_backup_bucket != "" }
log "INFO" "Setting up state backup to S3"
setup_state_backup
%{else}
log "INFO" "Skipping state backup setup as 'nomad_raft_backup_bucket' is not defined"
%{ endif }

log "INFO" "Restarting services"
restart_nomad

Expand Down
6 changes: 6 additions & 0 deletions modules/nomad-servers/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ variable "nomad_join_tag_value" {
nullable = false
}

variable "nomad_raft_backup_bucket" {
description = "The S3 bucket to use for backing up Nomad RAFT snapshot"
type = string
default = ""
}

variable "nomad_server_incoming_ips" {
description = "List of IPs to allow incoming connections from to Nomad server ALBs"
type = list(string)
Expand Down