From 61705210cc994a295810bba8f21682b9d6c71812 Mon Sep 17 00:00:00 2001 From: SURESH CHOUKSEY Date: Mon, 22 Jun 2026 09:22:03 +0530 Subject: [PATCH] feat: implement backup & DR strategy #12 --- .env.example | 14 ++ docs/disaster-recovery.md | 81 ++++++++++ scripts/backup.sh | 259 +++++++++++++++++++++---------- stacks/backup/docker-compose.yml | 68 ++++++++ 4 files changed, 343 insertions(+), 79 deletions(-) create mode 100644 docs/disaster-recovery.md create mode 100644 stacks/backup/docker-compose.yml diff --git a/.env.example b/.env.example index ab86b655..bf16134b 100644 --- a/.env.example +++ b/.env.example @@ -120,3 +120,17 @@ DOCKER_PROXY_ENABLED=false CN_MODE=false CN_APT_MIRROR=https://mirrors.aliyun.com/ubuntu CN_DOCKER_MIRROR=https://docker.m.daocloud.io + +# ----------------------------------------------------------------------------- +# BACKUP & RECOVERY +# ----------------------------------------------------------------------------- +BACKUP_TARGET=local # local | s3 | b2 | sftp +RESTIC_PASSWORD= # REQUIRED: Strong password for restic repo +RESTIC_REPOSITORY=rest:http://localhost:8000/ # e.g., rest:http://localhost:8000/, s3:s3.amazonaws.com/bucket/repo +# Cloud storage credentials (if using s3/b2/r2) +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= +# SFTP credentials (if using sftp) +SFTP_USER= +SFTP_HOST= +SFTP_PATH= diff --git a/docs/disaster-recovery.md b/docs/disaster-recovery.md new file mode 100644 index 00000000..569f0fb9 --- /dev/null +++ b/docs/disaster-recovery.md @@ -0,0 +1,81 @@ +# Disaster Recovery (DR) 灾难恢复演练文档 + +本文档详细描述了发生灾难(例如硬件损坏、系统崩溃等)时的恢复流程。基于 3-2-1 备份策略(3 份数据,2 种介质,1 份异地),确保数据万无一失。 + +## 恢复策略说明 + +- **本地备份仓库**:位于 `Restic REST Server` 中。 +- **云端备份仓库**:如 Amazon S3, Backblaze B2, 或是 `Duplicati` 管理的远端存储。 +- **RTO (预计恢复时间)**:< 2 小时(取决于网络带宽及数据量)。 +- **RPO (预计数据丢失)**:最多 24 小时(取决于定时备份频率)。 + +## 完整恢复流程(全新主机从零恢复) + +1. **基础环境安装** + 在新的 Linux 系统上安装 Docker 和 Docker Compose。 + ```bash + curl -fsSL https://get.docker.com | sh + ``` + 安装并克隆 `homelab-stack` 仓库。 + ```bash + git clone https://github.com/illbnm/homelab-stack.git /opt/homelab-stack + cd /opt/homelab-stack + ``` + +2. **恢复配置文件 (.env)** + 如果你的代码库中包含最新的 `config/` 和 `stacks/` 的备份,可通过 restic 直接恢复配置文件: + ```bash + export RESTIC_PASSWORD="<你的备份密码>" + export RESTIC_REPOSITORY="<你的远端仓库地址如s3>" + # 临时恢复配置 + docker run --rm -v $(pwd):/restore restic/restic:0.16.3 -r $RESTIC_REPOSITORY restore latest --target /restore --tag configs + ``` + 确认 `.env` 变量无误,特别是密码和云存储密钥。 + +3. **创建 Proxy 网络** + 启动集群之前,务必创建外部网络: + ```bash + docker network create proxy + ``` + +4. **恢复服务数据(顺序执行)** + + 使用重写的 `backup.sh` 脚本从远端或本地恢复每个 Stack。 + > **注意**:恢复时需要保证目标 Stack 的容器为停止状态(因为容器正在运行可能会写入冲突)。 + + **推荐的恢复顺序**: + 1. **Base (Traefik, Portainer)**: + ```bash + ./scripts/backup.sh --restore latest --target base + docker compose -f stacks/base/docker-compose.yml up -d + ``` + 2. **Databases (PostgreSQL, MariaDB, Redis)**: + ```bash + ./scripts/backup.sh --restore latest --target databases + docker compose -f stacks/databases/docker-compose.yml up -d + ``` + 3. **SSO (Authentik)**: + ```bash + ./scripts/backup.sh --restore latest --target sso + docker compose -f stacks/sso/docker-compose.yml up -d + ``` + 4. **其他业务服务 (Media, Nextcloud, Vaultwarden 等)**: + ```bash + ./scripts/backup.sh --restore latest --target + docker compose -f stacks//docker-compose.yml up -d + ``` + +## 定时备份设置 + +日常备份依赖 cron 进行调度。在宿主机执行 `crontab -e` 并添加: +```bash +0 2 * * * /opt/homelab-stack/scripts/backup.sh --target all >> /var/log/homelab-backup.log 2>&1 +``` + +## 验证恢复完整性的检查清单 + +- [ ] `Traefik` 仪表盘可访问,并显示所有路由配置正常。 +- [ ] 尝试登录 `Authentik` (SSO) 测试认证系统是否可用。 +- [ ] 检查 `PostgreSQL` / `MariaDB` 的日志是否包含异常的数据库崩溃恢复信息。 +- [ ] 访问并播放媒体库(`Media` stack),确认大文件是否完整。 +- [ ] 测试核心服务(例如 `Vaultwarden`, `Nextcloud`)的读写和数据展示。 diff --git a/scripts/backup.sh b/scripts/backup.sh index c9ba8377..008b6e24 100644 --- a/scripts/backup.sh +++ b/scripts/backup.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # ============================================================================= -# HomeLab Backup — Docker volumes + configs 全量备份 +# HomeLab Backup — Docker volumes + configs 3-2-1 Restic Backup # ============================================================================= set -euo pipefail @@ -8,92 +8,193 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd)" BASE_DIR="$SCRIPT_DIR/.." ENV_FILE="$BASE_DIR/config/.env" -[[ -f "$ENV_FILE" ]] && source "$ENV_FILE" - -BACKUP_DIR="${BACKUP_DIR:-/opt/homelab-backups}" -RETENTION_DAYS="${BACKUP_RETENTION_DAYS:-7}" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -BACKUP_PATH="$BACKUP_DIR/$TIMESTAMP" - -RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m' -log_info() { echo -e "${GREEN}[backup]${NC} $*"; } -log_warn() { echo -e "${YELLOW}[backup]${NC} $*"; } -log_error() { echo -e "${RED}[backup]${NC} $*" >&2; } - -mkdir -p "$BACKUP_PATH" - -# 备份 Docker volumes -backup_volumes() { - log_info "Backing up Docker volumes..." - local volumes - volumes=$(docker volume ls --format '{{.Name}}' | grep -v '^[a-f0-9]\{64\}$' || true) - while IFS= read -r vol; do - [[ -z "$vol" ]] && continue - log_info " Volume: $vol" - docker run --rm \ - -v "${vol}:/data:ro" \ - -v "$BACKUP_PATH:/backup" \ - alpine:3.19 \ - tar czf "/backup/vol_${vol}.tar.gz" -C /data . 2>/dev/null || \ - log_warn " Failed to backup volume: $vol" - done <<< "$volumes" +if [[ -f "$ENV_FILE" ]]; then + source "$ENV_FILE" +else + # If running from a different directory structure, fallback + ENV_FILE="$BASE_DIR/.env" + [[ -f "$ENV_FILE" ]] && source "$ENV_FILE" +fi + +TARGET="" +DRY_RUN="false" +RESTORE_ID="" +LIST="false" +VERIFY="false" + +while [[ $# -gt 0 ]]; do + case $1 in + --target) TARGET="$2"; shift 2 ;; + --dry-run) DRY_RUN="true"; shift ;; + --restore) RESTORE_ID="$2"; shift 2 ;; + --list) LIST="true"; shift ;; + --verify) VERIFY="true"; shift ;; + *) echo "Unknown parameter $1"; exit 1 ;; + esac +done + +BACKUP_TARGET=${BACKUP_TARGET:-local} + +if [ "$BACKUP_TARGET" = "local" ]; then + # Local restic server running in proxy network + RESTIC_REPO="rest:http://restic-server:8000/" +elif [ "$BACKUP_TARGET" = "s3" ] || [ "$BACKUP_TARGET" = "b2" ] || [ "$BACKUP_TARGET" = "sftp" ]; then + RESTIC_REPO="${RESTIC_REPOSITORY:-}" +fi + +export RESTIC_PASSWORD="${RESTIC_PASSWORD:-}" +if [ -z "${RESTIC_PASSWORD:-}" ]; then + echo "Error: RESTIC_PASSWORD not set in .env" + exit 1 +fi + +export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID:-}" +export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY:-}" + +restic_cmd() { + docker run --rm \ + --network proxy \ + -e RESTIC_PASSWORD="$RESTIC_PASSWORD" \ + -e RESTIC_REPOSITORY="$RESTIC_REPO" \ + -e AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" \ + -e AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" \ + restic/restic:0.16.3 "$@" } -# 备份配置文件 -backup_configs() { - log_info "Backing up configs..." - tar czf "$BACKUP_PATH/configs.tar.gz" \ - -C "$BASE_DIR" \ - --exclude='stacks/*/data' \ - config/ stacks/ scripts/ 2>/dev/null || true +restic_config() { + docker run --rm \ + --network proxy \ + -e RESTIC_PASSWORD="$RESTIC_PASSWORD" \ + -e RESTIC_REPOSITORY="$RESTIC_REPO" \ + -e AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" \ + -e AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" \ + -v "$BASE_DIR:/config_data:ro" \ + restic/restic:0.16.3 "$@" } -# 备份数据库 -backup_databases() { - log_info "Backing up databases..." - - # PostgreSQL - if docker ps --format '{{.Names}}' | grep -q 'postgres\|postgresql'; then - local pg_container - pg_container=$(docker ps --format '{{.Names}}' | grep -E 'postgres|postgresql' | head -1) - local pg_pass - pg_pass=$(docker inspect "$pg_container" --format '{{range .Config.Env}}{{println .}}{{end}}' | grep POSTGRES_PASSWORD | cut -d= -f2 | head -1) - docker exec "$pg_container" \ - sh -c "PGPASSWORD='$pg_pass' pg_dumpall -U postgres" \ - > "$BACKUP_PATH/postgresql_all.sql" 2>/dev/null || \ - log_warn "PostgreSQL backup failed" - fi +restic_volume() { + local vol=$1 + shift + docker run --rm \ + --network proxy \ + -e RESTIC_PASSWORD="$RESTIC_PASSWORD" \ + -e RESTIC_REPOSITORY="$RESTIC_REPO" \ + -e AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" \ + -e AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" \ + -v "${vol}:/data" \ + restic/restic:0.16.3 "$@" +} - # MariaDB/MySQL - if docker ps --format '{{.Names}}' | grep -q 'mariadb\|mysql'; then - local mysql_container - mysql_container=$(docker ps --format '{{.Names}}' | grep -E 'mariadb|mysql' | head -1) - local mysql_pass - mysql_pass=$(docker inspect "$mysql_container" --format '{{range .Config.Env}}{{println .}}{{end}}' | grep MYSQL_ROOT_PASSWORD | cut -d= -f2 | head -1) - docker exec "$mysql_container" \ - sh -c "mysqldump -u root -p'$mysql_pass' --all-databases" \ - > "$BACKUP_PATH/mysql_all.sql" 2>/dev/null || \ - log_warn "MySQL backup failed" +notify() { + local title=$1 + local msg=$2 + if [ -x "$SCRIPT_DIR/notify.sh" ]; then + "$SCRIPT_DIR/notify.sh" "backup" "$title" "$msg" || true fi } -# 清理旧备份 -cleanup_old() { - log_info "Cleaning backups older than ${RETENTION_DAYS} days..." - find "$BACKUP_DIR" -maxdepth 1 -type d -mtime +"$RETENTION_DAYS" -exec rm -rf {} + 2>/dev/null || true -} +# Ensure repo is initialized +if ! restic_cmd snapshots >/dev/null 2>&1; then + echo "Initializing restic repository..." + if ! restic_cmd init; then + echo "Notice: Repository might already be initialized or an error occurred." + fi +fi + +if [ "$LIST" = "true" ]; then + restic_cmd snapshots + exit 0 +fi + +if [ "$VERIFY" = "true" ]; then + restic_cmd check + exit 0 +fi + +if [ -n "$RESTORE_ID" ]; then + if [ -z "$TARGET" ]; then + echo "Error: --target must be specified for restore. e.g. --target media" + exit 1 + fi + echo "Restoring $TARGET from backup $RESTORE_ID..." + + if [ "$TARGET" = "configs" ] || [ "$TARGET" = "all" ]; then + echo "To restore configs, run restic locally. Automatic restore of configs is not supported via this script." + exit 1 + fi + + # Support restoring a single volume (target is exact volume name) or all volumes for a stack + VOLUMES=$(docker volume ls --format '{{.Name}}' | grep "^${TARGET}_" || true) + if [ -z "$VOLUMES" ]; then + echo "No volumes found for target $TARGET, attempting to restore exactly volume: $TARGET" + VOLUMES="$TARGET" + fi -# 生成备份摘要 -generate_summary() { - local total_size - total_size=$(du -sh "$BACKUP_PATH" 2>/dev/null | cut -f1) - log_info "Backup complete: $BACKUP_PATH ($total_size)" - ls -lh "$BACKUP_PATH/" + for vol in $VOLUMES; do + echo "Restoring volume: $vol" + restic_volume "$vol" restore "$RESTORE_ID" --target / + done + + notify "Restore Success" "Restored backup $RESTORE_ID for target $TARGET" + exit 0 +fi + +if [ -z "$TARGET" ]; then + echo "Error: --target is required" + echo "Usage: $0 --target [options]" + exit 1 +fi + +backup_volume() { + local vol=$1 + echo "Backing up volume: $vol" + if [ "$DRY_RUN" = "true" ]; then + restic_volume "$vol" backup --dry-run /data --tag "$vol" + else + if restic_volume "$vol" backup /data --tag "$vol"; then + echo "Successfully backed up $vol" + else + echo "Failed to back up $vol" + notify "Backup Failed" "Failed to backup $vol" + exit 1 + fi + fi } -log_info "Starting backup — $TIMESTAMP" -backup_configs -backup_volumes -backup_databases -cleanup_old -generate_summary +VOLUMES="" +if [ "$TARGET" = "all" ]; then + VOLUMES=$(docker volume ls --format '{{.Name}}' | grep -v '^[a-f0-9]\{64\}$' || true) +else + VOLUMES=$(docker volume ls --format '{{.Name}}' | grep "^${TARGET}_" || true) + if [ -z "$VOLUMES" ]; then + echo "No volumes found for stack $TARGET. If it's a specific volume, we'll try that." + VOLUMES="$TARGET" + fi +fi + +echo "Starting backup for target: $TARGET" + +for vol in $VOLUMES; do + [[ -z "$vol" ]] && continue + backup_volume "$vol" +done + +if [ "$TARGET" = "all" ]; then + echo "Backing up configs..." + if [ "$DRY_RUN" = "true" ]; then + restic_config backup --dry-run /config_data --exclude "/config_data/stacks/*/data" --tag "configs" + else + if restic_config backup /config_data --exclude "/config_data/stacks/*/data" --tag "configs"; then + echo "Successfully backed up configs" + else + notify "Backup Failed" "Failed to backup configs" + exit 1 + fi + fi +fi + +if [ "$DRY_RUN" != "true" ]; then + notify "Backup Success" "Backup completed for target: $TARGET" +fi + +echo "Backup complete!" diff --git a/stacks/backup/docker-compose.yml b/stacks/backup/docker-compose.yml new file mode 100644 index 00000000..8396a178 --- /dev/null +++ b/stacks/backup/docker-compose.yml @@ -0,0 +1,68 @@ +# ============================================================================= +# HomeLab Stack — Backup & Recovery +# Services: Duplicati (Cloud Backup GUI) + Restic REST Server (Local Repo) +# ============================================================================= + +services: + + # --------------------------------------------------------------------------- + # Duplicati — Encrypted Cloud Backup + # URL: https://duplicati.${DOMAIN} + # --------------------------------------------------------------------------- + duplicati: + image: lscr.io/linuxserver/duplicati:2.0.8 + container_name: duplicati + restart: unless-stopped + networks: + - proxy + environment: + - PUID=${PUID:-1000} + - PGID=${PGID:-1000} + - TZ=${TZ:-Asia/Shanghai} + volumes: + - duplicati-config:/config + - duplicati-backups:/backups + - /:/source:ro + labels: + - "traefik.enable=true" + - "traefik.http.routers.duplicati.rule=Host(`duplicati.${DOMAIN}`)" + - "traefik.http.routers.duplicati.entrypoints=websecure" + - "traefik.http.routers.duplicati.tls.certresolver=letsencrypt" + - "traefik.http.services.duplicati.loadbalancer.server.port=8200" + - "traefik.http.routers.duplicati.middlewares=security-headers@file" + - "com.centurylinklabs.watchtower.enable=true" + + # --------------------------------------------------------------------------- + # Restic REST Server — Local Backup Repository + # URL: https://restic.${DOMAIN} + # --------------------------------------------------------------------------- + rest-server: + image: restic/rest-server:0.13.0 + container_name: restic-server + restart: unless-stopped + networks: + - proxy + environment: + - OPTIONS=--no-auth --prometheus + volumes: + - restic-data:/data + labels: + - "traefik.enable=true" + - "traefik.http.routers.restic.rule=Host(`restic.${DOMAIN}`)" + - "traefik.http.routers.restic.entrypoints=websecure" + - "traefik.http.routers.restic.tls.certresolver=letsencrypt" + - "traefik.http.services.restic.loadbalancer.server.port=8000" + - "traefik.http.routers.restic.middlewares=security-headers@file" + - "com.centurylinklabs.watchtower.enable=true" + +networks: + proxy: + external: true + +volumes: + duplicati-config: + driver: local + duplicati-backups: + driver: local + restic-data: + driver: local