diff --git a/disaster_recovery_test.sh b/disaster_recovery_test.sh new file mode 100755 index 0000000..fb1b999 --- /dev/null +++ b/disaster_recovery_test.sh @@ -0,0 +1,197 @@ +#!/bin/bash +# +# DISASTER RECOVERY TEST SCRIPT +# Full cluster backup -> destroy all databases -> restore cluster +# +# This script performs the ultimate validation test: +# 1. Backup entire PostgreSQL cluster with maximum performance +# 2. Drop all user databases (destructive!) +# 3. Restore entire cluster from backup +# 4. Verify database count and integrity +# + +set -e # Exit on any error + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +# Configuration +BACKUP_DIR="/var/lib/pgsql/db_backups" +DBBACKUP_BIN="./dbbackup" +DB_USER="postgres" +DB_NAME="postgres" + +# Performance settings - use maximum CPU +MAX_CORES=$(nproc) # Use all available cores +COMPRESSION_LEVEL=3 # Fast compression for large DBs +CPU_WORKLOAD="cpu-intensive" # Maximum CPU utilization +PARALLEL_JOBS=$MAX_CORES # Maximum parallelization + +echo -e "${CYAN}╔════════════════════════════════════════════════════════╗${NC}" +echo -e "${CYAN}║ DISASTER RECOVERY TEST - FULL CLUSTER VALIDATION ║${NC}" +echo -e "${CYAN}╔════════════════════════════════════════════════════════╗${NC}" +echo "" +echo -e "${BLUE}Configuration:${NC}" +echo -e " Backup directory: ${BACKUP_DIR}" +echo -e " Max CPU cores: ${MAX_CORES}" +echo -e " Compression: ${COMPRESSION_LEVEL}" +echo -e " CPU workload: ${CPU_WORKLOAD}" +echo -e " Parallel jobs: ${PARALLEL_JOBS}" +echo "" + +# Step 0: Pre-flight checks +echo -e "${BLUE}[STEP 0/5]${NC} Pre-flight checks..." + +if [ ! -f "$DBBACKUP_BIN" ]; then + echo -e "${RED}ERROR: dbbackup binary not found at $DBBACKUP_BIN${NC}" + exit 1 +fi + +if ! command -v psql &> /dev/null; then + echo -e "${RED}ERROR: psql not found${NC}" + exit 1 +fi + +echo -e "${GREEN}✓${NC} Pre-flight checks passed" +echo "" + +# Step 1: Save current database list +echo -e "${BLUE}[STEP 1/5]${NC} Documenting current cluster state..." +PRE_BACKUP_LIST="/tmp/pre_disaster_recovery_dblist_$(date +%s).txt" +sudo -u $DB_USER psql -l -t > "$PRE_BACKUP_LIST" +DB_COUNT=$(sudo -u $DB_USER psql -l -t | grep -v "^$" | grep -v "template" | wc -l) +echo -e "${GREEN}✓${NC} Documented ${DB_COUNT} databases to ${PRE_BACKUP_LIST}" +echo "" + +# Step 2: Full cluster backup with maximum performance +echo -e "${BLUE}[STEP 2/5]${NC} ${YELLOW}Backing up entire cluster...${NC}" +echo -e "${CYAN}Performance settings: ${MAX_CORES} cores, compression=${COMPRESSION_LEVEL}, workload=${CPU_WORKLOAD}${NC}" +echo "" + +BACKUP_START=$(date +%s) + +sudo -u $DB_USER $DBBACKUP_BIN backup cluster \ + -d $DB_NAME \ + --insecure \ + --compression $COMPRESSION_LEVEL \ + --backup-dir "$BACKUP_DIR" \ + --max-cores $MAX_CORES \ + --cpu-workload "$CPU_WORKLOAD" \ + --dump-jobs $PARALLEL_JOBS \ + --jobs $PARALLEL_JOBS + +BACKUP_END=$(date +%s) +BACKUP_DURATION=$((BACKUP_END - BACKUP_START)) + +# Find the most recent cluster backup +BACKUP_FILE=$(ls -t "$BACKUP_DIR"/cluster_*.tar.gz | head -1) +BACKUP_SIZE=$(du -h "$BACKUP_FILE" | cut -f1) + +echo "" +echo -e "${GREEN}✓${NC} Cluster backup completed in ${BACKUP_DURATION}s" +echo -e " Archive: ${BACKUP_FILE}" +echo -e " Size: ${BACKUP_SIZE}" +echo "" + +# Step 3: DESTRUCTIVE - Drop all user databases +echo -e "${BLUE}[STEP 3/5]${NC} ${RED}DESTROYING ALL DATABASES (POINT OF NO RETURN!)${NC}" +echo -e "${YELLOW}Waiting 3 seconds... Press Ctrl+C to abort${NC}" +sleep 3 + +echo -e "${RED}🔥 DROPPING ALL USER DATABASES...${NC}" + +# Get list of all databases except templates and postgres +USER_DBS=$(sudo -u $DB_USER psql -d postgres -t -c "SELECT datname FROM pg_database WHERE datistemplate = false AND datname != 'postgres';") + +DROPPED_COUNT=0 +for db in $USER_DBS; do + echo -e " Dropping: ${db}" + sudo -u $DB_USER psql -d postgres -c "DROP DATABASE IF EXISTS \"$db\";" 2>&1 | grep -v "does not exist" || true + DROPPED_COUNT=$((DROPPED_COUNT + 1)) +done + +REMAINING_DBS=$(sudo -u $DB_USER psql -l -t | grep -v "^$" | grep -v "template" | wc -l) +echo "" +echo -e "${GREEN}✓${NC} Dropped ${DROPPED_COUNT} databases (${REMAINING_DBS} remaining)" +echo -e "${CYAN}Remaining databases:${NC}" +sudo -u $DB_USER psql -l | head -10 +echo "" + +# Step 4: Restore full cluster +echo -e "${BLUE}[STEP 4/5]${NC} ${YELLOW}RESTORING FULL CLUSTER FROM BACKUP...${NC}" +echo "" + +RESTORE_START=$(date +%s) + +sudo -u $DB_USER $DBBACKUP_BIN restore cluster \ + "$BACKUP_FILE" \ + --confirm \ + -d $DB_NAME \ + --insecure \ + --jobs $PARALLEL_JOBS + +RESTORE_END=$(date +%s) +RESTORE_DURATION=$((RESTORE_END - RESTORE_START)) + +echo "" +echo -e "${GREEN}✓${NC} Cluster restore completed in ${RESTORE_DURATION}s" +echo "" + +# Step 5: Verify restoration +echo -e "${BLUE}[STEP 5/5]${NC} Verifying restoration..." + +POST_RESTORE_LIST="/tmp/post_disaster_recovery_dblist_$(date +%s).txt" +sudo -u $DB_USER psql -l -t > "$POST_RESTORE_LIST" +RESTORED_DB_COUNT=$(sudo -u $DB_USER psql -l -t | grep -v "^$" | grep -v "template" | wc -l) + +echo -e "${CYAN}Restored databases:${NC}" +sudo -u $DB_USER psql -l + +echo "" +echo -e "${GREEN}✓${NC} Restored ${RESTORED_DB_COUNT} databases" +echo "" + +# Check if database counts match +if [ "$RESTORED_DB_COUNT" -eq "$DB_COUNT" ]; then + echo -e "${GREEN}✅ DATABASE COUNT MATCH: ${RESTORED_DB_COUNT}/${DB_COUNT}${NC}" +else + echo -e "${YELLOW}⚠️ DATABASE COUNT MISMATCH: ${RESTORED_DB_COUNT} restored vs ${DB_COUNT} original${NC}" +fi + +# Check largest databases +echo "" +echo -e "${CYAN}Largest restored databases:${NC}" +sudo -u $DB_USER psql -c "\l+" | grep -E "MB|GB" | head -5 + +# Summary +echo "" +echo -e "${CYAN}╔════════════════════════════════════════════════════════╗${NC}" +echo -e "${CYAN}║ DISASTER RECOVERY TEST SUMMARY ║${NC}" +echo -e "${CYAN}╚════════════════════════════════════════════════════════╝${NC}" +echo "" +echo -e " ${BLUE}Backup:${NC}" +echo -e " - Duration: ${BACKUP_DURATION}s ($(($BACKUP_DURATION / 60))m $(($BACKUP_DURATION % 60))s)" +echo -e " - File: ${BACKUP_FILE}" +echo -e " - Size: ${BACKUP_SIZE}" +echo "" +echo -e " ${BLUE}Restore:${NC}" +echo -e " - Duration: ${RESTORE_DURATION}s ($(($RESTORE_DURATION / 60))m $(($RESTORE_DURATION % 60))s)" +echo -e " - Databases: ${RESTORED_DB_COUNT}/${DB_COUNT}" +echo "" +echo -e " ${BLUE}Performance:${NC}" +echo -e " - CPU cores: ${MAX_CORES}" +echo -e " - Jobs: ${PARALLEL_JOBS}" +echo -e " - Workload: ${CPU_WORKLOAD}" +echo "" +echo -e " ${BLUE}Verification:${NC}" +echo -e " - Pre-test: ${PRE_BACKUP_LIST}" +echo -e " - Post-test: ${POST_RESTORE_LIST}" +echo "" +TOTAL_DURATION=$((BACKUP_DURATION + RESTORE_DURATION)) +echo -e "${GREEN}✅ DISASTER RECOVERY TEST COMPLETED IN ${TOTAL_DURATION}s ($(($TOTAL_DURATION / 60))m)${NC}" +echo ""