diff --git a/README.md b/README.md index 9b44b12..1f85d74 100644 --- a/README.md +++ b/README.md @@ -356,6 +356,12 @@ export COMPRESS_LEVEL=6 # Cluster backup timeout in minutes (controls overall cluster operation timeout) # Default: 240 (4 hours) export CLUSTER_TIMEOUT_MIN=240 + +# Swap file management (Linux only, requires root) +# Automatically create temporary swap file for large backups to prevent OOM kills +export AUTO_SWAP=false # Enable automatic swap management +export SWAP_FILE_SIZE_GB=8 # Swap file size in GB (0 = disabled) +export SWAP_FILE_PATH=/tmp/dbbackup_swap # Path to temporary swap file ``` ## 🏗️ Architecture @@ -477,6 +483,48 @@ dbbackup cpu dbbackup backup single mydb --cpu-workload io-intensive ``` +#### Out of Memory (OOM) Issues + +If backups are being killed by the system with "signal: killed" errors: + +```bash +# Check kernel logs for OOM killer events +dmesg --ctime | grep -i -E "oom|kill|memory" +journalctl -k --since "1 hour ago" | grep -i oom + +# Check current memory and swap +free -h +swapon --show + +# Option 1: Enable automatic swap file management (Linux + root only) +export AUTO_SWAP=true +export SWAP_FILE_SIZE_GB=8 +sudo dbbackup backup cluster + +# Option 2: Manually add swap before backup +sudo fallocate -l 8G /swapfile +sudo chmod 600 /swapfile +sudo mkswap /swapfile +sudo swapon /swapfile +# Run your backup +dbbackup backup cluster +# Cleanup after +sudo swapoff /swapfile +sudo rm /swapfile + +# Option 3: Reduce memory usage +export DUMP_JOBS=2 # Fewer parallel jobs +export COMPRESS_LEVEL=3 # Lower compression +dbbackup backup cluster +``` + +**Note**: Automatic swap management (`AUTO_SWAP=true`) requires: +- Linux operating system +- Root privileges (run with `sudo`) +- Available disk space for the swap file + +The tool will automatically create, enable, and cleanup the temporary swap file during cluster backups. + ### Debug Mode ```bash diff --git a/dbbackup b/dbbackup index 8dcdcd9..5c3e809 100755 Binary files a/dbbackup and b/dbbackup differ diff --git a/internal/backup/engine.go b/internal/backup/engine.go index a7c0e56..061dfb0 100644 --- a/internal/backup/engine.go +++ b/internal/backup/engine.go @@ -18,6 +18,7 @@ import ( "dbbackup/internal/database" "dbbackup/internal/logger" "dbbackup/internal/progress" + "dbbackup/internal/swap" ) // Engine handles backup operations @@ -252,6 +253,31 @@ func (e *Engine) BackupCluster(ctx context.Context) error { operation := e.log.StartOperation("Cluster Backup") + // Setup swap file if configured + var swapMgr *swap.Manager + if e.cfg.AutoSwap && e.cfg.SwapFileSizeGB > 0 { + swapMgr = swap.NewManager(e.cfg.SwapFilePath, e.cfg.SwapFileSizeGB, e.log) + + if swapMgr.IsSupported() { + e.log.Info("Setting up temporary swap file for large backup", + "path", e.cfg.SwapFilePath, + "size_gb", e.cfg.SwapFileSizeGB) + + if err := swapMgr.Setup(); err != nil { + e.log.Warn("Failed to setup swap file (continuing without it)", "error", err) + } else { + // Ensure cleanup on exit + defer func() { + if err := swapMgr.Cleanup(); err != nil { + e.log.Warn("Failed to cleanup swap file", "error", err) + } + }() + } + } else { + e.log.Warn("Swap file management not supported on this platform", "os", swapMgr) + } + } + // Use a quiet progress indicator to avoid duplicate messages quietProgress := progress.NewQuietLineByLine() quietProgress.Start("Starting cluster backup (all databases)") diff --git a/internal/config/config.go b/internal/config/config.go index 0cd7c37..a714002 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -56,6 +56,11 @@ type Config struct { RestoreDBName string // Timeouts (in minutes) ClusterTimeoutMinutes int + + // Swap file management (for large backups) + SwapFilePath string // Path to temporary swap file + SwapFileSizeGB int // Size in GB (0 = disabled) + AutoSwap bool // Automatically manage swap for large backups } // New creates a new configuration with default values @@ -134,6 +139,11 @@ func New() *Config { // Timeouts ClusterTimeoutMinutes: getEnvInt("CLUSTER_TIMEOUT_MIN", 240), + + // Swap file management + SwapFilePath: getEnvString("SWAP_FILE_PATH", "/tmp/dbbackup_swap"), + SwapFileSizeGB: getEnvInt("SWAP_FILE_SIZE_GB", 0), // 0 = disabled by default + AutoSwap: getEnvBool("AUTO_SWAP", false), } // Ensure canonical defaults are enforced diff --git a/internal/swap/swap.go b/internal/swap/swap.go new file mode 100644 index 0000000..d737951 --- /dev/null +++ b/internal/swap/swap.go @@ -0,0 +1,233 @@ +package swap + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + + "dbbackup/internal/logger" +) + +// Manager handles temporary swap file creation and cleanup +type Manager struct { + swapPath string + sizeGB int + isActive bool + wasCreated bool + log logger.Logger +} + +// NewManager creates a new swap file manager +func NewManager(swapPath string, sizeGB int, log logger.Logger) *Manager { + return &Manager{ + swapPath: swapPath, + sizeGB: sizeGB, + log: log, + } +} + +// IsSupported checks if swap file management is supported on this platform +func (m *Manager) IsSupported() bool { + // Only supported on Linux + return runtime.GOOS == "linux" +} + +// NeedsRoot checks if we need root privileges for swap operations +func (m *Manager) NeedsRoot() bool { + // On Linux, swap operations require root + return runtime.GOOS == "linux" && os.Geteuid() != 0 +} + +// GetCurrentSwap returns current swap usage info +func (m *Manager) GetCurrentSwap() (totalMB, usedMB, freeMB int64, err error) { + // Read /proc/meminfo for swap info + data, err := os.ReadFile("/proc/meminfo") + if err != nil { + return 0, 0, 0, err + } + + lines := strings.Split(string(data), "\n") + for _, line := range lines { + fields := strings.Fields(line) + if len(fields) < 2 { + continue + } + + switch fields[0] { + case "SwapTotal:": + if val, err := strconv.ParseInt(fields[1], 10, 64); err == nil { + totalMB = val / 1024 + } + case "SwapFree:": + if val, err := strconv.ParseInt(fields[1], 10, 64); err == nil { + freeMB = val / 1024 + } + } + } + usedMB = totalMB - freeMB + return +} + +// IsSwapFileActive checks if our specific swap file is currently active +func (m *Manager) IsSwapFileActive() (bool, error) { + // Read /proc/swaps to see active swap devices + data, err := os.ReadFile("/proc/swaps") + if err != nil { + return false, err + } + + absPath, _ := filepath.Abs(m.swapPath) + lines := strings.Split(string(data), "\n") + for _, line := range lines { + if strings.Contains(line, absPath) || strings.Contains(line, m.swapPath) { + return true, nil + } + } + return false, nil +} + +// Setup creates and enables the swap file if needed +func (m *Manager) Setup() error { + if !m.IsSupported() { + return fmt.Errorf("swap file management not supported on %s", runtime.GOOS) + } + + if m.sizeGB <= 0 { + return fmt.Errorf("swap size must be > 0 GB") + } + + if m.NeedsRoot() { + m.log.Warn("Swap file creation requires root privileges, skipping automatic swap setup") + return fmt.Errorf("swap file operations require root privileges (current euid: %d)", os.Geteuid()) + } + + m.log.Info("Setting up temporary swap file", "path", m.swapPath, "size_gb", m.sizeGB) + + // Check if swap file already exists and is active + if active, _ := m.IsSwapFileActive(); active { + m.log.Info("Swap file already active", "path", m.swapPath) + m.isActive = true + return nil + } + + // Check if file exists but is not active + if _, err := os.Stat(m.swapPath); err == nil { + m.log.Warn("Swap file exists but is not active, removing", "path", m.swapPath) + if err := os.Remove(m.swapPath); err != nil { + return fmt.Errorf("failed to remove existing swap file: %w", err) + } + } + + // Create swap file directory if needed + swapDir := filepath.Dir(m.swapPath) + if err := os.MkdirAll(swapDir, 0755); err != nil { + return fmt.Errorf("failed to create swap directory: %w", err) + } + + // Calculate size in bytes + sizeBytes := int64(m.sizeGB) * 1024 * 1024 * 1024 + + m.log.Info("Creating swap file", "size_bytes", sizeBytes) + + // Use fallocate for fast file creation + cmd := exec.Command("fallocate", "-l", fmt.Sprintf("%d", sizeBytes), m.swapPath) + if output, err := cmd.CombinedOutput(); err != nil { + // Fallback to dd if fallocate fails + m.log.Warn("fallocate failed, using dd (slower)", "error", err, "output", string(output)) + cmd = exec.Command("dd", "if=/dev/zero", "of="+m.swapPath, "bs=1M", fmt.Sprintf("count=%d", m.sizeGB*1024)) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("failed to create swap file with dd: %w (output: %s)", err, string(output)) + } + } + m.wasCreated = true + + // Set correct permissions (600 for security) + if err := os.Chmod(m.swapPath, 0600); err != nil { + m.cleanup() + return fmt.Errorf("failed to set swap file permissions: %w", err) + } + + // Format as swap + m.log.Info("Formatting swap file") + cmd = exec.Command("mkswap", m.swapPath) + if output, err := cmd.CombinedOutput(); err != nil { + m.cleanup() + return fmt.Errorf("failed to format swap file: %w (output: %s)", err, string(output)) + } + + // Enable swap + m.log.Info("Enabling swap file") + cmd = exec.Command("swapon", m.swapPath) + if output, err := cmd.CombinedOutput(); err != nil { + m.cleanup() + return fmt.Errorf("failed to enable swap file: %w (output: %s)", err, string(output)) + } + + m.isActive = true + + // Log current swap status + if total, used, free, err := m.GetCurrentSwap(); err == nil { + m.log.Info("Swap status after setup", + "total_mb", total, + "used_mb", used, + "free_mb", free, + "added_gb", m.sizeGB) + } + + return nil +} + +// cleanup removes the swap file (internal helper) +func (m *Manager) cleanup() { + if m.wasCreated { + os.Remove(m.swapPath) + m.wasCreated = false + } +} + +// Cleanup disables and removes the swap file +func (m *Manager) Cleanup() error { + if !m.isActive { + return nil + } + + m.log.Info("Cleaning up swap file", "path", m.swapPath) + + // Check if still active + if active, _ := m.IsSwapFileActive(); active { + // Disable swap + m.log.Info("Disabling swap file") + cmd := exec.Command("swapoff", m.swapPath) + if output, err := cmd.CombinedOutput(); err != nil { + m.log.Warn("Failed to disable swap file", "error", err, "output", string(output)) + // Continue anyway + } + } + + // Remove file if we created it + if m.wasCreated { + m.log.Info("Removing swap file", "path", m.swapPath) + if err := os.Remove(m.swapPath); err != nil { + m.log.Warn("Failed to remove swap file", "error", err) + return fmt.Errorf("failed to remove swap file: %w", err) + } + m.wasCreated = false + } + + m.isActive = false + return nil +} + +// IsActive returns whether the swap file is currently active +func (m *Manager) IsActive() bool { + return m.isActive +} + +// WasCreated returns whether this manager created the swap file +func (m *Manager) WasCreated() bool { + return m.wasCreated +}