feat: automatic swap file management for OOM prevention during large backups

This commit is contained in:
2025-11-05 12:10:06 +00:00
parent 28981120a0
commit 934af72940
5 changed files with 317 additions and 0 deletions

View File

@ -356,6 +356,12 @@ export COMPRESS_LEVEL=6
# Cluster backup timeout in minutes (controls overall cluster operation timeout) # Cluster backup timeout in minutes (controls overall cluster operation timeout)
# Default: 240 (4 hours) # Default: 240 (4 hours)
export CLUSTER_TIMEOUT_MIN=240 export CLUSTER_TIMEOUT_MIN=240
# Swap file management (Linux only, requires root)
# Automatically create temporary swap file for large backups to prevent OOM kills
export AUTO_SWAP=false # Enable automatic swap management
export SWAP_FILE_SIZE_GB=8 # Swap file size in GB (0 = disabled)
export SWAP_FILE_PATH=/tmp/dbbackup_swap # Path to temporary swap file
``` ```
## 🏗️ Architecture ## 🏗️ Architecture
@ -477,6 +483,48 @@ dbbackup cpu
dbbackup backup single mydb --cpu-workload io-intensive dbbackup backup single mydb --cpu-workload io-intensive
``` ```
#### Out of Memory (OOM) Issues
If backups are being killed by the system with "signal: killed" errors:
```bash
# Check kernel logs for OOM killer events
dmesg --ctime | grep -i -E "oom|kill|memory"
journalctl -k --since "1 hour ago" | grep -i oom
# Check current memory and swap
free -h
swapon --show
# Option 1: Enable automatic swap file management (Linux + root only)
export AUTO_SWAP=true
export SWAP_FILE_SIZE_GB=8
sudo dbbackup backup cluster
# Option 2: Manually add swap before backup
sudo fallocate -l 8G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
# Run your backup
dbbackup backup cluster
# Cleanup after
sudo swapoff /swapfile
sudo rm /swapfile
# Option 3: Reduce memory usage
export DUMP_JOBS=2 # Fewer parallel jobs
export COMPRESS_LEVEL=3 # Lower compression
dbbackup backup cluster
```
**Note**: Automatic swap management (`AUTO_SWAP=true`) requires:
- Linux operating system
- Root privileges (run with `sudo`)
- Available disk space for the swap file
The tool will automatically create, enable, and cleanup the temporary swap file during cluster backups.
### Debug Mode ### Debug Mode
```bash ```bash

BIN
dbbackup

Binary file not shown.

View File

@ -18,6 +18,7 @@ import (
"dbbackup/internal/database" "dbbackup/internal/database"
"dbbackup/internal/logger" "dbbackup/internal/logger"
"dbbackup/internal/progress" "dbbackup/internal/progress"
"dbbackup/internal/swap"
) )
// Engine handles backup operations // Engine handles backup operations
@ -252,6 +253,31 @@ func (e *Engine) BackupCluster(ctx context.Context) error {
operation := e.log.StartOperation("Cluster Backup") operation := e.log.StartOperation("Cluster Backup")
// Setup swap file if configured
var swapMgr *swap.Manager
if e.cfg.AutoSwap && e.cfg.SwapFileSizeGB > 0 {
swapMgr = swap.NewManager(e.cfg.SwapFilePath, e.cfg.SwapFileSizeGB, e.log)
if swapMgr.IsSupported() {
e.log.Info("Setting up temporary swap file for large backup",
"path", e.cfg.SwapFilePath,
"size_gb", e.cfg.SwapFileSizeGB)
if err := swapMgr.Setup(); err != nil {
e.log.Warn("Failed to setup swap file (continuing without it)", "error", err)
} else {
// Ensure cleanup on exit
defer func() {
if err := swapMgr.Cleanup(); err != nil {
e.log.Warn("Failed to cleanup swap file", "error", err)
}
}()
}
} else {
e.log.Warn("Swap file management not supported on this platform", "os", swapMgr)
}
}
// Use a quiet progress indicator to avoid duplicate messages // Use a quiet progress indicator to avoid duplicate messages
quietProgress := progress.NewQuietLineByLine() quietProgress := progress.NewQuietLineByLine()
quietProgress.Start("Starting cluster backup (all databases)") quietProgress.Start("Starting cluster backup (all databases)")

View File

@ -56,6 +56,11 @@ type Config struct {
RestoreDBName string RestoreDBName string
// Timeouts (in minutes) // Timeouts (in minutes)
ClusterTimeoutMinutes int ClusterTimeoutMinutes int
// Swap file management (for large backups)
SwapFilePath string // Path to temporary swap file
SwapFileSizeGB int // Size in GB (0 = disabled)
AutoSwap bool // Automatically manage swap for large backups
} }
// New creates a new configuration with default values // New creates a new configuration with default values
@ -134,6 +139,11 @@ func New() *Config {
// Timeouts // Timeouts
ClusterTimeoutMinutes: getEnvInt("CLUSTER_TIMEOUT_MIN", 240), ClusterTimeoutMinutes: getEnvInt("CLUSTER_TIMEOUT_MIN", 240),
// Swap file management
SwapFilePath: getEnvString("SWAP_FILE_PATH", "/tmp/dbbackup_swap"),
SwapFileSizeGB: getEnvInt("SWAP_FILE_SIZE_GB", 0), // 0 = disabled by default
AutoSwap: getEnvBool("AUTO_SWAP", false),
} }
// Ensure canonical defaults are enforced // Ensure canonical defaults are enforced

233
internal/swap/swap.go Normal file
View File

@ -0,0 +1,233 @@
package swap
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"dbbackup/internal/logger"
)
// Manager handles temporary swap file creation and cleanup
type Manager struct {
swapPath string
sizeGB int
isActive bool
wasCreated bool
log logger.Logger
}
// NewManager creates a new swap file manager
func NewManager(swapPath string, sizeGB int, log logger.Logger) *Manager {
return &Manager{
swapPath: swapPath,
sizeGB: sizeGB,
log: log,
}
}
// IsSupported checks if swap file management is supported on this platform
func (m *Manager) IsSupported() bool {
// Only supported on Linux
return runtime.GOOS == "linux"
}
// NeedsRoot checks if we need root privileges for swap operations
func (m *Manager) NeedsRoot() bool {
// On Linux, swap operations require root
return runtime.GOOS == "linux" && os.Geteuid() != 0
}
// GetCurrentSwap returns current swap usage info
func (m *Manager) GetCurrentSwap() (totalMB, usedMB, freeMB int64, err error) {
// Read /proc/meminfo for swap info
data, err := os.ReadFile("/proc/meminfo")
if err != nil {
return 0, 0, 0, err
}
lines := strings.Split(string(data), "\n")
for _, line := range lines {
fields := strings.Fields(line)
if len(fields) < 2 {
continue
}
switch fields[0] {
case "SwapTotal:":
if val, err := strconv.ParseInt(fields[1], 10, 64); err == nil {
totalMB = val / 1024
}
case "SwapFree:":
if val, err := strconv.ParseInt(fields[1], 10, 64); err == nil {
freeMB = val / 1024
}
}
}
usedMB = totalMB - freeMB
return
}
// IsSwapFileActive checks if our specific swap file is currently active
func (m *Manager) IsSwapFileActive() (bool, error) {
// Read /proc/swaps to see active swap devices
data, err := os.ReadFile("/proc/swaps")
if err != nil {
return false, err
}
absPath, _ := filepath.Abs(m.swapPath)
lines := strings.Split(string(data), "\n")
for _, line := range lines {
if strings.Contains(line, absPath) || strings.Contains(line, m.swapPath) {
return true, nil
}
}
return false, nil
}
// Setup creates and enables the swap file if needed
func (m *Manager) Setup() error {
if !m.IsSupported() {
return fmt.Errorf("swap file management not supported on %s", runtime.GOOS)
}
if m.sizeGB <= 0 {
return fmt.Errorf("swap size must be > 0 GB")
}
if m.NeedsRoot() {
m.log.Warn("Swap file creation requires root privileges, skipping automatic swap setup")
return fmt.Errorf("swap file operations require root privileges (current euid: %d)", os.Geteuid())
}
m.log.Info("Setting up temporary swap file", "path", m.swapPath, "size_gb", m.sizeGB)
// Check if swap file already exists and is active
if active, _ := m.IsSwapFileActive(); active {
m.log.Info("Swap file already active", "path", m.swapPath)
m.isActive = true
return nil
}
// Check if file exists but is not active
if _, err := os.Stat(m.swapPath); err == nil {
m.log.Warn("Swap file exists but is not active, removing", "path", m.swapPath)
if err := os.Remove(m.swapPath); err != nil {
return fmt.Errorf("failed to remove existing swap file: %w", err)
}
}
// Create swap file directory if needed
swapDir := filepath.Dir(m.swapPath)
if err := os.MkdirAll(swapDir, 0755); err != nil {
return fmt.Errorf("failed to create swap directory: %w", err)
}
// Calculate size in bytes
sizeBytes := int64(m.sizeGB) * 1024 * 1024 * 1024
m.log.Info("Creating swap file", "size_bytes", sizeBytes)
// Use fallocate for fast file creation
cmd := exec.Command("fallocate", "-l", fmt.Sprintf("%d", sizeBytes), m.swapPath)
if output, err := cmd.CombinedOutput(); err != nil {
// Fallback to dd if fallocate fails
m.log.Warn("fallocate failed, using dd (slower)", "error", err, "output", string(output))
cmd = exec.Command("dd", "if=/dev/zero", "of="+m.swapPath, "bs=1M", fmt.Sprintf("count=%d", m.sizeGB*1024))
if output, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("failed to create swap file with dd: %w (output: %s)", err, string(output))
}
}
m.wasCreated = true
// Set correct permissions (600 for security)
if err := os.Chmod(m.swapPath, 0600); err != nil {
m.cleanup()
return fmt.Errorf("failed to set swap file permissions: %w", err)
}
// Format as swap
m.log.Info("Formatting swap file")
cmd = exec.Command("mkswap", m.swapPath)
if output, err := cmd.CombinedOutput(); err != nil {
m.cleanup()
return fmt.Errorf("failed to format swap file: %w (output: %s)", err, string(output))
}
// Enable swap
m.log.Info("Enabling swap file")
cmd = exec.Command("swapon", m.swapPath)
if output, err := cmd.CombinedOutput(); err != nil {
m.cleanup()
return fmt.Errorf("failed to enable swap file: %w (output: %s)", err, string(output))
}
m.isActive = true
// Log current swap status
if total, used, free, err := m.GetCurrentSwap(); err == nil {
m.log.Info("Swap status after setup",
"total_mb", total,
"used_mb", used,
"free_mb", free,
"added_gb", m.sizeGB)
}
return nil
}
// cleanup removes the swap file (internal helper)
func (m *Manager) cleanup() {
if m.wasCreated {
os.Remove(m.swapPath)
m.wasCreated = false
}
}
// Cleanup disables and removes the swap file
func (m *Manager) Cleanup() error {
if !m.isActive {
return nil
}
m.log.Info("Cleaning up swap file", "path", m.swapPath)
// Check if still active
if active, _ := m.IsSwapFileActive(); active {
// Disable swap
m.log.Info("Disabling swap file")
cmd := exec.Command("swapoff", m.swapPath)
if output, err := cmd.CombinedOutput(); err != nil {
m.log.Warn("Failed to disable swap file", "error", err, "output", string(output))
// Continue anyway
}
}
// Remove file if we created it
if m.wasCreated {
m.log.Info("Removing swap file", "path", m.swapPath)
if err := os.Remove(m.swapPath); err != nil {
m.log.Warn("Failed to remove swap file", "error", err)
return fmt.Errorf("failed to remove swap file: %w", err)
}
m.wasCreated = false
}
m.isActive = false
return nil
}
// IsActive returns whether the swap file is currently active
func (m *Manager) IsActive() bool {
return m.isActive
}
// WasCreated returns whether this manager created the swap file
func (m *Manager) WasCreated() bool {
return m.wasCreated
}