feat: automatic swap file management for OOM prevention during large backups
This commit is contained in:
48
README.md
48
README.md
@ -356,6 +356,12 @@ export COMPRESS_LEVEL=6
|
|||||||
# Cluster backup timeout in minutes (controls overall cluster operation timeout)
|
# Cluster backup timeout in minutes (controls overall cluster operation timeout)
|
||||||
# Default: 240 (4 hours)
|
# Default: 240 (4 hours)
|
||||||
export CLUSTER_TIMEOUT_MIN=240
|
export CLUSTER_TIMEOUT_MIN=240
|
||||||
|
|
||||||
|
# Swap file management (Linux only, requires root)
|
||||||
|
# Automatically create temporary swap file for large backups to prevent OOM kills
|
||||||
|
export AUTO_SWAP=false # Enable automatic swap management
|
||||||
|
export SWAP_FILE_SIZE_GB=8 # Swap file size in GB (0 = disabled)
|
||||||
|
export SWAP_FILE_PATH=/tmp/dbbackup_swap # Path to temporary swap file
|
||||||
```
|
```
|
||||||
|
|
||||||
## 🏗️ Architecture
|
## 🏗️ Architecture
|
||||||
@ -477,6 +483,48 @@ dbbackup cpu
|
|||||||
dbbackup backup single mydb --cpu-workload io-intensive
|
dbbackup backup single mydb --cpu-workload io-intensive
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Out of Memory (OOM) Issues
|
||||||
|
|
||||||
|
If backups are being killed by the system with "signal: killed" errors:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check kernel logs for OOM killer events
|
||||||
|
dmesg --ctime | grep -i -E "oom|kill|memory"
|
||||||
|
journalctl -k --since "1 hour ago" | grep -i oom
|
||||||
|
|
||||||
|
# Check current memory and swap
|
||||||
|
free -h
|
||||||
|
swapon --show
|
||||||
|
|
||||||
|
# Option 1: Enable automatic swap file management (Linux + root only)
|
||||||
|
export AUTO_SWAP=true
|
||||||
|
export SWAP_FILE_SIZE_GB=8
|
||||||
|
sudo dbbackup backup cluster
|
||||||
|
|
||||||
|
# Option 2: Manually add swap before backup
|
||||||
|
sudo fallocate -l 8G /swapfile
|
||||||
|
sudo chmod 600 /swapfile
|
||||||
|
sudo mkswap /swapfile
|
||||||
|
sudo swapon /swapfile
|
||||||
|
# Run your backup
|
||||||
|
dbbackup backup cluster
|
||||||
|
# Cleanup after
|
||||||
|
sudo swapoff /swapfile
|
||||||
|
sudo rm /swapfile
|
||||||
|
|
||||||
|
# Option 3: Reduce memory usage
|
||||||
|
export DUMP_JOBS=2 # Fewer parallel jobs
|
||||||
|
export COMPRESS_LEVEL=3 # Lower compression
|
||||||
|
dbbackup backup cluster
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note**: Automatic swap management (`AUTO_SWAP=true`) requires:
|
||||||
|
- Linux operating system
|
||||||
|
- Root privileges (run with `sudo`)
|
||||||
|
- Available disk space for the swap file
|
||||||
|
|
||||||
|
The tool will automatically create, enable, and cleanup the temporary swap file during cluster backups.
|
||||||
|
|
||||||
### Debug Mode
|
### Debug Mode
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@ -18,6 +18,7 @@ import (
|
|||||||
"dbbackup/internal/database"
|
"dbbackup/internal/database"
|
||||||
"dbbackup/internal/logger"
|
"dbbackup/internal/logger"
|
||||||
"dbbackup/internal/progress"
|
"dbbackup/internal/progress"
|
||||||
|
"dbbackup/internal/swap"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Engine handles backup operations
|
// Engine handles backup operations
|
||||||
@ -252,6 +253,31 @@ func (e *Engine) BackupCluster(ctx context.Context) error {
|
|||||||
|
|
||||||
operation := e.log.StartOperation("Cluster Backup")
|
operation := e.log.StartOperation("Cluster Backup")
|
||||||
|
|
||||||
|
// Setup swap file if configured
|
||||||
|
var swapMgr *swap.Manager
|
||||||
|
if e.cfg.AutoSwap && e.cfg.SwapFileSizeGB > 0 {
|
||||||
|
swapMgr = swap.NewManager(e.cfg.SwapFilePath, e.cfg.SwapFileSizeGB, e.log)
|
||||||
|
|
||||||
|
if swapMgr.IsSupported() {
|
||||||
|
e.log.Info("Setting up temporary swap file for large backup",
|
||||||
|
"path", e.cfg.SwapFilePath,
|
||||||
|
"size_gb", e.cfg.SwapFileSizeGB)
|
||||||
|
|
||||||
|
if err := swapMgr.Setup(); err != nil {
|
||||||
|
e.log.Warn("Failed to setup swap file (continuing without it)", "error", err)
|
||||||
|
} else {
|
||||||
|
// Ensure cleanup on exit
|
||||||
|
defer func() {
|
||||||
|
if err := swapMgr.Cleanup(); err != nil {
|
||||||
|
e.log.Warn("Failed to cleanup swap file", "error", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
e.log.Warn("Swap file management not supported on this platform", "os", swapMgr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Use a quiet progress indicator to avoid duplicate messages
|
// Use a quiet progress indicator to avoid duplicate messages
|
||||||
quietProgress := progress.NewQuietLineByLine()
|
quietProgress := progress.NewQuietLineByLine()
|
||||||
quietProgress.Start("Starting cluster backup (all databases)")
|
quietProgress.Start("Starting cluster backup (all databases)")
|
||||||
|
|||||||
@ -56,6 +56,11 @@ type Config struct {
|
|||||||
RestoreDBName string
|
RestoreDBName string
|
||||||
// Timeouts (in minutes)
|
// Timeouts (in minutes)
|
||||||
ClusterTimeoutMinutes int
|
ClusterTimeoutMinutes int
|
||||||
|
|
||||||
|
// Swap file management (for large backups)
|
||||||
|
SwapFilePath string // Path to temporary swap file
|
||||||
|
SwapFileSizeGB int // Size in GB (0 = disabled)
|
||||||
|
AutoSwap bool // Automatically manage swap for large backups
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates a new configuration with default values
|
// New creates a new configuration with default values
|
||||||
@ -134,6 +139,11 @@ func New() *Config {
|
|||||||
|
|
||||||
// Timeouts
|
// Timeouts
|
||||||
ClusterTimeoutMinutes: getEnvInt("CLUSTER_TIMEOUT_MIN", 240),
|
ClusterTimeoutMinutes: getEnvInt("CLUSTER_TIMEOUT_MIN", 240),
|
||||||
|
|
||||||
|
// Swap file management
|
||||||
|
SwapFilePath: getEnvString("SWAP_FILE_PATH", "/tmp/dbbackup_swap"),
|
||||||
|
SwapFileSizeGB: getEnvInt("SWAP_FILE_SIZE_GB", 0), // 0 = disabled by default
|
||||||
|
AutoSwap: getEnvBool("AUTO_SWAP", false),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure canonical defaults are enforced
|
// Ensure canonical defaults are enforced
|
||||||
|
|||||||
233
internal/swap/swap.go
Normal file
233
internal/swap/swap.go
Normal file
@ -0,0 +1,233 @@
|
|||||||
|
package swap
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"dbbackup/internal/logger"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Manager handles temporary swap file creation and cleanup
|
||||||
|
type Manager struct {
|
||||||
|
swapPath string
|
||||||
|
sizeGB int
|
||||||
|
isActive bool
|
||||||
|
wasCreated bool
|
||||||
|
log logger.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewManager creates a new swap file manager
|
||||||
|
func NewManager(swapPath string, sizeGB int, log logger.Logger) *Manager {
|
||||||
|
return &Manager{
|
||||||
|
swapPath: swapPath,
|
||||||
|
sizeGB: sizeGB,
|
||||||
|
log: log,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsSupported checks if swap file management is supported on this platform
|
||||||
|
func (m *Manager) IsSupported() bool {
|
||||||
|
// Only supported on Linux
|
||||||
|
return runtime.GOOS == "linux"
|
||||||
|
}
|
||||||
|
|
||||||
|
// NeedsRoot checks if we need root privileges for swap operations
|
||||||
|
func (m *Manager) NeedsRoot() bool {
|
||||||
|
// On Linux, swap operations require root
|
||||||
|
return runtime.GOOS == "linux" && os.Geteuid() != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetCurrentSwap returns current swap usage info
|
||||||
|
func (m *Manager) GetCurrentSwap() (totalMB, usedMB, freeMB int64, err error) {
|
||||||
|
// Read /proc/meminfo for swap info
|
||||||
|
data, err := os.ReadFile("/proc/meminfo")
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
lines := strings.Split(string(data), "\n")
|
||||||
|
for _, line := range lines {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) < 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
switch fields[0] {
|
||||||
|
case "SwapTotal:":
|
||||||
|
if val, err := strconv.ParseInt(fields[1], 10, 64); err == nil {
|
||||||
|
totalMB = val / 1024
|
||||||
|
}
|
||||||
|
case "SwapFree:":
|
||||||
|
if val, err := strconv.ParseInt(fields[1], 10, 64); err == nil {
|
||||||
|
freeMB = val / 1024
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
usedMB = totalMB - freeMB
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsSwapFileActive checks if our specific swap file is currently active
|
||||||
|
func (m *Manager) IsSwapFileActive() (bool, error) {
|
||||||
|
// Read /proc/swaps to see active swap devices
|
||||||
|
data, err := os.ReadFile("/proc/swaps")
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
absPath, _ := filepath.Abs(m.swapPath)
|
||||||
|
lines := strings.Split(string(data), "\n")
|
||||||
|
for _, line := range lines {
|
||||||
|
if strings.Contains(line, absPath) || strings.Contains(line, m.swapPath) {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup creates and enables the swap file if needed
|
||||||
|
func (m *Manager) Setup() error {
|
||||||
|
if !m.IsSupported() {
|
||||||
|
return fmt.Errorf("swap file management not supported on %s", runtime.GOOS)
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.sizeGB <= 0 {
|
||||||
|
return fmt.Errorf("swap size must be > 0 GB")
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.NeedsRoot() {
|
||||||
|
m.log.Warn("Swap file creation requires root privileges, skipping automatic swap setup")
|
||||||
|
return fmt.Errorf("swap file operations require root privileges (current euid: %d)", os.Geteuid())
|
||||||
|
}
|
||||||
|
|
||||||
|
m.log.Info("Setting up temporary swap file", "path", m.swapPath, "size_gb", m.sizeGB)
|
||||||
|
|
||||||
|
// Check if swap file already exists and is active
|
||||||
|
if active, _ := m.IsSwapFileActive(); active {
|
||||||
|
m.log.Info("Swap file already active", "path", m.swapPath)
|
||||||
|
m.isActive = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if file exists but is not active
|
||||||
|
if _, err := os.Stat(m.swapPath); err == nil {
|
||||||
|
m.log.Warn("Swap file exists but is not active, removing", "path", m.swapPath)
|
||||||
|
if err := os.Remove(m.swapPath); err != nil {
|
||||||
|
return fmt.Errorf("failed to remove existing swap file: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create swap file directory if needed
|
||||||
|
swapDir := filepath.Dir(m.swapPath)
|
||||||
|
if err := os.MkdirAll(swapDir, 0755); err != nil {
|
||||||
|
return fmt.Errorf("failed to create swap directory: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate size in bytes
|
||||||
|
sizeBytes := int64(m.sizeGB) * 1024 * 1024 * 1024
|
||||||
|
|
||||||
|
m.log.Info("Creating swap file", "size_bytes", sizeBytes)
|
||||||
|
|
||||||
|
// Use fallocate for fast file creation
|
||||||
|
cmd := exec.Command("fallocate", "-l", fmt.Sprintf("%d", sizeBytes), m.swapPath)
|
||||||
|
if output, err := cmd.CombinedOutput(); err != nil {
|
||||||
|
// Fallback to dd if fallocate fails
|
||||||
|
m.log.Warn("fallocate failed, using dd (slower)", "error", err, "output", string(output))
|
||||||
|
cmd = exec.Command("dd", "if=/dev/zero", "of="+m.swapPath, "bs=1M", fmt.Sprintf("count=%d", m.sizeGB*1024))
|
||||||
|
if output, err := cmd.CombinedOutput(); err != nil {
|
||||||
|
return fmt.Errorf("failed to create swap file with dd: %w (output: %s)", err, string(output))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m.wasCreated = true
|
||||||
|
|
||||||
|
// Set correct permissions (600 for security)
|
||||||
|
if err := os.Chmod(m.swapPath, 0600); err != nil {
|
||||||
|
m.cleanup()
|
||||||
|
return fmt.Errorf("failed to set swap file permissions: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Format as swap
|
||||||
|
m.log.Info("Formatting swap file")
|
||||||
|
cmd = exec.Command("mkswap", m.swapPath)
|
||||||
|
if output, err := cmd.CombinedOutput(); err != nil {
|
||||||
|
m.cleanup()
|
||||||
|
return fmt.Errorf("failed to format swap file: %w (output: %s)", err, string(output))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enable swap
|
||||||
|
m.log.Info("Enabling swap file")
|
||||||
|
cmd = exec.Command("swapon", m.swapPath)
|
||||||
|
if output, err := cmd.CombinedOutput(); err != nil {
|
||||||
|
m.cleanup()
|
||||||
|
return fmt.Errorf("failed to enable swap file: %w (output: %s)", err, string(output))
|
||||||
|
}
|
||||||
|
|
||||||
|
m.isActive = true
|
||||||
|
|
||||||
|
// Log current swap status
|
||||||
|
if total, used, free, err := m.GetCurrentSwap(); err == nil {
|
||||||
|
m.log.Info("Swap status after setup",
|
||||||
|
"total_mb", total,
|
||||||
|
"used_mb", used,
|
||||||
|
"free_mb", free,
|
||||||
|
"added_gb", m.sizeGB)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// cleanup removes the swap file (internal helper)
|
||||||
|
func (m *Manager) cleanup() {
|
||||||
|
if m.wasCreated {
|
||||||
|
os.Remove(m.swapPath)
|
||||||
|
m.wasCreated = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleanup disables and removes the swap file
|
||||||
|
func (m *Manager) Cleanup() error {
|
||||||
|
if !m.isActive {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
m.log.Info("Cleaning up swap file", "path", m.swapPath)
|
||||||
|
|
||||||
|
// Check if still active
|
||||||
|
if active, _ := m.IsSwapFileActive(); active {
|
||||||
|
// Disable swap
|
||||||
|
m.log.Info("Disabling swap file")
|
||||||
|
cmd := exec.Command("swapoff", m.swapPath)
|
||||||
|
if output, err := cmd.CombinedOutput(); err != nil {
|
||||||
|
m.log.Warn("Failed to disable swap file", "error", err, "output", string(output))
|
||||||
|
// Continue anyway
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove file if we created it
|
||||||
|
if m.wasCreated {
|
||||||
|
m.log.Info("Removing swap file", "path", m.swapPath)
|
||||||
|
if err := os.Remove(m.swapPath); err != nil {
|
||||||
|
m.log.Warn("Failed to remove swap file", "error", err)
|
||||||
|
return fmt.Errorf("failed to remove swap file: %w", err)
|
||||||
|
}
|
||||||
|
m.wasCreated = false
|
||||||
|
}
|
||||||
|
|
||||||
|
m.isActive = false
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsActive returns whether the swap file is currently active
|
||||||
|
func (m *Manager) IsActive() bool {
|
||||||
|
return m.isActive
|
||||||
|
}
|
||||||
|
|
||||||
|
// WasCreated returns whether this manager created the swap file
|
||||||
|
func (m *Manager) WasCreated() bool {
|
||||||
|
return m.wasCreated
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user