Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6a7cf3c11e | |||
| fd3f8770b7 | |||
| 15f10c280c | |||
| 35a9a6e837 |
@ -3,9 +3,9 @@
|
||||
This directory contains pre-compiled binaries for the DB Backup Tool across multiple platforms and architectures.
|
||||
|
||||
## Build Information
|
||||
- **Version**: 3.42.79
|
||||
- **Build Time**: 2026-01-22_07:18:57_UTC
|
||||
- **Git Commit**: 9fec2c7
|
||||
- **Version**: 3.42.81
|
||||
- **Build Time**: 2026-01-22_15:56:08_UTC
|
||||
- **Git Commit**: fd3f877
|
||||
|
||||
## Recent Updates (v1.1.0)
|
||||
- ✅ Fixed TUI progress display with line-by-line output
|
||||
|
||||
@ -1172,6 +1172,27 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string, preExtr
|
||||
e.log.Warn("Preflight checks failed", "error", preflightErr)
|
||||
}
|
||||
|
||||
// 🛡️ LARGE DATABASE GUARD - Bulletproof protection for large database restores
|
||||
e.progress.Update("Analyzing database characteristics...")
|
||||
guard := NewLargeDBGuard(e.cfg, e.log)
|
||||
|
||||
// Build list of dump files for analysis
|
||||
var dumpFilePaths []string
|
||||
for _, entry := range entries {
|
||||
if !entry.IsDir() {
|
||||
dumpFilePaths = append(dumpFilePaths, filepath.Join(dumpsDir, entry.Name()))
|
||||
}
|
||||
}
|
||||
|
||||
// Determine optimal restore strategy
|
||||
strategy := guard.DetermineStrategy(ctx, archivePath, dumpFilePaths)
|
||||
|
||||
// Apply strategy (override config if needed)
|
||||
if strategy.UseConservative {
|
||||
guard.ApplyStrategy(strategy, e.cfg)
|
||||
guard.WarnUser(strategy, e.silentMode)
|
||||
}
|
||||
|
||||
// Calculate optimal lock boost based on BLOB count
|
||||
lockBoostValue := 2048 // Default
|
||||
if preflight != nil && preflight.Archive.RecommendedLockBoost > 0 {
|
||||
@ -1182,21 +1203,49 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string, preExtr
|
||||
e.progress.Update("Tuning PostgreSQL for large restore...")
|
||||
originalSettings, tuneErr := e.boostPostgreSQLSettings(ctx, lockBoostValue)
|
||||
if tuneErr != nil {
|
||||
e.log.Warn("Could not boost PostgreSQL settings - restore may fail on BLOB-heavy databases",
|
||||
"error", tuneErr)
|
||||
} else {
|
||||
e.log.Info("Boosted PostgreSQL settings for restore",
|
||||
"max_locks_per_transaction", fmt.Sprintf("%d → %d", originalSettings.MaxLocks, lockBoostValue),
|
||||
"maintenance_work_mem", fmt.Sprintf("%s → 2GB", originalSettings.MaintenanceWorkMem))
|
||||
// Ensure we reset settings when done (even on failure)
|
||||
defer func() {
|
||||
if resetErr := e.resetPostgreSQLSettings(ctx, originalSettings); resetErr != nil {
|
||||
e.log.Warn("Could not reset PostgreSQL settings", "error", resetErr)
|
||||
} else {
|
||||
e.log.Info("Reset PostgreSQL settings to original values")
|
||||
}
|
||||
}()
|
||||
e.log.Error("Could not boost PostgreSQL settings", "error", tuneErr)
|
||||
operation.Fail("PostgreSQL tuning failed")
|
||||
return fmt.Errorf("failed to boost PostgreSQL settings: %w", tuneErr)
|
||||
}
|
||||
|
||||
// CRITICAL: Verify locks were actually increased
|
||||
// Even in conservative mode (--jobs=1), a single massive database can exhaust locks
|
||||
// If boost failed (couldn't restart PostgreSQL), we MUST abort
|
||||
if originalSettings.MaxLocks < lockBoostValue {
|
||||
e.log.Error("PostgreSQL lock boost FAILED - restart required but not possible",
|
||||
"current_locks", originalSettings.MaxLocks,
|
||||
"required_locks", lockBoostValue,
|
||||
"conservative_mode", strategy.UseConservative,
|
||||
"note", "Even single-threaded restore can fail with massive databases")
|
||||
operation.Fail(fmt.Sprintf("PostgreSQL restart required: max_locks_per_transaction must be %d+ (current: %d)", lockBoostValue, originalSettings.MaxLocks))
|
||||
|
||||
// Provide clear instructions
|
||||
e.log.Error("=" + strings.Repeat("=", 70))
|
||||
e.log.Error("RESTORE ABORTED - Action Required:")
|
||||
e.log.Error("1. ALTER SYSTEM has saved max_locks_per_transaction=%d to postgresql.auto.conf", lockBoostValue)
|
||||
e.log.Error("2. Restart PostgreSQL to activate the new setting:")
|
||||
e.log.Error(" sudo systemctl restart postgresql")
|
||||
e.log.Error("3. Retry the restore - it will then complete successfully")
|
||||
e.log.Error("=" + strings.Repeat("=", 70))
|
||||
|
||||
return fmt.Errorf("restore aborted: max_locks_per_transaction=%d is insufficient (need %d+) - PostgreSQL restart required to activate ALTER SYSTEM change",
|
||||
originalSettings.MaxLocks, lockBoostValue)
|
||||
}
|
||||
|
||||
e.log.Info("PostgreSQL tuning verified - locks sufficient for restore",
|
||||
"max_locks_per_transaction", originalSettings.MaxLocks,
|
||||
"target_locks", lockBoostValue,
|
||||
"maintenance_work_mem", "2GB",
|
||||
"conservative_mode", strategy.UseConservative)
|
||||
|
||||
// Ensure we reset settings when done (even on failure)
|
||||
defer func() {
|
||||
if resetErr := e.resetPostgreSQLSettings(ctx, originalSettings); resetErr != nil {
|
||||
e.log.Warn("Could not reset PostgreSQL settings", "error", resetErr)
|
||||
} else {
|
||||
e.log.Info("Reset PostgreSQL settings to original values")
|
||||
}
|
||||
}()
|
||||
|
||||
var restoreErrors *multierror.Error
|
||||
var restoreErrorsMu sync.Mutex
|
||||
@ -2504,21 +2553,22 @@ func (e *Engine) boostPostgreSQLSettings(ctx context.Context, lockBoostValue int
|
||||
e.log.Info("PostgreSQL restarted successfully - max_locks_per_transaction now active")
|
||||
// Wait for PostgreSQL to be ready
|
||||
time.Sleep(3 * time.Second)
|
||||
// Update original.MaxLocks to reflect the new value after restart
|
||||
var newMaxLocksStr string
|
||||
if err := db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&newMaxLocksStr); err == nil {
|
||||
original.MaxLocks, _ = strconv.Atoi(newMaxLocksStr)
|
||||
e.log.Info("Verified new max_locks_per_transaction after restart", "value", original.MaxLocks)
|
||||
}
|
||||
} else {
|
||||
// Cannot restart - warn user but continue
|
||||
// The setting is written to postgresql.auto.conf and will take effect on next restart
|
||||
e.log.Warn("=" + strings.Repeat("=", 70))
|
||||
e.log.Warn("NOTE: max_locks_per_transaction change requires PostgreSQL restart")
|
||||
e.log.Warn("Current value: " + strconv.Itoa(original.MaxLocks) + ", target: " + strconv.Itoa(lockBoostValue))
|
||||
e.log.Warn("")
|
||||
e.log.Warn("The setting has been saved to postgresql.auto.conf and will take")
|
||||
e.log.Warn("effect on the next PostgreSQL restart. If restore fails with")
|
||||
e.log.Warn("'out of shared memory' errors, ask your DBA to restart PostgreSQL.")
|
||||
e.log.Warn("")
|
||||
e.log.Warn("Continuing with restore - this may succeed if your databases")
|
||||
e.log.Warn("don't have many large objects (BLOBs).")
|
||||
e.log.Warn("=" + strings.Repeat("=", 70))
|
||||
// Continue anyway - might work for small restores or DBs without BLOBs
|
||||
// Cannot restart - this is now a CRITICAL failure
|
||||
// We tried to boost locks but can't apply them without restart
|
||||
e.log.Error("CRITICAL: max_locks_per_transaction boost requires PostgreSQL restart")
|
||||
e.log.Error("Current value: "+strconv.Itoa(original.MaxLocks)+", required: "+strconv.Itoa(lockBoostValue))
|
||||
e.log.Error("The setting has been saved to postgresql.auto.conf but is NOT ACTIVE")
|
||||
e.log.Error("Restore will ABORT to prevent 'out of shared memory' failure")
|
||||
e.log.Error("Action required: Ask DBA to restart PostgreSQL, then retry restore")
|
||||
// Return original settings so caller can check and abort
|
||||
return original, nil
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
297
internal/restore/large_db_guard.go
Normal file
297
internal/restore/large_db_guard.go
Normal file
@ -0,0 +1,297 @@
|
||||
package restore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"dbbackup/internal/config"
|
||||
"dbbackup/internal/logger"
|
||||
)
|
||||
|
||||
// LargeDBGuard provides bulletproof protection for large database restores
|
||||
type LargeDBGuard struct {
|
||||
log logger.Logger
|
||||
cfg *config.Config
|
||||
}
|
||||
|
||||
// RestoreStrategy determines how to restore based on database characteristics
|
||||
type RestoreStrategy struct {
|
||||
UseConservative bool // Force conservative (single-threaded) mode
|
||||
Reason string // Why this strategy was chosen
|
||||
Jobs int // Recommended --jobs value
|
||||
ParallelDBs int // Recommended parallel database restores
|
||||
ExpectedTime string // Estimated restore time
|
||||
}
|
||||
|
||||
// NewLargeDBGuard creates a new guard
|
||||
func NewLargeDBGuard(cfg *config.Config, log logger.Logger) *LargeDBGuard {
|
||||
return &LargeDBGuard{
|
||||
cfg: cfg,
|
||||
log: log,
|
||||
}
|
||||
}
|
||||
|
||||
// DetermineStrategy analyzes the restore and determines the safest approach
|
||||
func (g *LargeDBGuard) DetermineStrategy(ctx context.Context, archivePath string, dumpFiles []string) *RestoreStrategy {
|
||||
strategy := &RestoreStrategy{
|
||||
UseConservative: false,
|
||||
Jobs: 0, // Will use profile default
|
||||
ParallelDBs: 0, // Will use profile default
|
||||
}
|
||||
|
||||
// 1. Check for large objects (BLOBs)
|
||||
hasLargeObjects, blobCount := g.detectLargeObjects(ctx, dumpFiles)
|
||||
if hasLargeObjects {
|
||||
strategy.UseConservative = true
|
||||
strategy.Reason = fmt.Sprintf("Database contains %d large objects (BLOBs)", blobCount)
|
||||
strategy.Jobs = 1
|
||||
strategy.ParallelDBs = 1
|
||||
|
||||
if blobCount > 10000 {
|
||||
strategy.ExpectedTime = "8-12 hours for very large BLOB database"
|
||||
} else if blobCount > 1000 {
|
||||
strategy.ExpectedTime = "4-8 hours for large BLOB database"
|
||||
} else {
|
||||
strategy.ExpectedTime = "2-4 hours"
|
||||
}
|
||||
|
||||
g.log.Warn("🛡️ Large DB Guard: Forcing conservative mode",
|
||||
"blob_count", blobCount,
|
||||
"reason", strategy.Reason)
|
||||
return strategy
|
||||
}
|
||||
|
||||
// 2. Check total database size
|
||||
totalSize := g.estimateTotalSize(dumpFiles)
|
||||
if totalSize > 50*1024*1024*1024 { // > 50GB
|
||||
strategy.UseConservative = true
|
||||
strategy.Reason = fmt.Sprintf("Total database size: %s (>50GB)", FormatBytes(totalSize))
|
||||
strategy.Jobs = 1
|
||||
strategy.ParallelDBs = 1
|
||||
strategy.ExpectedTime = "6-10 hours for very large database"
|
||||
|
||||
g.log.Warn("🛡️ Large DB Guard: Forcing conservative mode",
|
||||
"total_size_gb", totalSize/(1024*1024*1024),
|
||||
"reason", strategy.Reason)
|
||||
return strategy
|
||||
}
|
||||
|
||||
// 3. Check PostgreSQL lock configuration
|
||||
// CRITICAL: ALWAYS force conservative mode unless locks are 4096+
|
||||
// Parallel restore exhausts locks even with 2048 and high connection count
|
||||
// This is the PRIMARY protection - lock exhaustion is the #1 failure mode
|
||||
maxLocks, maxConns := g.checkLockConfiguration(ctx)
|
||||
lockCapacity := maxLocks * maxConns
|
||||
|
||||
if maxLocks < 4096 {
|
||||
strategy.UseConservative = true
|
||||
strategy.Reason = fmt.Sprintf("PostgreSQL max_locks_per_transaction=%d (need 4096+ for parallel restore)", maxLocks)
|
||||
strategy.Jobs = 1
|
||||
strategy.ParallelDBs = 1
|
||||
|
||||
g.log.Warn("🛡️ Large DB Guard: FORCING conservative mode - lock protection",
|
||||
"max_locks_per_transaction", maxLocks,
|
||||
"max_connections", maxConns,
|
||||
"total_capacity", lockCapacity,
|
||||
"required_locks", 4096,
|
||||
"reason", strategy.Reason)
|
||||
return strategy
|
||||
}
|
||||
|
||||
g.log.Info("✅ Large DB Guard: Lock configuration OK for parallel restore",
|
||||
"max_locks_per_transaction", maxLocks,
|
||||
"max_connections", maxConns,
|
||||
"total_capacity", lockCapacity)
|
||||
|
||||
// 4. Check individual dump file sizes
|
||||
largestDump := g.findLargestDump(dumpFiles)
|
||||
if largestDump.size > 10*1024*1024*1024 { // > 10GB single dump
|
||||
strategy.UseConservative = true
|
||||
strategy.Reason = fmt.Sprintf("Largest database: %s (%s)", largestDump.name, FormatBytes(largestDump.size))
|
||||
strategy.Jobs = 1
|
||||
strategy.ParallelDBs = 1
|
||||
|
||||
g.log.Warn("🛡️ Large DB Guard: Forcing conservative mode",
|
||||
"largest_db", largestDump.name,
|
||||
"size_gb", largestDump.size/(1024*1024*1024),
|
||||
"reason", strategy.Reason)
|
||||
return strategy
|
||||
}
|
||||
|
||||
// All checks passed - safe to use default profile
|
||||
strategy.Reason = "No large database risks detected"
|
||||
g.log.Info("✅ Large DB Guard: Safe to use default profile")
|
||||
return strategy
|
||||
}
|
||||
|
||||
// detectLargeObjects checks dump files for BLOBs/large objects
|
||||
func (g *LargeDBGuard) detectLargeObjects(ctx context.Context, dumpFiles []string) (bool, int) {
|
||||
totalBlobCount := 0
|
||||
|
||||
for _, dumpFile := range dumpFiles {
|
||||
// Skip if not a custom format dump
|
||||
if !strings.HasSuffix(dumpFile, ".dump") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Use pg_restore -l to list contents (fast)
|
||||
listCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
cmd := exec.CommandContext(listCtx, "pg_restore", "-l", dumpFile)
|
||||
output, err := cmd.Output()
|
||||
cancel()
|
||||
|
||||
if err != nil {
|
||||
continue // Skip on error
|
||||
}
|
||||
|
||||
// Count BLOB entries
|
||||
for _, line := range strings.Split(string(output), "\n") {
|
||||
if strings.Contains(line, "BLOB") ||
|
||||
strings.Contains(line, "LARGE OBJECT") ||
|
||||
strings.Contains(line, " BLOBS ") {
|
||||
totalBlobCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return totalBlobCount > 0, totalBlobCount
|
||||
}
|
||||
|
||||
// estimateTotalSize calculates total size of all dump files
|
||||
func (g *LargeDBGuard) estimateTotalSize(dumpFiles []string) int64 {
|
||||
var total int64
|
||||
for _, file := range dumpFiles {
|
||||
if info, err := os.Stat(file); err == nil {
|
||||
total += info.Size()
|
||||
}
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
// checkLockCapacity gets PostgreSQL lock table capacity
|
||||
func (g *LargeDBGuard) checkLockCapacity(ctx context.Context) int {
|
||||
maxLocks, maxConns := g.checkLockConfiguration(ctx)
|
||||
maxPrepared := 0 // We don't use prepared transactions in restore
|
||||
|
||||
// Calculate total lock capacity
|
||||
capacity := maxLocks * (maxConns + maxPrepared)
|
||||
return capacity
|
||||
}
|
||||
|
||||
// checkLockConfiguration returns max_locks_per_transaction and max_connections
|
||||
func (g *LargeDBGuard) checkLockConfiguration(ctx context.Context) (int, int) {
|
||||
// Build connection string
|
||||
connStr := fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=postgres sslmode=disable",
|
||||
g.cfg.Host, g.cfg.Port, g.cfg.User, g.cfg.Password)
|
||||
|
||||
db, err := sql.Open("pgx", connStr)
|
||||
if err != nil {
|
||||
return 64, 100 // PostgreSQL defaults
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
var maxLocks, maxConns int
|
||||
|
||||
// Get max_locks_per_transaction
|
||||
err = db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&maxLocks)
|
||||
if err != nil {
|
||||
maxLocks = 64 // PostgreSQL default
|
||||
}
|
||||
|
||||
// Get max_connections
|
||||
err = db.QueryRowContext(ctx, "SHOW max_connections").Scan(&maxConns)
|
||||
if err != nil {
|
||||
maxConns = 100 // PostgreSQL default
|
||||
}
|
||||
|
||||
return maxLocks, maxConns
|
||||
}
|
||||
|
||||
// findLargestDump finds the largest individual dump file
|
||||
func (g *LargeDBGuard) findLargestDump(dumpFiles []string) struct {
|
||||
name string
|
||||
size int64
|
||||
} {
|
||||
var largest struct {
|
||||
name string
|
||||
size int64
|
||||
}
|
||||
|
||||
for _, file := range dumpFiles {
|
||||
if info, err := os.Stat(file); err == nil {
|
||||
if info.Size() > largest.size {
|
||||
largest.name = filepath.Base(file)
|
||||
largest.size = info.Size()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return largest
|
||||
}
|
||||
|
||||
// ApplyStrategy enforces the recommended strategy
|
||||
func (g *LargeDBGuard) ApplyStrategy(strategy *RestoreStrategy, cfg *config.Config) {
|
||||
if !strategy.UseConservative {
|
||||
return
|
||||
}
|
||||
|
||||
// Override configuration to force conservative settings
|
||||
if strategy.Jobs > 0 {
|
||||
cfg.Jobs = strategy.Jobs
|
||||
}
|
||||
if strategy.ParallelDBs > 0 {
|
||||
cfg.ClusterParallelism = strategy.ParallelDBs
|
||||
}
|
||||
|
||||
g.log.Warn("🛡️ Large DB Guard ACTIVE",
|
||||
"reason", strategy.Reason,
|
||||
"jobs", cfg.Jobs,
|
||||
"parallel_dbs", cfg.ClusterParallelism,
|
||||
"expected_time", strategy.ExpectedTime)
|
||||
}
|
||||
|
||||
// WarnUser displays prominent warning about single-threaded restore
|
||||
// In silent mode (TUI), this is skipped to prevent scrambled output
|
||||
func (g *LargeDBGuard) WarnUser(strategy *RestoreStrategy, silentMode bool) {
|
||||
if !strategy.UseConservative {
|
||||
return
|
||||
}
|
||||
|
||||
// In TUI/silent mode, don't print to stdout - it causes scrambled output
|
||||
if silentMode {
|
||||
// Log the warning instead for debugging
|
||||
g.log.Info("Large Database Protection Active",
|
||||
"reason", strategy.Reason,
|
||||
"jobs", strategy.Jobs,
|
||||
"parallel_dbs", strategy.ParallelDBs,
|
||||
"expected_time", strategy.ExpectedTime)
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println("╔══════════════════════════════════════════════════════════════╗")
|
||||
fmt.Println("║ 🛡️ LARGE DATABASE PROTECTION ACTIVE 🛡️ ║")
|
||||
fmt.Println("╚══════════════════════════════════════════════════════════════╝")
|
||||
fmt.Println()
|
||||
fmt.Printf(" Reason: %s\n", strategy.Reason)
|
||||
fmt.Println()
|
||||
fmt.Println(" Strategy: SINGLE-THREADED RESTORE (Conservative Mode)")
|
||||
fmt.Println(" • Prevents PostgreSQL lock exhaustion")
|
||||
fmt.Println(" • Guarantees completion without 'out of shared memory' errors")
|
||||
fmt.Println(" • Slower but 100% reliable")
|
||||
fmt.Println()
|
||||
if strategy.ExpectedTime != "" {
|
||||
fmt.Printf(" Estimated Time: %s\n", strategy.ExpectedTime)
|
||||
fmt.Println()
|
||||
}
|
||||
fmt.Println(" This restore will complete successfully. Please be patient.")
|
||||
fmt.Println()
|
||||
fmt.Println("═══════════════════════════════════════════════════════════════")
|
||||
fmt.Println()
|
||||
}
|
||||
Reference in New Issue
Block a user