Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 527435a3b8 | |||
| 6a7cf3c11e | |||
| fd3f8770b7 | |||
| 15f10c280c |
@ -3,9 +3,9 @@
|
||||
This directory contains pre-compiled binaries for the DB Backup Tool across multiple platforms and architectures.
|
||||
|
||||
## Build Information
|
||||
- **Version**: 3.42.80
|
||||
- **Build Time**: 2026-01-22_07:26:07_UTC
|
||||
- **Git Commit**: 82378be
|
||||
- **Version**: 3.42.81
|
||||
- **Build Time**: 2026-01-22_17:13:41_UTC
|
||||
- **Git Commit**: 6a7cf3c
|
||||
|
||||
## Recent Updates (v1.1.0)
|
||||
- ✅ Fixed TUI progress display with line-by-line output
|
||||
|
||||
@ -39,6 +39,7 @@ var (
|
||||
restoreCleanCluster bool
|
||||
restoreDiagnose bool // Run diagnosis before restore
|
||||
restoreSaveDebugLog string // Path to save debug log on failure
|
||||
restoreDebugLocks bool // Enable detailed lock debugging
|
||||
|
||||
// Single database extraction from cluster flags
|
||||
restoreDatabase string // Single database to extract/restore from cluster
|
||||
@ -322,6 +323,7 @@ func init() {
|
||||
restoreSingleCmd.Flags().StringVar(&restoreEncryptionKeyEnv, "encryption-key-env", "DBBACKUP_ENCRYPTION_KEY", "Environment variable containing encryption key")
|
||||
restoreSingleCmd.Flags().BoolVar(&restoreDiagnose, "diagnose", false, "Run deep diagnosis before restore to detect corruption/truncation")
|
||||
restoreSingleCmd.Flags().StringVar(&restoreSaveDebugLog, "save-debug-log", "", "Save detailed error report to file on failure (e.g., /tmp/restore-debug.json)")
|
||||
restoreSingleCmd.Flags().BoolVar(&restoreDebugLocks, "debug-locks", false, "Enable detailed lock debugging (captures PostgreSQL config, Guard decisions, boost attempts)")
|
||||
|
||||
// Cluster restore flags
|
||||
restoreClusterCmd.Flags().BoolVar(&restoreListDBs, "list-databases", false, "List databases in cluster backup and exit")
|
||||
@ -342,6 +344,7 @@ func init() {
|
||||
restoreClusterCmd.Flags().StringVar(&restoreEncryptionKeyEnv, "encryption-key-env", "DBBACKUP_ENCRYPTION_KEY", "Environment variable containing encryption key")
|
||||
restoreClusterCmd.Flags().BoolVar(&restoreDiagnose, "diagnose", false, "Run deep diagnosis on all dumps before restore")
|
||||
restoreClusterCmd.Flags().StringVar(&restoreSaveDebugLog, "save-debug-log", "", "Save detailed error report to file on failure (e.g., /tmp/restore-debug.json)")
|
||||
restoreClusterCmd.Flags().BoolVar(&restoreDebugLocks, "debug-locks", false, "Enable detailed lock debugging (captures PostgreSQL config, Guard decisions, boost attempts)")
|
||||
restoreClusterCmd.Flags().BoolVar(&restoreClean, "clean", false, "Drop and recreate target database (for single DB restore)")
|
||||
restoreClusterCmd.Flags().BoolVar(&restoreCreate, "create", false, "Create target database if it doesn't exist (for single DB restore)")
|
||||
|
||||
@ -630,6 +633,12 @@ func runRestoreSingle(cmd *cobra.Command, args []string) error {
|
||||
log.Info("Debug logging enabled", "output", restoreSaveDebugLog)
|
||||
}
|
||||
|
||||
// Enable lock debugging if requested (single restore)
|
||||
if restoreDebugLocks {
|
||||
cfg.DebugLocks = true
|
||||
log.Info("🔍 Lock debugging enabled - will capture PostgreSQL lock config, Guard decisions, boost attempts")
|
||||
}
|
||||
|
||||
// Setup signal handling
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
@ -1058,6 +1067,12 @@ func runFullClusterRestore(archivePath string) error {
|
||||
log.Info("Debug logging enabled", "output", restoreSaveDebugLog)
|
||||
}
|
||||
|
||||
// Enable lock debugging if requested (cluster restore)
|
||||
if restoreDebugLocks {
|
||||
cfg.DebugLocks = true
|
||||
log.Info("🔍 Lock debugging enabled - will capture PostgreSQL lock config, Guard decisions, boost attempts")
|
||||
}
|
||||
|
||||
// Setup signal handling
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
@ -134,6 +134,7 @@ func Execute(ctx context.Context, config *config.Config, logger logger.Logger) e
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.BackupDir, "backup-dir", cfg.BackupDir, "Backup directory")
|
||||
rootCmd.PersistentFlags().BoolVar(&cfg.NoColor, "no-color", cfg.NoColor, "Disable colored output")
|
||||
rootCmd.PersistentFlags().BoolVar(&cfg.Debug, "debug", cfg.Debug, "Enable debug logging")
|
||||
rootCmd.PersistentFlags().BoolVar(&cfg.DebugLocks, "debug-locks", cfg.DebugLocks, "Enable detailed lock debugging (captures PostgreSQL lock configuration, Large DB Guard decisions, boost attempts)")
|
||||
rootCmd.PersistentFlags().IntVar(&cfg.Jobs, "jobs", cfg.Jobs, "Number of parallel jobs")
|
||||
rootCmd.PersistentFlags().IntVar(&cfg.DumpJobs, "dump-jobs", cfg.DumpJobs, "Number of parallel dump jobs")
|
||||
rootCmd.PersistentFlags().IntVar(&cfg.MaxCores, "max-cores", cfg.MaxCores, "Maximum CPU cores to use")
|
||||
|
||||
@ -50,10 +50,11 @@ type Config struct {
|
||||
SampleValue int
|
||||
|
||||
// Output options
|
||||
NoColor bool
|
||||
Debug bool
|
||||
LogLevel string
|
||||
LogFormat string
|
||||
NoColor bool
|
||||
Debug bool
|
||||
DebugLocks bool // Extended lock debugging (captures lock detection, Guard decisions, boost attempts)
|
||||
LogLevel string
|
||||
LogFormat string
|
||||
|
||||
// Config persistence
|
||||
NoSaveConfig bool
|
||||
|
||||
@ -1172,6 +1172,27 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string, preExtr
|
||||
e.log.Warn("Preflight checks failed", "error", preflightErr)
|
||||
}
|
||||
|
||||
// 🛡️ LARGE DATABASE GUARD - Bulletproof protection for large database restores
|
||||
e.progress.Update("Analyzing database characteristics...")
|
||||
guard := NewLargeDBGuard(e.cfg, e.log)
|
||||
|
||||
// Build list of dump files for analysis
|
||||
var dumpFilePaths []string
|
||||
for _, entry := range entries {
|
||||
if !entry.IsDir() {
|
||||
dumpFilePaths = append(dumpFilePaths, filepath.Join(dumpsDir, entry.Name()))
|
||||
}
|
||||
}
|
||||
|
||||
// Determine optimal restore strategy
|
||||
strategy := guard.DetermineStrategy(ctx, archivePath, dumpFilePaths)
|
||||
|
||||
// Apply strategy (override config if needed)
|
||||
if strategy.UseConservative {
|
||||
guard.ApplyStrategy(strategy, e.cfg)
|
||||
guard.WarnUser(strategy, e.silentMode)
|
||||
}
|
||||
|
||||
// Calculate optimal lock boost based on BLOB count
|
||||
lockBoostValue := 2048 // Default
|
||||
if preflight != nil && preflight.Archive.RecommendedLockBoost > 0 {
|
||||
@ -1180,23 +1201,88 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string, preExtr
|
||||
|
||||
// AUTO-TUNE: Boost PostgreSQL settings for large restores
|
||||
e.progress.Update("Tuning PostgreSQL for large restore...")
|
||||
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Info("🔍 [LOCK-DEBUG] Attempting to boost PostgreSQL lock settings",
|
||||
"target_max_locks", lockBoostValue,
|
||||
"conservative_mode", strategy.UseConservative)
|
||||
}
|
||||
|
||||
originalSettings, tuneErr := e.boostPostgreSQLSettings(ctx, lockBoostValue)
|
||||
if tuneErr != nil {
|
||||
e.log.Warn("Could not boost PostgreSQL settings - restore may fail on BLOB-heavy databases",
|
||||
"error", tuneErr)
|
||||
} else {
|
||||
e.log.Info("Boosted PostgreSQL settings for restore",
|
||||
"max_locks_per_transaction", fmt.Sprintf("%d → %d", originalSettings.MaxLocks, lockBoostValue),
|
||||
"maintenance_work_mem", fmt.Sprintf("%s → 2GB", originalSettings.MaintenanceWorkMem))
|
||||
// Ensure we reset settings when done (even on failure)
|
||||
defer func() {
|
||||
if resetErr := e.resetPostgreSQLSettings(ctx, originalSettings); resetErr != nil {
|
||||
e.log.Warn("Could not reset PostgreSQL settings", "error", resetErr)
|
||||
} else {
|
||||
e.log.Info("Reset PostgreSQL settings to original values")
|
||||
}
|
||||
}()
|
||||
e.log.Error("Could not boost PostgreSQL settings", "error", tuneErr)
|
||||
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Error("🔍 [LOCK-DEBUG] Lock boost attempt FAILED",
|
||||
"error", tuneErr,
|
||||
"phase", "boostPostgreSQLSettings")
|
||||
}
|
||||
|
||||
operation.Fail("PostgreSQL tuning failed")
|
||||
return fmt.Errorf("failed to boost PostgreSQL settings: %w", tuneErr)
|
||||
}
|
||||
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Info("🔍 [LOCK-DEBUG] Lock boost function returned",
|
||||
"original_max_locks", originalSettings.MaxLocks,
|
||||
"target_max_locks", lockBoostValue,
|
||||
"boost_successful", originalSettings.MaxLocks >= lockBoostValue)
|
||||
}
|
||||
|
||||
// CRITICAL: Verify locks were actually increased
|
||||
// Even in conservative mode (--jobs=1), a single massive database can exhaust locks
|
||||
// If boost failed (couldn't restart PostgreSQL), we MUST abort
|
||||
if originalSettings.MaxLocks < lockBoostValue {
|
||||
e.log.Error("PostgreSQL lock boost FAILED - restart required but not possible",
|
||||
"current_locks", originalSettings.MaxLocks,
|
||||
"required_locks", lockBoostValue,
|
||||
"conservative_mode", strategy.UseConservative,
|
||||
"note", "Even single-threaded restore can fail with massive databases")
|
||||
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Error("🔍 [LOCK-DEBUG] CRITICAL: Lock verification FAILED",
|
||||
"actual_locks", originalSettings.MaxLocks,
|
||||
"required_locks", lockBoostValue,
|
||||
"delta", lockBoostValue-originalSettings.MaxLocks,
|
||||
"verdict", "ABORT RESTORE")
|
||||
}
|
||||
|
||||
operation.Fail(fmt.Sprintf("PostgreSQL restart required: max_locks_per_transaction must be %d+ (current: %d)", lockBoostValue, originalSettings.MaxLocks))
|
||||
|
||||
// Provide clear instructions
|
||||
e.log.Error("=" + strings.Repeat("=", 70))
|
||||
e.log.Error("RESTORE ABORTED - Action Required:")
|
||||
e.log.Error("1. ALTER SYSTEM has saved max_locks_per_transaction=%d to postgresql.auto.conf", lockBoostValue)
|
||||
e.log.Error("2. Restart PostgreSQL to activate the new setting:")
|
||||
e.log.Error(" sudo systemctl restart postgresql")
|
||||
e.log.Error("3. Retry the restore - it will then complete successfully")
|
||||
e.log.Error("=" + strings.Repeat("=", 70))
|
||||
|
||||
return fmt.Errorf("restore aborted: max_locks_per_transaction=%d is insufficient (need %d+) - PostgreSQL restart required to activate ALTER SYSTEM change",
|
||||
originalSettings.MaxLocks, lockBoostValue)
|
||||
}
|
||||
|
||||
e.log.Info("PostgreSQL tuning verified - locks sufficient for restore",
|
||||
"max_locks_per_transaction", originalSettings.MaxLocks,
|
||||
"target_locks", lockBoostValue,
|
||||
"maintenance_work_mem", "2GB",
|
||||
"conservative_mode", strategy.UseConservative)
|
||||
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Info("🔍 [LOCK-DEBUG] Lock verification PASSED",
|
||||
"actual_locks", originalSettings.MaxLocks,
|
||||
"required_locks", lockBoostValue,
|
||||
"verdict", "PROCEED WITH RESTORE")
|
||||
}
|
||||
|
||||
// Ensure we reset settings when done (even on failure)
|
||||
defer func() {
|
||||
if resetErr := e.resetPostgreSQLSettings(ctx, originalSettings); resetErr != nil {
|
||||
e.log.Warn("Could not reset PostgreSQL settings", "error", resetErr)
|
||||
} else {
|
||||
e.log.Info("Reset PostgreSQL settings to original values")
|
||||
}
|
||||
}()
|
||||
|
||||
var restoreErrors *multierror.Error
|
||||
var restoreErrorsMu sync.Mutex
|
||||
@ -2452,9 +2538,18 @@ type OriginalSettings struct {
|
||||
// NOTE: max_locks_per_transaction requires a PostgreSQL RESTART to take effect!
|
||||
// maintenance_work_mem can be changed with pg_reload_conf().
|
||||
func (e *Engine) boostPostgreSQLSettings(ctx context.Context, lockBoostValue int) (*OriginalSettings, error) {
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Info("🔍 [LOCK-DEBUG] boostPostgreSQLSettings: Starting lock boost procedure",
|
||||
"target_lock_value", lockBoostValue)
|
||||
}
|
||||
|
||||
connStr := e.buildConnString()
|
||||
db, err := sql.Open("pgx", connStr)
|
||||
if err != nil {
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Error("🔍 [LOCK-DEBUG] Failed to connect to PostgreSQL",
|
||||
"error", err)
|
||||
}
|
||||
return nil, fmt.Errorf("failed to connect: %w", err)
|
||||
}
|
||||
defer db.Close()
|
||||
@ -2466,6 +2561,13 @@ func (e *Engine) boostPostgreSQLSettings(ctx context.Context, lockBoostValue int
|
||||
if err := db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&maxLocksStr); err == nil {
|
||||
original.MaxLocks, _ = strconv.Atoi(maxLocksStr)
|
||||
}
|
||||
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Info("🔍 [LOCK-DEBUG] Current PostgreSQL lock configuration",
|
||||
"current_max_locks", original.MaxLocks,
|
||||
"target_max_locks", lockBoostValue,
|
||||
"boost_required", original.MaxLocks < lockBoostValue)
|
||||
}
|
||||
|
||||
// Get current maintenance_work_mem
|
||||
db.QueryRowContext(ctx, "SHOW maintenance_work_mem").Scan(&original.MaintenanceWorkMem)
|
||||
@ -2474,14 +2576,31 @@ func (e *Engine) boostPostgreSQLSettings(ctx context.Context, lockBoostValue int
|
||||
// pg_reload_conf() is NOT sufficient for this parameter.
|
||||
needsRestart := false
|
||||
if original.MaxLocks < lockBoostValue {
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Info("🔍 [LOCK-DEBUG] Executing ALTER SYSTEM to boost locks",
|
||||
"from", original.MaxLocks,
|
||||
"to", lockBoostValue)
|
||||
}
|
||||
|
||||
_, err = db.ExecContext(ctx, fmt.Sprintf("ALTER SYSTEM SET max_locks_per_transaction = %d", lockBoostValue))
|
||||
if err != nil {
|
||||
e.log.Warn("Could not set max_locks_per_transaction", "error", err)
|
||||
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Error("🔍 [LOCK-DEBUG] ALTER SYSTEM failed",
|
||||
"error", err)
|
||||
}
|
||||
} else {
|
||||
needsRestart = true
|
||||
e.log.Warn("max_locks_per_transaction requires PostgreSQL restart to take effect",
|
||||
"current", original.MaxLocks,
|
||||
"target", lockBoostValue)
|
||||
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Info("🔍 [LOCK-DEBUG] ALTER SYSTEM succeeded - restart required",
|
||||
"setting_saved_to", "postgresql.auto.conf",
|
||||
"active_after", "PostgreSQL restart")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -2500,28 +2619,62 @@ func (e *Engine) boostPostgreSQLSettings(ctx context.Context, lockBoostValue int
|
||||
|
||||
// If max_locks_per_transaction needs a restart, try to do it
|
||||
if needsRestart {
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Info("🔍 [LOCK-DEBUG] Attempting PostgreSQL restart to activate new lock setting")
|
||||
}
|
||||
|
||||
if restarted := e.tryRestartPostgreSQL(ctx); restarted {
|
||||
e.log.Info("PostgreSQL restarted successfully - max_locks_per_transaction now active")
|
||||
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Info("🔍 [LOCK-DEBUG] PostgreSQL restart SUCCEEDED")
|
||||
}
|
||||
|
||||
// Wait for PostgreSQL to be ready
|
||||
time.Sleep(3 * time.Second)
|
||||
// Update original.MaxLocks to reflect the new value after restart
|
||||
var newMaxLocksStr string
|
||||
if err := db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&newMaxLocksStr); err == nil {
|
||||
original.MaxLocks, _ = strconv.Atoi(newMaxLocksStr)
|
||||
e.log.Info("Verified new max_locks_per_transaction after restart", "value", original.MaxLocks)
|
||||
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Info("🔍 [LOCK-DEBUG] Post-restart verification",
|
||||
"new_max_locks", original.MaxLocks,
|
||||
"target_was", lockBoostValue,
|
||||
"verification", "PASS")
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Cannot restart - warn user but continue
|
||||
// The setting is written to postgresql.auto.conf and will take effect on next restart
|
||||
e.log.Warn("=" + strings.Repeat("=", 70))
|
||||
e.log.Warn("NOTE: max_locks_per_transaction change requires PostgreSQL restart")
|
||||
e.log.Warn("Current value: " + strconv.Itoa(original.MaxLocks) + ", target: " + strconv.Itoa(lockBoostValue))
|
||||
e.log.Warn("")
|
||||
e.log.Warn("The setting has been saved to postgresql.auto.conf and will take")
|
||||
e.log.Warn("effect on the next PostgreSQL restart. If restore fails with")
|
||||
e.log.Warn("'out of shared memory' errors, ask your DBA to restart PostgreSQL.")
|
||||
e.log.Warn("")
|
||||
e.log.Warn("Continuing with restore - this may succeed if your databases")
|
||||
e.log.Warn("don't have many large objects (BLOBs).")
|
||||
e.log.Warn("=" + strings.Repeat("=", 70))
|
||||
// Continue anyway - might work for small restores or DBs without BLOBs
|
||||
// Cannot restart - this is now a CRITICAL failure
|
||||
// We tried to boost locks but can't apply them without restart
|
||||
e.log.Error("CRITICAL: max_locks_per_transaction boost requires PostgreSQL restart")
|
||||
e.log.Error("Current value: "+strconv.Itoa(original.MaxLocks)+", required: "+strconv.Itoa(lockBoostValue))
|
||||
e.log.Error("The setting has been saved to postgresql.auto.conf but is NOT ACTIVE")
|
||||
e.log.Error("Restore will ABORT to prevent 'out of shared memory' failure")
|
||||
e.log.Error("Action required: Ask DBA to restart PostgreSQL, then retry restore")
|
||||
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Error("🔍 [LOCK-DEBUG] PostgreSQL restart FAILED",
|
||||
"current_locks", original.MaxLocks,
|
||||
"required_locks", lockBoostValue,
|
||||
"setting_saved", true,
|
||||
"setting_active", false,
|
||||
"verdict", "ABORT - Manual restart required")
|
||||
}
|
||||
|
||||
// Return original settings so caller can check and abort
|
||||
return original, nil
|
||||
}
|
||||
}
|
||||
|
||||
if e.cfg.DebugLocks {
|
||||
e.log.Info("🔍 [LOCK-DEBUG] boostPostgreSQLSettings: Complete",
|
||||
"final_max_locks", original.MaxLocks,
|
||||
"target_was", lockBoostValue,
|
||||
"boost_successful", original.MaxLocks >= lockBoostValue)
|
||||
}
|
||||
|
||||
return original, nil
|
||||
}
|
||||
|
||||
|
||||
363
internal/restore/large_db_guard.go
Normal file
363
internal/restore/large_db_guard.go
Normal file
@ -0,0 +1,363 @@
|
||||
package restore
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"dbbackup/internal/config"
|
||||
"dbbackup/internal/logger"
|
||||
)
|
||||
|
||||
// LargeDBGuard provides bulletproof protection for large database restores
|
||||
type LargeDBGuard struct {
|
||||
log logger.Logger
|
||||
cfg *config.Config
|
||||
}
|
||||
|
||||
// RestoreStrategy determines how to restore based on database characteristics
|
||||
type RestoreStrategy struct {
|
||||
UseConservative bool // Force conservative (single-threaded) mode
|
||||
Reason string // Why this strategy was chosen
|
||||
Jobs int // Recommended --jobs value
|
||||
ParallelDBs int // Recommended parallel database restores
|
||||
ExpectedTime string // Estimated restore time
|
||||
}
|
||||
|
||||
// NewLargeDBGuard creates a new guard
|
||||
func NewLargeDBGuard(cfg *config.Config, log logger.Logger) *LargeDBGuard {
|
||||
return &LargeDBGuard{
|
||||
cfg: cfg,
|
||||
log: log,
|
||||
}
|
||||
}
|
||||
|
||||
// DetermineStrategy analyzes the restore and determines the safest approach
|
||||
func (g *LargeDBGuard) DetermineStrategy(ctx context.Context, archivePath string, dumpFiles []string) *RestoreStrategy {
|
||||
strategy := &RestoreStrategy{
|
||||
UseConservative: false,
|
||||
Jobs: 0, // Will use profile default
|
||||
ParallelDBs: 0, // Will use profile default
|
||||
}
|
||||
|
||||
if g.cfg.DebugLocks {
|
||||
g.log.Info("🔍 [LOCK-DEBUG] Large DB Guard: Starting strategy analysis",
|
||||
"archive", archivePath,
|
||||
"dump_count", len(dumpFiles))
|
||||
}
|
||||
|
||||
// 1. Check for large objects (BLOBs)
|
||||
hasLargeObjects, blobCount := g.detectLargeObjects(ctx, dumpFiles)
|
||||
if hasLargeObjects {
|
||||
strategy.UseConservative = true
|
||||
strategy.Reason = fmt.Sprintf("Database contains %d large objects (BLOBs)", blobCount)
|
||||
strategy.Jobs = 1
|
||||
strategy.ParallelDBs = 1
|
||||
|
||||
if blobCount > 10000 {
|
||||
strategy.ExpectedTime = "8-12 hours for very large BLOB database"
|
||||
} else if blobCount > 1000 {
|
||||
strategy.ExpectedTime = "4-8 hours for large BLOB database"
|
||||
} else {
|
||||
strategy.ExpectedTime = "2-4 hours"
|
||||
}
|
||||
|
||||
g.log.Warn("🛡️ Large DB Guard: Forcing conservative mode",
|
||||
"blob_count", blobCount,
|
||||
"reason", strategy.Reason)
|
||||
return strategy
|
||||
}
|
||||
|
||||
// 2. Check total database size
|
||||
totalSize := g.estimateTotalSize(dumpFiles)
|
||||
if totalSize > 50*1024*1024*1024 { // > 50GB
|
||||
strategy.UseConservative = true
|
||||
strategy.Reason = fmt.Sprintf("Total database size: %s (>50GB)", FormatBytes(totalSize))
|
||||
strategy.Jobs = 1
|
||||
strategy.ParallelDBs = 1
|
||||
strategy.ExpectedTime = "6-10 hours for very large database"
|
||||
|
||||
g.log.Warn("🛡️ Large DB Guard: Forcing conservative mode",
|
||||
"total_size_gb", totalSize/(1024*1024*1024),
|
||||
"reason", strategy.Reason)
|
||||
return strategy
|
||||
}
|
||||
|
||||
// 3. Check PostgreSQL lock configuration
|
||||
// CRITICAL: ALWAYS force conservative mode unless locks are 4096+
|
||||
// Parallel restore exhausts locks even with 2048 and high connection count
|
||||
// This is the PRIMARY protection - lock exhaustion is the #1 failure mode
|
||||
maxLocks, maxConns := g.checkLockConfiguration(ctx)
|
||||
lockCapacity := maxLocks * maxConns
|
||||
|
||||
if g.cfg.DebugLocks {
|
||||
g.log.Info("🔍 [LOCK-DEBUG] PostgreSQL lock configuration detected",
|
||||
"max_locks_per_transaction", maxLocks,
|
||||
"max_connections", maxConns,
|
||||
"calculated_capacity", lockCapacity,
|
||||
"threshold_required", 4096,
|
||||
"below_threshold", maxLocks < 4096)
|
||||
}
|
||||
|
||||
if maxLocks < 4096 {
|
||||
strategy.UseConservative = true
|
||||
strategy.Reason = fmt.Sprintf("PostgreSQL max_locks_per_transaction=%d (need 4096+ for parallel restore)", maxLocks)
|
||||
strategy.Jobs = 1
|
||||
strategy.ParallelDBs = 1
|
||||
|
||||
g.log.Warn("🛡️ Large DB Guard: FORCING conservative mode - lock protection",
|
||||
"max_locks_per_transaction", maxLocks,
|
||||
"max_connections", maxConns,
|
||||
"total_capacity", lockCapacity,
|
||||
"required_locks", 4096,
|
||||
"reason", strategy.Reason)
|
||||
|
||||
if g.cfg.DebugLocks {
|
||||
g.log.Info("🔍 [LOCK-DEBUG] Guard decision: CONSERVATIVE mode",
|
||||
"jobs", 1,
|
||||
"parallel_dbs", 1,
|
||||
"reason", "Lock threshold not met (max_locks < 4096)")
|
||||
}
|
||||
return strategy
|
||||
}
|
||||
|
||||
g.log.Info("✅ Large DB Guard: Lock configuration OK for parallel restore",
|
||||
"max_locks_per_transaction", maxLocks,
|
||||
"max_connections", maxConns,
|
||||
"total_capacity", lockCapacity)
|
||||
|
||||
if g.cfg.DebugLocks {
|
||||
g.log.Info("🔍 [LOCK-DEBUG] Lock check PASSED - parallel restore allowed",
|
||||
"max_locks", maxLocks,
|
||||
"threshold", 4096,
|
||||
"verdict", "PASS")
|
||||
}
|
||||
|
||||
// 4. Check individual dump file sizes
|
||||
largestDump := g.findLargestDump(dumpFiles)
|
||||
if largestDump.size > 10*1024*1024*1024 { // > 10GB single dump
|
||||
strategy.UseConservative = true
|
||||
strategy.Reason = fmt.Sprintf("Largest database: %s (%s)", largestDump.name, FormatBytes(largestDump.size))
|
||||
strategy.Jobs = 1
|
||||
strategy.ParallelDBs = 1
|
||||
|
||||
g.log.Warn("🛡️ Large DB Guard: Forcing conservative mode",
|
||||
"largest_db", largestDump.name,
|
||||
"size_gb", largestDump.size/(1024*1024*1024),
|
||||
"reason", strategy.Reason)
|
||||
return strategy
|
||||
}
|
||||
|
||||
// All checks passed - safe to use default profile
|
||||
strategy.Reason = "No large database risks detected"
|
||||
g.log.Info("✅ Large DB Guard: Safe to use default profile")
|
||||
|
||||
if g.cfg.DebugLocks {
|
||||
g.log.Info("🔍 [LOCK-DEBUG] Final strategy: Default profile (no restrictions)",
|
||||
"use_conservative", false,
|
||||
"reason", strategy.Reason)
|
||||
}
|
||||
|
||||
return strategy
|
||||
}
|
||||
|
||||
// detectLargeObjects checks dump files for BLOBs/large objects
|
||||
func (g *LargeDBGuard) detectLargeObjects(ctx context.Context, dumpFiles []string) (bool, int) {
|
||||
totalBlobCount := 0
|
||||
|
||||
for _, dumpFile := range dumpFiles {
|
||||
// Skip if not a custom format dump
|
||||
if !strings.HasSuffix(dumpFile, ".dump") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Use pg_restore -l to list contents (fast)
|
||||
listCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
cmd := exec.CommandContext(listCtx, "pg_restore", "-l", dumpFile)
|
||||
output, err := cmd.Output()
|
||||
cancel()
|
||||
|
||||
if err != nil {
|
||||
continue // Skip on error
|
||||
}
|
||||
|
||||
// Count BLOB entries
|
||||
for _, line := range strings.Split(string(output), "\n") {
|
||||
if strings.Contains(line, "BLOB") ||
|
||||
strings.Contains(line, "LARGE OBJECT") ||
|
||||
strings.Contains(line, " BLOBS ") {
|
||||
totalBlobCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return totalBlobCount > 0, totalBlobCount
|
||||
}
|
||||
|
||||
// estimateTotalSize calculates total size of all dump files
|
||||
func (g *LargeDBGuard) estimateTotalSize(dumpFiles []string) int64 {
|
||||
var total int64
|
||||
for _, file := range dumpFiles {
|
||||
if info, err := os.Stat(file); err == nil {
|
||||
total += info.Size()
|
||||
}
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
// checkLockCapacity gets PostgreSQL lock table capacity
|
||||
func (g *LargeDBGuard) checkLockCapacity(ctx context.Context) int {
|
||||
maxLocks, maxConns := g.checkLockConfiguration(ctx)
|
||||
maxPrepared := 0 // We don't use prepared transactions in restore
|
||||
|
||||
// Calculate total lock capacity
|
||||
capacity := maxLocks * (maxConns + maxPrepared)
|
||||
return capacity
|
||||
}
|
||||
|
||||
// checkLockConfiguration returns max_locks_per_transaction and max_connections
|
||||
func (g *LargeDBGuard) checkLockConfiguration(ctx context.Context) (int, int) {
|
||||
if g.cfg.DebugLocks {
|
||||
g.log.Info("🔍 [LOCK-DEBUG] Querying PostgreSQL for lock configuration",
|
||||
"host", g.cfg.Host,
|
||||
"port", g.cfg.Port,
|
||||
"user", g.cfg.User)
|
||||
}
|
||||
|
||||
// Build connection string
|
||||
connStr := fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=postgres sslmode=disable",
|
||||
g.cfg.Host, g.cfg.Port, g.cfg.User, g.cfg.Password)
|
||||
|
||||
db, err := sql.Open("pgx", connStr)
|
||||
if err != nil {
|
||||
if g.cfg.DebugLocks {
|
||||
g.log.Warn("🔍 [LOCK-DEBUG] Failed to connect to PostgreSQL, using defaults",
|
||||
"error", err,
|
||||
"default_max_locks", 64,
|
||||
"default_max_connections", 100)
|
||||
}
|
||||
return 64, 100 // PostgreSQL defaults
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
var maxLocks, maxConns int
|
||||
|
||||
// Get max_locks_per_transaction
|
||||
err = db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&maxLocks)
|
||||
if err != nil {
|
||||
if g.cfg.DebugLocks {
|
||||
g.log.Warn("🔍 [LOCK-DEBUG] Failed to query max_locks_per_transaction",
|
||||
"error", err,
|
||||
"using_default", 64)
|
||||
}
|
||||
maxLocks = 64 // PostgreSQL default
|
||||
}
|
||||
|
||||
// Get max_connections
|
||||
err = db.QueryRowContext(ctx, "SHOW max_connections").Scan(&maxConns)
|
||||
if err != nil {
|
||||
if g.cfg.DebugLocks {
|
||||
g.log.Warn("🔍 [LOCK-DEBUG] Failed to query max_connections",
|
||||
"error", err,
|
||||
"using_default", 100)
|
||||
}
|
||||
maxConns = 100 // PostgreSQL default
|
||||
}
|
||||
|
||||
if g.cfg.DebugLocks {
|
||||
g.log.Info("🔍 [LOCK-DEBUG] Successfully retrieved PostgreSQL lock settings",
|
||||
"max_locks_per_transaction", maxLocks,
|
||||
"max_connections", maxConns,
|
||||
"total_capacity", maxLocks*maxConns)
|
||||
}
|
||||
|
||||
return maxLocks, maxConns
|
||||
}
|
||||
|
||||
// findLargestDump finds the largest individual dump file
|
||||
func (g *LargeDBGuard) findLargestDump(dumpFiles []string) struct {
|
||||
name string
|
||||
size int64
|
||||
} {
|
||||
var largest struct {
|
||||
name string
|
||||
size int64
|
||||
}
|
||||
|
||||
for _, file := range dumpFiles {
|
||||
if info, err := os.Stat(file); err == nil {
|
||||
if info.Size() > largest.size {
|
||||
largest.name = filepath.Base(file)
|
||||
largest.size = info.Size()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return largest
|
||||
}
|
||||
|
||||
// ApplyStrategy enforces the recommended strategy
|
||||
func (g *LargeDBGuard) ApplyStrategy(strategy *RestoreStrategy, cfg *config.Config) {
|
||||
if !strategy.UseConservative {
|
||||
return
|
||||
}
|
||||
|
||||
// Override configuration to force conservative settings
|
||||
if strategy.Jobs > 0 {
|
||||
cfg.Jobs = strategy.Jobs
|
||||
}
|
||||
if strategy.ParallelDBs > 0 {
|
||||
cfg.ClusterParallelism = strategy.ParallelDBs
|
||||
}
|
||||
|
||||
g.log.Warn("🛡️ Large DB Guard ACTIVE",
|
||||
"reason", strategy.Reason,
|
||||
"jobs", cfg.Jobs,
|
||||
"parallel_dbs", cfg.ClusterParallelism,
|
||||
"expected_time", strategy.ExpectedTime)
|
||||
}
|
||||
|
||||
// WarnUser displays prominent warning about single-threaded restore
|
||||
// In silent mode (TUI), this is skipped to prevent scrambled output
|
||||
func (g *LargeDBGuard) WarnUser(strategy *RestoreStrategy, silentMode bool) {
|
||||
if !strategy.UseConservative {
|
||||
return
|
||||
}
|
||||
|
||||
// In TUI/silent mode, don't print to stdout - it causes scrambled output
|
||||
if silentMode {
|
||||
// Log the warning instead for debugging
|
||||
g.log.Info("Large Database Protection Active",
|
||||
"reason", strategy.Reason,
|
||||
"jobs", strategy.Jobs,
|
||||
"parallel_dbs", strategy.ParallelDBs,
|
||||
"expected_time", strategy.ExpectedTime)
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println("╔══════════════════════════════════════════════════════════════╗")
|
||||
fmt.Println("║ 🛡️ LARGE DATABASE PROTECTION ACTIVE 🛡️ ║")
|
||||
fmt.Println("╚══════════════════════════════════════════════════════════════╝")
|
||||
fmt.Println()
|
||||
fmt.Printf(" Reason: %s\n", strategy.Reason)
|
||||
fmt.Println()
|
||||
fmt.Println(" Strategy: SINGLE-THREADED RESTORE (Conservative Mode)")
|
||||
fmt.Println(" • Prevents PostgreSQL lock exhaustion")
|
||||
fmt.Println(" • Guarantees completion without 'out of shared memory' errors")
|
||||
fmt.Println(" • Slower but 100% reliable")
|
||||
fmt.Println()
|
||||
if strategy.ExpectedTime != "" {
|
||||
fmt.Printf(" Estimated Time: %s\n", strategy.ExpectedTime)
|
||||
fmt.Println()
|
||||
}
|
||||
fmt.Println(" This restore will complete successfully. Please be patient.")
|
||||
fmt.Println()
|
||||
fmt.Println("═══════════════════════════════════════════════════════════════")
|
||||
fmt.Println()
|
||||
}
|
||||
@ -61,6 +61,7 @@ type RestorePreviewModel struct {
|
||||
canProceed bool
|
||||
message string
|
||||
saveDebugLog bool // Save detailed error report on failure
|
||||
debugLocks bool // Enable detailed lock debugging
|
||||
workDir string // Custom work directory for extraction
|
||||
}
|
||||
|
||||
@ -317,6 +318,15 @@ func (m RestorePreviewModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
m.message = "Debug log: disabled"
|
||||
}
|
||||
|
||||
case "l":
|
||||
// Toggle lock debugging
|
||||
m.debugLocks = !m.debugLocks
|
||||
if m.debugLocks {
|
||||
m.message = infoStyle.Render("🔍 [LOCK-DEBUG] Lock debugging: ENABLED (captures PostgreSQL lock config, Guard decisions, boost attempts)")
|
||||
} else {
|
||||
m.message = "Lock debugging: disabled"
|
||||
}
|
||||
|
||||
case "w":
|
||||
// Toggle/set work directory
|
||||
if m.workDir == "" {
|
||||
@ -346,7 +356,10 @@ func (m RestorePreviewModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// Proceed to restore execution
|
||||
// Proceed to restore execution (enable lock debugging in Config)
|
||||
if m.debugLocks {
|
||||
m.config.DebugLocks = true
|
||||
}
|
||||
exec := NewRestoreExecution(m.config, m.logger, m.parent, m.ctx, m.archive, m.targetDB, m.cleanFirst, m.createIfMissing, m.mode, m.cleanClusterFirst, m.existingDBs, m.saveDebugLog, m.workDir)
|
||||
return exec, exec.Init()
|
||||
}
|
||||
@ -546,6 +559,20 @@ func (m RestorePreviewModel) View() string {
|
||||
s.WriteString(infoStyle.Render(fmt.Sprintf(" Saves detailed error report to %s on failure", m.config.GetEffectiveWorkDir())))
|
||||
s.WriteString("\n")
|
||||
}
|
||||
|
||||
// Lock debugging option
|
||||
lockDebugIcon := "[-]"
|
||||
lockDebugStyle := infoStyle
|
||||
if m.debugLocks {
|
||||
lockDebugIcon = "[🔍]"
|
||||
lockDebugStyle = checkPassedStyle
|
||||
}
|
||||
s.WriteString(lockDebugStyle.Render(fmt.Sprintf(" %s Lock Debug: %v (press 'l' to toggle)", lockDebugIcon, m.debugLocks)))
|
||||
s.WriteString("\n")
|
||||
if m.debugLocks {
|
||||
s.WriteString(infoStyle.Render(" Captures PostgreSQL lock config, Guard decisions, boost attempts"))
|
||||
s.WriteString("\n")
|
||||
}
|
||||
s.WriteString("\n")
|
||||
|
||||
// Message
|
||||
@ -561,10 +588,10 @@ func (m RestorePreviewModel) View() string {
|
||||
s.WriteString(successStyle.Render("[OK] Ready to restore"))
|
||||
s.WriteString("\n")
|
||||
if m.mode == "restore-single" {
|
||||
s.WriteString(infoStyle.Render("t: Clean-first | c: Create | w: WorkDir | d: Debug | Enter: Proceed | Esc: Cancel"))
|
||||
s.WriteString(infoStyle.Render("t: Clean-first | c: Create | w: WorkDir | d: Debug | l: LockDebug | Enter: Proceed | Esc: Cancel"))
|
||||
} else if m.mode == "restore-cluster" {
|
||||
if m.existingDBCount > 0 {
|
||||
s.WriteString(infoStyle.Render("c: Cleanup | w: WorkDir | d: Debug | Enter: Proceed | Esc: Cancel"))
|
||||
s.WriteString(infoStyle.Render("c: Cleanup | w: WorkDir | d: Debug | l: LockDebug | Enter: Proceed | Esc: Cancel"))
|
||||
} else {
|
||||
s.WriteString(infoStyle.Render("w: WorkDir | d: Debug | Enter: Proceed | Esc: Cancel"))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user