fix: improve cleanup toggle UX when database detection fails

- Allow cleanup toggle even when preview detection failed - Show 'detection pending' message instead of blocking the toggle - Will re-detect databases at restore execution time - Always show cleanup toggle option for cluster restores - Better messaging: 'enabled/disabled' instead of showing 0 count
fix: re-detect databases at execution time for cluster cleanup
2026-01-17 17:07:26 +01:00 · 2026-01-17 17:00:28 +01:00 · 2026-01-17 16:54:20 +01:00 · 2026-01-17 16:44:44 +01:00 · 2026-01-17 16:26:00 +01:00 · 2026-01-17 16:15:16 +01:00
11 changed files with 403 additions and 140 deletions
--- a/bin/README.md
+++ b/bin/README.md
@@ -4,8 +4,8 @@ This directory contains pre-compiled binaries for the DB Backup Tool across mult

 ## Build Information
 - **Version**: 3.42.50
- **Build Time**: 2026-01-16_18:37:32_UTC
- **Git Commit**: 9200024
+- **Build Time**: 2026-01-17_16:00:43_UTC
+- **Git Commit**: 29e089f

 ## Recent Updates (v1.1.0)
 - ✅ Fixed TUI progress display with line-by-line output
--- a/cmd/restore.go
+++ b/cmd/restore.go
@@ -290,7 +290,7 @@ func init() {
 	restoreClusterCmd.Flags().BoolVar(&restoreForce, "force", false, "Skip safety checks and confirmations")
 	restoreClusterCmd.Flags().BoolVar(&restoreCleanCluster, "clean-cluster", false, "Drop all existing user databases before restore (disaster recovery)")
 	restoreClusterCmd.Flags().IntVar(&restoreJobs, "jobs", 0, "Number of parallel decompression jobs (0 = auto)")
-	restoreClusterCmd.Flags().IntVar(&restoreParallelDBs, "parallel-dbs", 0, "Number of databases to restore in parallel (0 = use config default, 1 = sequential)")
+	restoreClusterCmd.Flags().IntVar(&restoreParallelDBs, "parallel-dbs", 0, "Number of databases to restore in parallel (0 = use config default, 1 = sequential, -1 = auto-detect based on CPU/RAM)")
 	restoreClusterCmd.Flags().StringVar(&restoreWorkdir, "workdir", "", "Working directory for extraction (use when system disk is small, e.g. /mnt/storage/restore_tmp)")
 	restoreClusterCmd.Flags().BoolVar(&restoreVerbose, "verbose", false, "Show detailed restore progress")
 	restoreClusterCmd.Flags().BoolVar(&restoreNoProgress, "no-progress", false, "Disable progress indicators")
@@ -786,7 +786,12 @@ func runRestoreCluster(cmd *cobra.Command, args []string) error {
 	}

 	// Override cluster parallelism if --parallel-dbs is specified
-	if restoreParallelDBs > 0 {
+	if restoreParallelDBs == -1 {
+		// Auto-detect optimal parallelism based on system resources
+		autoParallel := restore.CalculateOptimalParallel()
+		cfg.ClusterParallelism = autoParallel
+		log.Info("Auto-detected optimal parallelism for database restores", "parallel_dbs", autoParallel, "mode", "auto")
+	} else if restoreParallelDBs > 0 {
 		cfg.ClusterParallelism = restoreParallelDBs
 		log.Info("Using custom parallelism for database restores", "parallel_dbs", restoreParallelDBs)
 	}
--- a/grafana/dbbackup-dashboard.json
+++ b/grafana/dbbackup-dashboard.json
@@ -94,7 +94,7 @@
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
-          "expr": "dbbackup_rpo_seconds{instance=~\"$instance\"} < 86400",
+          "expr": "dbbackup_rpo_seconds{instance=~\"$instance\"} < bool 604800",
          "legendFormat": "{{database}}",
          "range": true,
          "refId": "A"
@@ -711,19 +711,6 @@
      },
      "pluginVersion": "10.2.0",
      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "editorMode": "code",
-          "expr": "dbbackup_rpo_seconds{instance=~\"$instance\"} < 86400",
-          "format": "table",
-          "instant": true,
-          "legendFormat": "__auto",
-          "range": false,
-          "refId": "Status"
-        },
        {
          "datasource": {
            "type": "prometheus",
@@ -769,26 +756,30 @@
              "Time": true,
              "Time 1": true,
              "Time 2": true,
-              "Time 3": true,
              "__name__": true,
              "__name__ 1": true,
              "__name__ 2": true,
-              "__name__ 3": true,
              "instance 1": true,
              "instance 2": true,
-              "instance 3": true,
              "job": true,
              "job 1": true,
              "job 2": true,
-              "job 3": true
+              "engine 1": true,
+              "engine 2": true
+            },
+            "indexByName": {
+              "Database": 0,
+              "Instance": 1,
+              "Engine": 2,
+              "RPO": 3,
+              "Size": 4
            },
-            "indexByName": {},
            "renameByName": {
              "Value #RPO": "RPO",
              "Value #Size": "Size",
-              "Value #Status": "Status",
              "database": "Database",
-              "instance": "Instance"
+              "instance": "Instance",
+              "engine": "Engine"
            }
          }
        }
@@ -1275,7 +1266,7 @@
          "query": "label_values(dbbackup_rpo_seconds, instance)",
          "refId": "StandardVariableQuery"
        },
-        "refresh": 1,
+        "refresh": 2,
        "regex": "",
        "skipUrlSync": false,
        "sort": 1,
--- a/internal/checks/error_hints.go
+++ b/internal/checks/error_hints.go
@@ -68,8 +68,8 @@ func ClassifyError(errorMsg string) *ErrorClassification {
 			Type:     "critical",
 			Category: "locks",
 			Message:  errorMsg,
-			Hint:     "Lock table exhausted - typically caused by large objects (BLOBs) during restore",
-			Action:   "Option 1: Increase max_locks_per_transaction to 1024+ in postgresql.conf (requires restart). Option 2: Update dbbackup and retry - phased restore now auto-enabled for BLOB databases",
+			Hint:     "Lock table exhausted. Total capacity = max_locks_per_transaction × (max_connections + max_prepared_transactions). If you reduced VM size or max_connections, you need higher max_locks_per_transaction to compensate.",
+			Action:   "Fix: ALTER SYSTEM SET max_locks_per_transaction = 4096; then RESTART PostgreSQL. For smaller VMs with fewer connections, you need higher max_locks_per_transaction values.",
 			Severity: 2,
 		}
 	case "permission_denied":
@@ -142,8 +142,8 @@ func ClassifyError(errorMsg string) *ErrorClassification {
 			Type:     "critical",
 			Category: "locks",
 			Message:  errorMsg,
-			Hint:     "Lock table exhausted - typically caused by large objects (BLOBs) during restore",
-			Action:   "Option 1: Increase max_locks_per_transaction to 1024+ in postgresql.conf (requires restart). Option 2: Update dbbackup and retry - phased restore now auto-enabled for BLOB databases",
+			Hint:     "Lock table exhausted. Total capacity = max_locks_per_transaction × (max_connections + max_prepared_transactions). If you reduced VM size or max_connections, you need higher max_locks_per_transaction to compensate.",
+			Action:   "Fix: ALTER SYSTEM SET max_locks_per_transaction = 4096; then RESTART PostgreSQL. For smaller VMs with fewer connections, you need higher max_locks_per_transaction values.",
 			Severity: 2,
 		}
 	}
--- a/internal/restore/engine.go
+++ b/internal/restore/engine.go
@@ -2125,9 +2125,10 @@ func (e *Engine) quickValidateSQLDump(archivePath string, compressed bool) error
 	return nil
 }

-// boostLockCapacity temporarily increases max_locks_per_transaction to prevent OOM
-// during large restores with many BLOBs. Returns the original value for later reset.
-// Uses ALTER SYSTEM + pg_reload_conf() so no restart is needed.
+// boostLockCapacity checks and reports on max_locks_per_transaction capacity.
+// IMPORTANT: max_locks_per_transaction requires a PostgreSQL RESTART to change!
+// This function now calculates total lock capacity based on max_connections and
+// warns the user if capacity is insufficient for the restore.
 func (e *Engine) boostLockCapacity(ctx context.Context) (int, error) {
 	// Connect to PostgreSQL to run system commands
 	connStr := fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=postgres sslmode=disable",
@@ -2145,7 +2146,7 @@ func (e *Engine) boostLockCapacity(ctx context.Context) (int, error) {
 	}
 	defer db.Close()

-	// Get current value
+	// Get current max_locks_per_transaction
 	var currentValue int
 	err = db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&currentValue)
 	if err != nil {
@@ -2158,22 +2159,56 @@ func (e *Engine) boostLockCapacity(ctx context.Context) (int, error) {
 		fmt.Sscanf(currentValueStr, "%d", &currentValue)
 	}

-	// Skip if already high enough
-	if currentValue >= 2048 {
-		e.log.Info("max_locks_per_transaction already sufficient", "value", currentValue)
-		return currentValue, nil
+	// Get max_connections to calculate total lock capacity
+	var maxConns int
+	if err := db.QueryRowContext(ctx, "SHOW max_connections").Scan(&maxConns); err != nil {
+		maxConns = 100 // default
 	}

-	// Boost to 2048 (enough for most BLOB-heavy databases)
-	_, err = db.ExecContext(ctx, "ALTER SYSTEM SET max_locks_per_transaction = 2048")
-	if err != nil {
-		return currentValue, fmt.Errorf("failed to set max_locks_per_transaction: %w", err)
+	// Get max_prepared_transactions
+	var maxPreparedTxns int
+	if err := db.QueryRowContext(ctx, "SHOW max_prepared_transactions").Scan(&maxPreparedTxns); err != nil {
+		maxPreparedTxns = 0
 	}

-	// Reload config without restart
-	_, err = db.ExecContext(ctx, "SELECT pg_reload_conf()")
-	if err != nil {
-		return currentValue, fmt.Errorf("failed to reload config: %w", err)
+	// Calculate total lock table capacity:
+	// Total locks = max_locks_per_transaction × (max_connections + max_prepared_transactions)
+	totalLockCapacity := currentValue * (maxConns + maxPreparedTxns)
+
+	e.log.Info("PostgreSQL lock table capacity",
+		"max_locks_per_transaction", currentValue,
+		"max_connections", maxConns,
+		"max_prepared_transactions", maxPreparedTxns,
+		"total_lock_capacity", totalLockCapacity)
+
+	// Minimum recommended total capacity for BLOB-heavy restores: 200,000 locks
+	minRecommendedCapacity := 200000
+	if totalLockCapacity < minRecommendedCapacity {
+		recommendedMaxLocks := minRecommendedCapacity / (maxConns + maxPreparedTxns)
+		if recommendedMaxLocks < 4096 {
+			recommendedMaxLocks = 4096
+		}
+
+		e.log.Warn("Lock table capacity may be insufficient for BLOB-heavy restores",
+			"current_total_capacity", totalLockCapacity,
+			"recommended_capacity", minRecommendedCapacity,
+			"current_max_locks", currentValue,
+			"recommended_max_locks", recommendedMaxLocks,
+			"note", "max_locks_per_transaction requires PostgreSQL RESTART to change")
+
+		// Write suggested fix to ALTER SYSTEM but warn about restart
+		_, err = db.ExecContext(ctx, fmt.Sprintf("ALTER SYSTEM SET max_locks_per_transaction = %d", recommendedMaxLocks))
+		if err != nil {
+			e.log.Warn("Could not set recommended max_locks_per_transaction (needs superuser)", "error", err)
+		} else {
+			e.log.Warn("Wrote recommended max_locks_per_transaction to postgresql.auto.conf",
+				"value", recommendedMaxLocks,
+				"action", "RESTART PostgreSQL to apply: sudo systemctl restart postgresql")
+		}
+	} else {
+		e.log.Info("Lock table capacity is sufficient",
+			"total_capacity", totalLockCapacity,
+			"max_locks_per_transaction", currentValue)
 	}

 	return currentValue, nil
--- a/internal/restore/preflight.go
+++ b/internal/restore/preflight.go
@@ -16,6 +16,57 @@ import (
 	"github.com/shirou/gopsutil/v3/mem"
 )

+// CalculateOptimalParallel returns the recommended number of parallel workers
+// based on available system resources (CPU cores and RAM).
+// This is a standalone function that can be called from anywhere.
+// Returns 0 if resources cannot be detected.
+func CalculateOptimalParallel() int {
+	cpuCores := runtime.NumCPU()
+
+	vmem, err := mem.VirtualMemory()
+	if err != nil {
+		// Fallback: use half of CPU cores if memory detection fails
+		if cpuCores > 1 {
+			return cpuCores / 2
+		}
+		return 1
+	}
+
+	memAvailableGB := float64(vmem.Available) / (1024 * 1024 * 1024)
+
+	// Each pg_restore worker needs approximately 2-4GB of RAM
+	// Use conservative 3GB per worker to avoid OOM
+	const memPerWorkerGB = 3.0
+
+	// Calculate limits
+	maxByMem := int(memAvailableGB / memPerWorkerGB)
+	maxByCPU := cpuCores
+
+	// Use the minimum of memory and CPU limits
+	recommended := maxByMem
+	if maxByCPU < recommended {
+		recommended = maxByCPU
+	}
+
+	// Apply sensible bounds
+	if recommended < 1 {
+		recommended = 1
+	}
+	if recommended > 16 {
+		recommended = 16 // Cap at 16 to avoid diminishing returns
+	}
+
+	// If memory pressure is high (>80%), reduce parallelism
+	if vmem.UsedPercent > 80 && recommended > 1 {
+		recommended = recommended / 2
+		if recommended < 1 {
+			recommended = 1
+		}
+	}
+
+	return recommended
+}
+
 // PreflightResult contains all preflight check results
 type PreflightResult struct {
 	// Linux system checks
@@ -35,25 +86,29 @@ type PreflightResult struct {

 // LinuxChecks contains Linux kernel/system checks
 type LinuxChecks struct {
-	ShmMax         int64   // /proc/sys/kernel/shmmax
-	ShmAll         int64   // /proc/sys/kernel/shmall
-	MemTotal       uint64  // Total RAM in bytes
-	MemAvailable   uint64  // Available RAM in bytes
-	MemUsedPercent float64 // Memory usage percentage
-	ShmMaxOK       bool    // Is shmmax sufficient?
-	ShmAllOK       bool    // Is shmall sufficient?
-	MemAvailableOK bool    // Is available RAM sufficient?
-	IsLinux        bool    // Are we running on Linux?
+	ShmMax              int64   // /proc/sys/kernel/shmmax
+	ShmAll              int64   // /proc/sys/kernel/shmall
+	MemTotal            uint64  // Total RAM in bytes
+	MemAvailable        uint64  // Available RAM in bytes
+	MemUsedPercent      float64 // Memory usage percentage
+	CPUCores            int     // Number of CPU cores
+	RecommendedParallel int     // Auto-calculated optimal parallel count
+	ShmMaxOK            bool    // Is shmmax sufficient?
+	ShmAllOK            bool    // Is shmall sufficient?
+	MemAvailableOK      bool    // Is available RAM sufficient?
+	IsLinux             bool    // Are we running on Linux?
 }

 // PostgreSQLChecks contains PostgreSQL configuration checks
 type PostgreSQLChecks struct {
-	MaxLocksPerTransaction int    // Current setting
-	MaintenanceWorkMem     string // Current setting
-	SharedBuffers          string // Current setting (info only)
-	MaxConnections         int    // Current setting
-	Version                string // PostgreSQL version
-	IsSuperuser            bool   // Can we modify settings?
+	MaxLocksPerTransaction  int    // Current setting
+	MaxPreparedTransactions int    // Current setting (affects lock capacity)
+	TotalLockCapacity       int    // Calculated: max_locks × (max_connections + max_prepared)
+	MaintenanceWorkMem      string // Current setting
+	SharedBuffers           string // Current setting (info only)
+	MaxConnections          int    // Current setting
+	Version                 string // PostgreSQL version
+	IsSuperuser             bool   // Can we modify settings?
 }

 // ArchiveChecks contains analysis of the backup archive
@@ -98,6 +153,7 @@ func (e *Engine) RunPreflightChecks(ctx context.Context, dumpsDir string, entrie
 // checkSystemResources uses gopsutil for cross-platform system checks
 func (e *Engine) checkSystemResources(result *PreflightResult) {
 	result.Linux.IsLinux = runtime.GOOS == "linux"
+	result.Linux.CPUCores = runtime.NumCPU()

 	// Get memory info (works on Linux, macOS, Windows, BSD)
 	if vmem, err := mem.VirtualMemory(); err == nil {
@@ -116,6 +172,9 @@ func (e *Engine) checkSystemResources(result *PreflightResult) {
 		e.log.Warn("Could not detect system memory", "error", err)
 	}

+	// Calculate recommended parallel based on resources
+	result.Linux.RecommendedParallel = e.calculateRecommendedParallel(result)
+
 	// Linux-specific kernel checks (shmmax, shmall)
 	if result.Linux.IsLinux {
 		e.checkLinuxKernel(result)
@@ -201,6 +260,29 @@ func (e *Engine) checkPostgreSQL(ctx context.Context, result *PreflightResult) {
 		result.PostgreSQL.IsSuperuser = isSuperuser
 	}

+	// Check max_prepared_transactions for lock capacity calculation
+	var maxPreparedTxns string
+	if err := db.QueryRowContext(ctx, "SHOW max_prepared_transactions").Scan(&maxPreparedTxns); err == nil {
+		result.PostgreSQL.MaxPreparedTransactions, _ = strconv.Atoi(maxPreparedTxns)
+	}
+
+	// CRITICAL: Calculate TOTAL lock table capacity
+	// Formula: max_locks_per_transaction × (max_connections + max_prepared_transactions)
+	// This is THE key capacity metric for BLOB-heavy restores
+	maxConns := result.PostgreSQL.MaxConnections
+	if maxConns == 0 {
+		maxConns = 100 // default
+	}
+	maxPrepared := result.PostgreSQL.MaxPreparedTransactions
+	totalLockCapacity := result.PostgreSQL.MaxLocksPerTransaction * (maxConns + maxPrepared)
+	result.PostgreSQL.TotalLockCapacity = totalLockCapacity
+
+	e.log.Info("PostgreSQL lock table capacity",
+		"max_locks_per_transaction", result.PostgreSQL.MaxLocksPerTransaction,
+		"max_connections", maxConns,
+		"max_prepared_transactions", maxPrepared,
+		"total_lock_capacity", totalLockCapacity)
+
 	// CRITICAL: max_locks_per_transaction requires PostgreSQL RESTART to change!
 	// Warn users loudly about this - it's the #1 cause of "out of shared memory" errors
 	if result.PostgreSQL.MaxLocksPerTransaction < 256 {
@@ -217,6 +299,33 @@ func (e *Engine) checkPostgreSQL(ctx context.Context, result *PreflightResult) {
 				result.PostgreSQL.MaxLocksPerTransaction))
 	}

+	// NEW: Check total lock capacity is sufficient for typical BLOB operations
+	// Minimum recommended: 200,000 for moderate BLOB databases
+	minRecommendedCapacity := 200000
+	if totalLockCapacity < minRecommendedCapacity {
+		recommendedMaxLocks := minRecommendedCapacity / (maxConns + maxPrepared)
+		if recommendedMaxLocks < 4096 {
+			recommendedMaxLocks = 4096
+		}
+
+		e.log.Warn("Total lock table capacity is LOW for BLOB-heavy restores",
+			"current_capacity", totalLockCapacity,
+			"recommended", minRecommendedCapacity,
+			"current_max_locks", result.PostgreSQL.MaxLocksPerTransaction,
+			"current_max_connections", maxConns,
+			"recommended_max_locks", recommendedMaxLocks,
+			"note", "VMs with fewer connections need higher max_locks_per_transaction")
+
+		result.Warnings = append(result.Warnings,
+			fmt.Sprintf("Total lock capacity=%d is low (recommend %d+). "+
+				"Capacity = max_locks_per_transaction(%d) × max_connections(%d). "+
+				"If you reduced VM size/connections, increase max_locks_per_transaction to %d. "+
+				"Fix: ALTER SYSTEM SET max_locks_per_transaction = %d; then RESTART PostgreSQL.",
+				totalLockCapacity, minRecommendedCapacity,
+				result.PostgreSQL.MaxLocksPerTransaction, maxConns,
+				recommendedMaxLocks, recommendedMaxLocks))
+	}
+
 	// Parse shared_buffers and warn if very low
 	sharedBuffersMB := parseMemoryToMB(result.PostgreSQL.SharedBuffers)
 	if sharedBuffersMB > 0 && sharedBuffersMB < 256 {
@@ -382,6 +491,56 @@ func (e *Engine) calculateRecommendations(result *PreflightResult) {
 		"recommended_locks", lockBoost)
 }

+// calculateRecommendedParallel determines optimal parallelism based on system resources
+// Returns the recommended number of parallel workers for pg_restore
+func (e *Engine) calculateRecommendedParallel(result *PreflightResult) int {
+	cpuCores := result.Linux.CPUCores
+	if cpuCores == 0 {
+		cpuCores = runtime.NumCPU()
+	}
+
+	memAvailableGB := float64(result.Linux.MemAvailable) / (1024 * 1024 * 1024)
+
+	// Each pg_restore worker needs approximately 2-4GB of RAM
+	// Use conservative 3GB per worker to avoid OOM
+	const memPerWorkerGB = 3.0
+
+	// Calculate limits
+	maxByMem := int(memAvailableGB / memPerWorkerGB)
+	maxByCPU := cpuCores
+
+	// Use the minimum of memory and CPU limits
+	recommended := maxByMem
+	if maxByCPU < recommended {
+		recommended = maxByCPU
+	}
+
+	// Apply sensible bounds
+	if recommended < 1 {
+		recommended = 1
+	}
+	if recommended > 16 {
+		recommended = 16 // Cap at 16 to avoid diminishing returns
+	}
+
+	// If memory pressure is high (>80%), reduce parallelism
+	if result.Linux.MemUsedPercent > 80 && recommended > 1 {
+		recommended = recommended / 2
+		if recommended < 1 {
+			recommended = 1
+		}
+	}
+
+	e.log.Info("Calculated recommended parallel",
+		"cpu_cores", cpuCores,
+		"mem_available_gb", fmt.Sprintf("%.1f", memAvailableGB),
+		"max_by_mem", maxByMem,
+		"max_by_cpu", maxByCPU,
+		"recommended", recommended)
+
+	return recommended
+}
+
 // printPreflightSummary prints a nice summary of all checks
 func (e *Engine) printPreflightSummary(result *PreflightResult) {
 	fmt.Println()
@@ -394,6 +553,8 @@ func (e *Engine) printPreflightSummary(result *PreflightResult) {
 	printCheck("Total RAM", humanize.Bytes(result.Linux.MemTotal), true)
 	printCheck("Available RAM", humanize.Bytes(result.Linux.MemAvailable), result.Linux.MemAvailableOK || result.Linux.MemAvailable == 0)
 	printCheck("Memory Usage", fmt.Sprintf("%.1f%%", result.Linux.MemUsedPercent), result.Linux.MemUsedPercent < 85)
+	printCheck("CPU Cores", fmt.Sprintf("%d", result.Linux.CPUCores), true)
+	printCheck("Recommended Parallel", fmt.Sprintf("%d (auto-calculated)", result.Linux.RecommendedParallel), true)

 	// Linux-specific kernel checks
 	if result.Linux.IsLinux && result.Linux.ShmMax > 0 {
@@ -409,6 +570,13 @@ func (e *Engine) printPreflightSummary(result *PreflightResult) {
 		humanize.Comma(int64(result.PostgreSQL.MaxLocksPerTransaction)),
 		humanize.Comma(int64(result.Archive.RecommendedLockBoost))),
 		true)
+	printCheck("max_connections", humanize.Comma(int64(result.PostgreSQL.MaxConnections)), true)
+	// Show total lock capacity with warning if low
+	totalCapacityOK := result.PostgreSQL.TotalLockCapacity >= 200000
+	printCheck("Total Lock Capacity",
+		fmt.Sprintf("%s (max_locks × max_conns)",
+			humanize.Comma(int64(result.PostgreSQL.TotalLockCapacity))),
+		totalCapacityOK)
 	printCheck("maintenance_work_mem", fmt.Sprintf("%s → 2GB (auto-boost)",
 		result.PostgreSQL.MaintenanceWorkMem), true)
 	printInfo("shared_buffers", result.PostgreSQL.SharedBuffers)
--- a/internal/restore/safety.go
+++ b/internal/restore/safety.go
@@ -334,10 +334,12 @@ func (s *Safety) checkPostgresDatabaseExists(ctx context.Context, dbName string)
 		"-tAc", fmt.Sprintf("SELECT 1 FROM pg_database WHERE datname='%s'", dbName),
 	}

-	// Only add -h flag if host is not localhost (to use Unix socket for peer auth)
-	if s.cfg.Host != "localhost" && s.cfg.Host != "127.0.0.1" && s.cfg.Host != "" {
-		args = append([]string{"-h", s.cfg.Host}, args...)
+	// Always add -h flag for explicit host connection (required for password auth)
+	host := s.cfg.Host
+	if host == "" {
+		host = "localhost"
 	}
+	args = append([]string{"-h", host}, args...)

 	cmd := exec.CommandContext(ctx, "psql", args...)

@@ -346,9 +348,9 @@ func (s *Safety) checkPostgresDatabaseExists(ctx context.Context, dbName string)
 		cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", s.cfg.Password))
 	}

-	output, err := cmd.Output()
+	output, err := cmd.CombinedOutput()
 	if err != nil {
-		return false, fmt.Errorf("failed to check database existence: %w", err)
+		return false, fmt.Errorf("failed to check database existence: %w (output: %s)", err, strings.TrimSpace(string(output)))
 	}

 	return strings.TrimSpace(string(output)) == "1", nil
@@ -405,21 +407,29 @@ func (s *Safety) listPostgresUserDatabases(ctx context.Context) ([]string, error
 		"-c", query,
 	}

-	// Only add -h flag if host is not localhost (to use Unix socket for peer auth)
-	if s.cfg.Host != "localhost" && s.cfg.Host != "127.0.0.1" && s.cfg.Host != "" {
-		args = append([]string{"-h", s.cfg.Host}, args...)
+	// Always add -h flag for explicit host connection (required for password auth)
+	// Empty or unset host defaults to localhost
+	host := s.cfg.Host
+	if host == "" {
+		host = "localhost"
 	}
+	args = append([]string{"-h", host}, args...)

 	cmd := exec.CommandContext(ctx, "psql", args...)

-	// Set password if provided
+	// Set password - check config first, then environment
+	env := os.Environ()
 	if s.cfg.Password != "" {
-		cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", s.cfg.Password))
+		env = append(env, fmt.Sprintf("PGPASSWORD=%s", s.cfg.Password))
 	}
+	cmd.Env = env

-	output, err := cmd.Output()
+	s.log.Debug("Listing PostgreSQL databases", "host", host, "port", s.cfg.Port, "user", s.cfg.User)
+
+	output, err := cmd.CombinedOutput()
 	if err != nil {
-		return nil, fmt.Errorf("failed to list databases: %w", err)
+		// Include psql output in error for debugging
+		return nil, fmt.Errorf("failed to list databases: %w (output: %s)", err, strings.TrimSpace(string(output)))
 	}

 	// Parse output
@@ -432,6 +442,8 @@ func (s *Safety) listPostgresUserDatabases(ctx context.Context) ([]string, error
 		}
 	}

+	s.log.Debug("Found user databases", "count", len(databases), "databases", databases, "raw_output", string(output))
+
 	return databases, nil
 }

--- a/internal/tui/backup_exec.go
+++ b/internal/tui/backup_exec.go
@@ -454,65 +454,61 @@ func (m BackupExecutionModel) View() string {
 	} else {
 		// Show completion summary with detailed stats
 		if m.err != nil {
+			s.WriteString(errorStyle.Render("╔══════════════════════════════════════════════════════════════╗"))
 			s.WriteString("\n")
-			s.WriteString(errorStyle.Render("  ╔══════════════════════════════════════════════════════════╗"))
+			s.WriteString(errorStyle.Render("║              [FAIL] BACKUP FAILED                            ║"))
 			s.WriteString("\n")
-			s.WriteString(errorStyle.Render("  ║              [FAIL] BACKUP FAILED                        ║"))
-			s.WriteString("\n")
-			s.WriteString(errorStyle.Render("  ╚══════════════════════════════════════════════════════════╝"))
+			s.WriteString(errorStyle.Render("╚══════════════════════════════════════════════════════════════╝"))
 			s.WriteString("\n\n")
-			s.WriteString(errorStyle.Render(fmt.Sprintf("    Error: %v", m.err)))
+			s.WriteString(errorStyle.Render(fmt.Sprintf("  Error: %v", m.err)))
 			s.WriteString("\n")
 		} else {
+			s.WriteString(successStyle.Render("╔══════════════════════════════════════════════════════════════╗"))
 			s.WriteString("\n")
-			s.WriteString(successStyle.Render("  ╔══════════════════════════════════════════════════════════╗"))
+			s.WriteString(successStyle.Render("║           [OK] BACKUP COMPLETED SUCCESSFULLY                 ║"))
 			s.WriteString("\n")
-			s.WriteString(successStyle.Render("  ║           [OK] BACKUP COMPLETED SUCCESSFULLY             ║"))
-			s.WriteString("\n")
-			s.WriteString(successStyle.Render("  ╚══════════════════════════════════════════════════════════╝"))
+			s.WriteString(successStyle.Render("╚══════════════════════════════════════════════════════════════╝"))
 			s.WriteString("\n\n")

 			// Summary section
-			s.WriteString(infoStyle.Render("    ─── Summary ─────────────────────────────────────────────"))
+			s.WriteString(infoStyle.Render("  ─── Summary ───────────────────────────────────────────────"))
 			s.WriteString("\n\n")

 			// Backup type specific info
 			switch m.backupType {
 			case "cluster":
-				s.WriteString("      Type:          Cluster Backup\n")
+				s.WriteString("    Type:          Cluster Backup\n")
 				if m.dbTotal > 0 {
-					s.WriteString(fmt.Sprintf("      Databases:     %d backed up\n", m.dbTotal))
+					s.WriteString(fmt.Sprintf("    Databases:     %d backed up\n", m.dbTotal))
 				}
 			case "single":
-				s.WriteString("      Type:          Single Database Backup\n")
-				s.WriteString(fmt.Sprintf("      Database:      %s\n", m.databaseName))
+				s.WriteString("    Type:          Single Database Backup\n")
+				s.WriteString(fmt.Sprintf("    Database:      %s\n", m.databaseName))
 			case "sample":
-				s.WriteString("      Type:          Sample Backup\n")
-				s.WriteString(fmt.Sprintf("      Database:      %s\n", m.databaseName))
-				s.WriteString(fmt.Sprintf("      Sample Ratio:  %d\n", m.ratio))
+				s.WriteString("    Type:          Sample Backup\n")
+				s.WriteString(fmt.Sprintf("    Database:      %s\n", m.databaseName))
+				s.WriteString(fmt.Sprintf("    Sample Ratio:  %d\n", m.ratio))
 			}

 			s.WriteString("\n")
-
-			// Timing section
-			s.WriteString(infoStyle.Render("    ─── Timing ──────────────────────────────────────────────"))
-			s.WriteString("\n\n")
-
-			elapsed := time.Since(m.startTime)
-			s.WriteString(fmt.Sprintf("      Total Time:    %s\n", formatBackupDuration(elapsed)))
-
-			if m.backupType == "cluster" && m.dbTotal > 0 {
-				avgPerDB := elapsed / time.Duration(m.dbTotal)
-				s.WriteString(fmt.Sprintf("      Avg per DB:    %s\n", formatBackupDuration(avgPerDB)))
-			}
-
-			s.WriteString("\n")
-			s.WriteString(infoStyle.Render("    ─────────────────────────────────────────────────────────"))
-			s.WriteString("\n")
+		}
+
+		// Timing section (always shown, consistent with restore)
+		s.WriteString(infoStyle.Render("  ─── Timing ────────────────────────────────────────────────"))
+		s.WriteString("\n\n")
+
+		elapsed := time.Since(m.startTime)
+		s.WriteString(fmt.Sprintf("    Total Time:    %s\n", formatBackupDuration(elapsed)))
+
+		if m.backupType == "cluster" && m.dbTotal > 0 && m.err == nil {
+			avgPerDB := elapsed / time.Duration(m.dbTotal)
+			s.WriteString(fmt.Sprintf("    Avg per DB:    %s\n", formatBackupDuration(avgPerDB)))
 		}

 		s.WriteString("\n")
-		s.WriteString("  [KEY]  Press Enter or ESC to return to menu\n")
+		s.WriteString(infoStyle.Render("  ───────────────────────────────────────────────────────────"))
+		s.WriteString("\n\n")
+		s.WriteString(infoStyle.Render("  [KEYS]  Press Enter to continue"))
 	}

 	return s.String()
--- a/internal/tui/menu.go
+++ b/internal/tui/menu.go
@@ -299,9 +299,13 @@ func (m *MenuModel) View() string {

 	var s string

+	// Product branding header
+	brandLine := fmt.Sprintf("dbbackup v%s • Enterprise Database Backup & Recovery", m.config.Version)
+	s += "\n" + infoStyle.Render(brandLine) + "\n"
+
 	// Header
-	header := titleStyle.Render("Database Backup Tool - Interactive Menu")
-	s += fmt.Sprintf("\n%s\n\n", header)
+	header := titleStyle.Render("Interactive Menu")
+	s += fmt.Sprintf("%s\n\n", header)

 	if len(m.dbTypes) > 0 {
 		options := make([]string, len(m.dbTypes))
--- a/internal/tui/restore_exec.go
+++ b/internal/tui/restore_exec.go
@@ -273,26 +273,42 @@ func executeRestoreWithTUIProgress(parentCtx context.Context, cfg *config.Config
 		defer dbClient.Close()

 		// STEP 1: Clean cluster if requested (drop all existing user databases)
-		if restoreType == "restore-cluster" && cleanClusterFirst && len(existingDBs) > 0 {
-			log.Info("Dropping existing user databases before cluster restore", "count", len(existingDBs))
-
-			// Drop databases using command-line psql (no connection required)
-			// This matches how cluster restore works - uses CLI tools, not database connections
-			droppedCount := 0
-			for _, dbName := range existingDBs {
-				// Create timeout context for each database drop (5 minutes per DB - large DBs take time)
-				dropCtx, dropCancel := context.WithTimeout(ctx, 5*time.Minute)
-				if err := dropDatabaseCLI(dropCtx, cfg, dbName); err != nil {
-					log.Warn("Failed to drop database", "name", dbName, "error", err)
-					// Continue with other databases
-				} else {
-					droppedCount++
-					log.Info("Dropped database", "name", dbName)
-				}
-				dropCancel() // Clean up context
+		if restoreType == "restore-cluster" && cleanClusterFirst {
+			// Re-detect databases at execution time to get current state
+			// The preview list may be stale or detection may have failed earlier
+			safety := restore.NewSafety(cfg, log)
+			currentDBs, err := safety.ListUserDatabases(ctx)
+			if err != nil {
+				log.Warn("Failed to list databases for cleanup, using preview list", "error", err)
+				currentDBs = existingDBs // Fall back to preview list
+			} else if len(currentDBs) > 0 {
+				log.Info("Re-detected user databases for cleanup", "count", len(currentDBs), "databases", currentDBs)
+				existingDBs = currentDBs // Update with fresh list
 			}

-			log.Info("Cluster cleanup completed", "dropped", droppedCount, "total", len(existingDBs))
+			if len(existingDBs) > 0 {
+				log.Info("Dropping existing user databases before cluster restore", "count", len(existingDBs))
+
+				// Drop databases using command-line psql (no connection required)
+				// This matches how cluster restore works - uses CLI tools, not database connections
+				droppedCount := 0
+				for _, dbName := range existingDBs {
+					// Create timeout context for each database drop (5 minutes per DB - large DBs take time)
+					dropCtx, dropCancel := context.WithTimeout(ctx, 5*time.Minute)
+					if err := dropDatabaseCLI(dropCtx, cfg, dbName); err != nil {
+						log.Warn("Failed to drop database", "name", dbName, "error", err)
+						// Continue with other databases
+					} else {
+						droppedCount++
+						log.Info("Dropped database", "name", dbName)
+					}
+					dropCancel() // Clean up context
+				}
+
+				log.Info("Cluster cleanup completed", "dropped", droppedCount, "total", len(existingDBs))
+			} else {
+				log.Info("No user databases to clean up")
+			}
 		}

 		// STEP 2: Create restore engine with silent progress (no stdout interference with TUI)
--- a/internal/tui/restore_preview.go
+++ b/internal/tui/restore_preview.go
@@ -55,6 +55,7 @@ type RestorePreviewModel struct {
 	cleanClusterFirst bool     // For cluster restore: drop all user databases first
 	existingDBCount   int      // Number of existing user databases
 	existingDBs       []string // List of existing user databases
+	existingDBError   string   // Error message if database listing failed
 	safetyChecks      []SafetyCheck
 	checking          bool
 	canProceed        bool
@@ -102,6 +103,7 @@ type safetyCheckCompleteMsg struct {
 	canProceed      bool
 	existingDBCount int
 	existingDBs     []string
+	existingDBError string
 }

 func runSafetyChecks(cfg *config.Config, log logger.Logger, archive ArchiveInfo, targetDB string) tea.Cmd {
@@ -221,10 +223,12 @@ func runSafetyChecks(cfg *config.Config, log logger.Logger, archive ArchiveInfo,
 			check = SafetyCheck{Name: "Existing databases", Status: "checking", Critical: false}

 			// Get list of existing user databases (exclude templates and system DBs)
+			var existingDBError string
 			dbList, err := safety.ListUserDatabases(ctx)
 			if err != nil {
 				check.Status = "warning"
 				check.Message = fmt.Sprintf("Cannot list databases: %v", err)
+				existingDBError = err.Error()
 			} else {
 				existingDBCount = len(dbList)
 				existingDBs = dbList
@@ -238,6 +242,14 @@ func runSafetyChecks(cfg *config.Config, log logger.Logger, archive ArchiveInfo,
 				}
 			}
 			checks = append(checks, check)
+
+			return safetyCheckCompleteMsg{
+				checks:          checks,
+				canProceed:      canProceed,
+				existingDBCount: existingDBCount,
+				existingDBs:     existingDBs,
+				existingDBError: existingDBError,
+			}
 		}

 		return safetyCheckCompleteMsg{
@@ -257,6 +269,7 @@ func (m RestorePreviewModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 		m.canProceed = msg.canProceed
 		m.existingDBCount = msg.existingDBCount
 		m.existingDBs = msg.existingDBs
+		m.existingDBError = msg.existingDBError
 		// Auto-forward in auto-confirm mode
 		if m.config.TUIAutoConfirm {
 			return m.parent, tea.Quit
@@ -275,10 +288,17 @@ func (m RestorePreviewModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {

 		case "c":
 			if m.mode == "restore-cluster" {
-				// Toggle cluster cleanup
+				// Toggle cluster cleanup - databases will be re-detected at execution time
 				m.cleanClusterFirst = !m.cleanClusterFirst
 				if m.cleanClusterFirst {
-					m.message = checkWarningStyle.Render(fmt.Sprintf("[WARN] Will drop %d existing database(s) before restore", m.existingDBCount))
+					if m.existingDBError != "" {
+						// Detection failed in preview - will re-detect at execution
+						m.message = checkWarningStyle.Render("[WARN] Will clean existing databases before restore (detection pending)")
+					} else if m.existingDBCount > 0 {
+						m.message = checkWarningStyle.Render(fmt.Sprintf("[WARN] Will drop %d existing database(s) before restore", m.existingDBCount))
+					} else {
+						m.message = infoStyle.Render("[INFO] Cleanup enabled (no databases currently detected)")
+					}
 				} else {
 					m.message = fmt.Sprintf("Clean cluster first: disabled")
 				}
@@ -382,7 +402,12 @@ func (m RestorePreviewModel) View() string {
 		s.WriteString("\n")
 		s.WriteString(fmt.Sprintf("  Host: %s:%d\n", m.config.Host, m.config.Port))

-		if m.existingDBCount > 0 {
+		if m.existingDBError != "" {
+			// Show warning when database listing failed - but still allow cleanup toggle
+			s.WriteString(checkWarningStyle.Render("  Existing Databases: Detection failed\n"))
+			s.WriteString(infoStyle.Render(fmt.Sprintf("    (%s)\n", m.existingDBError)))
+			s.WriteString(infoStyle.Render("    (Will re-detect at restore time)\n"))
+		} else if m.existingDBCount > 0 {
 			s.WriteString(fmt.Sprintf("  Existing Databases: %d found\n", m.existingDBCount))

 			// Show first few database names
@@ -395,17 +420,20 @@ func (m RestorePreviewModel) View() string {
 				}
 				s.WriteString(fmt.Sprintf("    - %s\n", db))
 			}
-
-			cleanIcon := "[N]"
-			cleanStyle := infoStyle
-			if m.cleanClusterFirst {
-				cleanIcon = "[Y]"
-				cleanStyle = checkWarningStyle
-			}
-			s.WriteString(cleanStyle.Render(fmt.Sprintf("  Clean All First: %s %v (press 'c' to toggle)\n", cleanIcon, m.cleanClusterFirst)))
 		} else {
 			s.WriteString("  Existing Databases: None (clean slate)\n")
 		}
+
+		// Always show cleanup toggle for cluster restore
+		cleanIcon := "[N]"
+		cleanStyle := infoStyle
+		if m.cleanClusterFirst {
+			cleanIcon := "[Y]"
+			cleanStyle = checkWarningStyle
+			s.WriteString(cleanStyle.Render(fmt.Sprintf("  Clean All First: %s enabled (press 'c' to toggle)\n", cleanIcon)))
+		} else {
+			s.WriteString(cleanStyle.Render(fmt.Sprintf("  Clean All First: %s disabled (press 'c' to toggle)\n", cleanIcon)))
+		}
 		s.WriteString("\n")
 	}

@@ -453,10 +481,18 @@ func (m RestorePreviewModel) View() string {
 		s.WriteString(infoStyle.Render("   All existing data in target database will be dropped!"))
 		s.WriteString("\n\n")
 	}
-	if m.cleanClusterFirst && m.existingDBCount > 0 {
+	if m.cleanClusterFirst {
 		s.WriteString(checkWarningStyle.Render("[DANGER] WARNING: Cluster cleanup enabled"))
 		s.WriteString("\n")
-		s.WriteString(checkWarningStyle.Render(fmt.Sprintf("   %d existing database(s) will be DROPPED before restore!", m.existingDBCount)))
+		if m.existingDBError != "" {
+			s.WriteString(checkWarningStyle.Render("   Existing databases will be DROPPED before restore!"))
+			s.WriteString("\n")
+			s.WriteString(infoStyle.Render("   (Database count will be detected at restore time)"))
+		} else if m.existingDBCount > 0 {
+			s.WriteString(checkWarningStyle.Render(fmt.Sprintf("   %d existing database(s) will be DROPPED before restore!", m.existingDBCount)))
+		} else {
+			s.WriteString(infoStyle.Render("   No databases currently detected - cleanup will verify at restore time"))
+		}
 		s.WriteString("\n")
 		s.WriteString(infoStyle.Render("   This ensures a clean disaster recovery scenario"))
 		s.WriteString("\n\n")
Author	SHA1	Message	Date
Alexander Renz	e2cf9adc62	fix: improve cleanup toggle UX when database detection fails All checks were successful CI/CD / Test (push) Successful in 1m17s Details CI/CD / Lint (push) Successful in 1m28s Details CI/CD / Build & Release (push) Successful in 3m13s Details - Allow cleanup toggle even when preview detection failed - Show 'detection pending' message instead of blocking the toggle - Will re-detect databases at restore execution time - Always show cleanup toggle option for cluster restores - Better messaging: 'enabled/disabled' instead of showing 0 count	2026-01-17 17:07:26 +01:00
Alexander Renz	29e089fe3b	fix: re-detect databases at execution time for cluster cleanup All checks were successful CI/CD / Test (push) Successful in 1m16s Details CI/CD / Lint (push) Successful in 1m25s Details CI/CD / Build & Release (push) Successful in 3m10s Details - Detection in preview may fail or return stale results - Re-detect user databases when cleanup is enabled at execution time - Fall back to preview list if re-detection fails - Ensures actual databases are dropped, not just what was detected earlier	2026-01-17 17:00:28 +01:00
Alexander Renz	9396c8e605	fix: add debug logging for database detection All checks were successful CI/CD / Test (push) Successful in 1m19s Details CI/CD / Lint (push) Successful in 1m34s Details CI/CD / Build & Release (push) Successful in 3m24s Details - Always set cmd.Env to preserve PGPASSWORD from environment - Add debug logging for connection parameters and results - Helps diagnose cluster restore database detection issues	2026-01-17 16:54:20 +01:00
Alexander Renz	e363e1937f	fix: cluster restore database detection and TUI error display All checks were successful CI/CD / Test (push) Successful in 1m14s Details CI/CD / Lint (push) Successful in 1m25s Details CI/CD / Build & Release (push) Successful in 3m16s Details - Fixed psql connection for database detection (always use -h flag) - Use CombinedOutput() to capture stderr for better diagnostics - Added existingDBError tracking in restore preview - Show 'Unable to detect' instead of misleading 'None' when listing fails - Disable cleanup toggle when database detection failed	2026-01-17 16:44:44 +01:00
Alexander Renz	df1ab2f55b	feat: TUI improvements and consistency fixes All checks were successful CI/CD / Test (push) Successful in 1m14s Details CI/CD / Lint (push) Successful in 1m23s Details CI/CD / Build & Release (push) Successful in 3m10s Details - Add product branding header to main menu (version + tagline) - Fix backup success/error report formatting consistency - Remove extra newline before error box in backup_exec - Align backup and restore completion screens	2026-01-17 16:26:00 +01:00
Alexander Renz	0e050b2def	fix: cluster backup TUI success report formatting consistency All checks were successful CI/CD / Test (push) Successful in 1m15s Details CI/CD / Lint (push) Successful in 1m24s Details CI/CD / Build & Release (push) Successful in 3m10s Details - Aligned box width and indentation with restore success screen - Removed inconsistent 2-space prefix from success/error boxes - Standardized content indentation to 4 spaces - Moved timing section outside else block (always shown) - Updated footer style to match restore screen	2026-01-17 16:15:16 +01:00
Alexander Renz	62d58c77af	feat(restore): add --parallel-dbs=-1 auto-detection based on CPU/RAM All checks were successful CI/CD / Test (push) Successful in 1m16s Details CI/CD / Lint (push) Successful in 1m25s Details CI/CD / Build & Release (push) Successful in 3m14s Details - Add CalculateOptimalParallel() function to preflight.go - Calculates optimal workers: min(RAM/3GB, CPU cores), capped at 16 - Reduces parallelism by 50% if memory pressure >80% - Add -1 flag value for auto-detection mode - Preflight summary now shows CPU cores and recommended parallel	2026-01-17 13:41:28 +01:00
Alexander Renz	c5be9bcd2b	fix(grafana): update dashboard queries and thresholds All checks were successful CI/CD / Test (push) Successful in 1m15s Details CI/CD / Lint (push) Successful in 1m26s Details CI/CD / Build & Release (push) Successful in 3m13s Details - Fix Last Backup Status panel to use bool modifier for proper 1/0 values - Change RPO threshold from 24h to 7 days (604800s) for status check - Clean up table transformations to exclude duplicate fields - Update variable refresh to trigger on time range change	2026-01-17 13:24:54 +01:00
Alexander Renz	b120f1507e	style: format struct field alignment All checks were successful CI/CD / Test (push) Successful in 1m18s Details CI/CD / Lint (push) Successful in 1m26s Details CI/CD / Build & Release (push) Has been skipped Details	2026-01-17 11:44:05 +01:00
Alexander Renz	dd1db844ce	fix: improve lock capacity calculation for smaller VMs All checks were successful CI/CD / Test (push) Successful in 1m16s Details CI/CD / Lint (push) Successful in 1m25s Details CI/CD / Build & Release (push) Successful in 3m13s Details - Fix boostLockCapacity: max_locks_per_transaction requires RESTART, not reload - Calculate total lock capacity: max_locks × (max_connections + max_prepared_txns) - Add TotalLockCapacity to preflight checks with warning if < 200,000 - Update error hints to explain capacity formula and recommend 4096+ for small VMs - Show max_connections and total capacity in preflight summary Fixes OOM 'out of shared memory' errors on VMs with reduced resources	2026-01-17 07:48:17 +01:00