feat(restore): add --parallel-dbs=-1 auto-detection based on CPU/RAM

- Add CalculateOptimalParallel() function to preflight.go - Calculates optimal workers: min(RAM/3GB, CPU cores), capped at 16 - Reduces parallelism by 50% if memory pressure >80% - Add -1 flag value for auto-detection mode - Preflight summary now shows CPU cores and recommended parallel
fix(grafana): update dashboard queries and thresholds
2026-01-17 13:41:28 +01:00 · 2026-01-17 13:24:54 +01:00 · 2026-01-17 11:44:05 +01:00 · 2026-01-17 07:48:17 +01:00 · 2026-01-17 07:25:45 +01:00 · 2026-01-16 19:36:52 +01:00
8 changed files with 462 additions and 75 deletions
--- a/bin/README.md
+++ b/bin/README.md
@@ -4,8 +4,8 @@ This directory contains pre-compiled binaries for the DB Backup Tool across mult
 ## Build Information
 - **Version**: 3.42.50
- **Build Time**: 2026-01-16_14:53:54_UTC
+- **Build Time**: 2026-01-17_12:25:20_UTC
- **Git Commit**: 5728b46
+- **Git Commit**: c5be9bc
 ## Recent Updates (v1.1.0)
 - ✅ Fixed TUI progress display with line-by-line output
--- a/cmd/restore.go
+++ b/cmd/restore.go
@@ -28,6 +28,7 @@ var (
 	restoreClean        bool
 	restoreCreate       bool
 	restoreJobs         int
 	restoreParallelDBs  int // Number of parallel database restores
 	restoreTarget       string
 	restoreVerbose      bool
 	restoreNoProgress   bool
@@ -289,6 +290,7 @@ func init() {
 	restoreClusterCmd.Flags().BoolVar(&restoreForce, "force", false, "Skip safety checks and confirmations")
 	restoreClusterCmd.Flags().BoolVar(&restoreCleanCluster, "clean-cluster", false, "Drop all existing user databases before restore (disaster recovery)")
 	restoreClusterCmd.Flags().IntVar(&restoreJobs, "jobs", 0, "Number of parallel decompression jobs (0 = auto)")
 	restoreClusterCmd.Flags().IntVar(&restoreParallelDBs, "parallel-dbs", 0, "Number of databases to restore in parallel (0 = use config default, 1 = sequential, -1 = auto-detect based on CPU/RAM)")
 	restoreClusterCmd.Flags().StringVar(&restoreWorkdir, "workdir", "", "Working directory for extraction (use when system disk is small, e.g. /mnt/storage/restore_tmp)")
 	restoreClusterCmd.Flags().BoolVar(&restoreVerbose, "verbose", false, "Show detailed restore progress")
 	restoreClusterCmd.Flags().BoolVar(&restoreNoProgress, "no-progress", false, "Disable progress indicators")
@@ -783,6 +785,17 @@ func runRestoreCluster(cmd *cobra.Command, args []string) error {
 		}
 	}
 	// Override cluster parallelism if --parallel-dbs is specified
 	if restoreParallelDBs == -1 {
 		// Auto-detect optimal parallelism based on system resources
 		autoParallel := restore.CalculateOptimalParallel()
 		cfg.ClusterParallelism = autoParallel
 		log.Info("Auto-detected optimal parallelism for database restores", "parallel_dbs", autoParallel, "mode", "auto")
 	} else if restoreParallelDBs > 0 {
 		cfg.ClusterParallelism = restoreParallelDBs
 		log.Info("Using custom parallelism for database restores", "parallel_dbs", restoreParallelDBs)
 	}
 	// Create restore engine
 	engine := restore.New(cfg, log, db)
--- a/grafana/dbbackup-dashboard.json
+++ b/grafana/dbbackup-dashboard.json
@@ -94,7 +94,7 @@
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
-          "expr": "dbbackup_rpo_seconds{instance=~\"$instance\"} < 86400",
+          "expr": "dbbackup_rpo_seconds{instance=~\"$instance\"} < bool 604800",
          "legendFormat": "{{database}}",
          "range": true,
          "refId": "A"
@@ -711,19 +711,6 @@
      },
      "pluginVersion": "10.2.0",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr": "dbbackup_rpo_seconds{instance=~\"$instance\"} < 86400",
          "format": "table",
          "instant": true,
          "legendFormat": "__auto",
          "range": false,
          "refId": "Status"
        },
        {
          "datasource": {
            "type": "prometheus",
@@ -769,26 +756,30 @@
              "Time": true,
              "Time 1": true,
              "Time 2": true,
              "Time 3": true,
              "__name__": true,
              "__name__ 1": true,
              "__name__ 2": true,
              "__name__ 3": true,
              "instance 1": true,
              "instance 2": true,
              "instance 3": true,
              "job": true,
              "job 1": true,
              "job 2": true,
-              "job 3": true
+              "engine 1": true,
              "engine 2": true
            },
            "indexByName": {
              "Database": 0,
              "Instance": 1,
              "Engine": 2,
              "RPO": 3,
              "Size": 4
            },
            "indexByName": {},
            "renameByName": {
              "Value #RPO": "RPO",
              "Value #Size": "Size",
              "Value #Status": "Status",
              "database": "Database",
-              "instance": "Instance"
+              "instance": "Instance",
              "engine": "Engine"
            }
          }
        }
@@ -1275,7 +1266,7 @@
          "query": "label_values(dbbackup_rpo_seconds, instance)",
          "refId": "StandardVariableQuery"
        },
-        "refresh": 1,
+        "refresh": 2,
        "regex": "",
        "skipUrlSync": false,
        "sort": 1,
--- a/internal/checks/error_hints.go
+++ b/internal/checks/error_hints.go
@@ -68,8 +68,8 @@ func ClassifyError(errorMsg string) *ErrorClassification {
 			Type:     "critical",
 			Category: "locks",
 			Message:  errorMsg,
-			Hint:     "Lock table exhausted - typically caused by large objects (BLOBs) during restore",
+			Hint:     "Lock table exhausted. Total capacity = max_locks_per_transaction × (max_connections + max_prepared_transactions). If you reduced VM size or max_connections, you need higher max_locks_per_transaction to compensate.",
-			Action:   "Option 1: Increase max_locks_per_transaction to 1024+ in postgresql.conf (requires restart). Option 2: Update dbbackup and retry - phased restore now auto-enabled for BLOB databases",
+			Action:   "Fix: ALTER SYSTEM SET max_locks_per_transaction = 4096; then RESTART PostgreSQL. For smaller VMs with fewer connections, you need higher max_locks_per_transaction values.",
 			Severity: 2,
 		}
 	case "permission_denied":
@@ -142,8 +142,8 @@ func ClassifyError(errorMsg string) *ErrorClassification {
 			Type:     "critical",
 			Category: "locks",
 			Message:  errorMsg,
-			Hint:     "Lock table exhausted - typically caused by large objects (BLOBs) during restore",
+			Hint:     "Lock table exhausted. Total capacity = max_locks_per_transaction × (max_connections + max_prepared_transactions). If you reduced VM size or max_connections, you need higher max_locks_per_transaction to compensate.",
-			Action:   "Option 1: Increase max_locks_per_transaction to 1024+ in postgresql.conf (requires restart). Option 2: Update dbbackup and retry - phased restore now auto-enabled for BLOB databases",
+			Action:   "Fix: ALTER SYSTEM SET max_locks_per_transaction = 4096; then RESTART PostgreSQL. For smaller VMs with fewer connections, you need higher max_locks_per_transaction values.",
 			Severity: 2,
 		}
 	}
--- a/internal/restore/engine.go
+++ b/internal/restore/engine.go
@@ -38,6 +38,10 @@ type DatabaseProgressCallback func(done, total int, dbName string)
 // Parameters: done count, total count, database name, elapsed time for current restore phase, avg duration per DB
 type DatabaseProgressWithTimingCallback func(done, total int, dbName string, phaseElapsed, avgPerDB time.Duration)
 // DatabaseProgressByBytesCallback is called with progress weighted by database sizes (bytes)
 // Parameters: bytes completed, total bytes, current database name, databases done count, total database count
 type DatabaseProgressByBytesCallback func(bytesDone, bytesTotal int64, dbName string, dbDone, dbTotal int)
 // Engine handles database restore operations
 type Engine struct {
 	cfg              *config.Config
@@ -49,9 +53,10 @@ type Engine struct {
 	debugLogPath     string // Path to save debug log on error
 	// TUI progress callback for detailed progress reporting
-	progressCallback         ProgressCallback
+	progressCallback          ProgressCallback
-	dbProgressCallback       DatabaseProgressCallback
+	dbProgressCallback        DatabaseProgressCallback
-	dbProgressTimingCallback DatabaseProgressWithTimingCallback
+	dbProgressTimingCallback  DatabaseProgressWithTimingCallback
 	dbProgressByBytesCallback DatabaseProgressByBytesCallback
 }
 // New creates a new restore engine
@@ -122,6 +127,11 @@ func (e *Engine) SetDatabaseProgressWithTimingCallback(cb DatabaseProgressWithTi
 	e.dbProgressTimingCallback = cb
 }
 // SetDatabaseProgressByBytesCallback sets a callback for progress weighted by database sizes
 func (e *Engine) SetDatabaseProgressByBytesCallback(cb DatabaseProgressByBytesCallback) {
 	e.dbProgressByBytesCallback = cb
 }
 // reportProgress safely calls the progress callback if set
 func (e *Engine) reportProgress(current, total int64, description string) {
 	if e.progressCallback != nil {
@@ -143,6 +153,13 @@ func (e *Engine) reportDatabaseProgressWithTiming(done, total int, dbName string
 	}
 }
 // reportDatabaseProgressByBytes safely calls the bytes-weighted callback if set
 func (e *Engine) reportDatabaseProgressByBytes(bytesDone, bytesTotal int64, dbName string, dbDone, dbTotal int) {
 	if e.dbProgressByBytesCallback != nil {
 		e.dbProgressByBytesCallback(bytesDone, bytesTotal, dbName, dbDone, dbTotal)
 	}
 }
 // loggerAdapter adapts our logger to the progress.Logger interface
 type loggerAdapter struct {
 	logger logger.Logger
@@ -861,6 +878,25 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
 	// Create temporary extraction directory in configured WorkDir
 	workDir := e.cfg.GetEffectiveWorkDir()
 	tempDir := filepath.Join(workDir, fmt.Sprintf(".restore_%d", time.Now().Unix()))
 	// Check disk space for extraction (need ~3x archive size: compressed + extracted + working space)
 	if archiveInfo != nil {
 		requiredBytes := uint64(archiveInfo.Size()) * 3
 		extractionCheck := checks.CheckDiskSpace(workDir)
 		if extractionCheck.AvailableBytes < requiredBytes {
 			operation.Fail("Insufficient disk space for extraction")
 			return fmt.Errorf("insufficient disk space for extraction in %s: need %.1f GB, have %.1f GB (archive size: %.1f GB × 3)",
 				workDir,
 				float64(requiredBytes)/(1024*1024*1024),
 				float64(extractionCheck.AvailableBytes)/(1024*1024*1024),
 				float64(archiveInfo.Size())/(1024*1024*1024))
 		}
 		e.log.Info("Disk space check for extraction passed",
 			"workdir", workDir,
 			"required_gb", float64(requiredBytes)/(1024*1024*1024),
 			"available_gb", float64(extractionCheck.AvailableBytes)/(1024*1024*1024))
 	}
 	if err := os.MkdirAll(tempDir, 0755); err != nil {
 		operation.Fail("Failed to create temporary directory")
 		return fmt.Errorf("failed to create temp directory in %s: %w", workDir, err)
@@ -874,6 +910,16 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
 		return fmt.Errorf("failed to extract archive: %w", err)
 	}
 	// Check context validity after extraction (debugging context cancellation issues)
 	if ctx.Err() != nil {
 		e.log.Error("Context cancelled after extraction - this should not happen",
 			"context_error", ctx.Err(),
 			"extraction_completed", true)
 		operation.Fail("Context cancelled unexpectedly")
 		return fmt.Errorf("context cancelled after extraction completed: %w", ctx.Err())
 	}
 	e.log.Info("Extraction completed, context still valid")
 	// Check if user has superuser privileges (required for ownership restoration)
 	e.progress.Update("Checking privileges...")
 	isSuperuser, err := e.checkSuperuser(ctx)
@@ -1024,12 +1070,27 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
 	var restoreErrorsMu sync.Mutex
 	totalDBs := 0
-	// Count total databases
+	// Count total databases and calculate total bytes for weighted progress
 	var totalBytes int64
 	dbSizes := make(map[string]int64) // Map database name to dump file size
 	for _, entry := range entries {
 		if !entry.IsDir() {
 			totalDBs++
 			dumpFile := filepath.Join(dumpsDir, entry.Name())
 			if info, err := os.Stat(dumpFile); err == nil {
 				dbName := entry.Name()
 				dbName = strings.TrimSuffix(dbName, ".dump")
 				dbName = strings.TrimSuffix(dbName, ".sql.gz")
 				dbSizes[dbName] = info.Size()
 				totalBytes += info.Size()
 			}
 		}
 	}
 	e.log.Info("Calculated total restore size", "databases", totalDBs, "total_bytes", totalBytes)
 	// Track bytes completed for weighted progress
 	var bytesCompleted int64
 	var bytesCompletedMu sync.Mutex
 	// Create ETA estimator for database restores
 	estimator := progress.NewETAEstimator("Restoring cluster", totalDBs)
@@ -1057,6 +1118,18 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
 	var successCount, failCount int32
 	var mu sync.Mutex // Protect shared resources (progress, logger)
 	// CRITICAL: Check context before starting database restore loop
 	// This helps debug issues where context gets cancelled between extraction and restore
 	if ctx.Err() != nil {
 		e.log.Error("Context cancelled before database restore loop started",
 			"context_error", ctx.Err(),
 			"total_databases", totalDBs,
 			"parallelism", parallelism)
 		operation.Fail("Context cancelled before database restores could start")
 		return fmt.Errorf("context cancelled before database restore: %w", ctx.Err())
 	}
 	e.log.Info("Starting database restore loop", "databases", totalDBs, "parallelism", parallelism)
 	// Timing tracking for restore phase progress
 	restorePhaseStart := time.Now()
 	var completedDBTimes []time.Duration // Track duration for each completed DB restore
@@ -1202,6 +1275,17 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
 			completedDBTimes = append(completedDBTimes, dbRestoreDuration)
 			completedDBTimesMu.Unlock()
 			// Update bytes completed for weighted progress
 			dbSize := dbSizes[dbName]
 			bytesCompletedMu.Lock()
 			bytesCompleted += dbSize
 			currentBytesCompleted := bytesCompleted
 			currentSuccessCount := int(atomic.LoadInt32(&successCount)) + 1 // +1 because we're about to increment
 			bytesCompletedMu.Unlock()
 			// Report weighted progress (bytes-based)
 			e.reportDatabaseProgressByBytes(currentBytesCompleted, totalBytes, dbName, currentSuccessCount, totalDBs)
 			atomic.AddInt32(&successCount, 1)
 			// Small delay to ensure PostgreSQL fully closes connections before next restore
@@ -2041,9 +2125,10 @@ func (e *Engine) quickValidateSQLDump(archivePath string, compressed bool) error
 	return nil
 }
-// boostLockCapacity temporarily increases max_locks_per_transaction to prevent OOM
+// boostLockCapacity checks and reports on max_locks_per_transaction capacity.
-// during large restores with many BLOBs. Returns the original value for later reset.
+// IMPORTANT: max_locks_per_transaction requires a PostgreSQL RESTART to change!
-// Uses ALTER SYSTEM + pg_reload_conf() so no restart is needed.
+// This function now calculates total lock capacity based on max_connections and
 // warns the user if capacity is insufficient for the restore.
 func (e *Engine) boostLockCapacity(ctx context.Context) (int, error) {
 	// Connect to PostgreSQL to run system commands
 	connStr := fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=postgres sslmode=disable",
@@ -2061,7 +2146,7 @@ func (e *Engine) boostLockCapacity(ctx context.Context) (int, error) {
 	}
 	defer db.Close()
-	// Get current value
+	// Get current max_locks_per_transaction
 	var currentValue int
 	err = db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&currentValue)
 	if err != nil {
@@ -2074,22 +2159,56 @@ func (e *Engine) boostLockCapacity(ctx context.Context) (int, error) {
 		fmt.Sscanf(currentValueStr, "%d", &currentValue)
 	}
-	// Skip if already high enough
+	// Get max_connections to calculate total lock capacity
-	if currentValue >= 2048 {
+	var maxConns int
-		e.log.Info("max_locks_per_transaction already sufficient", "value", currentValue)
+	if err := db.QueryRowContext(ctx, "SHOW max_connections").Scan(&maxConns); err != nil {
-		return currentValue, nil
+		maxConns = 100 // default
 	}
-	// Boost to 2048 (enough for most BLOB-heavy databases)
+	// Get max_prepared_transactions
-	_, err = db.ExecContext(ctx, "ALTER SYSTEM SET max_locks_per_transaction = 2048")
+	var maxPreparedTxns int
-	if err != nil {
+	if err := db.QueryRowContext(ctx, "SHOW max_prepared_transactions").Scan(&maxPreparedTxns); err != nil {
-		return currentValue, fmt.Errorf("failed to set max_locks_per_transaction: %w", err)
+		maxPreparedTxns = 0
 	}
-	// Reload config without restart
+	// Calculate total lock table capacity:
-	_, err = db.ExecContext(ctx, "SELECT pg_reload_conf()")
+	// Total locks = max_locks_per_transaction × (max_connections + max_prepared_transactions)
-	if err != nil {
+	totalLockCapacity := currentValue * (maxConns + maxPreparedTxns)
-		return currentValue, fmt.Errorf("failed to reload config: %w", err)
+
 	e.log.Info("PostgreSQL lock table capacity",
 		"max_locks_per_transaction", currentValue,
 		"max_connections", maxConns,
 		"max_prepared_transactions", maxPreparedTxns,
 		"total_lock_capacity", totalLockCapacity)
 	// Minimum recommended total capacity for BLOB-heavy restores: 200,000 locks
 	minRecommendedCapacity := 200000
 	if totalLockCapacity < minRecommendedCapacity {
 		recommendedMaxLocks := minRecommendedCapacity / (maxConns + maxPreparedTxns)
 		if recommendedMaxLocks < 4096 {
 			recommendedMaxLocks = 4096
 		}
 		e.log.Warn("Lock table capacity may be insufficient for BLOB-heavy restores",
 			"current_total_capacity", totalLockCapacity,
 			"recommended_capacity", minRecommendedCapacity,
 			"current_max_locks", currentValue,
 			"recommended_max_locks", recommendedMaxLocks,
 			"note", "max_locks_per_transaction requires PostgreSQL RESTART to change")
 		// Write suggested fix to ALTER SYSTEM but warn about restart
 		_, err = db.ExecContext(ctx, fmt.Sprintf("ALTER SYSTEM SET max_locks_per_transaction = %d", recommendedMaxLocks))
 		if err != nil {
 			e.log.Warn("Could not set recommended max_locks_per_transaction (needs superuser)", "error", err)
 		} else {
 			e.log.Warn("Wrote recommended max_locks_per_transaction to postgresql.auto.conf",
 				"value", recommendedMaxLocks,
 				"action", "RESTART PostgreSQL to apply: sudo systemctl restart postgresql")
 		}
 	} else {
 		e.log.Info("Lock table capacity is sufficient",
 			"total_capacity", totalLockCapacity,
 			"max_locks_per_transaction", currentValue)
 	}
 	return currentValue, nil
--- a/internal/restore/preflight.go
+++ b/internal/restore/preflight.go
@@ -16,6 +16,57 @@ import (
 	"github.com/shirou/gopsutil/v3/mem"
 )
 // CalculateOptimalParallel returns the recommended number of parallel workers
 // based on available system resources (CPU cores and RAM).
 // This is a standalone function that can be called from anywhere.
 // Returns 0 if resources cannot be detected.
 func CalculateOptimalParallel() int {
 	cpuCores := runtime.NumCPU()
 	vmem, err := mem.VirtualMemory()
 	if err != nil {
 		// Fallback: use half of CPU cores if memory detection fails
 		if cpuCores > 1 {
 			return cpuCores / 2
 		}
 		return 1
 	}
 	memAvailableGB := float64(vmem.Available) / (1024 * 1024 * 1024)
 	// Each pg_restore worker needs approximately 2-4GB of RAM
 	// Use conservative 3GB per worker to avoid OOM
 	const memPerWorkerGB = 3.0
 	// Calculate limits
 	maxByMem := int(memAvailableGB / memPerWorkerGB)
 	maxByCPU := cpuCores
 	// Use the minimum of memory and CPU limits
 	recommended := maxByMem
 	if maxByCPU < recommended {
 		recommended = maxByCPU
 	}
 	// Apply sensible bounds
 	if recommended < 1 {
 		recommended = 1
 	}
 	if recommended > 16 {
 		recommended = 16 // Cap at 16 to avoid diminishing returns
 	}
 	// If memory pressure is high (>80%), reduce parallelism
 	if vmem.UsedPercent > 80 && recommended > 1 {
 		recommended = recommended / 2
 		if recommended < 1 {
 			recommended = 1
 		}
 	}
 	return recommended
 }
 // PreflightResult contains all preflight check results
 type PreflightResult struct {
 	// Linux system checks
@@ -35,25 +86,29 @@ type PreflightResult struct {
 // LinuxChecks contains Linux kernel/system checks
 type LinuxChecks struct {
-	ShmMax         int64   // /proc/sys/kernel/shmmax
+	ShmMax              int64   // /proc/sys/kernel/shmmax
-	ShmAll         int64   // /proc/sys/kernel/shmall
+	ShmAll              int64   // /proc/sys/kernel/shmall
-	MemTotal       uint64  // Total RAM in bytes
+	MemTotal            uint64  // Total RAM in bytes
-	MemAvailable   uint64  // Available RAM in bytes
+	MemAvailable        uint64  // Available RAM in bytes
-	MemUsedPercent float64 // Memory usage percentage
+	MemUsedPercent      float64 // Memory usage percentage
-	ShmMaxOK       bool    // Is shmmax sufficient?
+	CPUCores            int     // Number of CPU cores
-	ShmAllOK       bool    // Is shmall sufficient?
+	RecommendedParallel int     // Auto-calculated optimal parallel count
-	MemAvailableOK bool    // Is available RAM sufficient?
+	ShmMaxOK            bool    // Is shmmax sufficient?
-	IsLinux        bool    // Are we running on Linux?
+	ShmAllOK            bool    // Is shmall sufficient?
 	MemAvailableOK      bool    // Is available RAM sufficient?
 	IsLinux             bool    // Are we running on Linux?
 }
 // PostgreSQLChecks contains PostgreSQL configuration checks
 type PostgreSQLChecks struct {
-	MaxLocksPerTransaction int    // Current setting
+	MaxLocksPerTransaction  int    // Current setting
-	MaintenanceWorkMem     string // Current setting
+	MaxPreparedTransactions int    // Current setting (affects lock capacity)
-	SharedBuffers          string // Current setting (info only)
+	TotalLockCapacity       int    // Calculated: max_locks × (max_connections + max_prepared)
-	MaxConnections         int    // Current setting
+	MaintenanceWorkMem      string // Current setting
-	Version                string // PostgreSQL version
+	SharedBuffers           string // Current setting (info only)
-	IsSuperuser            bool   // Can we modify settings?
+	MaxConnections          int    // Current setting
 	Version                 string // PostgreSQL version
 	IsSuperuser             bool   // Can we modify settings?
 }
 // ArchiveChecks contains analysis of the backup archive
@@ -98,6 +153,7 @@ func (e *Engine) RunPreflightChecks(ctx context.Context, dumpsDir string, entrie
 // checkSystemResources uses gopsutil for cross-platform system checks
 func (e *Engine) checkSystemResources(result *PreflightResult) {
 	result.Linux.IsLinux = runtime.GOOS == "linux"
 	result.Linux.CPUCores = runtime.NumCPU()
 	// Get memory info (works on Linux, macOS, Windows, BSD)
 	if vmem, err := mem.VirtualMemory(); err == nil {
@@ -116,6 +172,9 @@ func (e *Engine) checkSystemResources(result *PreflightResult) {
 		e.log.Warn("Could not detect system memory", "error", err)
 	}
 	// Calculate recommended parallel based on resources
 	result.Linux.RecommendedParallel = e.calculateRecommendedParallel(result)
 	// Linux-specific kernel checks (shmmax, shmall)
 	if result.Linux.IsLinux {
 		e.checkLinuxKernel(result)
@@ -201,6 +260,29 @@ func (e *Engine) checkPostgreSQL(ctx context.Context, result *PreflightResult) {
 		result.PostgreSQL.IsSuperuser = isSuperuser
 	}
 	// Check max_prepared_transactions for lock capacity calculation
 	var maxPreparedTxns string
 	if err := db.QueryRowContext(ctx, "SHOW max_prepared_transactions").Scan(&maxPreparedTxns); err == nil {
 		result.PostgreSQL.MaxPreparedTransactions, _ = strconv.Atoi(maxPreparedTxns)
 	}
 	// CRITICAL: Calculate TOTAL lock table capacity
 	// Formula: max_locks_per_transaction × (max_connections + max_prepared_transactions)
 	// This is THE key capacity metric for BLOB-heavy restores
 	maxConns := result.PostgreSQL.MaxConnections
 	if maxConns == 0 {
 		maxConns = 100 // default
 	}
 	maxPrepared := result.PostgreSQL.MaxPreparedTransactions
 	totalLockCapacity := result.PostgreSQL.MaxLocksPerTransaction * (maxConns + maxPrepared)
 	result.PostgreSQL.TotalLockCapacity = totalLockCapacity
 	e.log.Info("PostgreSQL lock table capacity",
 		"max_locks_per_transaction", result.PostgreSQL.MaxLocksPerTransaction,
 		"max_connections", maxConns,
 		"max_prepared_transactions", maxPrepared,
 		"total_lock_capacity", totalLockCapacity)
 	// CRITICAL: max_locks_per_transaction requires PostgreSQL RESTART to change!
 	// Warn users loudly about this - it's the #1 cause of "out of shared memory" errors
 	if result.PostgreSQL.MaxLocksPerTransaction < 256 {
@@ -212,10 +294,38 @@ func (e *Engine) checkPostgreSQL(ctx context.Context, result *PreflightResult) {
 		result.Warnings = append(result.Warnings,
 			fmt.Sprintf("max_locks_per_transaction=%d is low (recommend 256+). "+
 				"This setting requires PostgreSQL RESTART to change. "+
-				"BLOB-heavy databases may fail with 'out of shared memory' error.",
+				"BLOB-heavy databases may fail with 'out of shared memory' error. "+
 				"Fix: Edit postgresql.conf, set max_locks_per_transaction=2048, then restart PostgreSQL.",
 				result.PostgreSQL.MaxLocksPerTransaction))
 	}
 	// NEW: Check total lock capacity is sufficient for typical BLOB operations
 	// Minimum recommended: 200,000 for moderate BLOB databases
 	minRecommendedCapacity := 200000
 	if totalLockCapacity < minRecommendedCapacity {
 		recommendedMaxLocks := minRecommendedCapacity / (maxConns + maxPrepared)
 		if recommendedMaxLocks < 4096 {
 			recommendedMaxLocks = 4096
 		}
 		e.log.Warn("Total lock table capacity is LOW for BLOB-heavy restores",
 			"current_capacity", totalLockCapacity,
 			"recommended", minRecommendedCapacity,
 			"current_max_locks", result.PostgreSQL.MaxLocksPerTransaction,
 			"current_max_connections", maxConns,
 			"recommended_max_locks", recommendedMaxLocks,
 			"note", "VMs with fewer connections need higher max_locks_per_transaction")
 		result.Warnings = append(result.Warnings,
 			fmt.Sprintf("Total lock capacity=%d is low (recommend %d+). "+
 				"Capacity = max_locks_per_transaction(%d) × max_connections(%d). "+
 				"If you reduced VM size/connections, increase max_locks_per_transaction to %d. "+
 				"Fix: ALTER SYSTEM SET max_locks_per_transaction = %d; then RESTART PostgreSQL.",
 				totalLockCapacity, minRecommendedCapacity,
 				result.PostgreSQL.MaxLocksPerTransaction, maxConns,
 				recommendedMaxLocks, recommendedMaxLocks))
 	}
 	// Parse shared_buffers and warn if very low
 	sharedBuffersMB := parseMemoryToMB(result.PostgreSQL.SharedBuffers)
 	if sharedBuffersMB > 0 && sharedBuffersMB < 256 {
@@ -324,20 +434,113 @@ func (e *Engine) calculateRecommendations(result *PreflightResult) {
 	if result.Archive.TotalBlobCount > 50000 {
 		lockBoost = 16384
 	}
 	if result.Archive.TotalBlobCount > 100000 {
 		lockBoost = 32768
 	}
 	if result.Archive.TotalBlobCount > 200000 {
 		lockBoost = 65536
 	}
-	// Cap at reasonable maximum
+	// For extreme cases, calculate actual requirement
-	if lockBoost > 16384 {
+	// Rule of thumb: ~1 lock per BLOB, divided by max_connections (default 100)
-		lockBoost = 16384
+	// Add 50% safety margin
 	maxConns := result.PostgreSQL.MaxConnections
 	if maxConns == 0 {
 		maxConns = 100 // default
 	}
 	calculatedLocks := (result.Archive.TotalBlobCount / maxConns) * 3 / 2 // 1.5x safety margin
 	if calculatedLocks > lockBoost {
 		lockBoost = calculatedLocks
 	}
 	result.Archive.RecommendedLockBoost = lockBoost
 	// CRITICAL: Check if current max_locks_per_transaction is dangerously low for this BLOB count
 	currentLocks := result.PostgreSQL.MaxLocksPerTransaction
 	if currentLocks > 0 && result.Archive.TotalBlobCount > 0 {
 		// Estimate max BLOBs we can handle: locks * max_connections
 		maxSafeBLOBs := currentLocks * maxConns
 		if result.Archive.TotalBlobCount > maxSafeBLOBs {
 			severity := "WARNING"
 			if result.Archive.TotalBlobCount > maxSafeBLOBs*2 {
 				severity = "CRITICAL"
 				result.CanProceed = false
 			}
 			e.log.Error(fmt.Sprintf("%s: max_locks_per_transaction too low for BLOB count", severity),
 				"current_max_locks", currentLocks,
 				"total_blobs", result.Archive.TotalBlobCount,
 				"max_safe_blobs", maxSafeBLOBs,
 				"recommended_max_locks", lockBoost)
 			result.Errors = append(result.Errors,
 				fmt.Sprintf("%s: Archive contains %s BLOBs but max_locks_per_transaction=%d can only safely handle ~%s. "+
 					"Increase max_locks_per_transaction to %d in postgresql.conf and RESTART PostgreSQL.",
 					severity,
 					humanize.Comma(int64(result.Archive.TotalBlobCount)),
 					currentLocks,
 					humanize.Comma(int64(maxSafeBLOBs)),
 					lockBoost))
 		}
 	}
 	// Log recommendation
 	e.log.Info("Calculated recommended lock boost",
 		"total_blobs", result.Archive.TotalBlobCount,
 		"recommended_locks", lockBoost)
 }
 // calculateRecommendedParallel determines optimal parallelism based on system resources
 // Returns the recommended number of parallel workers for pg_restore
 func (e *Engine) calculateRecommendedParallel(result *PreflightResult) int {
 	cpuCores := result.Linux.CPUCores
 	if cpuCores == 0 {
 		cpuCores = runtime.NumCPU()
 	}
 	memAvailableGB := float64(result.Linux.MemAvailable) / (1024 * 1024 * 1024)
 	// Each pg_restore worker needs approximately 2-4GB of RAM
 	// Use conservative 3GB per worker to avoid OOM
 	const memPerWorkerGB = 3.0
 	// Calculate limits
 	maxByMem := int(memAvailableGB / memPerWorkerGB)
 	maxByCPU := cpuCores
 	// Use the minimum of memory and CPU limits
 	recommended := maxByMem
 	if maxByCPU < recommended {
 		recommended = maxByCPU
 	}
 	// Apply sensible bounds
 	if recommended < 1 {
 		recommended = 1
 	}
 	if recommended > 16 {
 		recommended = 16 // Cap at 16 to avoid diminishing returns
 	}
 	// If memory pressure is high (>80%), reduce parallelism
 	if result.Linux.MemUsedPercent > 80 && recommended > 1 {
 		recommended = recommended / 2
 		if recommended < 1 {
 			recommended = 1
 		}
 	}
 	e.log.Info("Calculated recommended parallel",
 		"cpu_cores", cpuCores,
 		"mem_available_gb", fmt.Sprintf("%.1f", memAvailableGB),
 		"max_by_mem", maxByMem,
 		"max_by_cpu", maxByCPU,
 		"recommended", recommended)
 	return recommended
 }
 // printPreflightSummary prints a nice summary of all checks
 func (e *Engine) printPreflightSummary(result *PreflightResult) {
 	fmt.Println()
@@ -350,6 +553,8 @@ func (e *Engine) printPreflightSummary(result *PreflightResult) {
 	printCheck("Total RAM", humanize.Bytes(result.Linux.MemTotal), true)
 	printCheck("Available RAM", humanize.Bytes(result.Linux.MemAvailable), result.Linux.MemAvailableOK || result.Linux.MemAvailable == 0)
 	printCheck("Memory Usage", fmt.Sprintf("%.1f%%", result.Linux.MemUsedPercent), result.Linux.MemUsedPercent < 85)
 	printCheck("CPU Cores", fmt.Sprintf("%d", result.Linux.CPUCores), true)
 	printCheck("Recommended Parallel", fmt.Sprintf("%d (auto-calculated)", result.Linux.RecommendedParallel), true)
 	// Linux-specific kernel checks
 	if result.Linux.IsLinux && result.Linux.ShmMax > 0 {
@@ -365,6 +570,13 @@ func (e *Engine) printPreflightSummary(result *PreflightResult) {
 		humanize.Comma(int64(result.PostgreSQL.MaxLocksPerTransaction)),
 		humanize.Comma(int64(result.Archive.RecommendedLockBoost))),
 		true)
 	printCheck("max_connections", humanize.Comma(int64(result.PostgreSQL.MaxConnections)), true)
 	// Show total lock capacity with warning if low
 	totalCapacityOK := result.PostgreSQL.TotalLockCapacity >= 200000
 	printCheck("Total Lock Capacity",
 		fmt.Sprintf("%s (max_locks × max_conns)",
 			humanize.Comma(int64(result.PostgreSQL.TotalLockCapacity))),
 		totalCapacityOK)
 	printCheck("maintenance_work_mem", fmt.Sprintf("%s → 2GB (auto-boost)",
 		result.PostgreSQL.MaintenanceWorkMem), true)
 	printInfo("shared_buffers", result.PostgreSQL.SharedBuffers)
@@ -386,6 +598,14 @@ func (e *Engine) printPreflightSummary(result *PreflightResult) {
 		}
 	}
 	// Errors (blocking issues)
 	if len(result.Errors) > 0 {
 		fmt.Println("\n  ✗ ERRORS (must fix before proceeding):")
 		for _, e := range result.Errors {
 			fmt.Printf("    • %s\n", e)
 		}
 	}
 	// Warnings
 	if len(result.Warnings) > 0 {
 		fmt.Println("\n  ⚠ Warnings:")
@@ -394,6 +614,23 @@ func (e *Engine) printPreflightSummary(result *PreflightResult) {
 		}
 	}
 	// Final status
 	fmt.Println()
 	if !result.CanProceed {
 		fmt.Println("  ┌─────────────────────────────────────────────────────────┐")
 		fmt.Println("  │  ✗ PREFLIGHT FAILED - Cannot proceed with restore       │")
 		fmt.Println("  │    Fix the errors above and try again.                  │")
 		fmt.Println("  └─────────────────────────────────────────────────────────┘")
 	} else if len(result.Warnings) > 0 {
 		fmt.Println("  ┌─────────────────────────────────────────────────────────┐")
 		fmt.Println("  │  ⚠ PREFLIGHT PASSED WITH WARNINGS - Proceed with care   │")
 		fmt.Println("  └─────────────────────────────────────────────────────────┘")
 	} else {
 		fmt.Println("  ┌─────────────────────────────────────────────────────────┐")
 		fmt.Println("  │  ✓ PREFLIGHT PASSED - Ready to restore                  │")
 		fmt.Println("  └─────────────────────────────────────────────────────────┘")
 	}
 	fmt.Println(strings.Repeat("─", 60))
 	fmt.Println()
 }
--- a/internal/tui/restore_exec.go
+++ b/internal/tui/restore_exec.go
@@ -159,6 +159,10 @@ type sharedProgressState struct {
 	overallPhase   int
 	extractionDone bool
 	// Weighted progress by database sizes (bytes)
 	dbBytesTotal int64 // Total bytes across all databases
 	dbBytesDone  int64 // Bytes completed (sum of finished DB sizes)
 	// Rolling window for speed calculation
 	speedSamples []restoreSpeedSample
 }
@@ -186,12 +190,12 @@ func clearCurrentRestoreProgress() {
 	currentRestoreProgressState = nil
 }
-func getCurrentRestoreProgress() (bytesTotal, bytesDone int64, description string, hasUpdate bool, dbTotal, dbDone int, speed float64, dbPhaseElapsed, dbAvgPerDB time.Duration, currentDB string, overallPhase int, extractionDone bool) {
+func getCurrentRestoreProgress() (bytesTotal, bytesDone int64, description string, hasUpdate bool, dbTotal, dbDone int, speed float64, dbPhaseElapsed, dbAvgPerDB time.Duration, currentDB string, overallPhase int, extractionDone bool, dbBytesTotal, dbBytesDone int64) {
 	currentRestoreProgressMu.Lock()
 	defer currentRestoreProgressMu.Unlock()
 	if currentRestoreProgressState == nil {
-		return 0, 0, "", false, 0, 0, 0, 0, 0, "", 0, false
+		return 0, 0, "", false, 0, 0, 0, 0, 0, "", 0, false, 0, 0
 	}
 	currentRestoreProgressState.mu.Lock()
@@ -205,7 +209,8 @@ func getCurrentRestoreProgress() (bytesTotal, bytesDone int64, description strin
 		currentRestoreProgressState.dbTotal, currentRestoreProgressState.dbDone, speed,
 		currentRestoreProgressState.dbPhaseElapsed, currentRestoreProgressState.dbAvgPerDB,
 		currentRestoreProgressState.currentDB, currentRestoreProgressState.overallPhase,
-		currentRestoreProgressState.extractionDone
+		currentRestoreProgressState.extractionDone,
 		currentRestoreProgressState.dbBytesTotal, currentRestoreProgressState.dbBytesDone
 }
 // calculateRollingSpeed calculates speed from recent samples (last 5 seconds)
@@ -359,6 +364,20 @@ func executeRestoreWithTUIProgress(parentCtx context.Context, cfg *config.Config
 			progressState.bytesDone = 0
 		})
 		// Set up weighted (bytes-based) progress callback for accurate cluster restore progress
 		engine.SetDatabaseProgressByBytesCallback(func(bytesDone, bytesTotal int64, dbName string, dbDone, dbTotal int) {
 			progressState.mu.Lock()
 			defer progressState.mu.Unlock()
 			progressState.dbBytesDone = bytesDone
 			progressState.dbBytesTotal = bytesTotal
 			progressState.dbDone = dbDone
 			progressState.dbTotal = dbTotal
 			progressState.currentDB = dbName
 			progressState.overallPhase = 3
 			progressState.extractionDone = true
 			progressState.hasUpdate = true
 		})
 		// Store progress state in a package-level variable for the ticker to access
 		// This is a workaround because tea messages can't be sent from callbacks
 		setCurrentRestoreProgress(progressState)
@@ -412,7 +431,7 @@ func (m RestoreExecutionModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			m.elapsed = time.Since(m.startTime)
 			// Poll shared progress state for real-time updates
-			bytesTotal, bytesDone, description, hasUpdate, dbTotal, dbDone, speed, dbPhaseElapsed, dbAvgPerDB, currentDB, overallPhase, extractionDone := getCurrentRestoreProgress()
+			bytesTotal, bytesDone, description, hasUpdate, dbTotal, dbDone, speed, dbPhaseElapsed, dbAvgPerDB, currentDB, overallPhase, extractionDone, dbBytesTotal, dbBytesDone := getCurrentRestoreProgress()
 			if hasUpdate && bytesTotal > 0 && !extractionDone {
 				// Phase 1: Extraction
 				m.bytesTotal = bytesTotal
@@ -443,8 +462,16 @@ func (m RestoreExecutionModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 				} else {
 					m.status = "Finalizing..."
 				}
-				m.phase = fmt.Sprintf("Phase 3/3: Databases (%d/%d)", dbDone, dbTotal)
+
-				m.progress = int((dbDone * 100) / dbTotal)
+				// Use weighted progress by bytes if available, otherwise use count
 				if dbBytesTotal > 0 {
 					weightedPercent := int((dbBytesDone * 100) / dbBytesTotal)
 					m.phase = fmt.Sprintf("Phase 3/3: Databases (%d/%d) - %.1f%% by size", dbDone, dbTotal, float64(dbBytesDone*100)/float64(dbBytesTotal))
 					m.progress = weightedPercent
 				} else {
 					m.phase = fmt.Sprintf("Phase 3/3: Databases (%d/%d)", dbDone, dbTotal)
 					m.progress = int((dbDone * 100) / dbTotal)
 				}
 			} else if hasUpdate && extractionDone && dbTotal == 0 {
 				// Phase 2: Globals restore (brief phase between extraction and databases)
 				m.overallPhase = 2
Author	SHA1	Message	Date
Alexander Renz	62d58c77af	feat(restore): add --parallel-dbs=-1 auto-detection based on CPU/RAM All checks were successful CI/CD / Test (push) Successful in 1m16s Details CI/CD / Lint (push) Successful in 1m25s Details CI/CD / Build & Release (push) Successful in 3m14s Details - Add CalculateOptimalParallel() function to preflight.go - Calculates optimal workers: min(RAM/3GB, CPU cores), capped at 16 - Reduces parallelism by 50% if memory pressure >80% - Add -1 flag value for auto-detection mode - Preflight summary now shows CPU cores and recommended parallel	2026-01-17 13:41:28 +01:00
Alexander Renz	c5be9bcd2b	fix(grafana): update dashboard queries and thresholds All checks were successful CI/CD / Test (push) Successful in 1m15s Details CI/CD / Lint (push) Successful in 1m26s Details CI/CD / Build & Release (push) Successful in 3m13s Details - Fix Last Backup Status panel to use bool modifier for proper 1/0 values - Change RPO threshold from 24h to 7 days (604800s) for status check - Clean up table transformations to exclude duplicate fields - Update variable refresh to trigger on time range change	2026-01-17 13:24:54 +01:00
Alexander Renz	b120f1507e	style: format struct field alignment All checks were successful CI/CD / Test (push) Successful in 1m18s Details CI/CD / Lint (push) Successful in 1m26s Details CI/CD / Build & Release (push) Has been skipped Details	2026-01-17 11:44:05 +01:00
Alexander Renz	dd1db844ce	fix: improve lock capacity calculation for smaller VMs All checks were successful CI/CD / Test (push) Successful in 1m16s Details CI/CD / Lint (push) Successful in 1m25s Details CI/CD / Build & Release (push) Successful in 3m13s Details - Fix boostLockCapacity: max_locks_per_transaction requires RESTART, not reload - Calculate total lock capacity: max_locks × (max_connections + max_prepared_txns) - Add TotalLockCapacity to preflight checks with warning if < 200,000 - Update error hints to explain capacity formula and recommend 4096+ for small VMs - Show max_connections and total capacity in preflight summary Fixes OOM 'out of shared memory' errors on VMs with reduced resources	2026-01-17 07:48:17 +01:00
Alexander Renz	4ea3ec2cf8	fix(preflight): improve BLOB count detection and block restore when max_locks_per_transaction is critically low All checks were successful CI/CD / Test (push) Successful in 1m15s Details CI/CD / Lint (push) Successful in 1m26s Details CI/CD / Build & Release (push) Successful in 3m14s Details - Add higher lock boost tiers for extreme BLOB counts (100K+, 200K+) - Calculate actual lock requirement: (totalBLOBs / max_connections) * 1.5 - Block restore with CRITICAL error when BLOB count exceeds 2x safe limit - Improve preflight summary with PASSED/FAILED status display - Add clearer fix instructions for max_locks_per_transaction	2026-01-17 07:25:45 +01:00
Alexander Renz	9200024e50	fix(restore): add context validity checks to debug cancellation issues All checks were successful CI/CD / Test (push) Successful in 1m21s Details CI/CD / Lint (push) Successful in 1m28s Details CI/CD / Build & Release (push) Successful in 3m17s Details Added explicit context checks at critical points: 1. After extraction completes - logs error if context was cancelled 2. Before database restore loop starts - catches premature cancellation This helps diagnose issues where all database restores fail with 'context cancelled' even though extraction completed successfully. The user reported this happening after 4h20m extraction - all 6 DBs showed 'restore skipped (context cancelled)'. These checks will log exactly when/where the context becomes invalid.	2026-01-16 19:36:52 +01:00
Alexander Renz	698b8a761c	feat(restore): add weighted progress, pre-extraction disk check, parallel-dbs flag All checks were successful CI/CD / Test (push) Successful in 1m20s Details CI/CD / Lint (push) Successful in 1m32s Details CI/CD / Build & Release (push) Successful in 3m19s Details Three high-value improvements for cluster restore: 1. Weighted progress by database size - Progress now shows percentage by data volume, not just count - Phase 3/3: Databases (2/7) - 45.2% by size - Gives more accurate ETA for clusters with varied DB sizes 2. Pre-extraction disk space check - Checks workdir has 3x archive size before extraction - Prevents partial extraction failures when disk fills mid-way - Clear error message with required vs available GB 3. --parallel-dbs flag for concurrent restores - dbbackup restore cluster archive.tar.gz --parallel-dbs=4 - Overrides CLUSTER_PARALLELISM config setting - Set to 1 for sequential restore (safest for large objects)	2026-01-16 18:31:12 +01:00