v3.42.9: Fix all timeout bugs and deadlocks

CRITICAL FIXES: - Encryption detection false positive (IsBackupEncrypted returned true for ALL files) - 12 cmd.Wait() deadlocks fixed with channel-based context handling - TUI timeout bugs: 60s->10min for safety checks, 15s->60s for DB listing - diagnose.go timeouts: 60s->5min for tar/pg_restore operations - Panic recovery added to parallel backup/restore goroutines - Variable shadowing fix in restore/engine.go These bugs caused pg_dump backups to fail through TUI for months.
2026-01-08 05:56:31 +01:00
parent 627061cdbb
commit 9c65821250
22 changed files with 1099 additions and 304 deletions
--- a/internal/tui/backup_exec.go
+++ b/internal/tui/backup_exec.go
@@ -83,10 +83,10 @@ type backupCompleteMsg struct {

 func executeBackupWithTUIProgress(parentCtx context.Context, cfg *config.Config, log logger.Logger, backupType, dbName string, ratio int) tea.Cmd {
 	return func() tea.Msg {
-		// Use configurable cluster timeout (minutes) from config; default set in config.New()
-		// Use parent context to inherit cancellation from TUI
-		clusterTimeout := time.Duration(cfg.ClusterTimeoutMinutes) * time.Minute
-		ctx, cancel := context.WithTimeout(parentCtx, clusterTimeout)
+		// NO TIMEOUT for backup operations - a backup takes as long as it takes
+		// Large databases can take many hours
+		// Only manual cancellation (Ctrl+C) should stop the backup
+		ctx, cancel := context.WithCancel(parentCtx)
 		defer cancel()

 		start := time.Now()
--- a/internal/tui/dbselector.go
+++ b/internal/tui/dbselector.go
@@ -53,7 +53,8 @@ type databaseListMsg struct {

 func fetchDatabases(cfg *config.Config, log logger.Logger) tea.Cmd {
 	return func() tea.Msg {
-		ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+		// 60 seconds for database listing - busy servers may be slow
+		ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
 		defer cancel()

 		dbClient, err := database.New(cfg, log)
--- a/internal/tui/restore_exec.go
+++ b/internal/tui/restore_exec.go
@@ -111,10 +111,10 @@ type restoreCompleteMsg struct {

 func executeRestoreWithTUIProgress(parentCtx context.Context, cfg *config.Config, log logger.Logger, archive ArchiveInfo, targetDB string, cleanFirst, createIfMissing bool, restoreType string, cleanClusterFirst bool, existingDBs []string, saveDebugLog bool) tea.Cmd {
 	return func() tea.Msg {
-		// Use configurable cluster timeout (minutes) from config; default set in config.New()
-		// Use parent context to inherit cancellation from TUI
-		restoreTimeout := time.Duration(cfg.ClusterTimeoutMinutes) * time.Minute
-		ctx, cancel := context.WithTimeout(parentCtx, restoreTimeout)
+		// NO TIMEOUT for restore operations - a restore takes as long as it takes
+		// Large databases with large objects can take many hours
+		// Only manual cancellation (Ctrl+C) should stop the restore
+		ctx, cancel := context.WithCancel(parentCtx)
 		defer cancel()

 		start := time.Now()
@@ -138,8 +138,8 @@ func executeRestoreWithTUIProgress(parentCtx context.Context, cfg *config.Config
 			// This matches how cluster restore works - uses CLI tools, not database connections
 			droppedCount := 0
 			for _, dbName := range existingDBs {
-				// Create timeout context for each database drop (30 seconds per DB)
-				dropCtx, dropCancel := context.WithTimeout(ctx, 30*time.Second)
+				// Create timeout context for each database drop (5 minutes per DB - large DBs take time)
+				dropCtx, dropCancel := context.WithTimeout(ctx, 5*time.Minute)
 				if err := dropDatabaseCLI(dropCtx, cfg, dbName); err != nil {
 					log.Warn("Failed to drop database", "name", dbName, "error", err)
 					// Continue with other databases
--- a/internal/tui/restore_preview.go
+++ b/internal/tui/restore_preview.go
@@ -106,7 +106,8 @@ type safetyCheckCompleteMsg struct {

 func runSafetyChecks(cfg *config.Config, log logger.Logger, archive ArchiveInfo, targetDB string) tea.Cmd {
 	return func() tea.Msg {
-		ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
+		// 10 minutes for safety checks - large archives can take a long time to diagnose
+		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
 		defer cancel()

 		safety := restore.NewSafety(cfg, log)
@@ -444,7 +445,7 @@ func (m RestorePreviewModel) View() string {
 	// Advanced Options
 	s.WriteString(archiveHeaderStyle.Render("⚙️  Advanced Options"))
 	s.WriteString("\n")
-	
+
 	// Work directory option
 	workDirIcon := "✗"
 	workDirStyle := infoStyle
@@ -460,7 +461,7 @@ func (m RestorePreviewModel) View() string {
 		s.WriteString(infoStyle.Render("    ⚠️  Large archives need more space than /tmp may have"))
 		s.WriteString("\n")
 	}
-	
+
 	// Debug log option
 	debugIcon := "✗"
 	debugStyle := infoStyle
--- a/internal/tui/status.go
+++ b/internal/tui/status.go
@@ -70,7 +70,8 @@ type statusMsg struct {

 func fetchStatus(cfg *config.Config, log logger.Logger) tea.Cmd {
 	return func() tea.Msg {
-		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+		// 30 seconds for status check - slow networks or SSL negotiation
+		ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 		defer cancel()

 		dbClient, err := database.New(cfg, log)