v5.8.22: Defensive fixes for potential restore hang issues

- Add context cancellation check during COPY data parsing loop (prevents hangs when parsing large tables with millions of rows) - Add 5-second timeout for stderr reader in globals restore (prevents indefinite hang if psql process doesn't terminate cleanly) - Reduce database drop timeout from 5 minutes to 60 seconds (improves TUI responsiveness during cluster cleanup)
2026-02-05 12:40:26 +00:00
parent 555177f5a7
commit a101fb81ab
4 changed files with 21 additions and 4 deletions
--- a/internal/engine/native/parallel_restore.go
+++ b/internal/engine/native/parallel_restore.go
@ -440,6 +440,15 @@ func (e *ParallelRestoreEngine) parseStatementsWithContext(ctx context.Context,
 				currentCopyStmt.CopyData.WriteString(line)
 				currentCopyStmt.CopyData.WriteByte('\n')
 			}
+			// Check for context cancellation during COPY data parsing (large tables)
+			// Check every 10000 lines to avoid overhead
+			if lineCount%10000 == 0 {
+				select {
+				case <-ctx.Done():
+					return statements, ctx.Err()
+				default:
+				}
+			}
 			continue
 		}

--- a/internal/restore/engine.go
+++ b/internal/restore/engine.go
@ -2525,7 +2525,14 @@ func (e *Engine) restoreGlobals(ctx context.Context, globalsFile string) error {
 		cmdErr = ctx.Err()
 	}

-	<-stderrDone
+	// Wait for stderr reader with timeout to prevent indefinite hang
+	// if the process doesn't fully terminate
+	select {
+	case <-stderrDone:
+		// Normal completion
+	case <-time.After(5 * time.Second):
+		e.log.Warn("Stderr reader timeout - forcefully continuing")
+	}

 	// Only fail on actual command errors or FATAL PostgreSQL errors
 	// Regular ERROR messages (like "role already exists") are expected
--- a/internal/tui/restore_exec.go
+++ b/internal/tui/restore_exec.go
@ -414,8 +414,9 @@ func executeRestoreWithTUIProgress(parentCtx context.Context, cfg *config.Config
 				// This matches how cluster restore works - uses CLI tools, not database connections
 				droppedCount := 0
 				for _, dbName := range existingDBs {
-					// Create timeout context for each database drop (5 minutes per DB - large DBs take time)
-					dropCtx, dropCancel := context.WithTimeout(ctx, 5*time.Minute)
+					// Create timeout context for each database drop (60 seconds per DB)
+					// Reduced from 5 minutes for better TUI responsiveness
+					dropCtx, dropCancel := context.WithTimeout(ctx, 60*time.Second)
 					if err := dropDatabaseCLI(dropCtx, cfg, dbName); err != nil {
 						log.Warn("Failed to drop database", "name", dbName, "error", err)
 						// Continue with other databases
--- a/main.go
+++ b/main.go
@ -16,7 +16,7 @@ import (

 // Build information (set by ldflags)
 var (
-	version   = "5.8.21"
+	version   = "5.8.22"
 	buildTime = "unknown"
 	gitCommit = "unknown"
 )