v5.8.22: Defensive fixes for potential restore hang issues
Some checks failed
CI/CD / Test (push) Successful in 3m25s
CI/CD / Lint (push) Successful in 1m33s
CI/CD / Integration Tests (push) Successful in 1m4s
CI/CD / Native Engine Tests (push) Successful in 1m2s
CI/CD / Build Binary (push) Successful in 56s
CI/CD / Test Release Build (push) Successful in 1m41s
CI/CD / Release Binaries (push) Failing after 11m55s
Some checks failed
CI/CD / Test (push) Successful in 3m25s
CI/CD / Lint (push) Successful in 1m33s
CI/CD / Integration Tests (push) Successful in 1m4s
CI/CD / Native Engine Tests (push) Successful in 1m2s
CI/CD / Build Binary (push) Successful in 56s
CI/CD / Test Release Build (push) Successful in 1m41s
CI/CD / Release Binaries (push) Failing after 11m55s
- Add context cancellation check during COPY data parsing loop (prevents hangs when parsing large tables with millions of rows) - Add 5-second timeout for stderr reader in globals restore (prevents indefinite hang if psql process doesn't terminate cleanly) - Reduce database drop timeout from 5 minutes to 60 seconds (improves TUI responsiveness during cluster cleanup)
This commit is contained in:
@ -440,6 +440,15 @@ func (e *ParallelRestoreEngine) parseStatementsWithContext(ctx context.Context,
|
||||
currentCopyStmt.CopyData.WriteString(line)
|
||||
currentCopyStmt.CopyData.WriteByte('\n')
|
||||
}
|
||||
// Check for context cancellation during COPY data parsing (large tables)
|
||||
// Check every 10000 lines to avoid overhead
|
||||
if lineCount%10000 == 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return statements, ctx.Err()
|
||||
default:
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
|
||||
@ -2525,7 +2525,14 @@ func (e *Engine) restoreGlobals(ctx context.Context, globalsFile string) error {
|
||||
cmdErr = ctx.Err()
|
||||
}
|
||||
|
||||
<-stderrDone
|
||||
// Wait for stderr reader with timeout to prevent indefinite hang
|
||||
// if the process doesn't fully terminate
|
||||
select {
|
||||
case <-stderrDone:
|
||||
// Normal completion
|
||||
case <-time.After(5 * time.Second):
|
||||
e.log.Warn("Stderr reader timeout - forcefully continuing")
|
||||
}
|
||||
|
||||
// Only fail on actual command errors or FATAL PostgreSQL errors
|
||||
// Regular ERROR messages (like "role already exists") are expected
|
||||
|
||||
@ -414,8 +414,9 @@ func executeRestoreWithTUIProgress(parentCtx context.Context, cfg *config.Config
|
||||
// This matches how cluster restore works - uses CLI tools, not database connections
|
||||
droppedCount := 0
|
||||
for _, dbName := range existingDBs {
|
||||
// Create timeout context for each database drop (5 minutes per DB - large DBs take time)
|
||||
dropCtx, dropCancel := context.WithTimeout(ctx, 5*time.Minute)
|
||||
// Create timeout context for each database drop (60 seconds per DB)
|
||||
// Reduced from 5 minutes for better TUI responsiveness
|
||||
dropCtx, dropCancel := context.WithTimeout(ctx, 60*time.Second)
|
||||
if err := dropDatabaseCLI(dropCtx, cfg, dbName); err != nil {
|
||||
log.Warn("Failed to drop database", "name", dbName, "error", err)
|
||||
// Continue with other databases
|
||||
|
||||
Reference in New Issue
Block a user