v5.8.22: Defensive fixes for potential restore hang issues
Some checks failed
CI/CD / Test (push) Successful in 3m25s
CI/CD / Lint (push) Successful in 1m33s
CI/CD / Integration Tests (push) Successful in 1m4s
CI/CD / Native Engine Tests (push) Successful in 1m2s
CI/CD / Build Binary (push) Successful in 56s
CI/CD / Test Release Build (push) Successful in 1m41s
CI/CD / Release Binaries (push) Failing after 11m55s
Some checks failed
CI/CD / Test (push) Successful in 3m25s
CI/CD / Lint (push) Successful in 1m33s
CI/CD / Integration Tests (push) Successful in 1m4s
CI/CD / Native Engine Tests (push) Successful in 1m2s
CI/CD / Build Binary (push) Successful in 56s
CI/CD / Test Release Build (push) Successful in 1m41s
CI/CD / Release Binaries (push) Failing after 11m55s
- Add context cancellation check during COPY data parsing loop (prevents hangs when parsing large tables with millions of rows) - Add 5-second timeout for stderr reader in globals restore (prevents indefinite hang if psql process doesn't terminate cleanly) - Reduce database drop timeout from 5 minutes to 60 seconds (improves TUI responsiveness during cluster cleanup)
This commit is contained in:
@ -440,6 +440,15 @@ func (e *ParallelRestoreEngine) parseStatementsWithContext(ctx context.Context,
|
|||||||
currentCopyStmt.CopyData.WriteString(line)
|
currentCopyStmt.CopyData.WriteString(line)
|
||||||
currentCopyStmt.CopyData.WriteByte('\n')
|
currentCopyStmt.CopyData.WriteByte('\n')
|
||||||
}
|
}
|
||||||
|
// Check for context cancellation during COPY data parsing (large tables)
|
||||||
|
// Check every 10000 lines to avoid overhead
|
||||||
|
if lineCount%10000 == 0 {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return statements, ctx.Err()
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -2525,7 +2525,14 @@ func (e *Engine) restoreGlobals(ctx context.Context, globalsFile string) error {
|
|||||||
cmdErr = ctx.Err()
|
cmdErr = ctx.Err()
|
||||||
}
|
}
|
||||||
|
|
||||||
<-stderrDone
|
// Wait for stderr reader with timeout to prevent indefinite hang
|
||||||
|
// if the process doesn't fully terminate
|
||||||
|
select {
|
||||||
|
case <-stderrDone:
|
||||||
|
// Normal completion
|
||||||
|
case <-time.After(5 * time.Second):
|
||||||
|
e.log.Warn("Stderr reader timeout - forcefully continuing")
|
||||||
|
}
|
||||||
|
|
||||||
// Only fail on actual command errors or FATAL PostgreSQL errors
|
// Only fail on actual command errors or FATAL PostgreSQL errors
|
||||||
// Regular ERROR messages (like "role already exists") are expected
|
// Regular ERROR messages (like "role already exists") are expected
|
||||||
|
|||||||
@ -414,8 +414,9 @@ func executeRestoreWithTUIProgress(parentCtx context.Context, cfg *config.Config
|
|||||||
// This matches how cluster restore works - uses CLI tools, not database connections
|
// This matches how cluster restore works - uses CLI tools, not database connections
|
||||||
droppedCount := 0
|
droppedCount := 0
|
||||||
for _, dbName := range existingDBs {
|
for _, dbName := range existingDBs {
|
||||||
// Create timeout context for each database drop (5 minutes per DB - large DBs take time)
|
// Create timeout context for each database drop (60 seconds per DB)
|
||||||
dropCtx, dropCancel := context.WithTimeout(ctx, 5*time.Minute)
|
// Reduced from 5 minutes for better TUI responsiveness
|
||||||
|
dropCtx, dropCancel := context.WithTimeout(ctx, 60*time.Second)
|
||||||
if err := dropDatabaseCLI(dropCtx, cfg, dbName); err != nil {
|
if err := dropDatabaseCLI(dropCtx, cfg, dbName); err != nil {
|
||||||
log.Warn("Failed to drop database", "name", dbName, "error", err)
|
log.Warn("Failed to drop database", "name", dbName, "error", err)
|
||||||
// Continue with other databases
|
// Continue with other databases
|
||||||
|
|||||||
Reference in New Issue
Block a user