fix(backup): dynamic timeout for large database backups
All checks were successful
CI/CD / Test (push) Successful in 1m11s
CI/CD / Lint (push) Successful in 1m20s
CI/CD / Build (amd64, darwin) (push) Successful in 29s
CI/CD / Build (amd64, linux) (push) Successful in 28s
CI/CD / Build (arm64, darwin) (push) Successful in 29s
CI/CD / Build (arm64, linux) (push) Successful in 29s

- 2-hour timeout was causing truncated backups for databases > 40GB
- Now scales: 2 hours base + 1 hour per 20GB
- 69GB database now gets ~5.5 hour timeout
- Fixed streaming compression error handling order

Fixes truncated resydb.sql.gz in cluster backups
This commit is contained in:
2026-01-06 15:09:29 +01:00
parent 938ee61686
commit 52d475506c

View File

@@ -502,7 +502,23 @@ func (e *Engine) BackupCluster(ctx context.Context) error {
cmd := e.db.BuildBackupCommand(name, dumpFile, options)
dbCtx, cancel := context.WithTimeout(ctx, 2*time.Hour)
// Calculate timeout based on database size:
// - Minimum 2 hours for small databases
// - Add 1 hour per 20GB for large databases
// - This allows ~69GB database to take up to 5+ hours
timeout := 2 * time.Hour
if size, err := e.db.GetDatabaseSize(ctx, name); err == nil {
sizeGB := size / (1024 * 1024 * 1024)
if sizeGB > 20 {
extraHours := (sizeGB / 20) + 1
timeout = time.Duration(2+extraHours) * time.Hour
mu.Lock()
e.printf(" Extended timeout: %v (for %dGB database)\n", timeout, sizeGB)
mu.Unlock()
}
}
dbCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
err := e.executeCommand(dbCtx, cmd, dumpFile)
cancel()
@@ -1356,16 +1372,21 @@ func (e *Engine) executeWithStreamingCompression(ctx context.Context, cmdArgs []
}
// Wait for pg_dump to complete
if err := dumpCmd.Wait(); err != nil {
return fmt.Errorf("pg_dump failed: %w", err)
}
dumpErr := dumpCmd.Wait()
// Close stdout pipe to signal compressor we're done
// Close stdout pipe to signal compressor we're done - MUST happen after Wait()
// but before we check for errors, so compressor gets EOF
dumpStdout.Close()
// Wait for compression to complete
if err := compressCmd.Wait(); err != nil {
return fmt.Errorf("compression failed: %w", err)
compressErr := compressCmd.Wait()
// Check errors in order
if dumpErr != nil {
return fmt.Errorf("pg_dump failed: %w", dumpErr)
}
if compressErr != nil {
return fmt.Errorf("compression failed: %w", compressErr)
}
e.log.Debug("Streaming compression completed", "output", compressedFile)