restore: fix OOM caused by --verbose output accumulation

CRITICAL OOM FIX:
- pg_restore --verbose outputs MASSIVE text (gigabytes for large DBs)
- Previous fix accumulated ALL errors in allErrors slice causing OOM
- Now limit error capture to last 10 errors only
- Discard verbose progress output entirely to prevent memory buildup

CHANGES:
- Replace allErrors slice with lastError string + errorCount counter
- Only log first 10 errors to prevent memory exhaustion
- Make --verbose optional via RestoreOptions.Verbose flag
- Disable --verbose for cluster restores (prevent OOM)
- Keep --verbose for single DB restores (better diagnostics)

This resolves 'runtime: out of memory' panic during cluster restore.
This commit is contained in:
2025-11-13 14:19:56 +00:00
parent 58d11bc4b3
commit ab3aceb5c0
3 changed files with 38 additions and 35 deletions

View File

@@ -60,12 +60,13 @@ type BackupOptions struct {
// RestoreOptions holds options for restore operations
type RestoreOptions struct {
Parallel int
Clean bool
IfExists bool
NoOwner bool
NoPrivileges bool
Parallel int
Clean bool
IfExists bool
NoOwner bool
NoPrivileges bool
SingleTransaction bool
Verbose bool // Enable verbose output (caution: can cause OOM on large restores)
}
// SampleStrategy defines how to sample data

View File

@@ -378,8 +378,10 @@ func (p *PostgreSQL) BuildRestoreCommand(database, inputFile string, options Res
// Skip data restore if table creation fails (prevents duplicate data errors)
cmd = append(cmd, "--no-data-for-failed-tables")
// Add verbose flag for better error reporting
cmd = append(cmd, "--verbose")
// Add verbose flag ONLY if requested (WARNING: can cause OOM on large cluster restores)
if options.Verbose {
cmd = append(cmd, "--verbose")
}
// Database and input
cmd = append(cmd, "--dbname="+database)

View File

@@ -161,6 +161,7 @@ func (e *Engine) restorePostgreSQLDump(ctx context.Context, archivePath, targetD
NoOwner: true,
NoPrivileges: true,
SingleTransaction: true,
Verbose: true, // Enable verbose for single database restores (not cluster)
}
cmd := e.db.BuildRestoreCommand(targetDB, archivePath, opts)
@@ -182,6 +183,7 @@ func (e *Engine) restorePostgreSQLDumpWithOwnership(ctx context.Context, archive
NoOwner: !preserveOwnership, // Preserve ownership if we're superuser
NoPrivileges: !preserveOwnership, // Preserve privileges if we're superuser
SingleTransaction: true,
Verbose: false, // CRITICAL: disable verbose to prevent OOM on large restores
}
e.log.Info("Restoring database",
@@ -287,17 +289,21 @@ func (e *Engine) executeRestoreCommand(ctx context.Context, cmdArgs []string) er
// Read stderr in chunks to log errors without loading all into memory
buf := make([]byte, 4096)
var lastError string
var allErrors []string
var errorCount int
const maxErrors = 10 // Limit captured errors to prevent OOM
for {
n, err := stderr.Read(buf)
if n > 0 {
chunk := string(buf[:n])
// Capture all errors/warnings for better diagnostics
if strings.Contains(chunk, "ERROR") || strings.Contains(chunk, "FATAL") || strings.Contains(chunk, "error:") {
lastError = chunk
allErrors = append(allErrors, strings.TrimSpace(chunk))
e.log.Warn("Restore stderr", "output", chunk)
// Only capture REAL errors, not verbose output
if strings.Contains(chunk, "ERROR:") || strings.Contains(chunk, "FATAL:") || strings.Contains(chunk, "error:") {
lastError = strings.TrimSpace(chunk)
errorCount++
if errorCount <= maxErrors {
e.log.Warn("Restore stderr", "output", chunk)
}
}
// Note: --verbose output is discarded to prevent OOM
}
if err != nil {
break
@@ -305,14 +311,9 @@ func (e *Engine) executeRestoreCommand(ctx context.Context, cmdArgs []string) er
}
if err := cmd.Wait(); err != nil {
// Include all captured errors in the return message for better diagnostics
errorDetails := lastError
if len(allErrors) > 0 {
errorDetails = strings.Join(allErrors, " | ")
}
e.log.Error("Restore command failed", "error", err, "stderr", errorDetails)
if errorDetails != "" {
return fmt.Errorf("restore failed: %w (stderr: %s)", err, errorDetails)
e.log.Error("Restore command failed", "error", err, "last_stderr", lastError, "error_count", errorCount)
if lastError != "" {
return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d)", err, lastError, errorCount)
}
return fmt.Errorf("restore failed: %w", err)
}
@@ -352,17 +353,21 @@ func (e *Engine) executeRestoreWithDecompression(ctx context.Context, archivePat
// Read stderr in chunks to log errors without loading all into memory
buf := make([]byte, 4096)
var lastError string
var allErrors []string
var errorCount int
const maxErrors = 10 // Limit captured errors to prevent OOM
for {
n, err := stderr.Read(buf)
if n > 0 {
chunk := string(buf[:n])
// Capture all errors/warnings for better diagnostics
if strings.Contains(chunk, "ERROR") || strings.Contains(chunk, "FATAL") || strings.Contains(chunk, "error:") {
lastError = chunk
allErrors = append(allErrors, strings.TrimSpace(chunk))
e.log.Warn("Restore stderr", "output", chunk)
// Only capture REAL errors, not verbose output
if strings.Contains(chunk, "ERROR:") || strings.Contains(chunk, "FATAL:") || strings.Contains(chunk, "error:") {
lastError = strings.TrimSpace(chunk)
errorCount++
if errorCount <= maxErrors {
e.log.Warn("Restore stderr", "output", chunk)
}
}
// Note: --verbose output is discarded to prevent OOM
}
if err != nil {
break
@@ -370,14 +375,9 @@ func (e *Engine) executeRestoreWithDecompression(ctx context.Context, archivePat
}
if err := cmd.Wait(); err != nil {
// Include all captured errors in the return message for better diagnostics
errorDetails := lastError
if len(allErrors) > 0 {
errorDetails = strings.Join(allErrors, " | ")
}
e.log.Error("Restore with decompression failed", "error", err, "stderr", errorDetails)
if errorDetails != "" {
return fmt.Errorf("restore failed: %w (stderr: %s)", err, errorDetails)
e.log.Error("Restore with decompression failed", "error", err, "last_stderr", lastError, "error_count", errorCount)
if lastError != "" {
return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d)", err, lastError, errorCount)
}
return fmt.Errorf("restore failed: %w", err)
}