restore: fix OOM caused by --verbose output accumulation
CRITICAL OOM FIX: - pg_restore --verbose outputs MASSIVE text (gigabytes for large DBs) - Previous fix accumulated ALL errors in allErrors slice causing OOM - Now limit error capture to last 10 errors only - Discard verbose progress output entirely to prevent memory buildup CHANGES: - Replace allErrors slice with lastError string + errorCount counter - Only log first 10 errors to prevent memory exhaustion - Make --verbose optional via RestoreOptions.Verbose flag - Disable --verbose for cluster restores (prevent OOM) - Keep --verbose for single DB restores (better diagnostics) This resolves 'runtime: out of memory' panic during cluster restore.
This commit is contained in:
@@ -66,6 +66,7 @@ type RestoreOptions struct {
|
|||||||
NoOwner bool
|
NoOwner bool
|
||||||
NoPrivileges bool
|
NoPrivileges bool
|
||||||
SingleTransaction bool
|
SingleTransaction bool
|
||||||
|
Verbose bool // Enable verbose output (caution: can cause OOM on large restores)
|
||||||
}
|
}
|
||||||
|
|
||||||
// SampleStrategy defines how to sample data
|
// SampleStrategy defines how to sample data
|
||||||
|
|||||||
@@ -378,8 +378,10 @@ func (p *PostgreSQL) BuildRestoreCommand(database, inputFile string, options Res
|
|||||||
// Skip data restore if table creation fails (prevents duplicate data errors)
|
// Skip data restore if table creation fails (prevents duplicate data errors)
|
||||||
cmd = append(cmd, "--no-data-for-failed-tables")
|
cmd = append(cmd, "--no-data-for-failed-tables")
|
||||||
|
|
||||||
// Add verbose flag for better error reporting
|
// Add verbose flag ONLY if requested (WARNING: can cause OOM on large cluster restores)
|
||||||
|
if options.Verbose {
|
||||||
cmd = append(cmd, "--verbose")
|
cmd = append(cmd, "--verbose")
|
||||||
|
}
|
||||||
|
|
||||||
// Database and input
|
// Database and input
|
||||||
cmd = append(cmd, "--dbname="+database)
|
cmd = append(cmd, "--dbname="+database)
|
||||||
|
|||||||
@@ -161,6 +161,7 @@ func (e *Engine) restorePostgreSQLDump(ctx context.Context, archivePath, targetD
|
|||||||
NoOwner: true,
|
NoOwner: true,
|
||||||
NoPrivileges: true,
|
NoPrivileges: true,
|
||||||
SingleTransaction: true,
|
SingleTransaction: true,
|
||||||
|
Verbose: true, // Enable verbose for single database restores (not cluster)
|
||||||
}
|
}
|
||||||
|
|
||||||
cmd := e.db.BuildRestoreCommand(targetDB, archivePath, opts)
|
cmd := e.db.BuildRestoreCommand(targetDB, archivePath, opts)
|
||||||
@@ -182,6 +183,7 @@ func (e *Engine) restorePostgreSQLDumpWithOwnership(ctx context.Context, archive
|
|||||||
NoOwner: !preserveOwnership, // Preserve ownership if we're superuser
|
NoOwner: !preserveOwnership, // Preserve ownership if we're superuser
|
||||||
NoPrivileges: !preserveOwnership, // Preserve privileges if we're superuser
|
NoPrivileges: !preserveOwnership, // Preserve privileges if we're superuser
|
||||||
SingleTransaction: true,
|
SingleTransaction: true,
|
||||||
|
Verbose: false, // CRITICAL: disable verbose to prevent OOM on large restores
|
||||||
}
|
}
|
||||||
|
|
||||||
e.log.Info("Restoring database",
|
e.log.Info("Restoring database",
|
||||||
@@ -287,32 +289,31 @@ func (e *Engine) executeRestoreCommand(ctx context.Context, cmdArgs []string) er
|
|||||||
// Read stderr in chunks to log errors without loading all into memory
|
// Read stderr in chunks to log errors without loading all into memory
|
||||||
buf := make([]byte, 4096)
|
buf := make([]byte, 4096)
|
||||||
var lastError string
|
var lastError string
|
||||||
var allErrors []string
|
var errorCount int
|
||||||
|
const maxErrors = 10 // Limit captured errors to prevent OOM
|
||||||
for {
|
for {
|
||||||
n, err := stderr.Read(buf)
|
n, err := stderr.Read(buf)
|
||||||
if n > 0 {
|
if n > 0 {
|
||||||
chunk := string(buf[:n])
|
chunk := string(buf[:n])
|
||||||
// Capture all errors/warnings for better diagnostics
|
// Only capture REAL errors, not verbose output
|
||||||
if strings.Contains(chunk, "ERROR") || strings.Contains(chunk, "FATAL") || strings.Contains(chunk, "error:") {
|
if strings.Contains(chunk, "ERROR:") || strings.Contains(chunk, "FATAL:") || strings.Contains(chunk, "error:") {
|
||||||
lastError = chunk
|
lastError = strings.TrimSpace(chunk)
|
||||||
allErrors = append(allErrors, strings.TrimSpace(chunk))
|
errorCount++
|
||||||
|
if errorCount <= maxErrors {
|
||||||
e.log.Warn("Restore stderr", "output", chunk)
|
e.log.Warn("Restore stderr", "output", chunk)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Note: --verbose output is discarded to prevent OOM
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := cmd.Wait(); err != nil {
|
if err := cmd.Wait(); err != nil {
|
||||||
// Include all captured errors in the return message for better diagnostics
|
e.log.Error("Restore command failed", "error", err, "last_stderr", lastError, "error_count", errorCount)
|
||||||
errorDetails := lastError
|
if lastError != "" {
|
||||||
if len(allErrors) > 0 {
|
return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d)", err, lastError, errorCount)
|
||||||
errorDetails = strings.Join(allErrors, " | ")
|
|
||||||
}
|
|
||||||
e.log.Error("Restore command failed", "error", err, "stderr", errorDetails)
|
|
||||||
if errorDetails != "" {
|
|
||||||
return fmt.Errorf("restore failed: %w (stderr: %s)", err, errorDetails)
|
|
||||||
}
|
}
|
||||||
return fmt.Errorf("restore failed: %w", err)
|
return fmt.Errorf("restore failed: %w", err)
|
||||||
}
|
}
|
||||||
@@ -352,32 +353,31 @@ func (e *Engine) executeRestoreWithDecompression(ctx context.Context, archivePat
|
|||||||
// Read stderr in chunks to log errors without loading all into memory
|
// Read stderr in chunks to log errors without loading all into memory
|
||||||
buf := make([]byte, 4096)
|
buf := make([]byte, 4096)
|
||||||
var lastError string
|
var lastError string
|
||||||
var allErrors []string
|
var errorCount int
|
||||||
|
const maxErrors = 10 // Limit captured errors to prevent OOM
|
||||||
for {
|
for {
|
||||||
n, err := stderr.Read(buf)
|
n, err := stderr.Read(buf)
|
||||||
if n > 0 {
|
if n > 0 {
|
||||||
chunk := string(buf[:n])
|
chunk := string(buf[:n])
|
||||||
// Capture all errors/warnings for better diagnostics
|
// Only capture REAL errors, not verbose output
|
||||||
if strings.Contains(chunk, "ERROR") || strings.Contains(chunk, "FATAL") || strings.Contains(chunk, "error:") {
|
if strings.Contains(chunk, "ERROR:") || strings.Contains(chunk, "FATAL:") || strings.Contains(chunk, "error:") {
|
||||||
lastError = chunk
|
lastError = strings.TrimSpace(chunk)
|
||||||
allErrors = append(allErrors, strings.TrimSpace(chunk))
|
errorCount++
|
||||||
|
if errorCount <= maxErrors {
|
||||||
e.log.Warn("Restore stderr", "output", chunk)
|
e.log.Warn("Restore stderr", "output", chunk)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Note: --verbose output is discarded to prevent OOM
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := cmd.Wait(); err != nil {
|
if err := cmd.Wait(); err != nil {
|
||||||
// Include all captured errors in the return message for better diagnostics
|
e.log.Error("Restore with decompression failed", "error", err, "last_stderr", lastError, "error_count", errorCount)
|
||||||
errorDetails := lastError
|
if lastError != "" {
|
||||||
if len(allErrors) > 0 {
|
return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d)", err, lastError, errorCount)
|
||||||
errorDetails = strings.Join(allErrors, " | ")
|
|
||||||
}
|
|
||||||
e.log.Error("Restore with decompression failed", "error", err, "stderr", errorDetails)
|
|
||||||
if errorDetails != "" {
|
|
||||||
return fmt.Errorf("restore failed: %w (stderr: %s)", err, errorDetails)
|
|
||||||
}
|
}
|
||||||
return fmt.Errorf("restore failed: %w", err)
|
return fmt.Errorf("restore failed: %w", err)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user