From ab3aceb5c07939b4bbe7b9e84fb308ca42ff2f57 Mon Sep 17 00:00:00 2001 From: Renz Date: Thu, 13 Nov 2025 14:19:56 +0000 Subject: [PATCH] restore: fix OOM caused by --verbose output accumulation CRITICAL OOM FIX: - pg_restore --verbose outputs MASSIVE text (gigabytes for large DBs) - Previous fix accumulated ALL errors in allErrors slice causing OOM - Now limit error capture to last 10 errors only - Discard verbose progress output entirely to prevent memory buildup CHANGES: - Replace allErrors slice with lastError string + errorCount counter - Only log first 10 errors to prevent memory exhaustion - Make --verbose optional via RestoreOptions.Verbose flag - Disable --verbose for cluster restores (prevent OOM) - Keep --verbose for single DB restores (better diagnostics) This resolves 'runtime: out of memory' panic during cluster restore. --- internal/database/interface.go | 11 ++++--- internal/database/postgresql.go | 6 ++-- internal/restore/engine.go | 56 ++++++++++++++++----------------- 3 files changed, 38 insertions(+), 35 deletions(-) diff --git a/internal/database/interface.go b/internal/database/interface.go index 5431509..d3d25e3 100644 --- a/internal/database/interface.go +++ b/internal/database/interface.go @@ -60,12 +60,13 @@ type BackupOptions struct { // RestoreOptions holds options for restore operations type RestoreOptions struct { - Parallel int - Clean bool - IfExists bool - NoOwner bool - NoPrivileges bool + Parallel int + Clean bool + IfExists bool + NoOwner bool + NoPrivileges bool SingleTransaction bool + Verbose bool // Enable verbose output (caution: can cause OOM on large restores) } // SampleStrategy defines how to sample data diff --git a/internal/database/postgresql.go b/internal/database/postgresql.go index 6ecb256..51818b0 100644 --- a/internal/database/postgresql.go +++ b/internal/database/postgresql.go @@ -378,8 +378,10 @@ func (p *PostgreSQL) BuildRestoreCommand(database, inputFile string, options Res // Skip data restore if table creation fails (prevents duplicate data errors) cmd = append(cmd, "--no-data-for-failed-tables") - // Add verbose flag for better error reporting - cmd = append(cmd, "--verbose") + // Add verbose flag ONLY if requested (WARNING: can cause OOM on large cluster restores) + if options.Verbose { + cmd = append(cmd, "--verbose") + } // Database and input cmd = append(cmd, "--dbname="+database) diff --git a/internal/restore/engine.go b/internal/restore/engine.go index 33d7d61..431bd37 100644 --- a/internal/restore/engine.go +++ b/internal/restore/engine.go @@ -161,6 +161,7 @@ func (e *Engine) restorePostgreSQLDump(ctx context.Context, archivePath, targetD NoOwner: true, NoPrivileges: true, SingleTransaction: true, + Verbose: true, // Enable verbose for single database restores (not cluster) } cmd := e.db.BuildRestoreCommand(targetDB, archivePath, opts) @@ -182,6 +183,7 @@ func (e *Engine) restorePostgreSQLDumpWithOwnership(ctx context.Context, archive NoOwner: !preserveOwnership, // Preserve ownership if we're superuser NoPrivileges: !preserveOwnership, // Preserve privileges if we're superuser SingleTransaction: true, + Verbose: false, // CRITICAL: disable verbose to prevent OOM on large restores } e.log.Info("Restoring database", @@ -287,17 +289,21 @@ func (e *Engine) executeRestoreCommand(ctx context.Context, cmdArgs []string) er // Read stderr in chunks to log errors without loading all into memory buf := make([]byte, 4096) var lastError string - var allErrors []string + var errorCount int + const maxErrors = 10 // Limit captured errors to prevent OOM for { n, err := stderr.Read(buf) if n > 0 { chunk := string(buf[:n]) - // Capture all errors/warnings for better diagnostics - if strings.Contains(chunk, "ERROR") || strings.Contains(chunk, "FATAL") || strings.Contains(chunk, "error:") { - lastError = chunk - allErrors = append(allErrors, strings.TrimSpace(chunk)) - e.log.Warn("Restore stderr", "output", chunk) + // Only capture REAL errors, not verbose output + if strings.Contains(chunk, "ERROR:") || strings.Contains(chunk, "FATAL:") || strings.Contains(chunk, "error:") { + lastError = strings.TrimSpace(chunk) + errorCount++ + if errorCount <= maxErrors { + e.log.Warn("Restore stderr", "output", chunk) + } } + // Note: --verbose output is discarded to prevent OOM } if err != nil { break @@ -305,14 +311,9 @@ func (e *Engine) executeRestoreCommand(ctx context.Context, cmdArgs []string) er } if err := cmd.Wait(); err != nil { - // Include all captured errors in the return message for better diagnostics - errorDetails := lastError - if len(allErrors) > 0 { - errorDetails = strings.Join(allErrors, " | ") - } - e.log.Error("Restore command failed", "error", err, "stderr", errorDetails) - if errorDetails != "" { - return fmt.Errorf("restore failed: %w (stderr: %s)", err, errorDetails) + e.log.Error("Restore command failed", "error", err, "last_stderr", lastError, "error_count", errorCount) + if lastError != "" { + return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d)", err, lastError, errorCount) } return fmt.Errorf("restore failed: %w", err) } @@ -352,17 +353,21 @@ func (e *Engine) executeRestoreWithDecompression(ctx context.Context, archivePat // Read stderr in chunks to log errors without loading all into memory buf := make([]byte, 4096) var lastError string - var allErrors []string + var errorCount int + const maxErrors = 10 // Limit captured errors to prevent OOM for { n, err := stderr.Read(buf) if n > 0 { chunk := string(buf[:n]) - // Capture all errors/warnings for better diagnostics - if strings.Contains(chunk, "ERROR") || strings.Contains(chunk, "FATAL") || strings.Contains(chunk, "error:") { - lastError = chunk - allErrors = append(allErrors, strings.TrimSpace(chunk)) - e.log.Warn("Restore stderr", "output", chunk) + // Only capture REAL errors, not verbose output + if strings.Contains(chunk, "ERROR:") || strings.Contains(chunk, "FATAL:") || strings.Contains(chunk, "error:") { + lastError = strings.TrimSpace(chunk) + errorCount++ + if errorCount <= maxErrors { + e.log.Warn("Restore stderr", "output", chunk) + } } + // Note: --verbose output is discarded to prevent OOM } if err != nil { break @@ -370,14 +375,9 @@ func (e *Engine) executeRestoreWithDecompression(ctx context.Context, archivePat } if err := cmd.Wait(); err != nil { - // Include all captured errors in the return message for better diagnostics - errorDetails := lastError - if len(allErrors) > 0 { - errorDetails = strings.Join(allErrors, " | ") - } - e.log.Error("Restore with decompression failed", "error", err, "stderr", errorDetails) - if errorDetails != "" { - return fmt.Errorf("restore failed: %w (stderr: %s)", err, errorDetails) + e.log.Error("Restore with decompression failed", "error", err, "last_stderr", lastError, "error_count", errorCount) + if lastError != "" { + return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d)", err, lastError, errorCount) } return fmt.Errorf("restore failed: %w", err) }