package restore import ( "archive/tar" "bufio" "context" "database/sql" "fmt" "io" "os" "os/exec" "path/filepath" "strconv" "strings" "sync" "sync/atomic" "time" "dbbackup/internal/checks" "dbbackup/internal/cleanup" "dbbackup/internal/config" "dbbackup/internal/database" "dbbackup/internal/engine/native" "dbbackup/internal/fs" "dbbackup/internal/logger" "dbbackup/internal/progress" "dbbackup/internal/security" "github.com/hashicorp/go-multierror" _ "github.com/jackc/pgx/v5/stdlib" // PostgreSQL driver "github.com/klauspost/pgzip" ) // ProgressCallback is called with progress updates during long operations // Parameters: current bytes/items done, total bytes/items, description type ProgressCallback func(current, total int64, description string) // DatabaseProgressCallback is called with database count progress during cluster restore type DatabaseProgressCallback func(done, total int, dbName string) // DatabaseProgressWithTimingCallback is called with database progress including timing info // Parameters: done count, total count, database name, elapsed time for current restore phase, avg duration per DB type DatabaseProgressWithTimingCallback func(done, total int, dbName string, phaseElapsed, avgPerDB time.Duration) // DatabaseProgressByBytesCallback is called with progress weighted by database sizes (bytes) // Parameters: bytes completed, total bytes, current database name, databases done count, total database count type DatabaseProgressByBytesCallback func(bytesDone, bytesTotal int64, dbName string, dbDone, dbTotal int) // Engine handles database restore operations type Engine struct { cfg *config.Config log logger.Logger db database.Database progress progress.Indicator detailedReporter *progress.DetailedReporter dryRun bool silentMode bool // Suppress stdout output (for TUI mode) debugLogPath string // Path to save debug log on error // TUI progress callback for detailed progress reporting progressCallback ProgressCallback dbProgressCallback DatabaseProgressCallback dbProgressTimingCallback DatabaseProgressWithTimingCallback dbProgressByBytesCallback DatabaseProgressByBytesCallback } // New creates a new restore engine func New(cfg *config.Config, log logger.Logger, db database.Database) *Engine { progressIndicator := progress.NewIndicator(true, "line") detailedReporter := progress.NewDetailedReporter(progressIndicator, &loggerAdapter{logger: log}) return &Engine{ cfg: cfg, log: log, db: db, progress: progressIndicator, detailedReporter: detailedReporter, dryRun: false, } } // NewSilent creates a new restore engine with no stdout progress (for TUI mode) func NewSilent(cfg *config.Config, log logger.Logger, db database.Database) *Engine { progressIndicator := progress.NewNullIndicator() detailedReporter := progress.NewDetailedReporter(progressIndicator, &loggerAdapter{logger: log}) return &Engine{ cfg: cfg, log: log, db: db, progress: progressIndicator, detailedReporter: detailedReporter, dryRun: false, silentMode: true, // Suppress stdout for TUI } } // NewWithProgress creates a restore engine with custom progress indicator func NewWithProgress(cfg *config.Config, log logger.Logger, db database.Database, progressIndicator progress.Indicator, dryRun bool) *Engine { if progressIndicator == nil { progressIndicator = progress.NewNullIndicator() } detailedReporter := progress.NewDetailedReporter(progressIndicator, &loggerAdapter{logger: log}) return &Engine{ cfg: cfg, log: log, db: db, progress: progressIndicator, detailedReporter: detailedReporter, dryRun: dryRun, } } // SetDebugLogPath enables saving detailed error reports on failure func (e *Engine) SetDebugLogPath(path string) { e.debugLogPath = path } // SetProgressCallback sets a callback for detailed progress reporting (for TUI mode) func (e *Engine) SetProgressCallback(cb ProgressCallback) { e.progressCallback = cb } // SetDatabaseProgressCallback sets a callback for database count progress during cluster restore func (e *Engine) SetDatabaseProgressCallback(cb DatabaseProgressCallback) { e.dbProgressCallback = cb } // SetDatabaseProgressWithTimingCallback sets a callback for database progress with timing info func (e *Engine) SetDatabaseProgressWithTimingCallback(cb DatabaseProgressWithTimingCallback) { e.dbProgressTimingCallback = cb } // SetDatabaseProgressByBytesCallback sets a callback for progress weighted by database sizes func (e *Engine) SetDatabaseProgressByBytesCallback(cb DatabaseProgressByBytesCallback) { e.dbProgressByBytesCallback = cb } // reportProgress safely calls the progress callback if set func (e *Engine) reportProgress(current, total int64, description string) { if e.progressCallback != nil { e.progressCallback(current, total, description) } } // reportDatabaseProgress safely calls the database progress callback if set func (e *Engine) reportDatabaseProgress(done, total int, dbName string) { // CRITICAL: Add panic recovery to prevent crashes during TUI shutdown defer func() { if r := recover(); r != nil { e.log.Warn("Database progress callback panic recovered", "panic", r, "db", dbName) } }() if e.dbProgressCallback != nil { e.dbProgressCallback(done, total, dbName) } } // reportDatabaseProgressWithTiming safely calls the timing-aware callback if set func (e *Engine) reportDatabaseProgressWithTiming(done, total int, dbName string, phaseElapsed, avgPerDB time.Duration) { // CRITICAL: Add panic recovery to prevent crashes during TUI shutdown defer func() { if r := recover(); r != nil { e.log.Warn("Database timing progress callback panic recovered", "panic", r, "db", dbName) } }() if e.dbProgressTimingCallback != nil { e.dbProgressTimingCallback(done, total, dbName, phaseElapsed, avgPerDB) } } // reportDatabaseProgressByBytes safely calls the bytes-weighted callback if set func (e *Engine) reportDatabaseProgressByBytes(bytesDone, bytesTotal int64, dbName string, dbDone, dbTotal int) { // CRITICAL: Add panic recovery to prevent crashes during TUI shutdown defer func() { if r := recover(); r != nil { e.log.Warn("Database bytes progress callback panic recovered", "panic", r, "db", dbName) } }() if e.dbProgressByBytesCallback != nil { e.dbProgressByBytesCallback(bytesDone, bytesTotal, dbName, dbDone, dbTotal) } } // loggerAdapter adapts our logger to the progress.Logger interface type loggerAdapter struct { logger logger.Logger } func (la *loggerAdapter) Info(msg string, args ...any) { la.logger.Info(msg, args...) } func (la *loggerAdapter) Warn(msg string, args ...any) { la.logger.Warn(msg, args...) } func (la *loggerAdapter) Error(msg string, args ...any) { la.logger.Error(msg, args...) } func (la *loggerAdapter) Debug(msg string, args ...any) { la.logger.Debug(msg, args...) } // RestoreSingle restores a single database from an archive func (e *Engine) RestoreSingle(ctx context.Context, archivePath, targetDB string, cleanFirst, createIfMissing bool) error { operation := e.log.StartOperation("Single Database Restore") startTime := time.Now() // Validate and sanitize archive path validArchivePath, pathErr := security.ValidateArchivePath(archivePath) if pathErr != nil { operation.Fail(fmt.Sprintf("Invalid archive path: %v", pathErr)) return fmt.Errorf("invalid archive path: %w", pathErr) } archivePath = validArchivePath // Get archive size for metrics var archiveSize int64 if fi, err := os.Stat(archivePath); err == nil { archiveSize = fi.Size() } // Validate archive exists if _, err := os.Stat(archivePath); os.IsNotExist(err) { operation.Fail("Archive not found") return fmt.Errorf("archive not found: %s", archivePath) } // Verify checksum if .sha256 file exists if checksumErr := security.LoadAndVerifyChecksum(archivePath); checksumErr != nil { e.log.Warn("Checksum verification failed", "error", checksumErr) e.log.Warn("Continuing restore without checksum verification (use with caution)") } else { e.log.Info("[OK] Archive checksum verified successfully") } // Detect archive format format := DetectArchiveFormat(archivePath) e.log.Info("Detected archive format", "format", format, "path", archivePath) // Check version compatibility for PostgreSQL dumps if format == FormatPostgreSQLDump || format == FormatPostgreSQLDumpGz { if compatResult, err := e.CheckRestoreVersionCompatibility(ctx, archivePath); err == nil && compatResult != nil { e.log.Info(compatResult.Message, "source_version", compatResult.SourceVersion.Full, "target_version", compatResult.TargetVersion.Full, "compatibility", compatResult.Level.String()) // Block unsupported downgrades if !compatResult.Compatible { operation.Fail(compatResult.Message) return fmt.Errorf("version compatibility error: %s", compatResult.Message) } // Show warnings for risky upgrades if compatResult.Level == CompatibilityLevelRisky || compatResult.Level == CompatibilityLevelWarning { for _, warning := range compatResult.Warnings { e.log.Warn(warning) } } } } if e.dryRun { e.log.Info("DRY RUN: Would restore single database", "archive", archivePath, "target", targetDB) return e.previewRestore(archivePath, targetDB, format) } // Start progress tracking e.progress.Start(fmt.Sprintf("Restoring database '%s' from %s", targetDB, filepath.Base(archivePath))) // Create database if requested and it doesn't exist if createIfMissing { e.log.Info("Checking if target database exists", "database", targetDB) if err := e.ensureDatabaseExists(ctx, targetDB); err != nil { operation.Fail(fmt.Sprintf("Failed to create database: %v", err)) return fmt.Errorf("failed to create database '%s': %w", targetDB, err) } } // Handle different archive formats var err error switch format { case FormatPostgreSQLDump, FormatPostgreSQLDumpGz: err = e.restorePostgreSQLDump(ctx, archivePath, targetDB, format == FormatPostgreSQLDumpGz, cleanFirst) case FormatPostgreSQLSQL, FormatPostgreSQLSQLGz: err = e.restorePostgreSQLSQL(ctx, archivePath, targetDB, format == FormatPostgreSQLSQLGz) case FormatMySQLSQL, FormatMySQLSQLGz: err = e.restoreMySQLSQL(ctx, archivePath, targetDB, format == FormatMySQLSQLGz) default: operation.Fail("Unsupported archive format") return fmt.Errorf("unsupported archive format: %s", format) } // Record restore metrics for Prometheus duration := time.Since(startTime) dbType := "postgresql" if format == FormatMySQLSQL || format == FormatMySQLSQLGz { dbType = "mysql" } record := RestoreRecord{ Database: targetDB, Engine: dbType, StartedAt: startTime, CompletedAt: time.Now(), Duration: duration, SizeBytes: archiveSize, ParallelJobs: e.cfg.Jobs, Profile: e.cfg.ResourceProfile, Success: err == nil, SourceFile: filepath.Base(archivePath), TargetDB: targetDB, IsCluster: false, } if err != nil { record.ErrorMessage = err.Error() } if recordErr := RecordRestore(record); recordErr != nil { e.log.Warn("Failed to record restore metrics", "error", recordErr) } if err != nil { e.progress.Fail(fmt.Sprintf("Restore failed: %v", err)) operation.Fail(fmt.Sprintf("Restore failed: %v", err)) return err } e.progress.Complete(fmt.Sprintf("Database '%s' restored successfully", targetDB)) operation.Complete(fmt.Sprintf("Restored database '%s' from %s", targetDB, filepath.Base(archivePath))) return nil } // restorePostgreSQLDump restores from PostgreSQL custom dump format func (e *Engine) restorePostgreSQLDump(ctx context.Context, archivePath, targetDB string, compressed bool, cleanFirst bool) error { // Build restore command // Use configured Jobs count for parallel pg_restore (matches pg_restore -j behavior) parallelJobs := e.cfg.Jobs if parallelJobs <= 0 { parallelJobs = 1 // Default fallback } opts := database.RestoreOptions{ Parallel: parallelJobs, Clean: cleanFirst, NoOwner: true, NoPrivileges: true, SingleTransaction: false, // CRITICAL: Disabled to prevent lock exhaustion with large objects Verbose: true, // Enable verbose for single database restores (not cluster) } cmd := e.db.BuildRestoreCommand(targetDB, archivePath, opts) // Start heartbeat ticker for restore progress (10s interval to reduce overhead) restoreStart := time.Now() heartbeatCtx, cancelHeartbeat := context.WithCancel(ctx) heartbeatTicker := time.NewTicker(10 * time.Second) defer heartbeatTicker.Stop() defer cancelHeartbeat() // Run heartbeat in background - no mutex needed as progress.Update is thread-safe go func() { for { select { case <-heartbeatTicker.C: elapsed := time.Since(restoreStart) e.progress.Update(fmt.Sprintf("Restoring %s... (elapsed: %s)", targetDB, formatDuration(elapsed))) case <-heartbeatCtx.Done(): return } } }() if compressed { // For compressed dumps, decompress first return e.executeRestoreWithDecompression(ctx, archivePath, cmd) } return e.executeRestoreCommand(ctx, cmd) } // restorePostgreSQLDumpWithOwnership restores from PostgreSQL custom dump with ownership control func (e *Engine) restorePostgreSQLDumpWithOwnership(ctx context.Context, archivePath, targetDB string, compressed bool, preserveOwnership bool) error { // Check if dump contains large objects (BLOBs) - if so, use phased restore // to prevent lock table exhaustion (max_locks_per_transaction OOM) hasLargeObjects := e.checkDumpHasLargeObjects(archivePath) if hasLargeObjects { e.log.Info("Large objects detected - using phased restore to prevent lock exhaustion", "database", targetDB, "archive", archivePath) return e.restorePostgreSQLDumpPhased(ctx, archivePath, targetDB, preserveOwnership) } // Standard restore for dumps without large objects // Use configured Jobs count for parallel pg_restore (matches pg_restore -j behavior) parallelJobs := e.cfg.Jobs if parallelJobs <= 0 { parallelJobs = 1 // Default fallback } opts := database.RestoreOptions{ Parallel: parallelJobs, Clean: false, // We already dropped the database NoOwner: !preserveOwnership, // Preserve ownership if we're superuser NoPrivileges: !preserveOwnership, // Preserve privileges if we're superuser SingleTransaction: false, // CRITICAL: Disabled to prevent lock exhaustion with large objects Verbose: false, // CRITICAL: disable verbose to prevent OOM on large restores } e.log.Info("Restoring database", "database", targetDB, "parallel_jobs", parallelJobs, "preserveOwnership", preserveOwnership, "noOwner", opts.NoOwner, "noPrivileges", opts.NoPrivileges) cmd := e.db.BuildRestoreCommand(targetDB, archivePath, opts) if compressed { // For compressed dumps, decompress first return e.executeRestoreWithDecompression(ctx, archivePath, cmd) } return e.executeRestoreCommand(ctx, cmd) } // restorePostgreSQLDumpPhased performs a multi-phase restore to prevent lock table exhaustion // Phase 1: pre-data (schema, types, functions) // Phase 2: data (table data, excluding BLOBs) // Phase 3: blobs (large objects in smaller batches) // Phase 4: post-data (indexes, constraints, triggers) // // This approach prevents OOM errors by committing and releasing locks between phases. func (e *Engine) restorePostgreSQLDumpPhased(ctx context.Context, archivePath, targetDB string, preserveOwnership bool) error { e.log.Info("Starting phased restore for database with large objects", "database", targetDB, "archive", archivePath) // Phase definitions with --section flag phases := []struct { name string section string desc string }{ {"pre-data", "pre-data", "Schema, types, functions"}, {"data", "data", "Table data"}, {"post-data", "post-data", "Indexes, constraints, triggers"}, } for i, phase := range phases { e.log.Info(fmt.Sprintf("Phase %d/%d: Restoring %s", i+1, len(phases), phase.name), "database", targetDB, "section", phase.section, "description", phase.desc) if err := e.restoreSection(ctx, archivePath, targetDB, phase.section, preserveOwnership); err != nil { // Check if it's an ignorable error if e.isIgnorableError(err.Error()) { e.log.Warn(fmt.Sprintf("Phase %d completed with ignorable errors", i+1), "section", phase.section, "error", err) continue } return fmt.Errorf("phase %d (%s) failed: %w", i+1, phase.name, err) } e.log.Info(fmt.Sprintf("Phase %d/%d completed successfully", i+1, len(phases)), "section", phase.section) } e.log.Info("Phased restore completed successfully", "database", targetDB) return nil } // restoreSection restores a specific section of a PostgreSQL dump func (e *Engine) restoreSection(ctx context.Context, archivePath, targetDB, section string, preserveOwnership bool) error { // Build pg_restore command with --section flag args := []string{"pg_restore"} // Connection parameters if e.cfg.Host != "localhost" { args = append(args, "-h", e.cfg.Host) args = append(args, "-p", fmt.Sprintf("%d", e.cfg.Port)) args = append(args, "--no-password") } args = append(args, "-U", e.cfg.User) // CRITICAL: Use configured Jobs for parallel restore (fixes slow phased restores) parallelJobs := e.cfg.Jobs if parallelJobs <= 0 { parallelJobs = 1 } args = append(args, fmt.Sprintf("--jobs=%d", parallelJobs)) e.log.Info("Phased restore section", "section", section, "parallel_jobs", parallelJobs) // Section-specific restore args = append(args, "--section="+section) // Options if !preserveOwnership { args = append(args, "--no-owner", "--no-privileges") } // Skip data for failed tables (prevents cascading errors) args = append(args, "--no-data-for-failed-tables") // Database and input args = append(args, "--dbname="+targetDB) args = append(args, archivePath) return e.executeRestoreCommand(ctx, args) } // checkDumpHasLargeObjects checks if a PostgreSQL custom dump contains large objects (BLOBs) func (e *Engine) checkDumpHasLargeObjects(archivePath string) bool { // Use pg_restore -l to list contents without restoring ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() cmd := cleanup.SafeCommand(ctx, "pg_restore", "-l", archivePath) output, err := cmd.Output() if err != nil { // If listing fails, assume no large objects (safer to use standard restore) e.log.Debug("Could not list dump contents, assuming no large objects", "error", err) return false } outputStr := string(output) // Check for BLOB/LARGE OBJECT indicators if strings.Contains(outputStr, "BLOB") || strings.Contains(outputStr, "LARGE OBJECT") || strings.Contains(outputStr, " BLOBS ") || strings.Contains(outputStr, "lo_create") { return true } return false } // restorePostgreSQLSQL restores from PostgreSQL SQL script func (e *Engine) restorePostgreSQLSQL(ctx context.Context, archivePath, targetDB string, compressed bool) error { // Pre-validate SQL dump to detect truncation BEFORE attempting restore // This saves time by catching corrupted files early (vs 49min failures) if err := e.quickValidateSQLDump(archivePath, compressed); err != nil { e.log.Error("Pre-restore validation failed - dump file appears corrupted", "file", archivePath, "error", err) return fmt.Errorf("dump validation failed: %w - the backup file may be truncated or corrupted", err) } // USE NATIVE ENGINE if configured // This uses pure Go (pgx) instead of psql if e.cfg.UseNativeEngine { e.log.Info("Using native Go engine for restore", "database", targetDB, "file", archivePath) nativeErr := e.restoreWithNativeEngine(ctx, archivePath, targetDB, compressed) if nativeErr != nil { if e.cfg.FallbackToTools { e.log.Warn("Native restore failed, falling back to psql", "database", targetDB, "error", nativeErr) } else { return fmt.Errorf("native restore failed: %w", nativeErr) } } else { return nil // Native restore succeeded! } } // Use psql for SQL scripts (fallback or non-native mode) var cmd []string // For localhost, omit -h to use Unix socket (avoids Ident auth issues) hostArg := "" if e.cfg.Host != "localhost" && e.cfg.Host != "" { hostArg = fmt.Sprintf("-h %s", e.cfg.Host) } if compressed { // Use in-process pgzip decompression (parallel, no external process) return e.executeRestoreWithPgzipStream(ctx, archivePath, targetDB, "postgresql") } else { // NOTE: We do NOT use ON_ERROR_STOP=1 (see above) if hostArg != "" { cmd = []string{ "psql", "-h", e.cfg.Host, "-p", fmt.Sprintf("%d", e.cfg.Port), "-U", e.cfg.User, "-d", targetDB, "-f", archivePath, } } else { cmd = []string{ "psql", "-p", fmt.Sprintf("%d", e.cfg.Port), "-U", e.cfg.User, "-d", targetDB, "-f", archivePath, } } } return e.executeRestoreCommand(ctx, cmd) } // restoreWithNativeEngine restores a SQL file using the pure Go native engine func (e *Engine) restoreWithNativeEngine(ctx context.Context, archivePath, targetDB string, compressed bool) error { // Create native engine config nativeCfg := &native.PostgreSQLNativeConfig{ Host: e.cfg.Host, Port: e.cfg.Port, User: e.cfg.User, Password: e.cfg.Password, Database: targetDB, // Connect to target database SSLMode: e.cfg.SSLMode, } // Use PARALLEL restore engine for SQL format - this matches pg_restore -j performance! // The parallel engine: // 1. Executes schema statements sequentially (CREATE TABLE, etc.) // 2. Executes COPY data loading in PARALLEL (like pg_restore -j8) // 3. Creates indexes and constraints in PARALLEL parallelWorkers := e.cfg.Jobs if parallelWorkers < 1 { parallelWorkers = 4 } e.log.Info("Using PARALLEL native restore engine", "workers", parallelWorkers, "database", targetDB, "archive", archivePath) // Pass context to ensure pool is properly closed on Ctrl+C cancellation parallelEngine, err := native.NewParallelRestoreEngineWithContext(ctx, nativeCfg, e.log, parallelWorkers) if err != nil { e.log.Warn("Failed to create parallel restore engine, falling back to sequential", "error", err) // Fall back to sequential restore return e.restoreWithSequentialNativeEngine(ctx, archivePath, targetDB, compressed) } defer parallelEngine.Close() // Run parallel restore with progress callbacks options := &native.ParallelRestoreOptions{ Workers: parallelWorkers, ContinueOnError: true, ProgressCallback: func(phase string, current, total int, tableName string) { switch phase { case "parsing": e.log.Debug("Parsing SQL dump...") case "schema": if current%50 == 0 { e.log.Debug("Creating schema", "progress", current, "total", total) } case "data": e.log.Debug("Loading data", "table", tableName, "progress", current, "total", total) // Report progress to TUI e.reportDatabaseProgress(current, total, tableName) case "indexes": e.log.Debug("Creating indexes", "progress", current, "total", total) } }, } result, err := parallelEngine.RestoreFile(ctx, archivePath, options) if err != nil { return fmt.Errorf("parallel native restore failed: %w", err) } e.log.Info("Parallel native restore completed", "database", targetDB, "tables", result.TablesRestored, "rows", result.RowsRestored, "indexes", result.IndexesCreated, "duration", result.Duration) return nil } // restoreWithSequentialNativeEngine is the fallback sequential restore func (e *Engine) restoreWithSequentialNativeEngine(ctx context.Context, archivePath, targetDB string, compressed bool) error { nativeCfg := &native.PostgreSQLNativeConfig{ Host: e.cfg.Host, Port: e.cfg.Port, User: e.cfg.User, Password: e.cfg.Password, Database: targetDB, SSLMode: e.cfg.SSLMode, } // Create restore engine restoreEngine, err := native.NewPostgreSQLRestoreEngine(nativeCfg, e.log) if err != nil { return fmt.Errorf("failed to create native restore engine: %w", err) } defer restoreEngine.Close() // Open input file file, err := os.Open(archivePath) if err != nil { return fmt.Errorf("failed to open backup file: %w", err) } defer file.Close() var reader io.Reader = file // Handle compression if compressed { gzReader, err := pgzip.NewReader(file) if err != nil { return fmt.Errorf("failed to create gzip reader: %w", err) } defer gzReader.Close() reader = gzReader } // Restore with progress tracking options := &native.RestoreOptions{ Database: targetDB, ContinueOnError: true, // Be resilient like pg_restore ProgressCallback: func(progress *native.RestoreProgress) { e.log.Debug("Native restore progress", "operation", progress.Operation, "objects", progress.ObjectsCompleted, "rows", progress.RowsProcessed) }, } result, err := restoreEngine.Restore(ctx, reader, options) if err != nil { return fmt.Errorf("native restore failed: %w", err) } e.log.Info("Native restore completed", "database", targetDB, "objects", result.ObjectsProcessed, "duration", result.Duration) return nil } // restoreMySQLSQL restores from MySQL SQL script func (e *Engine) restoreMySQLSQL(ctx context.Context, archivePath, targetDB string, compressed bool) error { options := database.RestoreOptions{} cmd := e.db.BuildRestoreCommand(targetDB, archivePath, options) if compressed { // Use in-process pgzip decompression (parallel, no external process) return e.executeRestoreWithPgzipStream(ctx, archivePath, targetDB, "mysql") } return e.executeRestoreCommand(ctx, cmd) } // executeRestoreCommand executes a restore command func (e *Engine) executeRestoreCommand(ctx context.Context, cmdArgs []string) error { return e.executeRestoreCommandWithContext(ctx, cmdArgs, "", "", FormatUnknown) } // executeRestoreCommandWithContext executes a restore command with error collection context func (e *Engine) executeRestoreCommandWithContext(ctx context.Context, cmdArgs []string, archivePath, targetDB string, format ArchiveFormat) error { e.log.Info("Executing restore command", "command", strings.Join(cmdArgs, " ")) cmd := cleanup.SafeCommand(ctx, cmdArgs[0], cmdArgs[1:]...) // Set environment variables cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password), fmt.Sprintf("MYSQL_PWD=%s", e.cfg.Password), ) // Create error collector if debug log path is set var collector *ErrorCollector if e.debugLogPath != "" { collector = NewErrorCollector(e.cfg, e.log, archivePath, targetDB, format, true) } // Stream stderr to avoid memory issues with large output // Don't use CombinedOutput() as it loads everything into memory stderr, err := cmd.StderrPipe() if err != nil { return fmt.Errorf("failed to create stderr pipe: %w", err) } if err := cmd.Start(); err != nil { return fmt.Errorf("failed to start restore command: %w", err) } // Read stderr in goroutine to avoid blocking var lastError string var errorCount int stderrDone := make(chan struct{}) go func() { defer close(stderrDone) buf := make([]byte, 4096) const maxErrors = 10 // Limit captured errors to prevent OOM for { n, err := stderr.Read(buf) if n > 0 { chunk := string(buf[:n]) // Feed to error collector if enabled if collector != nil { collector.CaptureStderr(chunk) } // Only capture REAL errors, not verbose output if strings.Contains(chunk, "ERROR:") || strings.Contains(chunk, "FATAL:") || strings.Contains(chunk, "error:") { lastError = strings.TrimSpace(chunk) errorCount++ if errorCount <= maxErrors { e.log.Warn("Restore stderr", "output", chunk) } } // Note: --verbose output is discarded to prevent OOM } if err != nil { break } } }() // Wait for command with proper context handling cmdDone := make(chan error, 1) go func() { cmdDone <- cmd.Wait() }() var cmdErr error select { case cmdErr = <-cmdDone: // Command completed (success or failure) case <-ctx.Done(): // Context cancelled - kill entire process group e.log.Warn("Restore cancelled - killing process group") cleanup.KillCommandGroup(cmd) <-cmdDone cmdErr = ctx.Err() } // Wait for stderr reader to finish <-stderrDone if cmdErr != nil { // Get exit code exitCode := 1 if exitErr, ok := cmdErr.(*exec.ExitError); ok { exitCode = exitErr.ExitCode() } // PostgreSQL pg_restore returns exit code 1 even for ignorable errors // Check if errors are ignorable (already exists, duplicate, etc.) if lastError != "" && e.isIgnorableError(lastError) { e.log.Warn("Restore completed with ignorable errors", "error_count", errorCount, "last_error", lastError) return nil // Success despite ignorable errors } // Classify error and provide helpful hints var classification *checks.ErrorClassification var errType, errHint string if lastError != "" { classification = checks.ClassifyError(lastError) errType = classification.Type errHint = classification.Hint // CRITICAL: Detect "out of shared memory" / lock exhaustion errors // This means max_locks_per_transaction is insufficient if strings.Contains(lastError, "out of shared memory") || strings.Contains(lastError, "max_locks_per_transaction") { e.log.Error("šŸ”“ LOCK EXHAUSTION DETECTED during restore - this should have been prevented", "last_error", lastError, "database", targetDB, "action", "Report this to developers - preflight checks should have caught this") // Return a special error that signals lock exhaustion // The caller can decide to retry with reduced parallelism return fmt.Errorf("LOCK_EXHAUSTION: %s - max_locks_per_transaction insufficient (error: %w)", lastError, cmdErr) } e.log.Error("Restore command failed", "error", err, "last_stderr", lastError, "error_count", errorCount, "error_type", classification.Type, "hint", classification.Hint, "action", classification.Action) } else { e.log.Error("Restore command failed", "error", err, "error_count", errorCount) } // Generate and save error report if collector is enabled if collector != nil { collector.SetExitCode(exitCode) report := collector.GenerateReport( lastError, errType, errHint, ) // Print report to console collector.PrintReport(report) // Save to file if e.debugLogPath != "" { if saveErr := collector.SaveReport(report, e.debugLogPath); saveErr != nil { e.log.Warn("Failed to save debug log", "error", saveErr) } else { e.log.Info("Debug log saved", "path", e.debugLogPath) fmt.Printf("\n[LOG] Detailed error report saved to: %s\n", e.debugLogPath) } } } if lastError != "" { return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d) - %s", err, lastError, errorCount, errHint) } return fmt.Errorf("restore failed: %w", err) } e.log.Info("Restore command completed successfully") return nil } // executeRestoreWithDecompression handles decompression during restore using in-process pgzip func (e *Engine) executeRestoreWithDecompression(ctx context.Context, archivePath string, restoreCmd []string) error { e.log.Info("Using in-process pgzip decompression (parallel)", "archive", archivePath) // Open the gzip file file, err := os.Open(archivePath) if err != nil { return fmt.Errorf("failed to open archive: %w", err) } defer file.Close() // Create parallel gzip reader gz, err := pgzip.NewReader(file) if err != nil { return fmt.Errorf("failed to create pgzip reader: %w", err) } defer gz.Close() // Start restore command cmd := cleanup.SafeCommand(ctx, restoreCmd[0], restoreCmd[1:]...) cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password), fmt.Sprintf("MYSQL_PWD=%s", e.cfg.Password), ) // Pipe decompressed data to restore command stdin stdin, err := cmd.StdinPipe() if err != nil { return fmt.Errorf("failed to create stdin pipe: %w", err) } // Capture stderr stderr, err := cmd.StderrPipe() if err != nil { return fmt.Errorf("failed to create stderr pipe: %w", err) } if err := cmd.Start(); err != nil { return fmt.Errorf("failed to start restore command: %w", err) } // Stream decompressed data to restore command in goroutine copyDone := make(chan error, 1) go func() { _, copyErr := fs.CopyWithContext(ctx, stdin, gz) stdin.Close() copyDone <- copyErr }() // Read stderr in goroutine var lastError string var errorCount int stderrDone := make(chan struct{}) go func() { defer close(stderrDone) scanner := bufio.NewScanner(stderr) // Increase buffer size for long lines buf := make([]byte, 64*1024) scanner.Buffer(buf, 1024*1024) for scanner.Scan() { line := scanner.Text() if strings.Contains(strings.ToLower(line), "error") || strings.Contains(line, "ERROR") || strings.Contains(line, "FATAL") { lastError = line errorCount++ e.log.Debug("Restore stderr", "line", line) } } }() // Wait for copy to complete copyErr := <-copyDone // Wait for command cmdErr := cmd.Wait() <-stderrDone if copyErr != nil && cmdErr == nil { return fmt.Errorf("decompression failed: %w", copyErr) } if cmdErr != nil { if lastError != "" && e.isIgnorableError(lastError) { e.log.Warn("Restore completed with ignorable errors", "error_count", errorCount) return nil } if lastError != "" { classification := checks.ClassifyError(lastError) return fmt.Errorf("restore failed: %w (last error: %s) - %s", cmdErr, lastError, classification.Hint) } return fmt.Errorf("restore failed: %w", cmdErr) } e.log.Info("Restore with pgzip decompression completed successfully") return nil } // executeRestoreWithPgzipStream handles SQL restore with in-process pgzip decompression func (e *Engine) executeRestoreWithPgzipStream(ctx context.Context, archivePath, targetDB, dbType string) error { e.log.Info("Using in-process pgzip stream for SQL restore", "archive", archivePath, "database", targetDB, "type", dbType) // Open the gzip file file, err := os.Open(archivePath) if err != nil { return fmt.Errorf("failed to open archive: %w", err) } defer file.Close() // Create parallel gzip reader gz, err := pgzip.NewReader(file) if err != nil { return fmt.Errorf("failed to create pgzip reader: %w", err) } defer gz.Close() // Build restore command based on database type var cmd *exec.Cmd if dbType == "postgresql" { // Add performance tuning via psql preamble commands // These are executed before the SQL dump to speed up bulk loading preamble := ` SET synchronous_commit = 'off'; SET work_mem = '256MB'; SET maintenance_work_mem = '1GB'; SET max_parallel_workers_per_gather = 4; SET max_parallel_maintenance_workers = 4; SET wal_level = 'minimal'; SET fsync = off; SET full_page_writes = off; SET checkpoint_timeout = '1h'; SET max_wal_size = '10GB'; ` // Note: Some settings require superuser - we try them but continue if they fail // The -c flags run before the main script args := []string{ "-p", fmt.Sprintf("%d", e.cfg.Port), "-U", e.cfg.User, "-d", targetDB, "-c", "SET synchronous_commit = 'off'", "-c", "SET work_mem = '256MB'", "-c", "SET maintenance_work_mem = '1GB'", } if e.cfg.Host != "localhost" && e.cfg.Host != "" { args = append([]string{"-h", e.cfg.Host}, args...) } e.log.Info("Applying PostgreSQL performance tuning for SQL restore", "preamble_settings", 3) _ = preamble // Documented for reference cmd = cleanup.SafeCommand(ctx, "psql", args...) cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password)) } else { // MySQL - use MYSQL_PWD env var to avoid password in process list args := []string{"-u", e.cfg.User} if e.cfg.Host != "localhost" && e.cfg.Host != "" { args = append(args, "-h", e.cfg.Host) } args = append(args, "-P", fmt.Sprintf("%d", e.cfg.Port), targetDB) cmd = cleanup.SafeCommand(ctx, "mysql", args...) // Pass password via environment variable to avoid process list exposure cmd.Env = os.Environ() if e.cfg.Password != "" { cmd.Env = append(cmd.Env, "MYSQL_PWD="+e.cfg.Password) } } // Pipe decompressed data to restore command stdin stdin, err := cmd.StdinPipe() if err != nil { return fmt.Errorf("failed to create stdin pipe: %w", err) } // Capture stderr stderr, err := cmd.StderrPipe() if err != nil { return fmt.Errorf("failed to create stderr pipe: %w", err) } if err := cmd.Start(); err != nil { return fmt.Errorf("failed to start restore command: %w", err) } // Stream decompressed data to restore command in goroutine copyDone := make(chan error, 1) go func() { _, copyErr := fs.CopyWithContext(ctx, stdin, gz) stdin.Close() copyDone <- copyErr }() // Read stderr in goroutine var lastError string var errorCount int stderrDone := make(chan struct{}) go func() { defer close(stderrDone) scanner := bufio.NewScanner(stderr) buf := make([]byte, 64*1024) scanner.Buffer(buf, 1024*1024) for scanner.Scan() { line := scanner.Text() if strings.Contains(strings.ToLower(line), "error") || strings.Contains(line, "ERROR") || strings.Contains(line, "FATAL") { lastError = line errorCount++ e.log.Debug("Restore stderr", "line", line) } } }() // Wait for copy to complete copyErr := <-copyDone // Wait for command cmdErr := cmd.Wait() <-stderrDone if copyErr != nil && cmdErr == nil { return fmt.Errorf("pgzip decompression failed: %w", copyErr) } if cmdErr != nil { if lastError != "" && e.isIgnorableError(lastError) { e.log.Warn("SQL restore completed with ignorable errors", "error_count", errorCount) return nil } if lastError != "" { classification := checks.ClassifyError(lastError) return fmt.Errorf("restore failed: %w (last error: %s) - %s", cmdErr, lastError, classification.Hint) } return fmt.Errorf("restore failed: %w", cmdErr) } e.log.Info("SQL restore with pgzip stream completed successfully") return nil } // previewRestore shows what would be done without executing func (e *Engine) previewRestore(archivePath, targetDB string, format ArchiveFormat) error { fmt.Println("\n" + strings.Repeat("=", 60)) fmt.Println(" RESTORE PREVIEW (DRY RUN)") fmt.Println(strings.Repeat("=", 60)) stat, _ := os.Stat(archivePath) fmt.Printf("\nArchive: %s\n", filepath.Base(archivePath)) fmt.Printf("Format: %s\n", format) if stat != nil { fmt.Printf("Size: %s\n", FormatBytes(stat.Size())) fmt.Printf("Modified: %s\n", stat.ModTime().Format("2006-01-02 15:04:05")) } fmt.Printf("Target Database: %s\n", targetDB) fmt.Printf("Target Host: %s:%d\n", e.cfg.Host, e.cfg.Port) fmt.Println("\nOperations that would be performed:") switch format { case FormatPostgreSQLDump: fmt.Printf(" 1. Execute: pg_restore -d %s %s\n", targetDB, archivePath) case FormatPostgreSQLDumpGz: fmt.Printf(" 1. Decompress: %s\n", archivePath) fmt.Printf(" 2. Execute: pg_restore -d %s\n", targetDB) case FormatPostgreSQLSQL, FormatPostgreSQLSQLGz: fmt.Printf(" 1. Execute: psql -d %s -f %s\n", targetDB, archivePath) case FormatMySQLSQL, FormatMySQLSQLGz: fmt.Printf(" 1. Execute: mysql %s < %s\n", targetDB, archivePath) } fmt.Println("\n[WARN] WARNING: This will restore data to the target database.") fmt.Println(" Existing data may be overwritten or merged.") fmt.Println("\nTo execute this restore, add the --confirm flag.") fmt.Println(strings.Repeat("=", 60) + "\n") return nil } // RestoreSingleFromCluster extracts and restores a single database from a cluster backup func (e *Engine) RestoreSingleFromCluster(ctx context.Context, clusterArchivePath, dbName, targetDB string, cleanFirst, createIfMissing bool) error { operation := e.log.StartOperation("Single Database Restore from Cluster") // Validate and sanitize archive path validArchivePath, pathErr := security.ValidateArchivePath(clusterArchivePath) if pathErr != nil { operation.Fail(fmt.Sprintf("Invalid archive path: %v", pathErr)) return fmt.Errorf("invalid archive path: %w", pathErr) } clusterArchivePath = validArchivePath // Validate archive exists if _, err := os.Stat(clusterArchivePath); os.IsNotExist(err) { operation.Fail("Archive not found") return fmt.Errorf("archive not found: %s", clusterArchivePath) } // Verify it's a cluster archive format := DetectArchiveFormat(clusterArchivePath) if format != FormatClusterTarGz { operation.Fail("Not a cluster archive") return fmt.Errorf("not a cluster archive: %s (format: %s)", clusterArchivePath, format) } // Create temporary directory for extraction workDir := e.cfg.GetEffectiveWorkDir() tempDir := filepath.Join(workDir, fmt.Sprintf(".extract_%d", time.Now().Unix())) if err := os.MkdirAll(tempDir, 0755); err != nil { operation.Fail("Failed to create temporary directory") return fmt.Errorf("failed to create temp directory: %w", err) } defer os.RemoveAll(tempDir) // Extract the specific database from cluster archive e.log.Info("Extracting database from cluster backup", "database", dbName, "cluster", filepath.Base(clusterArchivePath)) e.progress.Start(fmt.Sprintf("Extracting '%s' from cluster backup", dbName)) extractedPath, err := ExtractDatabaseFromCluster(ctx, clusterArchivePath, dbName, tempDir, e.log, e.progress) if err != nil { e.progress.Fail(fmt.Sprintf("Extraction failed: %v", err)) operation.Fail(fmt.Sprintf("Extraction failed: %v", err)) return fmt.Errorf("failed to extract database: %w", err) } e.progress.Update(fmt.Sprintf("Extracted: %s", filepath.Base(extractedPath))) e.log.Info("Database extracted successfully", "path", extractedPath) // Now restore the extracted database file e.progress.Update("Restoring database...") // Create database if requested and it doesn't exist if createIfMissing { e.log.Info("Checking if target database exists", "database", targetDB) if err := e.ensureDatabaseExists(ctx, targetDB); err != nil { operation.Fail(fmt.Sprintf("Failed to create database: %v", err)) return fmt.Errorf("failed to create database '%s': %w", targetDB, err) } } // Detect format of extracted file extractedFormat := DetectArchiveFormat(extractedPath) e.log.Info("Restoring extracted database", "format", extractedFormat, "target", targetDB) // Restore based on format var restoreErr error switch extractedFormat { case FormatPostgreSQLDump, FormatPostgreSQLDumpGz: restoreErr = e.restorePostgreSQLDump(ctx, extractedPath, targetDB, extractedFormat == FormatPostgreSQLDumpGz, cleanFirst) case FormatPostgreSQLSQL, FormatPostgreSQLSQLGz: restoreErr = e.restorePostgreSQLSQL(ctx, extractedPath, targetDB, extractedFormat == FormatPostgreSQLSQLGz) case FormatMySQLSQL, FormatMySQLSQLGz: restoreErr = e.restoreMySQLSQL(ctx, extractedPath, targetDB, extractedFormat == FormatMySQLSQLGz) default: operation.Fail("Unsupported extracted format") return fmt.Errorf("unsupported extracted format: %s", extractedFormat) } if restoreErr != nil { e.progress.Fail(fmt.Sprintf("Restore failed: %v", restoreErr)) operation.Fail(fmt.Sprintf("Restore failed: %v", restoreErr)) return restoreErr } e.progress.Complete(fmt.Sprintf("Database '%s' restored from cluster backup", targetDB)) operation.Complete(fmt.Sprintf("Restored '%s' from cluster as '%s'", dbName, targetDB)) return nil } // RestoreCluster restores a full cluster from a tar.gz archive // If preExtractedPath is non-empty, uses that directory instead of extracting archivePath // This avoids double extraction when ValidateAndExtractCluster was already called func (e *Engine) RestoreCluster(ctx context.Context, archivePath string, preExtractedPath ...string) error { operation := e.log.StartOperation("Cluster Restore") clusterStartTime := time.Now() // šŸš€ LOG ACTUAL PERFORMANCE SETTINGS - helps debug slow restores profile := e.cfg.GetCurrentProfile() if profile != nil { e.log.Info("šŸš€ RESTORE PERFORMANCE SETTINGS", "profile", profile.Name, "cluster_parallelism", profile.ClusterParallelism, "pg_restore_jobs", profile.Jobs, "large_db_mode", e.cfg.LargeDBMode, "buffered_io", profile.BufferedIO) } else { e.log.Info("šŸš€ RESTORE PERFORMANCE SETTINGS (raw config)", "profile", e.cfg.ResourceProfile, "cluster_parallelism", e.cfg.ClusterParallelism, "pg_restore_jobs", e.cfg.Jobs, "large_db_mode", e.cfg.LargeDBMode) } // Also show in progress bar for TUI visibility if !e.silentMode { fmt.Printf("\n⚔ Performance: profile=%s, parallel_dbs=%d, pg_restore_jobs=%d\n\n", e.cfg.ResourceProfile, e.cfg.ClusterParallelism, e.cfg.Jobs) } // Validate and sanitize archive path validArchivePath, pathErr := security.ValidateArchivePath(archivePath) if pathErr != nil { operation.Fail(fmt.Sprintf("Invalid archive path: %v", pathErr)) return fmt.Errorf("invalid archive path: %w", pathErr) } archivePath = validArchivePath // Validate archive exists if _, err := os.Stat(archivePath); os.IsNotExist(err) { operation.Fail("Archive not found") return fmt.Errorf("archive not found: %s", archivePath) } // Verify checksum if .sha256 file exists if checksumErr := security.LoadAndVerifyChecksum(archivePath); checksumErr != nil { e.log.Warn("Checksum verification failed", "error", checksumErr) e.log.Warn("Continuing restore without checksum verification (use with caution)") } else { e.log.Info("[OK] Cluster archive checksum verified successfully") } format := DetectArchiveFormat(archivePath) if !format.CanBeClusterRestore() { operation.Fail("Invalid cluster archive format") return fmt.Errorf("not a valid cluster restore format: %s (detected format: %s). Supported: .tar.gz, .sql, .sql.gz", archivePath, format) } // For SQL-based cluster restores, use a different restore path if format == FormatPostgreSQLSQL || format == FormatPostgreSQLSQLGz { return e.restoreClusterFromSQL(ctx, archivePath, operation) } // Check if we have a pre-extracted directory (optimization to avoid double extraction) // This check must happen BEFORE disk space checks to avoid false failures usingPreExtracted := len(preExtractedPath) > 0 && preExtractedPath[0] != "" // Check disk space before starting restore (skip if using pre-extracted directory) var archiveInfo os.FileInfo var err error if !usingPreExtracted { e.log.Info("Checking disk space for restore") archiveInfo, err = os.Stat(archivePath) if err == nil { spaceCheck := checks.CheckDiskSpaceForRestore(e.cfg.BackupDir, archiveInfo.Size()) if spaceCheck.Critical { operation.Fail("Insufficient disk space") return fmt.Errorf("insufficient disk space for restore: %.1f%% used - need at least 4x archive size", spaceCheck.UsedPercent) } if spaceCheck.Warning { e.log.Warn("Low disk space - restore may fail", "available_gb", float64(spaceCheck.AvailableBytes)/(1024*1024*1024), "used_percent", spaceCheck.UsedPercent) } } } else { e.log.Info("Skipping disk space check (using pre-extracted directory)") } if e.dryRun { e.log.Info("DRY RUN: Would restore cluster", "archive", archivePath) return e.previewClusterRestore(archivePath) } e.progress.Start(fmt.Sprintf("Restoring cluster from %s", filepath.Base(archivePath))) // Create temporary extraction directory in configured WorkDir workDir := e.cfg.GetEffectiveWorkDir() tempDir := filepath.Join(workDir, fmt.Sprintf(".restore_%d", time.Now().Unix())) // Handle pre-extracted directory or extract archive if usingPreExtracted { tempDir = preExtractedPath[0] // Note: Caller handles cleanup of pre-extracted directory e.log.Info("Using pre-extracted cluster directory", "path", tempDir, "optimization", "skipping duplicate extraction") } else { // Check disk space for extraction (need ~3x archive size: compressed + extracted + working space) if archiveInfo != nil { requiredBytes := uint64(archiveInfo.Size()) * 3 extractionCheck := checks.CheckDiskSpace(workDir) if extractionCheck.AvailableBytes < requiredBytes { operation.Fail("Insufficient disk space for extraction") return fmt.Errorf("insufficient disk space for extraction in %s: need %.1f GB, have %.1f GB (archive size: %.1f GB Ɨ 3)", workDir, float64(requiredBytes)/(1024*1024*1024), float64(extractionCheck.AvailableBytes)/(1024*1024*1024), float64(archiveInfo.Size())/(1024*1024*1024)) } e.log.Info("Disk space check for extraction passed", "workdir", workDir, "required_gb", float64(requiredBytes)/(1024*1024*1024), "available_gb", float64(extractionCheck.AvailableBytes)/(1024*1024*1024)) } // Need to extract archive ourselves if err := os.MkdirAll(tempDir, 0755); err != nil { operation.Fail("Failed to create temporary directory") return fmt.Errorf("failed to create temp directory in %s: %w", workDir, err) } defer os.RemoveAll(tempDir) // Extract archive e.log.Info("Extracting cluster archive", "archive", archivePath, "tempDir", tempDir) if err := e.extractArchive(ctx, archivePath, tempDir); err != nil { operation.Fail("Archive extraction failed") return fmt.Errorf("failed to extract archive: %w", err) } // Check context validity after extraction (debugging context cancellation issues) if ctx.Err() != nil { e.log.Error("Context cancelled after extraction - this should not happen", "context_error", ctx.Err(), "extraction_completed", true) operation.Fail("Context cancelled unexpectedly") return fmt.Errorf("context cancelled after extraction completed: %w", ctx.Err()) } e.log.Info("Extraction completed, context still valid") } // Check if user has superuser privileges (required for ownership restoration) e.progress.Update("Checking privileges...") isSuperuser, err := e.checkSuperuser(ctx) if err != nil { e.log.Warn("Could not verify superuser status", "error", err) isSuperuser = false // Assume not superuser if check fails } if !isSuperuser { e.log.Warn("Current user is not a superuser - database ownership may not be fully restored") e.progress.Update("[WARN] Warning: Non-superuser - ownership restoration limited") time.Sleep(2 * time.Second) // Give user time to see warning } else { e.log.Info("Superuser privileges confirmed - full ownership restoration enabled") } // Restore global objects FIRST (roles, tablespaces) - CRITICAL for ownership globalsFile := filepath.Join(tempDir, "globals.sql") if _, err := os.Stat(globalsFile); err == nil { e.log.Info("Restoring global objects (roles, tablespaces)") e.progress.Update("Restoring global objects (roles, tablespaces)...") if err := e.restoreGlobals(ctx, globalsFile); err != nil { e.log.Error("Failed to restore global objects", "error", err) if isSuperuser { // If we're superuser and can't restore globals, this is a problem e.progress.Fail("Failed to restore global objects") operation.Fail("Global objects restoration failed") return fmt.Errorf("failed to restore global objects: %w", err) } else { e.log.Warn("Continuing without global objects (may cause ownership issues)") } } else { e.log.Info("Successfully restored global objects") } } else { e.log.Warn("No globals.sql file found in backup - roles and tablespaces will not be restored") } // Restore individual databases dumpsDir := filepath.Join(tempDir, "dumps") if _, err := os.Stat(dumpsDir); err != nil { operation.Fail("No database dumps found in archive") return fmt.Errorf("no database dumps found in archive") } entries, err := os.ReadDir(dumpsDir) if err != nil { operation.Fail("Failed to read dumps directory") return fmt.Errorf("failed to read dumps directory: %w", err) } // PRE-VALIDATE all SQL dumps BEFORE starting restore // This catches truncated files early instead of failing after hours of work e.log.Info("Pre-validating dump files before restore...") e.progress.Update("Pre-validating dump files...") var corruptedDumps []string diagnoser := NewDiagnoser(e.log, false) for _, entry := range entries { if entry.IsDir() { continue } dumpFile := filepath.Join(dumpsDir, entry.Name()) if strings.HasSuffix(dumpFile, ".sql.gz") { result, err := diagnoser.DiagnoseFile(dumpFile) if err != nil { e.log.Warn("Could not validate dump file", "file", entry.Name(), "error", err) continue } if result.IsTruncated || result.IsCorrupted || !result.IsValid { dbName := strings.TrimSuffix(entry.Name(), ".sql.gz") errDetail := "unknown issue" if len(result.Errors) > 0 { errDetail = result.Errors[0] } corruptedDumps = append(corruptedDumps, fmt.Sprintf("%s: %s", dbName, errDetail)) e.log.Error("CORRUPTED dump file detected", "database", dbName, "file", entry.Name(), "truncated", result.IsTruncated, "errors", result.Errors) } } else if strings.HasSuffix(dumpFile, ".dump") { // Validate custom format dumps using pg_restore --list cmd := cleanup.SafeCommand(ctx, "pg_restore", "--list", dumpFile) output, err := cmd.CombinedOutput() if err != nil { dbName := strings.TrimSuffix(entry.Name(), ".dump") errDetail := strings.TrimSpace(string(output)) if len(errDetail) > 100 { errDetail = errDetail[:100] + "..." } // Check for truncation indicators if strings.Contains(errDetail, "unexpected end") || strings.Contains(errDetail, "invalid") { corruptedDumps = append(corruptedDumps, fmt.Sprintf("%s: %s", dbName, errDetail)) e.log.Error("CORRUPTED custom dump file detected", "database", dbName, "file", entry.Name(), "error", errDetail) } else { e.log.Warn("pg_restore --list warning (may be recoverable)", "file", entry.Name(), "error", errDetail) } } } } if len(corruptedDumps) > 0 { operation.Fail("Corrupted dump files detected") e.progress.Fail(fmt.Sprintf("Found %d corrupted dump files - restore aborted", len(corruptedDumps))) return fmt.Errorf("pre-validation failed: %d corrupted dump files detected: %s - the backup archive appears to be damaged, restore from a different backup", len(corruptedDumps), strings.Join(corruptedDumps, ", ")) } e.log.Info("All dump files passed validation") // Run comprehensive preflight checks (Linux system + PostgreSQL + Archive analysis) preflight, preflightErr := e.RunPreflightChecks(ctx, dumpsDir, entries) if preflightErr != nil { e.log.Warn("Preflight checks failed", "error", preflightErr) } // šŸ›”ļø LARGE DATABASE GUARD - Bulletproof protection for large database restores e.progress.Update("Analyzing database characteristics...") guard := NewLargeDBGuard(e.cfg, e.log) // 🧠 MEMORY CHECK - Detect OOM risk before attempting restore e.progress.Update("Checking system memory...") archiveStats, statErr := os.Stat(archivePath) var backupSizeBytes int64 if statErr == nil && archiveStats != nil { backupSizeBytes = archiveStats.Size() } memCheck := guard.CheckSystemMemoryWithType(backupSizeBytes, true) // true = cluster archive with pre-compressed dumps if memCheck != nil { if memCheck.Critical { e.log.Error("🚨 CRITICAL MEMORY WARNING", "error", memCheck.Recommendation) e.log.Warn("Proceeding but OOM failure is likely - consider adding swap") } if memCheck.LowMemory { e.log.Warn("āš ļø LOW MEMORY DETECTED - Consider reducing parallelism", "available_gb", fmt.Sprintf("%.1f", memCheck.AvailableRAMGB), "backup_gb", fmt.Sprintf("%.1f", memCheck.BackupSizeGB), "current_jobs", e.cfg.Jobs, "current_parallelism", e.cfg.ClusterParallelism) // DO NOT override user settings - just warn // User explicitly chose their profile, respect that choice e.log.Warn("User settings preserved: jobs=%d, cluster-parallelism=%d", e.cfg.Jobs, e.cfg.ClusterParallelism) e.log.Warn("If restore fails with OOM, reduce --jobs or use --profile conservative") } if memCheck.NeedsMoreSwap { e.log.Warn("āš ļø SWAP RECOMMENDATION", "action", memCheck.Recommendation) fmt.Println() fmt.Println("═══════════════════════════════════════════════════════════════") fmt.Println(" SWAP MEMORY RECOMMENDATION") fmt.Println("═══════════════════════════════════════════════════════════════") fmt.Println(memCheck.Recommendation) fmt.Println("═══════════════════════════════════════════════════════════════") fmt.Println() } if memCheck.EstimatedHours > 1 { e.log.Info("ā±ļø Estimated restore time", "hours", fmt.Sprintf("%.1f", memCheck.EstimatedHours)) } } // Build list of dump files for analysis var dumpFilePaths []string for _, entry := range entries { if !entry.IsDir() { dumpFilePaths = append(dumpFilePaths, filepath.Join(dumpsDir, entry.Name())) } } // Determine optimal restore strategy strategy := guard.DetermineStrategy(ctx, archivePath, dumpFilePaths) // Apply strategy (override config if needed) if strategy.UseConservative { guard.ApplyStrategy(strategy, e.cfg) guard.WarnUser(strategy, e.silentMode) } // Calculate optimal lock boost based on BLOB count lockBoostValue := 2048 // Default if preflight != nil && preflight.Archive.RecommendedLockBoost > 0 { lockBoostValue = preflight.Archive.RecommendedLockBoost } // AUTO-TUNE: Boost PostgreSQL settings for large restores e.progress.Update("Tuning PostgreSQL for large restore...") if e.cfg.DebugLocks { e.log.Info("šŸ” [LOCK-DEBUG] Attempting to boost PostgreSQL lock settings", "target_max_locks", lockBoostValue, "conservative_mode", strategy.UseConservative) } originalSettings, tuneErr := e.boostPostgreSQLSettings(ctx, lockBoostValue) if tuneErr != nil { e.log.Error("Could not boost PostgreSQL settings", "error", tuneErr) if e.cfg.DebugLocks { e.log.Error("šŸ” [LOCK-DEBUG] Lock boost attempt FAILED", "error", tuneErr, "phase", "boostPostgreSQLSettings") } operation.Fail("PostgreSQL tuning failed") return fmt.Errorf("failed to boost PostgreSQL settings: %w", tuneErr) } if e.cfg.DebugLocks { e.log.Info("šŸ” [LOCK-DEBUG] Lock boost function returned", "original_max_locks", originalSettings.MaxLocks, "target_max_locks", lockBoostValue, "boost_successful", originalSettings.MaxLocks >= lockBoostValue) } // INFORMATIONAL: Check if locks are sufficient, but DO NOT override user's Jobs setting // The user explicitly chose their profile/jobs - respect that choice if originalSettings.MaxLocks < lockBoostValue { e.log.Warn("āš ļø PostgreSQL locks may be insufficient for optimal restore", "current_locks", originalSettings.MaxLocks, "recommended_locks", lockBoostValue, "user_jobs", e.cfg.Jobs, "user_parallelism", e.cfg.ClusterParallelism) if e.cfg.DebugLocks { e.log.Info("šŸ” [LOCK-DEBUG] Lock verification WARNING (user settings preserved)", "actual_locks", originalSettings.MaxLocks, "recommended_locks", lockBoostValue, "delta", lockBoostValue-originalSettings.MaxLocks, "verdict", "PROCEEDING WITH USER SETTINGS") } // WARN but DO NOT override user's settings e.log.Warn("=" + strings.Repeat("=", 70)) e.log.Warn("LOCK WARNING (user settings preserved):") e.log.Warn("Current locks: %d, Recommended: %d", originalSettings.MaxLocks, lockBoostValue) e.log.Warn("Using user-configured: jobs=%d, cluster-parallelism=%d", e.cfg.Jobs, e.cfg.ClusterParallelism) e.log.Warn("If restore fails with lock errors, reduce --jobs or use --profile conservative") e.log.Warn("=" + strings.Repeat("=", 70)) // DO NOT force Jobs=1 anymore - respect user's choice! // The previous code here was overriding e.cfg.Jobs = 1 which broke turbo/performance profiles e.log.Info("Proceeding with user settings", "jobs", e.cfg.Jobs, "cluster_parallelism", e.cfg.ClusterParallelism, "available_locks", originalSettings.MaxLocks, "note", "User profile settings respected") } e.log.Info("PostgreSQL tuning verified - locks sufficient for restore", "max_locks_per_transaction", originalSettings.MaxLocks, "target_locks", lockBoostValue, "maintenance_work_mem", "2GB", "conservative_mode", strategy.UseConservative) if e.cfg.DebugLocks { e.log.Info("šŸ” [LOCK-DEBUG] Lock verification PASSED", "actual_locks", originalSettings.MaxLocks, "required_locks", lockBoostValue, "verdict", "PROCEED WITH RESTORE") } // Ensure we reset settings when done (even on failure) defer func() { if resetErr := e.resetPostgreSQLSettings(ctx, originalSettings); resetErr != nil { e.log.Warn("Could not reset PostgreSQL settings", "error", resetErr) } else { e.log.Info("Reset PostgreSQL settings to original values") } }() var restoreErrors *multierror.Error var restoreErrorsMu sync.Mutex totalDBs := 0 // Count total databases and calculate total bytes for weighted progress var totalBytes int64 dbSizes := make(map[string]int64) // Map database name to dump file size for _, entry := range entries { if !entry.IsDir() { totalDBs++ dumpFile := filepath.Join(dumpsDir, entry.Name()) if info, err := os.Stat(dumpFile); err == nil { dbName := entry.Name() dbName = strings.TrimSuffix(dbName, ".dump") dbName = strings.TrimSuffix(dbName, ".sql.gz") dbSizes[dbName] = info.Size() totalBytes += info.Size() } } } e.log.Info("Calculated total restore size", "databases", totalDBs, "total_bytes", totalBytes) // Track bytes completed for weighted progress var bytesCompleted int64 var bytesCompletedMu sync.Mutex // Create ETA estimator for database restores estimator := progress.NewETAEstimator("Restoring cluster", totalDBs) e.progress.SetEstimator(estimator) // Detect backup format and warn about performance implications // .sql.gz files (from native engine) cannot use parallel restore like pg_restore -j8 hasSQLFormat := false hasCustomFormat := false for _, entry := range entries { if !entry.IsDir() { if strings.HasSuffix(entry.Name(), ".sql.gz") { hasSQLFormat = true } else if strings.HasSuffix(entry.Name(), ".dump") { hasCustomFormat = true } } } // Warn about SQL format performance limitation if hasSQLFormat && !hasCustomFormat { if e.cfg.UseNativeEngine { // Native engine now uses PARALLEL restore - should match pg_restore -j8 performance! e.log.Info("āœ… SQL format detected - using PARALLEL native restore engine", "mode", "parallel", "workers", e.cfg.Jobs, "optimization", "COPY operations run in parallel like pg_restore -j") if !e.silentMode { fmt.Println() fmt.Println("═══════════════════════════════════════════════════════════════") fmt.Println(" āœ… PARALLEL NATIVE RESTORE: SQL Format with Parallel Loading") fmt.Println("═══════════════════════════════════════════════════════════════") fmt.Printf(" Using %d parallel workers for COPY operations.\n", e.cfg.Jobs) fmt.Println(" Performance should match pg_restore -j" + fmt.Sprintf("%d", e.cfg.Jobs)) fmt.Println("═══════════════════════════════════════════════════════════════") fmt.Println() } } else { // psql path is still sequential e.log.Warn("āš ļø PERFORMANCE WARNING: Backup uses SQL format (.sql.gz)", "reason", "psql mode cannot parallelize SQL format", "recommendation", "Enable --use-native-engine for parallel COPY loading") if !e.silentMode { fmt.Println() fmt.Println("═══════════════════════════════════════════════════════════════") fmt.Println(" āš ļø PERFORMANCE NOTE: SQL Format with psql (sequential)") fmt.Println("═══════════════════════════════════════════════════════════════") fmt.Println(" Backup files use .sql.gz format.") fmt.Println(" psql mode restores are sequential.") fmt.Println() fmt.Println(" For PARALLEL restore, use: --use-native-engine") fmt.Println(" The native engine parallelizes COPY like pg_restore -j8") fmt.Println("═══════════════════════════════════════════════════════════════") fmt.Println() } time.Sleep(2 * time.Second) } } // Check for large objects in dump files and adjust parallelism hasLargeObjects := e.detectLargeObjectsInDumps(dumpsDir, entries) // Use worker pool for parallel restore parallelism := e.cfg.ClusterParallelism if parallelism < 1 { parallelism = 1 // Ensure at least sequential } // Automatically reduce parallelism if large objects detected if hasLargeObjects && parallelism > 1 { e.log.Warn("Large objects detected in dump files - reducing parallelism to avoid lock contention", "original_parallelism", parallelism, "adjusted_parallelism", 1) e.progress.Update("[WARN] Large objects detected - using sequential restore to avoid lock conflicts") time.Sleep(2 * time.Second) // Give user time to see warning parallelism = 1 } var successCount, failCount int32 var mu sync.Mutex // Protect shared resources (progress, logger) // CRITICAL: Check context before starting database restore loop // This helps debug issues where context gets cancelled between extraction and restore if ctx.Err() != nil { e.log.Error("Context cancelled before database restore loop started", "context_error", ctx.Err(), "total_databases", totalDBs, "parallelism", parallelism) operation.Fail("Context cancelled before database restores could start") return fmt.Errorf("context cancelled before database restore: %w", ctx.Err()) } e.log.Info("Starting database restore loop", "databases", totalDBs, "parallelism", parallelism) // Timing tracking for restore phase progress restorePhaseStart := time.Now() var completedDBTimes []time.Duration // Track duration for each completed DB restore var completedDBTimesMu sync.Mutex // Create semaphore to limit concurrency semaphore := make(chan struct{}, parallelism) var wg sync.WaitGroup dbIndex := 0 for _, entry := range entries { if entry.IsDir() { continue } // Check context before acquiring semaphore to prevent goroutine leak if ctx.Err() != nil { e.log.Warn("Context cancelled - stopping database restore scheduling") break } wg.Add(1) // Acquire semaphore with context awareness to prevent goroutine leak select { case semaphore <- struct{}{}: // Acquired, proceed case <-ctx.Done(): wg.Done() e.log.Warn("Context cancelled while waiting for semaphore", "file", entry.Name()) continue } go func(idx int, filename string) { defer wg.Done() defer func() { <-semaphore }() // Release // Panic recovery - prevent one database failure from crashing entire cluster restore defer func() { if r := recover(); r != nil { e.log.Error("Panic in database restore goroutine", "file", filename, "panic", r) atomic.AddInt32(&failCount, 1) } }() // Check for context cancellation before starting if ctx.Err() != nil { e.log.Warn("Context cancelled - skipping database restore", "file", filename) atomic.AddInt32(&failCount, 1) restoreErrorsMu.Lock() restoreErrors = multierror.Append(restoreErrors, fmt.Errorf("%s: restore skipped (context cancelled)", strings.TrimSuffix(strings.TrimSuffix(filename, ".dump"), ".sql.gz"))) restoreErrorsMu.Unlock() return } // Track timing for this database restore dbRestoreStart := time.Now() // Update estimator progress (thread-safe) mu.Lock() estimator.UpdateProgress(idx) mu.Unlock() dumpFile := filepath.Join(dumpsDir, filename) dbName := filename dbName = strings.TrimSuffix(dbName, ".dump") dbName = strings.TrimSuffix(dbName, ".sql.gz") dbProgress := 15 + int(float64(idx)/float64(totalDBs)*85.0) // Calculate average time per DB and report progress with timing completedDBTimesMu.Lock() var avgPerDB time.Duration if len(completedDBTimes) > 0 { var totalDuration time.Duration for _, d := range completedDBTimes { totalDuration += d } avgPerDB = totalDuration / time.Duration(len(completedDBTimes)) } phaseElapsed := time.Since(restorePhaseStart) completedDBTimesMu.Unlock() mu.Lock() statusMsg := fmt.Sprintf("Restoring database %s (%d/%d)", dbName, idx+1, totalDBs) e.progress.Update(statusMsg) e.log.Info("Restoring database", "name", dbName, "file", dumpFile, "progress", dbProgress) // Report database progress for TUI (both callbacks) e.reportDatabaseProgress(idx, totalDBs, dbName) e.reportDatabaseProgressWithTiming(idx, totalDBs, dbName, phaseElapsed, avgPerDB) mu.Unlock() // STEP 1: Drop existing database completely (clean slate) e.log.Info("Dropping existing database for clean restore", "name", dbName) if err := e.dropDatabaseIfExists(ctx, dbName); err != nil { e.log.Warn("Could not drop existing database", "name", dbName, "error", err) } // STEP 2: Create fresh database if err := e.ensureDatabaseExists(ctx, dbName); err != nil { e.log.Error("Failed to create database", "name", dbName, "error", err) restoreErrorsMu.Lock() restoreErrors = multierror.Append(restoreErrors, fmt.Errorf("%s: failed to create database: %w", dbName, err)) restoreErrorsMu.Unlock() atomic.AddInt32(&failCount, 1) return } // STEP 3: Restore with ownership preservation if superuser preserveOwnership := isSuperuser isCompressedSQL := strings.HasSuffix(dumpFile, ".sql.gz") // Get expected size for this database for progress estimation expectedDBSize := dbSizes[dbName] // Start heartbeat ticker to show progress during long-running restore // CRITICAL FIX: Report progress to TUI callbacks so large DB restores show updates heartbeatCtx, cancelHeartbeat := context.WithCancel(ctx) heartbeatTicker := time.NewTicker(5 * time.Second) // More frequent updates (was 15s) heartbeatCount := int64(0) go func() { for { select { case <-heartbeatTicker.C: heartbeatCount++ dbElapsed := time.Since(dbRestoreStart) // Per-database elapsed phaseElapsedNow := time.Since(restorePhaseStart) // Overall phase elapsed mu.Lock() statusMsg := fmt.Sprintf("Restoring %s (%d/%d) - running: %s (phase: %s)", dbName, idx+1, totalDBs, formatDuration(dbElapsed), formatDuration(phaseElapsedNow)) e.progress.Update(statusMsg) // CRITICAL: Report activity to TUI callbacks during long-running restore // Use time-based progress estimation: assume ~10MB/s average throughput // This gives visual feedback even when pg_restore hasn't completed estimatedBytesPerSec := int64(10 * 1024 * 1024) // 10 MB/s conservative estimate estimatedBytesDone := dbElapsed.Milliseconds() / 1000 * estimatedBytesPerSec if expectedDBSize > 0 && estimatedBytesDone > expectedDBSize { estimatedBytesDone = expectedDBSize * 95 / 100 // Cap at 95% } // Calculate current progress including in-flight database currentBytesEstimate := bytesCompleted + estimatedBytesDone // Report to TUI with estimated progress e.reportDatabaseProgressByBytes(currentBytesEstimate, totalBytes, dbName, int(atomic.LoadInt32(&successCount)), totalDBs) // Also report timing info (use phaseElapsedNow computed above) var avgPerDB time.Duration completedDBTimesMu.Lock() if len(completedDBTimes) > 0 { var total time.Duration for _, d := range completedDBTimes { total += d } avgPerDB = total / time.Duration(len(completedDBTimes)) } completedDBTimesMu.Unlock() e.reportDatabaseProgressWithTiming(idx, totalDBs, dbName, phaseElapsedNow, avgPerDB) mu.Unlock() case <-heartbeatCtx.Done(): return } } }() var restoreErr error if isCompressedSQL { mu.Lock() if e.cfg.UseNativeEngine { e.log.Info("Detected compressed SQL format, using native Go engine", "file", dumpFile, "database", dbName) } else { e.log.Info("Detected compressed SQL format, using psql + pgzip", "file", dumpFile, "database", dbName) } mu.Unlock() restoreErr = e.restorePostgreSQLSQL(ctx, dumpFile, dbName, true) } else { mu.Lock() e.log.Info("Detected custom dump format, using pg_restore", "file", dumpFile, "database", dbName) mu.Unlock() restoreErr = e.restorePostgreSQLDumpWithOwnership(ctx, dumpFile, dbName, false, preserveOwnership) } // Stop heartbeat ticker heartbeatTicker.Stop() cancelHeartbeat() if restoreErr != nil { mu.Lock() e.log.Error("Failed to restore database", "name", dbName, "file", dumpFile, "error", restoreErr) mu.Unlock() // Check for specific recoverable errors errMsg := restoreErr.Error() // CRITICAL: Check for LOCK_EXHAUSTION error that escaped preflight checks if strings.Contains(errMsg, "LOCK_EXHAUSTION:") || strings.Contains(errMsg, "out of shared memory") || strings.Contains(errMsg, "max_locks_per_transaction") { mu.Lock() e.log.Error("šŸ”“ LOCK EXHAUSTION ERROR - ABORTING ALL DATABASE RESTORES", "database", dbName, "error", errMsg, "action", "Will force sequential mode and abort current parallel restore") // Force sequential mode for any future restores e.cfg.ClusterParallelism = 1 e.cfg.Jobs = 1 e.log.Error("=" + strings.Repeat("=", 70)) e.log.Error("CRITICAL: Lock exhaustion during restore - this should NOT happen") e.log.Error("Setting ClusterParallelism=1 and Jobs=1 for future operations") e.log.Error("Current restore MUST be aborted and restarted") e.log.Error("=" + strings.Repeat("=", 70)) mu.Unlock() // Add error and abort immediately - don't continue with other databases restoreErrorsMu.Lock() restoreErrors = multierror.Append(restoreErrors, fmt.Errorf("LOCK_EXHAUSTION: %s - all restores aborted, must restart with sequential mode", dbName)) restoreErrorsMu.Unlock() atomic.AddInt32(&failCount, 1) // Cancel context to stop all other goroutines // This will cause the entire restore to fail fast return } if strings.Contains(errMsg, "max_locks_per_transaction") { mu.Lock() e.log.Warn("Database restore failed due to insufficient locks - this is a PostgreSQL configuration issue", "database", dbName, "solution", "increase max_locks_per_transaction in postgresql.conf") mu.Unlock() } else if strings.Contains(errMsg, "total errors:") && strings.Contains(errMsg, "2562426") { mu.Lock() e.log.Warn("Database has massive error count - likely data corruption or incompatible dump format", "database", dbName, "errors", "2562426") mu.Unlock() } restoreErrorsMu.Lock() // Include more context in the error message restoreErrors = multierror.Append(restoreErrors, fmt.Errorf("%s: restore failed: %w", dbName, restoreErr)) restoreErrorsMu.Unlock() atomic.AddInt32(&failCount, 1) return } // Track completed database restore duration for ETA calculation dbRestoreDuration := time.Since(dbRestoreStart) completedDBTimesMu.Lock() completedDBTimes = append(completedDBTimes, dbRestoreDuration) completedDBTimesMu.Unlock() // Update bytes completed for weighted progress dbSize := dbSizes[dbName] bytesCompletedMu.Lock() bytesCompleted += dbSize currentBytesCompleted := bytesCompleted currentSuccessCount := int(atomic.LoadInt32(&successCount)) + 1 // +1 because we're about to increment bytesCompletedMu.Unlock() // Report weighted progress (bytes-based) e.reportDatabaseProgressByBytes(currentBytesCompleted, totalBytes, dbName, currentSuccessCount, totalDBs) atomic.AddInt32(&successCount, 1) // Small delay to ensure PostgreSQL fully closes connections before next restore time.Sleep(100 * time.Millisecond) }(dbIndex, entry.Name()) dbIndex++ } // Wait for all restores to complete wg.Wait() successCountFinal := int(atomic.LoadInt32(&successCount)) failCountFinal := int(atomic.LoadInt32(&failCount)) // SANITY CHECK: Verify all databases were accounted for // This catches any goroutine that exited without updating counters accountedFor := successCountFinal + failCountFinal if accountedFor != totalDBs { missingCount := totalDBs - accountedFor e.log.Error("INTERNAL ERROR: Some database restore goroutines did not report status", "expected", totalDBs, "success", successCountFinal, "failed", failCountFinal, "unaccounted", missingCount) // Treat unaccounted databases as failures failCountFinal += missingCount restoreErrorsMu.Lock() restoreErrors = multierror.Append(restoreErrors, fmt.Errorf("%d database(s) did not complete (possible goroutine crash or deadlock)", missingCount)) restoreErrorsMu.Unlock() } // CRITICAL: Check if no databases were restored at all if successCountFinal == 0 { e.progress.Fail(fmt.Sprintf("Cluster restore FAILED: 0 of %d databases restored", totalDBs)) operation.Fail("No databases were restored") if failCountFinal > 0 && restoreErrors != nil { return fmt.Errorf("cluster restore failed: all %d database(s) failed:\n%s", failCountFinal, restoreErrors.Error()) } return fmt.Errorf("cluster restore failed: no databases were restored (0 of %d total). Check PostgreSQL logs for details", totalDBs) } if failCountFinal > 0 { // Format multi-error with detailed output restoreErrors.ErrorFormat = func(errs []error) string { if len(errs) == 1 { return errs[0].Error() } points := make([]string, len(errs)) for i, err := range errs { points[i] = fmt.Sprintf(" • %s", err.Error()) } return fmt.Sprintf("%d database(s) failed:\n%s", len(errs), strings.Join(points, "\n")) } // Log summary e.log.Info("Cluster restore completed with failures", "succeeded", successCountFinal, "failed", failCountFinal, "total", totalDBs) e.progress.Fail(fmt.Sprintf("Cluster restore: %d succeeded, %d failed out of %d total", successCountFinal, failCountFinal, totalDBs)) operation.Complete(fmt.Sprintf("Partial restore: %d/%d databases succeeded", successCountFinal, totalDBs)) // Record cluster restore metrics (partial failure) e.recordClusterRestoreMetrics(clusterStartTime, archivePath, totalDBs, successCountFinal, false, restoreErrors.Error()) return fmt.Errorf("cluster restore completed with %d failures:\n%s", failCountFinal, restoreErrors.Error()) } e.progress.Complete(fmt.Sprintf("Cluster restored successfully: %d databases", successCountFinal)) operation.Complete(fmt.Sprintf("Restored %d databases from cluster archive", successCountFinal)) // Record cluster restore metrics (success) e.recordClusterRestoreMetrics(clusterStartTime, archivePath, totalDBs, successCountFinal, true, "") return nil } // restoreClusterFromSQL restores a pg_dumpall SQL file using the native engine // This handles .sql and .sql.gz files containing full cluster dumps func (e *Engine) restoreClusterFromSQL(ctx context.Context, archivePath string, operation logger.OperationLogger) error { e.log.Info("Restoring cluster from SQL file (pg_dumpall format)", "file", filepath.Base(archivePath), "native_engine", true) clusterStartTime := time.Now() // Determine if compressed compressed := strings.HasSuffix(strings.ToLower(archivePath), ".gz") // Use native engine to restore directly to postgres database (globals + all databases) e.log.Info("Restoring SQL dump using native engine...", "compressed", compressed, "size", FormatBytes(getFileSize(archivePath))) e.progress.Start("Restoring cluster from SQL dump...") // For pg_dumpall, we restore to the 'postgres' database which then creates other databases targetDB := "postgres" err := e.restoreWithNativeEngine(ctx, archivePath, targetDB, compressed) if err != nil { operation.Fail(fmt.Sprintf("SQL cluster restore failed: %v", err)) e.recordClusterRestoreMetrics(clusterStartTime, archivePath, 0, 0, false, err.Error()) return fmt.Errorf("SQL cluster restore failed: %w", err) } duration := time.Since(clusterStartTime) e.progress.Complete(fmt.Sprintf("Cluster restored successfully from SQL in %s", duration.Round(time.Second))) operation.Complete("SQL cluster restore completed") // Record metrics e.recordClusterRestoreMetrics(clusterStartTime, archivePath, 1, 1, true, "") return nil } // recordClusterRestoreMetrics records metrics for cluster restore operations func (e *Engine) recordClusterRestoreMetrics(startTime time.Time, archivePath string, totalDBs, successCount int, success bool, errorMsg string) { duration := time.Since(startTime) // Get archive size var archiveSize int64 if fi, err := os.Stat(archivePath); err == nil { archiveSize = fi.Size() } record := RestoreRecord{ Database: "cluster", Engine: "postgresql", StartedAt: startTime, CompletedAt: time.Now(), Duration: duration, SizeBytes: archiveSize, ParallelJobs: e.cfg.Jobs, Profile: e.cfg.ResourceProfile, Success: success, SourceFile: filepath.Base(archivePath), IsCluster: true, ErrorMessage: errorMsg, } if recordErr := RecordRestore(record); recordErr != nil { e.log.Warn("Failed to record cluster restore metrics", "error", recordErr) } // Log performance summary e.log.Info("šŸ“Š RESTORE PERFORMANCE SUMMARY", "total_duration", duration.Round(time.Second), "databases", totalDBs, "successful", successCount, "parallel_jobs", e.cfg.Jobs, "profile", e.cfg.ResourceProfile, "avg_per_db", (duration / time.Duration(totalDBs)).Round(time.Second)) } // extractArchive extracts a tar.gz archive with progress reporting func (e *Engine) extractArchive(ctx context.Context, archivePath, destDir string) error { // If progress callback is set, use Go's archive/tar for progress tracking if e.progressCallback != nil { return e.extractArchiveWithProgress(ctx, archivePath, destDir) } // Otherwise use fast shell tar (no progress) return e.extractArchiveShell(ctx, archivePath, destDir) } // extractArchiveWithProgress extracts using Go's archive/tar with detailed progress reporting func (e *Engine) extractArchiveWithProgress(ctx context.Context, archivePath, destDir string) error { // Get archive size for progress calculation archiveInfo, err := os.Stat(archivePath) if err != nil { return fmt.Errorf("failed to stat archive: %w", err) } totalSize := archiveInfo.Size() // Open the archive file file, err := os.Open(archivePath) if err != nil { return fmt.Errorf("failed to open archive: %w", err) } defer file.Close() // Wrap with progress reader progressReader := &progressReader{ reader: file, totalSize: totalSize, callback: e.progressCallback, desc: "Extracting archive", } // Create parallel gzip reader for faster decompression gzReader, err := pgzip.NewReader(progressReader) if err != nil { return fmt.Errorf("failed to create gzip reader: %w", err) } defer gzReader.Close() // Create tar reader tarReader := tar.NewReader(gzReader) // Extract files for { select { case <-ctx.Done(): return ctx.Err() default: } header, err := tarReader.Next() if err == io.EOF { break // End of archive } if err != nil { return fmt.Errorf("failed to read tar header: %w", err) } // Sanitize and validate path targetPath := filepath.Join(destDir, header.Name) // Security check: ensure path is within destDir (prevent path traversal) if !strings.HasPrefix(filepath.Clean(targetPath), filepath.Clean(destDir)) { e.log.Warn("Skipping potentially malicious path in archive", "path", header.Name) continue } switch header.Typeflag { case tar.TypeDir: if err := os.MkdirAll(targetPath, 0755); err != nil { return fmt.Errorf("failed to create directory %s: %w", targetPath, err) } case tar.TypeReg: // Ensure parent directory exists if err := os.MkdirAll(filepath.Dir(targetPath), 0755); err != nil { return fmt.Errorf("failed to create parent directory: %w", err) } // Create the file outFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(header.Mode)) if err != nil { return fmt.Errorf("failed to create file %s: %w", targetPath, err) } // Copy file contents with context awareness for Ctrl+C interruption // Use buffered I/O for turbo mode (32KB buffer) if e.cfg.BufferedIO { bufferedWriter := bufio.NewWriterSize(outFile, 32*1024) // 32KB buffer for faster writes if _, err := fs.CopyWithContext(ctx, bufferedWriter, tarReader); err != nil { outFile.Close() os.Remove(targetPath) // Clean up partial file return fmt.Errorf("failed to write file %s: %w", targetPath, err) } if err := bufferedWriter.Flush(); err != nil { outFile.Close() os.Remove(targetPath) return fmt.Errorf("failed to flush buffer for %s: %w", targetPath, err) } } else { if _, err := fs.CopyWithContext(ctx, outFile, tarReader); err != nil { outFile.Close() os.Remove(targetPath) // Clean up partial file return fmt.Errorf("failed to write file %s: %w", targetPath, err) } } outFile.Close() case tar.TypeSymlink: // Handle symlinks (common in some archives) if err := os.Symlink(header.Linkname, targetPath); err != nil { // Ignore symlink errors (may already exist or not supported) e.log.Debug("Could not create symlink", "path", targetPath, "target", header.Linkname) } } } // Final progress update e.reportProgress(totalSize, totalSize, "Extraction complete") return nil } // progressReader wraps an io.Reader to report read progress type progressReader struct { reader io.Reader totalSize int64 bytesRead int64 callback ProgressCallback desc string lastReport time.Time reportEvery time.Duration } func (pr *progressReader) Read(p []byte) (n int, err error) { n, err = pr.reader.Read(p) pr.bytesRead += int64(n) // Throttle progress reporting to every 50ms for smoother updates if pr.reportEvery == 0 { pr.reportEvery = 50 * time.Millisecond } if time.Since(pr.lastReport) > pr.reportEvery { if pr.callback != nil { pr.callback(pr.bytesRead, pr.totalSize, pr.desc) } pr.lastReport = time.Now() } return n, err } // extractArchiveShell extracts using pgzip (parallel gzip, 2-4x faster on multi-core) func (e *Engine) extractArchiveShell(ctx context.Context, archivePath, destDir string) error { // Start heartbeat ticker for extraction progress extractionStart := time.Now() e.log.Info("Extracting archive with pgzip (parallel gzip)", "archive", archivePath, "dest", destDir, "method", "pgzip") // Use parallel extraction err := fs.ExtractTarGzParallel(ctx, archivePath, destDir, func(progress fs.ExtractProgress) { if progress.TotalBytes > 0 { elapsed := time.Since(extractionStart) pct := float64(progress.BytesRead) / float64(progress.TotalBytes) * 100 e.progress.Update(fmt.Sprintf("Extracting archive... %.1f%% (elapsed: %s)", pct, formatDuration(elapsed))) } }) if err != nil { return fmt.Errorf("parallel extraction failed: %w", err) } elapsed := time.Since(extractionStart) e.log.Info("Archive extraction complete", "duration", formatDuration(elapsed)) return nil } // restoreGlobals restores global objects (roles, tablespaces) // Note: psql returns 0 even when some statements fail (e.g., role already exists) // We track errors but only fail on FATAL errors that would prevent restore func (e *Engine) restoreGlobals(ctx context.Context, globalsFile string) error { args := []string{ "-p", fmt.Sprintf("%d", e.cfg.Port), "-U", e.cfg.User, "-d", "postgres", "-f", globalsFile, } // Only add -h flag if host is not localhost (to use Unix socket for peer auth) if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" { args = append([]string{"-h", e.cfg.Host}, args...) } cmd := cleanup.SafeCommand(ctx, "psql", args...) cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password)) // Stream output to avoid memory issues with large globals.sql files stderr, err := cmd.StderrPipe() if err != nil { return fmt.Errorf("failed to create stderr pipe: %w", err) } if err := cmd.Start(); err != nil { return fmt.Errorf("failed to start psql: %w", err) } // Read stderr in chunks in goroutine var lastError string var errorCount int var fatalError bool stderrDone := make(chan struct{}) go func() { defer close(stderrDone) buf := make([]byte, 4096) for { n, err := stderr.Read(buf) if n > 0 { chunk := string(buf[:n]) // Track different error types if strings.Contains(chunk, "FATAL") { fatalError = true lastError = chunk e.log.Error("Globals restore FATAL error", "output", chunk) } else if strings.Contains(chunk, "ERROR") { errorCount++ lastError = chunk // Only log first few errors to avoid spam if errorCount <= 5 { // Check if it's an ignorable "already exists" error if strings.Contains(chunk, "already exists") { e.log.Debug("Globals restore: object already exists (expected)", "output", chunk) } else { e.log.Warn("Globals restore error", "output", chunk) } } } } if err != nil { break } } }() // Wait for command with proper context handling cmdDone := make(chan error, 1) go func() { cmdDone <- cmd.Wait() }() var cmdErr error select { case cmdErr = <-cmdDone: // Command completed case <-ctx.Done(): e.log.Warn("Globals restore cancelled - killing process group") cleanup.KillCommandGroup(cmd) <-cmdDone cmdErr = ctx.Err() } // Wait for stderr reader with timeout to prevent indefinite hang // if the process doesn't fully terminate select { case <-stderrDone: // Normal completion case <-time.After(5 * time.Second): e.log.Warn("Stderr reader timeout - forcefully continuing") } // Only fail on actual command errors or FATAL PostgreSQL errors // Regular ERROR messages (like "role already exists") are expected if cmdErr != nil { return fmt.Errorf("failed to restore globals: %w (last error: %s)", cmdErr, lastError) } // If we had FATAL errors, those are real problems if fatalError { return fmt.Errorf("globals restore had FATAL error: %s", lastError) } // Log summary if there were errors (but don't fail) if errorCount > 0 { e.log.Info("Globals restore completed with some errors (usually 'already exists' - expected)", "error_count", errorCount) } return nil } // checkSuperuser verifies if the current user has superuser privileges func (e *Engine) checkSuperuser(ctx context.Context) (bool, error) { args := []string{ "-p", fmt.Sprintf("%d", e.cfg.Port), "-U", e.cfg.User, "-d", "postgres", "-tAc", "SELECT usesuper FROM pg_user WHERE usename = current_user", } // Only add -h flag if host is not localhost (to use Unix socket for peer auth) if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" { args = append([]string{"-h", e.cfg.Host}, args...) } cmd := cleanup.SafeCommand(ctx, "psql", args...) // Always set PGPASSWORD (empty string is fine for peer/ident auth) cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password)) output, err := cmd.CombinedOutput() if err != nil { return false, fmt.Errorf("failed to check superuser status: %w", err) } isSuperuser := strings.TrimSpace(string(output)) == "t" return isSuperuser, nil } // terminateConnections kills all active connections to a database func (e *Engine) terminateConnections(ctx context.Context, dbName string) error { query := fmt.Sprintf(` SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '%s' AND pid <> pg_backend_pid() `, dbName) args := []string{ "-p", fmt.Sprintf("%d", e.cfg.Port), "-U", e.cfg.User, "-d", "postgres", "-tAc", query, } // Only add -h flag if host is not localhost (to use Unix socket for peer auth) if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" { args = append([]string{"-h", e.cfg.Host}, args...) } cmd := cleanup.SafeCommand(ctx, "psql", args...) // Always set PGPASSWORD (empty string is fine for peer/ident auth) cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password)) output, err := cmd.CombinedOutput() if err != nil { e.log.Warn("Failed to terminate connections", "database", dbName, "error", err, "output", string(output)) // Don't fail - database might not exist or have no connections } return nil } // dropDatabaseIfExists drops a database completely (clean slate) // Uses PostgreSQL 13+ WITH (FORCE) option to forcefully drop even with active connections func (e *Engine) dropDatabaseIfExists(ctx context.Context, dbName string) error { // First terminate all connections if err := e.terminateConnections(ctx, dbName); err != nil { e.log.Warn("Could not terminate connections", "database", dbName, "error", err) } // Wait a moment for connections to terminate time.Sleep(500 * time.Millisecond) // Try to revoke new connections (prevents race condition) // This only works if we have the privilege to do so revokeArgs := []string{ "-p", fmt.Sprintf("%d", e.cfg.Port), "-U", e.cfg.User, "-d", "postgres", "-c", fmt.Sprintf("REVOKE CONNECT ON DATABASE \"%s\" FROM PUBLIC", dbName), } if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" { revokeArgs = append([]string{"-h", e.cfg.Host}, revokeArgs...) } revokeCmd := cleanup.SafeCommand(ctx, "psql", revokeArgs...) revokeCmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password)) revokeCmd.Run() // Ignore errors - database might not exist // Terminate connections again after revoking connect privilege e.terminateConnections(ctx, dbName) time.Sleep(200 * time.Millisecond) // Try DROP DATABASE WITH (FORCE) first (PostgreSQL 13+) // This forcefully terminates connections and drops the database atomically forceArgs := []string{ "-p", fmt.Sprintf("%d", e.cfg.Port), "-U", e.cfg.User, "-d", "postgres", "-c", fmt.Sprintf("DROP DATABASE IF EXISTS \"%s\" WITH (FORCE)", dbName), } if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" { forceArgs = append([]string{"-h", e.cfg.Host}, forceArgs...) } forceCmd := cleanup.SafeCommand(ctx, "psql", forceArgs...) forceCmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password)) output, err := forceCmd.CombinedOutput() if err == nil { e.log.Info("Dropped existing database (with FORCE)", "name", dbName) return nil } // If FORCE option failed (PostgreSQL < 13), try regular drop if strings.Contains(string(output), "syntax error") || strings.Contains(string(output), "WITH (FORCE)") { e.log.Debug("WITH (FORCE) not supported, using standard DROP", "name", dbName) args := []string{ "-p", fmt.Sprintf("%d", e.cfg.Port), "-U", e.cfg.User, "-d", "postgres", "-c", fmt.Sprintf("DROP DATABASE IF EXISTS \"%s\"", dbName), } if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" { args = append([]string{"-h", e.cfg.Host}, args...) } cmd := cleanup.SafeCommand(ctx, "psql", args...) cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password)) output, err = cmd.CombinedOutput() if err != nil { return fmt.Errorf("failed to drop database '%s': %w\nOutput: %s", dbName, err, string(output)) } } else if err != nil { return fmt.Errorf("failed to drop database '%s': %w\nOutput: %s", dbName, err, string(output)) } e.log.Info("Dropped existing database", "name", dbName) return nil } // ensureDatabaseExists checks if a database exists and creates it if not func (e *Engine) ensureDatabaseExists(ctx context.Context, dbName string) error { // Route to appropriate implementation based on database type if e.cfg.DatabaseType == "mysql" || e.cfg.DatabaseType == "mariadb" { return e.ensureMySQLDatabaseExists(ctx, dbName) } return e.ensurePostgresDatabaseExists(ctx, dbName) } // ensureMySQLDatabaseExists checks if a MySQL database exists and creates it if not func (e *Engine) ensureMySQLDatabaseExists(ctx context.Context, dbName string) error { // Build mysql command - use environment variable for password (security: avoid process list exposure) args := []string{ "-h", e.cfg.Host, "-P", fmt.Sprintf("%d", e.cfg.Port), "-u", e.cfg.User, "-e", fmt.Sprintf("CREATE DATABASE IF NOT EXISTS `%s`", dbName), } cmd := cleanup.SafeCommand(ctx, "mysql", args...) cmd.Env = os.Environ() if e.cfg.Password != "" { cmd.Env = append(cmd.Env, "MYSQL_PWD="+e.cfg.Password) } output, err := cmd.CombinedOutput() if err != nil { e.log.Warn("MySQL database creation failed", "name", dbName, "error", err, "output", string(output)) return fmt.Errorf("failed to create database '%s': %w (output: %s)", dbName, err, strings.TrimSpace(string(output))) } e.log.Info("Successfully ensured MySQL database exists", "name", dbName) return nil } // ensurePostgresDatabaseExists checks if a PostgreSQL database exists and creates it if not // It attempts to extract encoding/locale from the dump file to preserve original settings func (e *Engine) ensurePostgresDatabaseExists(ctx context.Context, dbName string) error { // Skip creation for postgres and template databases - they should already exist if dbName == "postgres" || dbName == "template0" || dbName == "template1" { e.log.Info("Skipping create for system database (assume exists)", "name", dbName) return nil } // Build psql command with authentication buildPsqlCmd := func(ctx context.Context, database, query string) *exec.Cmd { args := []string{ "-p", fmt.Sprintf("%d", e.cfg.Port), "-U", e.cfg.User, "-d", database, "-tAc", query, } // Only add -h flag if host is not localhost (to use Unix socket for peer auth) if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" { args = append([]string{"-h", e.cfg.Host}, args...) } cmd := cleanup.SafeCommand(ctx, "psql", args...) // Always set PGPASSWORD (empty string is fine for peer/ident auth) cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password)) return cmd } // Check if database exists checkCmd := buildPsqlCmd(ctx, "postgres", fmt.Sprintf("SELECT 1 FROM pg_database WHERE datname = '%s'", dbName)) output, err := checkCmd.CombinedOutput() if err != nil { e.log.Warn("Database existence check failed", "name", dbName, "error", err, "output", string(output)) // Continue anyway - maybe we can create it } // If database exists, we're done if strings.TrimSpace(string(output)) == "1" { e.log.Info("Database already exists", "name", dbName) return nil } // Database doesn't exist, create it // IMPORTANT: Use template0 to avoid duplicate definition errors from local additions to template1 // Also use UTF8 encoding explicitly as it's the most common and safest choice // See PostgreSQL docs: https://www.postgresql.org/docs/current/app-pgrestore.html#APP-PGRESTORE-NOTES e.log.Info("Creating database from template0 with UTF8 encoding", "name", dbName) // Get server's default locale for LC_COLLATE and LC_CTYPE // This ensures compatibility while using the correct encoding localeCmd := buildPsqlCmd(ctx, "postgres", "SHOW lc_collate") localeOutput, _ := localeCmd.CombinedOutput() serverLocale := strings.TrimSpace(string(localeOutput)) if serverLocale == "" { serverLocale = "en_US.UTF-8" // Fallback to common default } // Build CREATE DATABASE command with encoding and locale // Using ENCODING 'UTF8' explicitly ensures the dump can be restored createSQL := fmt.Sprintf( "CREATE DATABASE \"%s\" WITH TEMPLATE template0 ENCODING 'UTF8' LC_COLLATE '%s' LC_CTYPE '%s'", dbName, serverLocale, serverLocale, ) createArgs := []string{ "-p", fmt.Sprintf("%d", e.cfg.Port), "-U", e.cfg.User, "-d", "postgres", "-c", createSQL, } // Only add -h flag if host is not localhost (to use Unix socket for peer auth) if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" { createArgs = append([]string{"-h", e.cfg.Host}, createArgs...) } createCmd := cleanup.SafeCommand(ctx, "psql", createArgs...) // Always set PGPASSWORD (empty string is fine for peer/ident auth) createCmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password)) createOutput, createErr := createCmd.CombinedOutput() if createErr != nil { // If encoding/locale fails, try simpler CREATE DATABASE e.log.Warn("Database creation with encoding failed, trying simple create", "name", dbName, "error", createErr, "output", string(createOutput)) simpleArgs := []string{ "-p", fmt.Sprintf("%d", e.cfg.Port), "-U", e.cfg.User, "-d", "postgres", "-c", fmt.Sprintf("CREATE DATABASE \"%s\" WITH TEMPLATE template0", dbName), } if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" { simpleArgs = append([]string{"-h", e.cfg.Host}, simpleArgs...) } simpleCmd := cleanup.SafeCommand(ctx, "psql", simpleArgs...) simpleCmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password)) output, err = simpleCmd.CombinedOutput() if err != nil { e.log.Warn("Database creation failed", "name", dbName, "error", err, "output", string(output)) return fmt.Errorf("failed to create database '%s': %w (output: %s)", dbName, err, strings.TrimSpace(string(output))) } } e.log.Info("Successfully created database from template0", "name", dbName) return nil } // previewClusterRestore shows cluster restore preview func (e *Engine) previewClusterRestore(archivePath string) error { fmt.Println("\n" + strings.Repeat("=", 60)) fmt.Println(" CLUSTER RESTORE PREVIEW (DRY RUN)") fmt.Println(strings.Repeat("=", 60)) stat, _ := os.Stat(archivePath) fmt.Printf("\nArchive: %s\n", filepath.Base(archivePath)) if stat != nil { fmt.Printf("Size: %s\n", FormatBytes(stat.Size())) fmt.Printf("Modified: %s\n", stat.ModTime().Format("2006-01-02 15:04:05")) } fmt.Printf("Target Host: %s:%d\n", e.cfg.Host, e.cfg.Port) fmt.Println("\nOperations that would be performed:") fmt.Println(" 1. Extract cluster archive to temporary directory") fmt.Println(" 2. Restore global objects (roles, tablespaces)") fmt.Println(" 3. Restore all databases found in archive") fmt.Println(" 4. Cleanup temporary files") fmt.Println("\n[WARN] WARNING: This will restore multiple databases.") fmt.Println(" Existing databases may be overwritten or merged.") fmt.Println("\nTo execute this restore, add the --confirm flag.") fmt.Println(strings.Repeat("=", 60) + "\n") return nil } // detectLargeObjectsInDumps checks if any dump files contain large objects func (e *Engine) detectLargeObjectsInDumps(dumpsDir string, entries []os.DirEntry) bool { hasLargeObjects := false checkedCount := 0 maxChecks := 5 // Only check first 5 dumps to avoid slowdown for _, entry := range entries { if entry.IsDir() || checkedCount >= maxChecks { continue } dumpFile := filepath.Join(dumpsDir, entry.Name()) // Skip compressed SQL files (can't easily check without decompressing) if strings.HasSuffix(dumpFile, ".sql.gz") { continue } // Use pg_restore -l to list contents (fast, doesn't restore data) // 2 minutes for large dumps with many objects ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) defer cancel() cmd := cleanup.SafeCommand(ctx, "pg_restore", "-l", dumpFile) output, err := cmd.Output() if err != nil { // If pg_restore -l fails, it might not be custom format - skip continue } checkedCount++ // Check if output contains "BLOB" or "LARGE OBJECT" entries outputStr := string(output) if strings.Contains(outputStr, "BLOB") || strings.Contains(outputStr, "LARGE OBJECT") || strings.Contains(outputStr, " BLOBS ") { e.log.Info("Large objects detected in dump file", "file", entry.Name()) hasLargeObjects = true // Don't break - log all files with large objects } } if hasLargeObjects { e.log.Warn("Cluster contains databases with large objects - parallel restore may cause lock contention") } return hasLargeObjects } // isIgnorableError checks if an error message represents an ignorable PostgreSQL restore error func (e *Engine) isIgnorableError(errorMsg string) bool { // Convert to lowercase for case-insensitive matching lowerMsg := strings.ToLower(errorMsg) // CRITICAL: Syntax errors are NOT ignorable - indicates corrupted dump if strings.Contains(lowerMsg, "syntax error") { e.log.Error("CRITICAL: Syntax error in dump file - dump may be corrupted", "error", errorMsg) return false } // CRITICAL: If error count is extremely high (>100k), dump is likely corrupted if strings.Contains(errorMsg, "total errors:") { // Extract error count if present in message parts := strings.Split(errorMsg, "total errors:") if len(parts) > 1 { errorCountStr := strings.TrimSpace(strings.Split(parts[1], ")")[0]) // Try to parse as number var count int if _, err := fmt.Sscanf(errorCountStr, "%d", &count); err == nil && count > 100000 { e.log.Error("CRITICAL: Excessive errors indicate corrupted dump", "error_count", count) return false } } } // List of ignorable error patterns (objects that already exist or don't exist) ignorablePatterns := []string{ "already exists", "duplicate key", "does not exist, skipping", // For DROP IF EXISTS "no pg_hba.conf entry", // Permission warnings (not fatal) } for _, pattern := range ignorablePatterns { if strings.Contains(lowerMsg, pattern) { return true } } // Special handling for "role does not exist" - this is a warning, not fatal // Happens when globals.sql didn't contain a role that the dump references // The restore can continue, but ownership won't be preserved for that role if strings.Contains(lowerMsg, "role") && strings.Contains(lowerMsg, "does not exist") { e.log.Warn("Role referenced in dump does not exist - ownership won't be preserved", "error", errorMsg, "hint", "The role may not have been in globals.sql or globals restore failed") return true // Treat as ignorable - restore can continue } return false } // getFileSize returns the size of a file, or 0 if it can't be read func getFileSize(path string) int64 { info, err := os.Stat(path) if err != nil { return 0 } return info.Size() } // FormatBytes formats bytes to human readable format func FormatBytes(bytes int64) string { const unit = 1024 if bytes < unit { return fmt.Sprintf("%d B", bytes) } div, exp := int64(unit), 0 for n := bytes / unit; n >= unit; n /= unit { div *= unit exp++ } return fmt.Sprintf("%.1f %cB", float64(bytes)/float64(div), "KMGTPE"[exp]) } // formatDuration formats a duration to human readable format (e.g., "3m 45s", "1h 23m", "45s") func formatDuration(d time.Duration) string { if d < time.Second { return "0s" } hours := int(d.Hours()) minutes := int(d.Minutes()) % 60 seconds := int(d.Seconds()) % 60 if hours > 0 { return fmt.Sprintf("%dh %dm", hours, minutes) } if minutes > 0 { return fmt.Sprintf("%dm %ds", minutes, seconds) } return fmt.Sprintf("%ds", seconds) } // quickValidateSQLDump performs a fast validation of SQL dump files // by checking for truncated COPY blocks. This catches corrupted dumps // BEFORE attempting a full restore (which could waste 49+ minutes). func (e *Engine) quickValidateSQLDump(archivePath string, compressed bool) error { e.log.Debug("Pre-validating SQL dump file", "path", archivePath, "compressed", compressed) diagnoser := NewDiagnoser(e.log, false) // non-verbose for speed result, err := diagnoser.DiagnoseFile(archivePath) if err != nil { return fmt.Errorf("diagnosis error: %w", err) } // Check for critical issues that would cause restore failure if result.IsTruncated { errMsg := "SQL dump file is TRUNCATED" if result.Details != nil && result.Details.UnterminatedCopy { errMsg = fmt.Sprintf("%s - unterminated COPY block for table '%s' at line %d", errMsg, result.Details.LastCopyTable, result.Details.LastCopyLineNumber) if len(result.Details.SampleCopyData) > 0 { errMsg = fmt.Sprintf("%s (sample orphaned data: %s)", errMsg, result.Details.SampleCopyData[0]) } } return fmt.Errorf("%s", errMsg) } if result.IsCorrupted { return fmt.Errorf("SQL dump file is corrupted: %v", result.Errors) } if !result.IsValid { if len(result.Errors) > 0 { return fmt.Errorf("dump validation failed: %s", result.Errors[0]) } return fmt.Errorf("dump file is invalid (unknown reason)") } // Log any warnings but don't fail for _, warning := range result.Warnings { e.log.Warn("Dump validation warning", "warning", warning) } e.log.Debug("SQL dump validation passed", "path", archivePath) return nil } // OriginalSettings stores PostgreSQL settings to restore after operation type OriginalSettings struct { MaxLocks int MaintenanceWorkMem string } // boostPostgreSQLSettings boosts multiple PostgreSQL settings for large restores // NOTE: max_locks_per_transaction requires a PostgreSQL RESTART to take effect! // maintenance_work_mem can be changed with pg_reload_conf(). func (e *Engine) boostPostgreSQLSettings(ctx context.Context, lockBoostValue int) (*OriginalSettings, error) { if e.cfg.DebugLocks { e.log.Info("šŸ” [LOCK-DEBUG] boostPostgreSQLSettings: Starting lock boost procedure", "target_lock_value", lockBoostValue) } connStr := e.buildConnString() db, err := sql.Open("pgx", connStr) if err != nil { if e.cfg.DebugLocks { e.log.Error("šŸ” [LOCK-DEBUG] Failed to connect to PostgreSQL", "error", err) } return nil, fmt.Errorf("failed to connect: %w", err) } defer db.Close() original := &OriginalSettings{} // Get current max_locks_per_transaction var maxLocksStr string if err := db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&maxLocksStr); err == nil { original.MaxLocks, _ = strconv.Atoi(maxLocksStr) } if e.cfg.DebugLocks { e.log.Info("šŸ” [LOCK-DEBUG] Current PostgreSQL lock configuration", "current_max_locks", original.MaxLocks, "target_max_locks", lockBoostValue, "boost_required", original.MaxLocks < lockBoostValue) } // Get current maintenance_work_mem db.QueryRowContext(ctx, "SHOW maintenance_work_mem").Scan(&original.MaintenanceWorkMem) // CRITICAL: max_locks_per_transaction requires a PostgreSQL RESTART! // pg_reload_conf() is NOT sufficient for this parameter. needsRestart := false if original.MaxLocks < lockBoostValue { if e.cfg.DebugLocks { e.log.Info("šŸ” [LOCK-DEBUG] Executing ALTER SYSTEM to boost locks", "from", original.MaxLocks, "to", lockBoostValue) } _, err = db.ExecContext(ctx, fmt.Sprintf("ALTER SYSTEM SET max_locks_per_transaction = %d", lockBoostValue)) if err != nil { e.log.Warn("Could not set max_locks_per_transaction", "error", err) if e.cfg.DebugLocks { e.log.Error("šŸ” [LOCK-DEBUG] ALTER SYSTEM failed", "error", err) } } else { needsRestart = true e.log.Warn("max_locks_per_transaction requires PostgreSQL restart to take effect", "current", original.MaxLocks, "target", lockBoostValue) if e.cfg.DebugLocks { e.log.Info("šŸ” [LOCK-DEBUG] ALTER SYSTEM succeeded - restart required", "setting_saved_to", "postgresql.auto.conf", "active_after", "PostgreSQL restart") } } } // Boost maintenance_work_mem to 2GB for faster index creation // (this one CAN be applied via pg_reload_conf) _, err = db.ExecContext(ctx, "ALTER SYSTEM SET maintenance_work_mem = '2GB'") if err != nil { e.log.Warn("Could not boost maintenance_work_mem", "error", err) } // Reload config to apply maintenance_work_mem _, err = db.ExecContext(ctx, "SELECT pg_reload_conf()") if err != nil { return original, fmt.Errorf("failed to reload config: %w", err) } // If max_locks_per_transaction needs a restart, try to do it if needsRestart { if e.cfg.DebugLocks { e.log.Info("šŸ” [LOCK-DEBUG] Attempting PostgreSQL restart to activate new lock setting") } if restarted := e.tryRestartPostgreSQL(ctx); restarted { e.log.Info("PostgreSQL restarted successfully - max_locks_per_transaction now active") if e.cfg.DebugLocks { e.log.Info("šŸ” [LOCK-DEBUG] PostgreSQL restart SUCCEEDED") } // Wait for PostgreSQL to be ready time.Sleep(3 * time.Second) // Update original.MaxLocks to reflect the new value after restart var newMaxLocksStr string if err := db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&newMaxLocksStr); err == nil { original.MaxLocks, _ = strconv.Atoi(newMaxLocksStr) e.log.Info("Verified new max_locks_per_transaction after restart", "value", original.MaxLocks) if e.cfg.DebugLocks { e.log.Info("šŸ” [LOCK-DEBUG] Post-restart verification", "new_max_locks", original.MaxLocks, "target_was", lockBoostValue, "verification", "PASS") } } } else { // Cannot restart - this is now a CRITICAL failure // We tried to boost locks but can't apply them without restart e.log.Error("CRITICAL: max_locks_per_transaction boost requires PostgreSQL restart") e.log.Error("Current value: " + strconv.Itoa(original.MaxLocks) + ", required: " + strconv.Itoa(lockBoostValue)) e.log.Error("The setting has been saved to postgresql.auto.conf but is NOT ACTIVE") e.log.Error("Restore will ABORT to prevent 'out of shared memory' failure") e.log.Error("Action required: Ask DBA to restart PostgreSQL, then retry restore") if e.cfg.DebugLocks { e.log.Error("šŸ” [LOCK-DEBUG] PostgreSQL restart FAILED", "current_locks", original.MaxLocks, "required_locks", lockBoostValue, "setting_saved", true, "setting_active", false, "verdict", "ABORT - Manual restart required") } // Return original settings so caller can check and abort return original, nil } } if e.cfg.DebugLocks { e.log.Info("šŸ” [LOCK-DEBUG] boostPostgreSQLSettings: Complete", "final_max_locks", original.MaxLocks, "target_was", lockBoostValue, "boost_successful", original.MaxLocks >= lockBoostValue) } return original, nil } // canRestartPostgreSQL checks if we have the ability to restart PostgreSQL // Returns false if running in a restricted environment (e.g., su postgres on enterprise systems) func (e *Engine) canRestartPostgreSQL() bool { // Check if we're running as postgres user - if so, we likely can't restart // because PostgreSQL is managed by init/systemd, not directly by pg_ctl currentUser := os.Getenv("USER") if currentUser == "" { currentUser = os.Getenv("LOGNAME") } // If we're the postgres user, check if we have sudo access if currentUser == "postgres" { // Try a quick sudo check - if this fails, we can't restart ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) defer cancel() cmd := cleanup.SafeCommand(ctx, "sudo", "-n", "true") cmd.Stdin = nil if err := cmd.Run(); err != nil { e.log.Info("Running as postgres user without sudo access - cannot restart PostgreSQL", "user", currentUser, "hint", "Ask system administrator to restart PostgreSQL if needed") return false } } return true } // tryRestartPostgreSQL attempts to restart PostgreSQL using various methods // Returns true if restart was successful // IMPORTANT: Uses short timeouts and non-interactive sudo to avoid blocking on password prompts // NOTE: This function will return false immediately if running as postgres without sudo func (e *Engine) tryRestartPostgreSQL(ctx context.Context) bool { // First check if we can even attempt a restart if !e.canRestartPostgreSQL() { e.log.Info("Skipping PostgreSQL restart attempt (no privileges)") return false } e.progress.Update("Attempting PostgreSQL restart for lock settings...") // Use short timeout for each restart attempt (don't block on sudo password prompts) runWithTimeout := func(args ...string) bool { cmdCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() cmd := cleanup.SafeCommand(cmdCtx, args[0], args[1:]...) // Set stdin to /dev/null to prevent sudo from waiting for password cmd.Stdin = nil return cmd.Run() == nil } // Method 1: systemctl (most common on modern Linux) - use sudo -n for non-interactive if runWithTimeout("sudo", "-n", "systemctl", "restart", "postgresql") { return true } // Method 2: systemctl with version suffix (e.g., postgresql-15) for _, ver := range []string{"17", "16", "15", "14", "13", "12"} { if runWithTimeout("sudo", "-n", "systemctl", "restart", "postgresql-"+ver) { return true } } // Method 3: service command (older systems) if runWithTimeout("sudo", "-n", "service", "postgresql", "restart") { return true } // Method 4: pg_ctl as postgres user (if we ARE postgres user, no sudo needed) if runWithTimeout("pg_ctl", "restart", "-D", "/var/lib/postgresql/data", "-m", "fast") { return true } // Method 5: Try common PGDATA paths with pg_ctl directly (for postgres user) pgdataPaths := []string{ "/var/lib/pgsql/data", "/var/lib/pgsql/17/data", "/var/lib/pgsql/16/data", "/var/lib/pgsql/15/data", "/var/lib/postgresql/17/main", "/var/lib/postgresql/16/main", "/var/lib/postgresql/15/main", } for _, pgdata := range pgdataPaths { if runWithTimeout("pg_ctl", "restart", "-D", pgdata, "-m", "fast") { return true } } return false } // resetPostgreSQLSettings restores original PostgreSQL settings // NOTE: max_locks_per_transaction changes are written but require restart to take effect. // We don't restart here since we're done with the restore. func (e *Engine) resetPostgreSQLSettings(ctx context.Context, original *OriginalSettings) error { connStr := e.buildConnString() db, err := sql.Open("pgx", connStr) if err != nil { return fmt.Errorf("failed to connect: %w", err) } defer db.Close() // Reset max_locks_per_transaction (will take effect on next restart) if original.MaxLocks == 64 { // Default db.ExecContext(ctx, "ALTER SYSTEM RESET max_locks_per_transaction") } else if original.MaxLocks > 0 { db.ExecContext(ctx, fmt.Sprintf("ALTER SYSTEM SET max_locks_per_transaction = %d", original.MaxLocks)) } // Reset maintenance_work_mem (takes effect immediately with reload) if original.MaintenanceWorkMem == "64MB" { // Default db.ExecContext(ctx, "ALTER SYSTEM RESET maintenance_work_mem") } else if original.MaintenanceWorkMem != "" { db.ExecContext(ctx, fmt.Sprintf("ALTER SYSTEM SET maintenance_work_mem = '%s'", original.MaintenanceWorkMem)) } // Reload config (only maintenance_work_mem will take effect immediately) _, err = db.ExecContext(ctx, "SELECT pg_reload_conf()") if err != nil { return fmt.Errorf("failed to reload config: %w", err) } e.log.Info("PostgreSQL settings reset queued", "note", "max_locks_per_transaction will revert on next PostgreSQL restart") return nil }