Files
dbbackup/internal/restore/engine.go
Renz a101fb81ab
Some checks failed
CI/CD / Test (push) Successful in 3m25s
CI/CD / Lint (push) Successful in 1m33s
CI/CD / Integration Tests (push) Successful in 1m4s
CI/CD / Native Engine Tests (push) Successful in 1m2s
CI/CD / Build Binary (push) Successful in 56s
CI/CD / Test Release Build (push) Successful in 1m41s
CI/CD / Release Binaries (push) Failing after 11m55s
v5.8.22: Defensive fixes for potential restore hang issues
- Add context cancellation check during COPY data parsing loop
  (prevents hangs when parsing large tables with millions of rows)
- Add 5-second timeout for stderr reader in globals restore
  (prevents indefinite hang if psql process doesn't terminate cleanly)
- Reduce database drop timeout from 5 minutes to 60 seconds
  (improves TUI responsiveness during cluster cleanup)
2026-02-05 12:40:26 +00:00

3344 lines
118 KiB
Go
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package restore
import (
"archive/tar"
"bufio"
"context"
"database/sql"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"dbbackup/internal/checks"
"dbbackup/internal/cleanup"
"dbbackup/internal/config"
"dbbackup/internal/database"
"dbbackup/internal/engine/native"
"dbbackup/internal/fs"
"dbbackup/internal/logger"
"dbbackup/internal/progress"
"dbbackup/internal/security"
"github.com/hashicorp/go-multierror"
_ "github.com/jackc/pgx/v5/stdlib" // PostgreSQL driver
"github.com/klauspost/pgzip"
)
// ProgressCallback is called with progress updates during long operations
// Parameters: current bytes/items done, total bytes/items, description
type ProgressCallback func(current, total int64, description string)
// DatabaseProgressCallback is called with database count progress during cluster restore
type DatabaseProgressCallback func(done, total int, dbName string)
// DatabaseProgressWithTimingCallback is called with database progress including timing info
// Parameters: done count, total count, database name, elapsed time for current restore phase, avg duration per DB
type DatabaseProgressWithTimingCallback func(done, total int, dbName string, phaseElapsed, avgPerDB time.Duration)
// DatabaseProgressByBytesCallback is called with progress weighted by database sizes (bytes)
// Parameters: bytes completed, total bytes, current database name, databases done count, total database count
type DatabaseProgressByBytesCallback func(bytesDone, bytesTotal int64, dbName string, dbDone, dbTotal int)
// Engine handles database restore operations
type Engine struct {
cfg *config.Config
log logger.Logger
db database.Database
progress progress.Indicator
detailedReporter *progress.DetailedReporter
dryRun bool
silentMode bool // Suppress stdout output (for TUI mode)
debugLogPath string // Path to save debug log on error
// TUI progress callback for detailed progress reporting
progressCallback ProgressCallback
dbProgressCallback DatabaseProgressCallback
dbProgressTimingCallback DatabaseProgressWithTimingCallback
dbProgressByBytesCallback DatabaseProgressByBytesCallback
}
// New creates a new restore engine
func New(cfg *config.Config, log logger.Logger, db database.Database) *Engine {
progressIndicator := progress.NewIndicator(true, "line")
detailedReporter := progress.NewDetailedReporter(progressIndicator, &loggerAdapter{logger: log})
return &Engine{
cfg: cfg,
log: log,
db: db,
progress: progressIndicator,
detailedReporter: detailedReporter,
dryRun: false,
}
}
// NewSilent creates a new restore engine with no stdout progress (for TUI mode)
func NewSilent(cfg *config.Config, log logger.Logger, db database.Database) *Engine {
progressIndicator := progress.NewNullIndicator()
detailedReporter := progress.NewDetailedReporter(progressIndicator, &loggerAdapter{logger: log})
return &Engine{
cfg: cfg,
log: log,
db: db,
progress: progressIndicator,
detailedReporter: detailedReporter,
dryRun: false,
silentMode: true, // Suppress stdout for TUI
}
}
// NewWithProgress creates a restore engine with custom progress indicator
func NewWithProgress(cfg *config.Config, log logger.Logger, db database.Database, progressIndicator progress.Indicator, dryRun bool) *Engine {
if progressIndicator == nil {
progressIndicator = progress.NewNullIndicator()
}
detailedReporter := progress.NewDetailedReporter(progressIndicator, &loggerAdapter{logger: log})
return &Engine{
cfg: cfg,
log: log,
db: db,
progress: progressIndicator,
detailedReporter: detailedReporter,
dryRun: dryRun,
}
}
// SetDebugLogPath enables saving detailed error reports on failure
func (e *Engine) SetDebugLogPath(path string) {
e.debugLogPath = path
}
// SetProgressCallback sets a callback for detailed progress reporting (for TUI mode)
func (e *Engine) SetProgressCallback(cb ProgressCallback) {
e.progressCallback = cb
}
// SetDatabaseProgressCallback sets a callback for database count progress during cluster restore
func (e *Engine) SetDatabaseProgressCallback(cb DatabaseProgressCallback) {
e.dbProgressCallback = cb
}
// SetDatabaseProgressWithTimingCallback sets a callback for database progress with timing info
func (e *Engine) SetDatabaseProgressWithTimingCallback(cb DatabaseProgressWithTimingCallback) {
e.dbProgressTimingCallback = cb
}
// SetDatabaseProgressByBytesCallback sets a callback for progress weighted by database sizes
func (e *Engine) SetDatabaseProgressByBytesCallback(cb DatabaseProgressByBytesCallback) {
e.dbProgressByBytesCallback = cb
}
// reportProgress safely calls the progress callback if set
func (e *Engine) reportProgress(current, total int64, description string) {
if e.progressCallback != nil {
e.progressCallback(current, total, description)
}
}
// reportDatabaseProgress safely calls the database progress callback if set
func (e *Engine) reportDatabaseProgress(done, total int, dbName string) {
// CRITICAL: Add panic recovery to prevent crashes during TUI shutdown
defer func() {
if r := recover(); r != nil {
e.log.Warn("Database progress callback panic recovered", "panic", r, "db", dbName)
}
}()
if e.dbProgressCallback != nil {
e.dbProgressCallback(done, total, dbName)
}
}
// reportDatabaseProgressWithTiming safely calls the timing-aware callback if set
func (e *Engine) reportDatabaseProgressWithTiming(done, total int, dbName string, phaseElapsed, avgPerDB time.Duration) {
// CRITICAL: Add panic recovery to prevent crashes during TUI shutdown
defer func() {
if r := recover(); r != nil {
e.log.Warn("Database timing progress callback panic recovered", "panic", r, "db", dbName)
}
}()
if e.dbProgressTimingCallback != nil {
e.dbProgressTimingCallback(done, total, dbName, phaseElapsed, avgPerDB)
}
}
// reportDatabaseProgressByBytes safely calls the bytes-weighted callback if set
func (e *Engine) reportDatabaseProgressByBytes(bytesDone, bytesTotal int64, dbName string, dbDone, dbTotal int) {
// CRITICAL: Add panic recovery to prevent crashes during TUI shutdown
defer func() {
if r := recover(); r != nil {
e.log.Warn("Database bytes progress callback panic recovered", "panic", r, "db", dbName)
}
}()
if e.dbProgressByBytesCallback != nil {
e.dbProgressByBytesCallback(bytesDone, bytesTotal, dbName, dbDone, dbTotal)
}
}
// loggerAdapter adapts our logger to the progress.Logger interface
type loggerAdapter struct {
logger logger.Logger
}
func (la *loggerAdapter) Info(msg string, args ...any) {
la.logger.Info(msg, args...)
}
func (la *loggerAdapter) Warn(msg string, args ...any) {
la.logger.Warn(msg, args...)
}
func (la *loggerAdapter) Error(msg string, args ...any) {
la.logger.Error(msg, args...)
}
func (la *loggerAdapter) Debug(msg string, args ...any) {
la.logger.Debug(msg, args...)
}
// RestoreSingle restores a single database from an archive
func (e *Engine) RestoreSingle(ctx context.Context, archivePath, targetDB string, cleanFirst, createIfMissing bool) error {
operation := e.log.StartOperation("Single Database Restore")
startTime := time.Now()
// Validate and sanitize archive path
validArchivePath, pathErr := security.ValidateArchivePath(archivePath)
if pathErr != nil {
operation.Fail(fmt.Sprintf("Invalid archive path: %v", pathErr))
return fmt.Errorf("invalid archive path: %w", pathErr)
}
archivePath = validArchivePath
// Get archive size for metrics
var archiveSize int64
if fi, err := os.Stat(archivePath); err == nil {
archiveSize = fi.Size()
}
// Validate archive exists
if _, err := os.Stat(archivePath); os.IsNotExist(err) {
operation.Fail("Archive not found")
return fmt.Errorf("archive not found: %s", archivePath)
}
// Verify checksum if .sha256 file exists
if checksumErr := security.LoadAndVerifyChecksum(archivePath); checksumErr != nil {
e.log.Warn("Checksum verification failed", "error", checksumErr)
e.log.Warn("Continuing restore without checksum verification (use with caution)")
} else {
e.log.Info("[OK] Archive checksum verified successfully")
}
// Detect archive format
format := DetectArchiveFormat(archivePath)
e.log.Info("Detected archive format", "format", format, "path", archivePath)
// Check version compatibility for PostgreSQL dumps
if format == FormatPostgreSQLDump || format == FormatPostgreSQLDumpGz {
if compatResult, err := e.CheckRestoreVersionCompatibility(ctx, archivePath); err == nil && compatResult != nil {
e.log.Info(compatResult.Message,
"source_version", compatResult.SourceVersion.Full,
"target_version", compatResult.TargetVersion.Full,
"compatibility", compatResult.Level.String())
// Block unsupported downgrades
if !compatResult.Compatible {
operation.Fail(compatResult.Message)
return fmt.Errorf("version compatibility error: %s", compatResult.Message)
}
// Show warnings for risky upgrades
if compatResult.Level == CompatibilityLevelRisky || compatResult.Level == CompatibilityLevelWarning {
for _, warning := range compatResult.Warnings {
e.log.Warn(warning)
}
}
}
}
if e.dryRun {
e.log.Info("DRY RUN: Would restore single database", "archive", archivePath, "target", targetDB)
return e.previewRestore(archivePath, targetDB, format)
}
// Start progress tracking
e.progress.Start(fmt.Sprintf("Restoring database '%s' from %s", targetDB, filepath.Base(archivePath)))
// Create database if requested and it doesn't exist
if createIfMissing {
e.log.Info("Checking if target database exists", "database", targetDB)
if err := e.ensureDatabaseExists(ctx, targetDB); err != nil {
operation.Fail(fmt.Sprintf("Failed to create database: %v", err))
return fmt.Errorf("failed to create database '%s': %w", targetDB, err)
}
}
// Handle different archive formats
var err error
switch format {
case FormatPostgreSQLDump, FormatPostgreSQLDumpGz:
err = e.restorePostgreSQLDump(ctx, archivePath, targetDB, format == FormatPostgreSQLDumpGz, cleanFirst)
case FormatPostgreSQLSQL, FormatPostgreSQLSQLGz:
err = e.restorePostgreSQLSQL(ctx, archivePath, targetDB, format == FormatPostgreSQLSQLGz)
case FormatMySQLSQL, FormatMySQLSQLGz:
err = e.restoreMySQLSQL(ctx, archivePath, targetDB, format == FormatMySQLSQLGz)
default:
operation.Fail("Unsupported archive format")
return fmt.Errorf("unsupported archive format: %s", format)
}
// Record restore metrics for Prometheus
duration := time.Since(startTime)
dbType := "postgresql"
if format == FormatMySQLSQL || format == FormatMySQLSQLGz {
dbType = "mysql"
}
record := RestoreRecord{
Database: targetDB,
Engine: dbType,
StartedAt: startTime,
CompletedAt: time.Now(),
Duration: duration,
SizeBytes: archiveSize,
ParallelJobs: e.cfg.Jobs,
Profile: e.cfg.ResourceProfile,
Success: err == nil,
SourceFile: filepath.Base(archivePath),
TargetDB: targetDB,
IsCluster: false,
}
if err != nil {
record.ErrorMessage = err.Error()
}
if recordErr := RecordRestore(record); recordErr != nil {
e.log.Warn("Failed to record restore metrics", "error", recordErr)
}
if err != nil {
e.progress.Fail(fmt.Sprintf("Restore failed: %v", err))
operation.Fail(fmt.Sprintf("Restore failed: %v", err))
return err
}
e.progress.Complete(fmt.Sprintf("Database '%s' restored successfully", targetDB))
operation.Complete(fmt.Sprintf("Restored database '%s' from %s", targetDB, filepath.Base(archivePath)))
return nil
}
// restorePostgreSQLDump restores from PostgreSQL custom dump format
func (e *Engine) restorePostgreSQLDump(ctx context.Context, archivePath, targetDB string, compressed bool, cleanFirst bool) error {
// Build restore command
// Use configured Jobs count for parallel pg_restore (matches pg_restore -j behavior)
parallelJobs := e.cfg.Jobs
if parallelJobs <= 0 {
parallelJobs = 1 // Default fallback
}
opts := database.RestoreOptions{
Parallel: parallelJobs,
Clean: cleanFirst,
NoOwner: true,
NoPrivileges: true,
SingleTransaction: false, // CRITICAL: Disabled to prevent lock exhaustion with large objects
Verbose: true, // Enable verbose for single database restores (not cluster)
}
cmd := e.db.BuildRestoreCommand(targetDB, archivePath, opts)
// Start heartbeat ticker for restore progress (10s interval to reduce overhead)
restoreStart := time.Now()
heartbeatCtx, cancelHeartbeat := context.WithCancel(ctx)
heartbeatTicker := time.NewTicker(10 * time.Second)
defer heartbeatTicker.Stop()
defer cancelHeartbeat()
// Run heartbeat in background - no mutex needed as progress.Update is thread-safe
go func() {
for {
select {
case <-heartbeatTicker.C:
elapsed := time.Since(restoreStart)
e.progress.Update(fmt.Sprintf("Restoring %s... (elapsed: %s)", targetDB, formatDuration(elapsed)))
case <-heartbeatCtx.Done():
return
}
}
}()
if compressed {
// For compressed dumps, decompress first
return e.executeRestoreWithDecompression(ctx, archivePath, cmd)
}
return e.executeRestoreCommand(ctx, cmd)
}
// restorePostgreSQLDumpWithOwnership restores from PostgreSQL custom dump with ownership control
func (e *Engine) restorePostgreSQLDumpWithOwnership(ctx context.Context, archivePath, targetDB string, compressed bool, preserveOwnership bool) error {
// Check if dump contains large objects (BLOBs) - if so, use phased restore
// to prevent lock table exhaustion (max_locks_per_transaction OOM)
hasLargeObjects := e.checkDumpHasLargeObjects(archivePath)
if hasLargeObjects {
e.log.Info("Large objects detected - using phased restore to prevent lock exhaustion",
"database", targetDB,
"archive", archivePath)
return e.restorePostgreSQLDumpPhased(ctx, archivePath, targetDB, preserveOwnership)
}
// Standard restore for dumps without large objects
// Use configured Jobs count for parallel pg_restore (matches pg_restore -j behavior)
parallelJobs := e.cfg.Jobs
if parallelJobs <= 0 {
parallelJobs = 1 // Default fallback
}
opts := database.RestoreOptions{
Parallel: parallelJobs,
Clean: false, // We already dropped the database
NoOwner: !preserveOwnership, // Preserve ownership if we're superuser
NoPrivileges: !preserveOwnership, // Preserve privileges if we're superuser
SingleTransaction: false, // CRITICAL: Disabled to prevent lock exhaustion with large objects
Verbose: false, // CRITICAL: disable verbose to prevent OOM on large restores
}
e.log.Info("Restoring database",
"database", targetDB,
"parallel_jobs", parallelJobs,
"preserveOwnership", preserveOwnership,
"noOwner", opts.NoOwner,
"noPrivileges", opts.NoPrivileges)
cmd := e.db.BuildRestoreCommand(targetDB, archivePath, opts)
if compressed {
// For compressed dumps, decompress first
return e.executeRestoreWithDecompression(ctx, archivePath, cmd)
}
return e.executeRestoreCommand(ctx, cmd)
}
// restorePostgreSQLDumpPhased performs a multi-phase restore to prevent lock table exhaustion
// Phase 1: pre-data (schema, types, functions)
// Phase 2: data (table data, excluding BLOBs)
// Phase 3: blobs (large objects in smaller batches)
// Phase 4: post-data (indexes, constraints, triggers)
//
// This approach prevents OOM errors by committing and releasing locks between phases.
func (e *Engine) restorePostgreSQLDumpPhased(ctx context.Context, archivePath, targetDB string, preserveOwnership bool) error {
e.log.Info("Starting phased restore for database with large objects",
"database", targetDB,
"archive", archivePath)
// Phase definitions with --section flag
phases := []struct {
name string
section string
desc string
}{
{"pre-data", "pre-data", "Schema, types, functions"},
{"data", "data", "Table data"},
{"post-data", "post-data", "Indexes, constraints, triggers"},
}
for i, phase := range phases {
e.log.Info(fmt.Sprintf("Phase %d/%d: Restoring %s", i+1, len(phases), phase.name),
"database", targetDB,
"section", phase.section,
"description", phase.desc)
if err := e.restoreSection(ctx, archivePath, targetDB, phase.section, preserveOwnership); err != nil {
// Check if it's an ignorable error
if e.isIgnorableError(err.Error()) {
e.log.Warn(fmt.Sprintf("Phase %d completed with ignorable errors", i+1),
"section", phase.section,
"error", err)
continue
}
return fmt.Errorf("phase %d (%s) failed: %w", i+1, phase.name, err)
}
e.log.Info(fmt.Sprintf("Phase %d/%d completed successfully", i+1, len(phases)),
"section", phase.section)
}
e.log.Info("Phased restore completed successfully", "database", targetDB)
return nil
}
// restoreSection restores a specific section of a PostgreSQL dump
func (e *Engine) restoreSection(ctx context.Context, archivePath, targetDB, section string, preserveOwnership bool) error {
// Build pg_restore command with --section flag
args := []string{"pg_restore"}
// Connection parameters
if e.cfg.Host != "localhost" {
args = append(args, "-h", e.cfg.Host)
args = append(args, "-p", fmt.Sprintf("%d", e.cfg.Port))
args = append(args, "--no-password")
}
args = append(args, "-U", e.cfg.User)
// CRITICAL: Use configured Jobs for parallel restore (fixes slow phased restores)
parallelJobs := e.cfg.Jobs
if parallelJobs <= 0 {
parallelJobs = 1
}
args = append(args, fmt.Sprintf("--jobs=%d", parallelJobs))
e.log.Info("Phased restore section", "section", section, "parallel_jobs", parallelJobs)
// Section-specific restore
args = append(args, "--section="+section)
// Options
if !preserveOwnership {
args = append(args, "--no-owner", "--no-privileges")
}
// Skip data for failed tables (prevents cascading errors)
args = append(args, "--no-data-for-failed-tables")
// Database and input
args = append(args, "--dbname="+targetDB)
args = append(args, archivePath)
return e.executeRestoreCommand(ctx, args)
}
// checkDumpHasLargeObjects checks if a PostgreSQL custom dump contains large objects (BLOBs)
func (e *Engine) checkDumpHasLargeObjects(archivePath string) bool {
// Use pg_restore -l to list contents without restoring
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
cmd := cleanup.SafeCommand(ctx, "pg_restore", "-l", archivePath)
output, err := cmd.Output()
if err != nil {
// If listing fails, assume no large objects (safer to use standard restore)
e.log.Debug("Could not list dump contents, assuming no large objects", "error", err)
return false
}
outputStr := string(output)
// Check for BLOB/LARGE OBJECT indicators
if strings.Contains(outputStr, "BLOB") ||
strings.Contains(outputStr, "LARGE OBJECT") ||
strings.Contains(outputStr, " BLOBS ") ||
strings.Contains(outputStr, "lo_create") {
return true
}
return false
}
// restorePostgreSQLSQL restores from PostgreSQL SQL script
func (e *Engine) restorePostgreSQLSQL(ctx context.Context, archivePath, targetDB string, compressed bool) error {
// Pre-validate SQL dump to detect truncation BEFORE attempting restore
// This saves time by catching corrupted files early (vs 49min failures)
if err := e.quickValidateSQLDump(archivePath, compressed); err != nil {
e.log.Error("Pre-restore validation failed - dump file appears corrupted",
"file", archivePath,
"error", err)
return fmt.Errorf("dump validation failed: %w - the backup file may be truncated or corrupted", err)
}
// USE NATIVE ENGINE if configured
// This uses pure Go (pgx) instead of psql
if e.cfg.UseNativeEngine {
e.log.Info("Using native Go engine for restore", "database", targetDB, "file", archivePath)
nativeErr := e.restoreWithNativeEngine(ctx, archivePath, targetDB, compressed)
if nativeErr != nil {
if e.cfg.FallbackToTools {
e.log.Warn("Native restore failed, falling back to psql", "database", targetDB, "error", nativeErr)
} else {
return fmt.Errorf("native restore failed: %w", nativeErr)
}
} else {
return nil // Native restore succeeded!
}
}
// Use psql for SQL scripts (fallback or non-native mode)
var cmd []string
// For localhost, omit -h to use Unix socket (avoids Ident auth issues)
hostArg := ""
if e.cfg.Host != "localhost" && e.cfg.Host != "" {
hostArg = fmt.Sprintf("-h %s", e.cfg.Host)
}
if compressed {
// Use in-process pgzip decompression (parallel, no external process)
return e.executeRestoreWithPgzipStream(ctx, archivePath, targetDB, "postgresql")
} else {
// NOTE: We do NOT use ON_ERROR_STOP=1 (see above)
if hostArg != "" {
cmd = []string{
"psql",
"-h", e.cfg.Host,
"-p", fmt.Sprintf("%d", e.cfg.Port),
"-U", e.cfg.User,
"-d", targetDB,
"-f", archivePath,
}
} else {
cmd = []string{
"psql",
"-p", fmt.Sprintf("%d", e.cfg.Port),
"-U", e.cfg.User,
"-d", targetDB,
"-f", archivePath,
}
}
}
return e.executeRestoreCommand(ctx, cmd)
}
// restoreWithNativeEngine restores a SQL file using the pure Go native engine
func (e *Engine) restoreWithNativeEngine(ctx context.Context, archivePath, targetDB string, compressed bool) error {
// Create native engine config
nativeCfg := &native.PostgreSQLNativeConfig{
Host: e.cfg.Host,
Port: e.cfg.Port,
User: e.cfg.User,
Password: e.cfg.Password,
Database: targetDB, // Connect to target database
SSLMode: e.cfg.SSLMode,
}
// Use PARALLEL restore engine for SQL format - this matches pg_restore -j performance!
// The parallel engine:
// 1. Executes schema statements sequentially (CREATE TABLE, etc.)
// 2. Executes COPY data loading in PARALLEL (like pg_restore -j8)
// 3. Creates indexes and constraints in PARALLEL
parallelWorkers := e.cfg.Jobs
if parallelWorkers < 1 {
parallelWorkers = 4
}
e.log.Info("Using PARALLEL native restore engine",
"workers", parallelWorkers,
"database", targetDB,
"archive", archivePath)
// Pass context to ensure pool is properly closed on Ctrl+C cancellation
parallelEngine, err := native.NewParallelRestoreEngineWithContext(ctx, nativeCfg, e.log, parallelWorkers)
if err != nil {
e.log.Warn("Failed to create parallel restore engine, falling back to sequential", "error", err)
// Fall back to sequential restore
return e.restoreWithSequentialNativeEngine(ctx, archivePath, targetDB, compressed)
}
defer parallelEngine.Close()
// Run parallel restore with progress callbacks
options := &native.ParallelRestoreOptions{
Workers: parallelWorkers,
ContinueOnError: true,
ProgressCallback: func(phase string, current, total int, tableName string) {
switch phase {
case "parsing":
e.log.Debug("Parsing SQL dump...")
case "schema":
if current%50 == 0 {
e.log.Debug("Creating schema", "progress", current, "total", total)
}
case "data":
e.log.Debug("Loading data", "table", tableName, "progress", current, "total", total)
// Report progress to TUI
e.reportDatabaseProgress(current, total, tableName)
case "indexes":
e.log.Debug("Creating indexes", "progress", current, "total", total)
}
},
}
result, err := parallelEngine.RestoreFile(ctx, archivePath, options)
if err != nil {
return fmt.Errorf("parallel native restore failed: %w", err)
}
e.log.Info("Parallel native restore completed",
"database", targetDB,
"tables", result.TablesRestored,
"rows", result.RowsRestored,
"indexes", result.IndexesCreated,
"duration", result.Duration)
return nil
}
// restoreWithSequentialNativeEngine is the fallback sequential restore
func (e *Engine) restoreWithSequentialNativeEngine(ctx context.Context, archivePath, targetDB string, compressed bool) error {
nativeCfg := &native.PostgreSQLNativeConfig{
Host: e.cfg.Host,
Port: e.cfg.Port,
User: e.cfg.User,
Password: e.cfg.Password,
Database: targetDB,
SSLMode: e.cfg.SSLMode,
}
// Create restore engine
restoreEngine, err := native.NewPostgreSQLRestoreEngine(nativeCfg, e.log)
if err != nil {
return fmt.Errorf("failed to create native restore engine: %w", err)
}
defer restoreEngine.Close()
// Open input file
file, err := os.Open(archivePath)
if err != nil {
return fmt.Errorf("failed to open backup file: %w", err)
}
defer file.Close()
var reader io.Reader = file
// Handle compression
if compressed {
gzReader, err := pgzip.NewReader(file)
if err != nil {
return fmt.Errorf("failed to create gzip reader: %w", err)
}
defer gzReader.Close()
reader = gzReader
}
// Restore with progress tracking
options := &native.RestoreOptions{
Database: targetDB,
ContinueOnError: true, // Be resilient like pg_restore
ProgressCallback: func(progress *native.RestoreProgress) {
e.log.Debug("Native restore progress",
"operation", progress.Operation,
"objects", progress.ObjectsCompleted,
"rows", progress.RowsProcessed)
},
}
result, err := restoreEngine.Restore(ctx, reader, options)
if err != nil {
return fmt.Errorf("native restore failed: %w", err)
}
e.log.Info("Native restore completed",
"database", targetDB,
"objects", result.ObjectsProcessed,
"duration", result.Duration)
return nil
}
// restoreMySQLSQL restores from MySQL SQL script
func (e *Engine) restoreMySQLSQL(ctx context.Context, archivePath, targetDB string, compressed bool) error {
options := database.RestoreOptions{}
cmd := e.db.BuildRestoreCommand(targetDB, archivePath, options)
if compressed {
// Use in-process pgzip decompression (parallel, no external process)
return e.executeRestoreWithPgzipStream(ctx, archivePath, targetDB, "mysql")
}
return e.executeRestoreCommand(ctx, cmd)
}
// executeRestoreCommand executes a restore command
func (e *Engine) executeRestoreCommand(ctx context.Context, cmdArgs []string) error {
return e.executeRestoreCommandWithContext(ctx, cmdArgs, "", "", FormatUnknown)
}
// executeRestoreCommandWithContext executes a restore command with error collection context
func (e *Engine) executeRestoreCommandWithContext(ctx context.Context, cmdArgs []string, archivePath, targetDB string, format ArchiveFormat) error {
e.log.Info("Executing restore command", "command", strings.Join(cmdArgs, " "))
cmd := cleanup.SafeCommand(ctx, cmdArgs[0], cmdArgs[1:]...)
// Set environment variables
cmd.Env = append(os.Environ(),
fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password),
fmt.Sprintf("MYSQL_PWD=%s", e.cfg.Password),
)
// Create error collector if debug log path is set
var collector *ErrorCollector
if e.debugLogPath != "" {
collector = NewErrorCollector(e.cfg, e.log, archivePath, targetDB, format, true)
}
// Stream stderr to avoid memory issues with large output
// Don't use CombinedOutput() as it loads everything into memory
stderr, err := cmd.StderrPipe()
if err != nil {
return fmt.Errorf("failed to create stderr pipe: %w", err)
}
if err := cmd.Start(); err != nil {
return fmt.Errorf("failed to start restore command: %w", err)
}
// Read stderr in goroutine to avoid blocking
var lastError string
var errorCount int
stderrDone := make(chan struct{})
go func() {
defer close(stderrDone)
buf := make([]byte, 4096)
const maxErrors = 10 // Limit captured errors to prevent OOM
for {
n, err := stderr.Read(buf)
if n > 0 {
chunk := string(buf[:n])
// Feed to error collector if enabled
if collector != nil {
collector.CaptureStderr(chunk)
}
// Only capture REAL errors, not verbose output
if strings.Contains(chunk, "ERROR:") || strings.Contains(chunk, "FATAL:") || strings.Contains(chunk, "error:") {
lastError = strings.TrimSpace(chunk)
errorCount++
if errorCount <= maxErrors {
e.log.Warn("Restore stderr", "output", chunk)
}
}
// Note: --verbose output is discarded to prevent OOM
}
if err != nil {
break
}
}
}()
// Wait for command with proper context handling
cmdDone := make(chan error, 1)
go func() {
cmdDone <- cmd.Wait()
}()
var cmdErr error
select {
case cmdErr = <-cmdDone:
// Command completed (success or failure)
case <-ctx.Done():
// Context cancelled - kill entire process group
e.log.Warn("Restore cancelled - killing process group")
cleanup.KillCommandGroup(cmd)
<-cmdDone
cmdErr = ctx.Err()
}
// Wait for stderr reader to finish
<-stderrDone
if cmdErr != nil {
// Get exit code
exitCode := 1
if exitErr, ok := cmdErr.(*exec.ExitError); ok {
exitCode = exitErr.ExitCode()
}
// PostgreSQL pg_restore returns exit code 1 even for ignorable errors
// Check if errors are ignorable (already exists, duplicate, etc.)
if lastError != "" && e.isIgnorableError(lastError) {
e.log.Warn("Restore completed with ignorable errors", "error_count", errorCount, "last_error", lastError)
return nil // Success despite ignorable errors
}
// Classify error and provide helpful hints
var classification *checks.ErrorClassification
var errType, errHint string
if lastError != "" {
classification = checks.ClassifyError(lastError)
errType = classification.Type
errHint = classification.Hint
// CRITICAL: Detect "out of shared memory" / lock exhaustion errors
// This means max_locks_per_transaction is insufficient
if strings.Contains(lastError, "out of shared memory") ||
strings.Contains(lastError, "max_locks_per_transaction") {
e.log.Error("🔴 LOCK EXHAUSTION DETECTED during restore - this should have been prevented",
"last_error", lastError,
"database", targetDB,
"action", "Report this to developers - preflight checks should have caught this")
// Return a special error that signals lock exhaustion
// The caller can decide to retry with reduced parallelism
return fmt.Errorf("LOCK_EXHAUSTION: %s - max_locks_per_transaction insufficient (error: %w)", lastError, cmdErr)
}
e.log.Error("Restore command failed",
"error", err,
"last_stderr", lastError,
"error_count", errorCount,
"error_type", classification.Type,
"hint", classification.Hint,
"action", classification.Action)
} else {
e.log.Error("Restore command failed", "error", err, "error_count", errorCount)
}
// Generate and save error report if collector is enabled
if collector != nil {
collector.SetExitCode(exitCode)
report := collector.GenerateReport(
lastError,
errType,
errHint,
)
// Print report to console
collector.PrintReport(report)
// Save to file
if e.debugLogPath != "" {
if saveErr := collector.SaveReport(report, e.debugLogPath); saveErr != nil {
e.log.Warn("Failed to save debug log", "error", saveErr)
} else {
e.log.Info("Debug log saved", "path", e.debugLogPath)
fmt.Printf("\n[LOG] Detailed error report saved to: %s\n", e.debugLogPath)
}
}
}
if lastError != "" {
return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d) - %s",
err, lastError, errorCount, errHint)
}
return fmt.Errorf("restore failed: %w", err)
}
e.log.Info("Restore command completed successfully")
return nil
}
// executeRestoreWithDecompression handles decompression during restore using in-process pgzip
func (e *Engine) executeRestoreWithDecompression(ctx context.Context, archivePath string, restoreCmd []string) error {
e.log.Info("Using in-process pgzip decompression (parallel)", "archive", archivePath)
// Open the gzip file
file, err := os.Open(archivePath)
if err != nil {
return fmt.Errorf("failed to open archive: %w", err)
}
defer file.Close()
// Create parallel gzip reader
gz, err := pgzip.NewReader(file)
if err != nil {
return fmt.Errorf("failed to create pgzip reader: %w", err)
}
defer gz.Close()
// Start restore command
cmd := cleanup.SafeCommand(ctx, restoreCmd[0], restoreCmd[1:]...)
cmd.Env = append(os.Environ(),
fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password),
fmt.Sprintf("MYSQL_PWD=%s", e.cfg.Password),
)
// Pipe decompressed data to restore command stdin
stdin, err := cmd.StdinPipe()
if err != nil {
return fmt.Errorf("failed to create stdin pipe: %w", err)
}
// Capture stderr
stderr, err := cmd.StderrPipe()
if err != nil {
return fmt.Errorf("failed to create stderr pipe: %w", err)
}
if err := cmd.Start(); err != nil {
return fmt.Errorf("failed to start restore command: %w", err)
}
// Stream decompressed data to restore command in goroutine
copyDone := make(chan error, 1)
go func() {
_, copyErr := fs.CopyWithContext(ctx, stdin, gz)
stdin.Close()
copyDone <- copyErr
}()
// Read stderr in goroutine
var lastError string
var errorCount int
stderrDone := make(chan struct{})
go func() {
defer close(stderrDone)
scanner := bufio.NewScanner(stderr)
// Increase buffer size for long lines
buf := make([]byte, 64*1024)
scanner.Buffer(buf, 1024*1024)
for scanner.Scan() {
line := scanner.Text()
if strings.Contains(strings.ToLower(line), "error") ||
strings.Contains(line, "ERROR") ||
strings.Contains(line, "FATAL") {
lastError = line
errorCount++
e.log.Debug("Restore stderr", "line", line)
}
}
}()
// Wait for copy to complete
copyErr := <-copyDone
// Wait for command
cmdErr := cmd.Wait()
<-stderrDone
if copyErr != nil && cmdErr == nil {
return fmt.Errorf("decompression failed: %w", copyErr)
}
if cmdErr != nil {
if lastError != "" && e.isIgnorableError(lastError) {
e.log.Warn("Restore completed with ignorable errors", "error_count", errorCount)
return nil
}
if lastError != "" {
classification := checks.ClassifyError(lastError)
return fmt.Errorf("restore failed: %w (last error: %s) - %s", cmdErr, lastError, classification.Hint)
}
return fmt.Errorf("restore failed: %w", cmdErr)
}
e.log.Info("Restore with pgzip decompression completed successfully")
return nil
}
// executeRestoreWithPgzipStream handles SQL restore with in-process pgzip decompression
func (e *Engine) executeRestoreWithPgzipStream(ctx context.Context, archivePath, targetDB, dbType string) error {
e.log.Info("Using in-process pgzip stream for SQL restore", "archive", archivePath, "database", targetDB, "type", dbType)
// Open the gzip file
file, err := os.Open(archivePath)
if err != nil {
return fmt.Errorf("failed to open archive: %w", err)
}
defer file.Close()
// Create parallel gzip reader
gz, err := pgzip.NewReader(file)
if err != nil {
return fmt.Errorf("failed to create pgzip reader: %w", err)
}
defer gz.Close()
// Build restore command based on database type
var cmd *exec.Cmd
if dbType == "postgresql" {
// Add performance tuning via psql preamble commands
// These are executed before the SQL dump to speed up bulk loading
preamble := `
SET synchronous_commit = 'off';
SET work_mem = '256MB';
SET maintenance_work_mem = '1GB';
SET max_parallel_workers_per_gather = 4;
SET max_parallel_maintenance_workers = 4;
SET wal_level = 'minimal';
SET fsync = off;
SET full_page_writes = off;
SET checkpoint_timeout = '1h';
SET max_wal_size = '10GB';
`
// Note: Some settings require superuser - we try them but continue if they fail
// The -c flags run before the main script
args := []string{
"-p", fmt.Sprintf("%d", e.cfg.Port),
"-U", e.cfg.User,
"-d", targetDB,
"-c", "SET synchronous_commit = 'off'",
"-c", "SET work_mem = '256MB'",
"-c", "SET maintenance_work_mem = '1GB'",
}
if e.cfg.Host != "localhost" && e.cfg.Host != "" {
args = append([]string{"-h", e.cfg.Host}, args...)
}
e.log.Info("Applying PostgreSQL performance tuning for SQL restore", "preamble_settings", 3)
_ = preamble // Documented for reference
cmd = cleanup.SafeCommand(ctx, "psql", args...)
cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password))
} else {
// MySQL - use MYSQL_PWD env var to avoid password in process list
args := []string{"-u", e.cfg.User}
if e.cfg.Host != "localhost" && e.cfg.Host != "" {
args = append(args, "-h", e.cfg.Host)
}
args = append(args, "-P", fmt.Sprintf("%d", e.cfg.Port), targetDB)
cmd = cleanup.SafeCommand(ctx, "mysql", args...)
// Pass password via environment variable to avoid process list exposure
cmd.Env = os.Environ()
if e.cfg.Password != "" {
cmd.Env = append(cmd.Env, "MYSQL_PWD="+e.cfg.Password)
}
}
// Pipe decompressed data to restore command stdin
stdin, err := cmd.StdinPipe()
if err != nil {
return fmt.Errorf("failed to create stdin pipe: %w", err)
}
// Capture stderr
stderr, err := cmd.StderrPipe()
if err != nil {
return fmt.Errorf("failed to create stderr pipe: %w", err)
}
if err := cmd.Start(); err != nil {
return fmt.Errorf("failed to start restore command: %w", err)
}
// Stream decompressed data to restore command in goroutine
copyDone := make(chan error, 1)
go func() {
_, copyErr := fs.CopyWithContext(ctx, stdin, gz)
stdin.Close()
copyDone <- copyErr
}()
// Read stderr in goroutine
var lastError string
var errorCount int
stderrDone := make(chan struct{})
go func() {
defer close(stderrDone)
scanner := bufio.NewScanner(stderr)
buf := make([]byte, 64*1024)
scanner.Buffer(buf, 1024*1024)
for scanner.Scan() {
line := scanner.Text()
if strings.Contains(strings.ToLower(line), "error") ||
strings.Contains(line, "ERROR") ||
strings.Contains(line, "FATAL") {
lastError = line
errorCount++
e.log.Debug("Restore stderr", "line", line)
}
}
}()
// Wait for copy to complete
copyErr := <-copyDone
// Wait for command
cmdErr := cmd.Wait()
<-stderrDone
if copyErr != nil && cmdErr == nil {
return fmt.Errorf("pgzip decompression failed: %w", copyErr)
}
if cmdErr != nil {
if lastError != "" && e.isIgnorableError(lastError) {
e.log.Warn("SQL restore completed with ignorable errors", "error_count", errorCount)
return nil
}
if lastError != "" {
classification := checks.ClassifyError(lastError)
return fmt.Errorf("restore failed: %w (last error: %s) - %s", cmdErr, lastError, classification.Hint)
}
return fmt.Errorf("restore failed: %w", cmdErr)
}
e.log.Info("SQL restore with pgzip stream completed successfully")
return nil
}
// previewRestore shows what would be done without executing
func (e *Engine) previewRestore(archivePath, targetDB string, format ArchiveFormat) error {
fmt.Println("\n" + strings.Repeat("=", 60))
fmt.Println(" RESTORE PREVIEW (DRY RUN)")
fmt.Println(strings.Repeat("=", 60))
stat, _ := os.Stat(archivePath)
fmt.Printf("\nArchive: %s\n", filepath.Base(archivePath))
fmt.Printf("Format: %s\n", format)
if stat != nil {
fmt.Printf("Size: %s\n", FormatBytes(stat.Size()))
fmt.Printf("Modified: %s\n", stat.ModTime().Format("2006-01-02 15:04:05"))
}
fmt.Printf("Target Database: %s\n", targetDB)
fmt.Printf("Target Host: %s:%d\n", e.cfg.Host, e.cfg.Port)
fmt.Println("\nOperations that would be performed:")
switch format {
case FormatPostgreSQLDump:
fmt.Printf(" 1. Execute: pg_restore -d %s %s\n", targetDB, archivePath)
case FormatPostgreSQLDumpGz:
fmt.Printf(" 1. Decompress: %s\n", archivePath)
fmt.Printf(" 2. Execute: pg_restore -d %s\n", targetDB)
case FormatPostgreSQLSQL, FormatPostgreSQLSQLGz:
fmt.Printf(" 1. Execute: psql -d %s -f %s\n", targetDB, archivePath)
case FormatMySQLSQL, FormatMySQLSQLGz:
fmt.Printf(" 1. Execute: mysql %s < %s\n", targetDB, archivePath)
}
fmt.Println("\n[WARN] WARNING: This will restore data to the target database.")
fmt.Println(" Existing data may be overwritten or merged.")
fmt.Println("\nTo execute this restore, add the --confirm flag.")
fmt.Println(strings.Repeat("=", 60) + "\n")
return nil
}
// RestoreSingleFromCluster extracts and restores a single database from a cluster backup
func (e *Engine) RestoreSingleFromCluster(ctx context.Context, clusterArchivePath, dbName, targetDB string, cleanFirst, createIfMissing bool) error {
operation := e.log.StartOperation("Single Database Restore from Cluster")
// Validate and sanitize archive path
validArchivePath, pathErr := security.ValidateArchivePath(clusterArchivePath)
if pathErr != nil {
operation.Fail(fmt.Sprintf("Invalid archive path: %v", pathErr))
return fmt.Errorf("invalid archive path: %w", pathErr)
}
clusterArchivePath = validArchivePath
// Validate archive exists
if _, err := os.Stat(clusterArchivePath); os.IsNotExist(err) {
operation.Fail("Archive not found")
return fmt.Errorf("archive not found: %s", clusterArchivePath)
}
// Verify it's a cluster archive
format := DetectArchiveFormat(clusterArchivePath)
if format != FormatClusterTarGz {
operation.Fail("Not a cluster archive")
return fmt.Errorf("not a cluster archive: %s (format: %s)", clusterArchivePath, format)
}
// Create temporary directory for extraction
workDir := e.cfg.GetEffectiveWorkDir()
tempDir := filepath.Join(workDir, fmt.Sprintf(".extract_%d", time.Now().Unix()))
if err := os.MkdirAll(tempDir, 0755); err != nil {
operation.Fail("Failed to create temporary directory")
return fmt.Errorf("failed to create temp directory: %w", err)
}
defer os.RemoveAll(tempDir)
// Extract the specific database from cluster archive
e.log.Info("Extracting database from cluster backup", "database", dbName, "cluster", filepath.Base(clusterArchivePath))
e.progress.Start(fmt.Sprintf("Extracting '%s' from cluster backup", dbName))
extractedPath, err := ExtractDatabaseFromCluster(ctx, clusterArchivePath, dbName, tempDir, e.log, e.progress)
if err != nil {
e.progress.Fail(fmt.Sprintf("Extraction failed: %v", err))
operation.Fail(fmt.Sprintf("Extraction failed: %v", err))
return fmt.Errorf("failed to extract database: %w", err)
}
e.progress.Update(fmt.Sprintf("Extracted: %s", filepath.Base(extractedPath)))
e.log.Info("Database extracted successfully", "path", extractedPath)
// Now restore the extracted database file
e.progress.Update("Restoring database...")
// Create database if requested and it doesn't exist
if createIfMissing {
e.log.Info("Checking if target database exists", "database", targetDB)
if err := e.ensureDatabaseExists(ctx, targetDB); err != nil {
operation.Fail(fmt.Sprintf("Failed to create database: %v", err))
return fmt.Errorf("failed to create database '%s': %w", targetDB, err)
}
}
// Detect format of extracted file
extractedFormat := DetectArchiveFormat(extractedPath)
e.log.Info("Restoring extracted database", "format", extractedFormat, "target", targetDB)
// Restore based on format
var restoreErr error
switch extractedFormat {
case FormatPostgreSQLDump, FormatPostgreSQLDumpGz:
restoreErr = e.restorePostgreSQLDump(ctx, extractedPath, targetDB, extractedFormat == FormatPostgreSQLDumpGz, cleanFirst)
case FormatPostgreSQLSQL, FormatPostgreSQLSQLGz:
restoreErr = e.restorePostgreSQLSQL(ctx, extractedPath, targetDB, extractedFormat == FormatPostgreSQLSQLGz)
case FormatMySQLSQL, FormatMySQLSQLGz:
restoreErr = e.restoreMySQLSQL(ctx, extractedPath, targetDB, extractedFormat == FormatMySQLSQLGz)
default:
operation.Fail("Unsupported extracted format")
return fmt.Errorf("unsupported extracted format: %s", extractedFormat)
}
if restoreErr != nil {
e.progress.Fail(fmt.Sprintf("Restore failed: %v", restoreErr))
operation.Fail(fmt.Sprintf("Restore failed: %v", restoreErr))
return restoreErr
}
e.progress.Complete(fmt.Sprintf("Database '%s' restored from cluster backup", targetDB))
operation.Complete(fmt.Sprintf("Restored '%s' from cluster as '%s'", dbName, targetDB))
return nil
}
// RestoreCluster restores a full cluster from a tar.gz archive
// If preExtractedPath is non-empty, uses that directory instead of extracting archivePath
// This avoids double extraction when ValidateAndExtractCluster was already called
func (e *Engine) RestoreCluster(ctx context.Context, archivePath string, preExtractedPath ...string) error {
operation := e.log.StartOperation("Cluster Restore")
clusterStartTime := time.Now()
// 🚀 LOG ACTUAL PERFORMANCE SETTINGS - helps debug slow restores
profile := e.cfg.GetCurrentProfile()
if profile != nil {
e.log.Info("🚀 RESTORE PERFORMANCE SETTINGS",
"profile", profile.Name,
"cluster_parallelism", profile.ClusterParallelism,
"pg_restore_jobs", profile.Jobs,
"large_db_mode", e.cfg.LargeDBMode,
"buffered_io", profile.BufferedIO)
} else {
e.log.Info("🚀 RESTORE PERFORMANCE SETTINGS (raw config)",
"profile", e.cfg.ResourceProfile,
"cluster_parallelism", e.cfg.ClusterParallelism,
"pg_restore_jobs", e.cfg.Jobs,
"large_db_mode", e.cfg.LargeDBMode)
}
// Also show in progress bar for TUI visibility
if !e.silentMode {
fmt.Printf("\n⚡ Performance: profile=%s, parallel_dbs=%d, pg_restore_jobs=%d\n\n",
e.cfg.ResourceProfile, e.cfg.ClusterParallelism, e.cfg.Jobs)
}
// Validate and sanitize archive path
validArchivePath, pathErr := security.ValidateArchivePath(archivePath)
if pathErr != nil {
operation.Fail(fmt.Sprintf("Invalid archive path: %v", pathErr))
return fmt.Errorf("invalid archive path: %w", pathErr)
}
archivePath = validArchivePath
// Validate archive exists
if _, err := os.Stat(archivePath); os.IsNotExist(err) {
operation.Fail("Archive not found")
return fmt.Errorf("archive not found: %s", archivePath)
}
// Verify checksum if .sha256 file exists
if checksumErr := security.LoadAndVerifyChecksum(archivePath); checksumErr != nil {
e.log.Warn("Checksum verification failed", "error", checksumErr)
e.log.Warn("Continuing restore without checksum verification (use with caution)")
} else {
e.log.Info("[OK] Cluster archive checksum verified successfully")
}
format := DetectArchiveFormat(archivePath)
if !format.CanBeClusterRestore() {
operation.Fail("Invalid cluster archive format")
return fmt.Errorf("not a valid cluster restore format: %s (detected format: %s). Supported: .tar.gz, .sql, .sql.gz", archivePath, format)
}
// For SQL-based cluster restores, use a different restore path
if format == FormatPostgreSQLSQL || format == FormatPostgreSQLSQLGz {
return e.restoreClusterFromSQL(ctx, archivePath, operation)
}
// Check if we have a pre-extracted directory (optimization to avoid double extraction)
// This check must happen BEFORE disk space checks to avoid false failures
usingPreExtracted := len(preExtractedPath) > 0 && preExtractedPath[0] != ""
// Check disk space before starting restore (skip if using pre-extracted directory)
var archiveInfo os.FileInfo
var err error
if !usingPreExtracted {
e.log.Info("Checking disk space for restore")
archiveInfo, err = os.Stat(archivePath)
if err == nil {
spaceCheck := checks.CheckDiskSpaceForRestore(e.cfg.BackupDir, archiveInfo.Size())
if spaceCheck.Critical {
operation.Fail("Insufficient disk space")
return fmt.Errorf("insufficient disk space for restore: %.1f%% used - need at least 4x archive size", spaceCheck.UsedPercent)
}
if spaceCheck.Warning {
e.log.Warn("Low disk space - restore may fail",
"available_gb", float64(spaceCheck.AvailableBytes)/(1024*1024*1024),
"used_percent", spaceCheck.UsedPercent)
}
}
} else {
e.log.Info("Skipping disk space check (using pre-extracted directory)")
}
if e.dryRun {
e.log.Info("DRY RUN: Would restore cluster", "archive", archivePath)
return e.previewClusterRestore(archivePath)
}
e.progress.Start(fmt.Sprintf("Restoring cluster from %s", filepath.Base(archivePath)))
// Create temporary extraction directory in configured WorkDir
workDir := e.cfg.GetEffectiveWorkDir()
tempDir := filepath.Join(workDir, fmt.Sprintf(".restore_%d", time.Now().Unix()))
// Handle pre-extracted directory or extract archive
if usingPreExtracted {
tempDir = preExtractedPath[0]
// Note: Caller handles cleanup of pre-extracted directory
e.log.Info("Using pre-extracted cluster directory",
"path", tempDir,
"optimization", "skipping duplicate extraction")
} else {
// Check disk space for extraction (need ~3x archive size: compressed + extracted + working space)
if archiveInfo != nil {
requiredBytes := uint64(archiveInfo.Size()) * 3
extractionCheck := checks.CheckDiskSpace(workDir)
if extractionCheck.AvailableBytes < requiredBytes {
operation.Fail("Insufficient disk space for extraction")
return fmt.Errorf("insufficient disk space for extraction in %s: need %.1f GB, have %.1f GB (archive size: %.1f GB × 3)",
workDir,
float64(requiredBytes)/(1024*1024*1024),
float64(extractionCheck.AvailableBytes)/(1024*1024*1024),
float64(archiveInfo.Size())/(1024*1024*1024))
}
e.log.Info("Disk space check for extraction passed",
"workdir", workDir,
"required_gb", float64(requiredBytes)/(1024*1024*1024),
"available_gb", float64(extractionCheck.AvailableBytes)/(1024*1024*1024))
}
// Need to extract archive ourselves
if err := os.MkdirAll(tempDir, 0755); err != nil {
operation.Fail("Failed to create temporary directory")
return fmt.Errorf("failed to create temp directory in %s: %w", workDir, err)
}
defer os.RemoveAll(tempDir)
// Extract archive
e.log.Info("Extracting cluster archive", "archive", archivePath, "tempDir", tempDir)
if err := e.extractArchive(ctx, archivePath, tempDir); err != nil {
operation.Fail("Archive extraction failed")
return fmt.Errorf("failed to extract archive: %w", err)
}
// Check context validity after extraction (debugging context cancellation issues)
if ctx.Err() != nil {
e.log.Error("Context cancelled after extraction - this should not happen",
"context_error", ctx.Err(),
"extraction_completed", true)
operation.Fail("Context cancelled unexpectedly")
return fmt.Errorf("context cancelled after extraction completed: %w", ctx.Err())
}
e.log.Info("Extraction completed, context still valid")
}
// Check if user has superuser privileges (required for ownership restoration)
e.progress.Update("Checking privileges...")
isSuperuser, err := e.checkSuperuser(ctx)
if err != nil {
e.log.Warn("Could not verify superuser status", "error", err)
isSuperuser = false // Assume not superuser if check fails
}
if !isSuperuser {
e.log.Warn("Current user is not a superuser - database ownership may not be fully restored")
e.progress.Update("[WARN] Warning: Non-superuser - ownership restoration limited")
time.Sleep(2 * time.Second) // Give user time to see warning
} else {
e.log.Info("Superuser privileges confirmed - full ownership restoration enabled")
}
// Restore global objects FIRST (roles, tablespaces) - CRITICAL for ownership
globalsFile := filepath.Join(tempDir, "globals.sql")
if _, err := os.Stat(globalsFile); err == nil {
e.log.Info("Restoring global objects (roles, tablespaces)")
e.progress.Update("Restoring global objects (roles, tablespaces)...")
if err := e.restoreGlobals(ctx, globalsFile); err != nil {
e.log.Error("Failed to restore global objects", "error", err)
if isSuperuser {
// If we're superuser and can't restore globals, this is a problem
e.progress.Fail("Failed to restore global objects")
operation.Fail("Global objects restoration failed")
return fmt.Errorf("failed to restore global objects: %w", err)
} else {
e.log.Warn("Continuing without global objects (may cause ownership issues)")
}
} else {
e.log.Info("Successfully restored global objects")
}
} else {
e.log.Warn("No globals.sql file found in backup - roles and tablespaces will not be restored")
}
// Restore individual databases
dumpsDir := filepath.Join(tempDir, "dumps")
if _, err := os.Stat(dumpsDir); err != nil {
operation.Fail("No database dumps found in archive")
return fmt.Errorf("no database dumps found in archive")
}
entries, err := os.ReadDir(dumpsDir)
if err != nil {
operation.Fail("Failed to read dumps directory")
return fmt.Errorf("failed to read dumps directory: %w", err)
}
// PRE-VALIDATE all SQL dumps BEFORE starting restore
// This catches truncated files early instead of failing after hours of work
e.log.Info("Pre-validating dump files before restore...")
e.progress.Update("Pre-validating dump files...")
var corruptedDumps []string
diagnoser := NewDiagnoser(e.log, false)
for _, entry := range entries {
if entry.IsDir() {
continue
}
dumpFile := filepath.Join(dumpsDir, entry.Name())
if strings.HasSuffix(dumpFile, ".sql.gz") {
result, err := diagnoser.DiagnoseFile(dumpFile)
if err != nil {
e.log.Warn("Could not validate dump file", "file", entry.Name(), "error", err)
continue
}
if result.IsTruncated || result.IsCorrupted || !result.IsValid {
dbName := strings.TrimSuffix(entry.Name(), ".sql.gz")
errDetail := "unknown issue"
if len(result.Errors) > 0 {
errDetail = result.Errors[0]
}
corruptedDumps = append(corruptedDumps, fmt.Sprintf("%s: %s", dbName, errDetail))
e.log.Error("CORRUPTED dump file detected",
"database", dbName,
"file", entry.Name(),
"truncated", result.IsTruncated,
"errors", result.Errors)
}
} else if strings.HasSuffix(dumpFile, ".dump") {
// Validate custom format dumps using pg_restore --list
cmd := cleanup.SafeCommand(ctx, "pg_restore", "--list", dumpFile)
output, err := cmd.CombinedOutput()
if err != nil {
dbName := strings.TrimSuffix(entry.Name(), ".dump")
errDetail := strings.TrimSpace(string(output))
if len(errDetail) > 100 {
errDetail = errDetail[:100] + "..."
}
// Check for truncation indicators
if strings.Contains(errDetail, "unexpected end") || strings.Contains(errDetail, "invalid") {
corruptedDumps = append(corruptedDumps, fmt.Sprintf("%s: %s", dbName, errDetail))
e.log.Error("CORRUPTED custom dump file detected",
"database", dbName,
"file", entry.Name(),
"error", errDetail)
} else {
e.log.Warn("pg_restore --list warning (may be recoverable)",
"file", entry.Name(),
"error", errDetail)
}
}
}
}
if len(corruptedDumps) > 0 {
operation.Fail("Corrupted dump files detected")
e.progress.Fail(fmt.Sprintf("Found %d corrupted dump files - restore aborted", len(corruptedDumps)))
return fmt.Errorf("pre-validation failed: %d corrupted dump files detected: %s - the backup archive appears to be damaged, restore from a different backup",
len(corruptedDumps), strings.Join(corruptedDumps, ", "))
}
e.log.Info("All dump files passed validation")
// Run comprehensive preflight checks (Linux system + PostgreSQL + Archive analysis)
preflight, preflightErr := e.RunPreflightChecks(ctx, dumpsDir, entries)
if preflightErr != nil {
e.log.Warn("Preflight checks failed", "error", preflightErr)
}
// 🛡️ LARGE DATABASE GUARD - Bulletproof protection for large database restores
e.progress.Update("Analyzing database characteristics...")
guard := NewLargeDBGuard(e.cfg, e.log)
// 🧠 MEMORY CHECK - Detect OOM risk before attempting restore
e.progress.Update("Checking system memory...")
archiveStats, statErr := os.Stat(archivePath)
var backupSizeBytes int64
if statErr == nil && archiveStats != nil {
backupSizeBytes = archiveStats.Size()
}
memCheck := guard.CheckSystemMemoryWithType(backupSizeBytes, true) // true = cluster archive with pre-compressed dumps
if memCheck != nil {
if memCheck.Critical {
e.log.Error("🚨 CRITICAL MEMORY WARNING", "error", memCheck.Recommendation)
e.log.Warn("Proceeding but OOM failure is likely - consider adding swap")
}
if memCheck.LowMemory {
e.log.Warn("⚠️ LOW MEMORY DETECTED - Consider reducing parallelism",
"available_gb", fmt.Sprintf("%.1f", memCheck.AvailableRAMGB),
"backup_gb", fmt.Sprintf("%.1f", memCheck.BackupSizeGB),
"current_jobs", e.cfg.Jobs,
"current_parallelism", e.cfg.ClusterParallelism)
// DO NOT override user settings - just warn
// User explicitly chose their profile, respect that choice
e.log.Warn("User settings preserved: jobs=%d, cluster-parallelism=%d", e.cfg.Jobs, e.cfg.ClusterParallelism)
e.log.Warn("If restore fails with OOM, reduce --jobs or use --profile conservative")
}
if memCheck.NeedsMoreSwap {
e.log.Warn("⚠️ SWAP RECOMMENDATION", "action", memCheck.Recommendation)
fmt.Println()
fmt.Println("═══════════════════════════════════════════════════════════════")
fmt.Println(" SWAP MEMORY RECOMMENDATION")
fmt.Println("═══════════════════════════════════════════════════════════════")
fmt.Println(memCheck.Recommendation)
fmt.Println("═══════════════════════════════════════════════════════════════")
fmt.Println()
}
if memCheck.EstimatedHours > 1 {
e.log.Info("⏱️ Estimated restore time", "hours", fmt.Sprintf("%.1f", memCheck.EstimatedHours))
}
}
// Build list of dump files for analysis
var dumpFilePaths []string
for _, entry := range entries {
if !entry.IsDir() {
dumpFilePaths = append(dumpFilePaths, filepath.Join(dumpsDir, entry.Name()))
}
}
// Determine optimal restore strategy
strategy := guard.DetermineStrategy(ctx, archivePath, dumpFilePaths)
// Apply strategy (override config if needed)
if strategy.UseConservative {
guard.ApplyStrategy(strategy, e.cfg)
guard.WarnUser(strategy, e.silentMode)
}
// Calculate optimal lock boost based on BLOB count
lockBoostValue := 2048 // Default
if preflight != nil && preflight.Archive.RecommendedLockBoost > 0 {
lockBoostValue = preflight.Archive.RecommendedLockBoost
}
// AUTO-TUNE: Boost PostgreSQL settings for large restores
e.progress.Update("Tuning PostgreSQL for large restore...")
if e.cfg.DebugLocks {
e.log.Info("🔍 [LOCK-DEBUG] Attempting to boost PostgreSQL lock settings",
"target_max_locks", lockBoostValue,
"conservative_mode", strategy.UseConservative)
}
originalSettings, tuneErr := e.boostPostgreSQLSettings(ctx, lockBoostValue)
if tuneErr != nil {
e.log.Error("Could not boost PostgreSQL settings", "error", tuneErr)
if e.cfg.DebugLocks {
e.log.Error("🔍 [LOCK-DEBUG] Lock boost attempt FAILED",
"error", tuneErr,
"phase", "boostPostgreSQLSettings")
}
operation.Fail("PostgreSQL tuning failed")
return fmt.Errorf("failed to boost PostgreSQL settings: %w", tuneErr)
}
if e.cfg.DebugLocks {
e.log.Info("🔍 [LOCK-DEBUG] Lock boost function returned",
"original_max_locks", originalSettings.MaxLocks,
"target_max_locks", lockBoostValue,
"boost_successful", originalSettings.MaxLocks >= lockBoostValue)
}
// INFORMATIONAL: Check if locks are sufficient, but DO NOT override user's Jobs setting
// The user explicitly chose their profile/jobs - respect that choice
if originalSettings.MaxLocks < lockBoostValue {
e.log.Warn("⚠️ PostgreSQL locks may be insufficient for optimal restore",
"current_locks", originalSettings.MaxLocks,
"recommended_locks", lockBoostValue,
"user_jobs", e.cfg.Jobs,
"user_parallelism", e.cfg.ClusterParallelism)
if e.cfg.DebugLocks {
e.log.Info("🔍 [LOCK-DEBUG] Lock verification WARNING (user settings preserved)",
"actual_locks", originalSettings.MaxLocks,
"recommended_locks", lockBoostValue,
"delta", lockBoostValue-originalSettings.MaxLocks,
"verdict", "PROCEEDING WITH USER SETTINGS")
}
// WARN but DO NOT override user's settings
e.log.Warn("=" + strings.Repeat("=", 70))
e.log.Warn("LOCK WARNING (user settings preserved):")
e.log.Warn("Current locks: %d, Recommended: %d", originalSettings.MaxLocks, lockBoostValue)
e.log.Warn("Using user-configured: jobs=%d, cluster-parallelism=%d", e.cfg.Jobs, e.cfg.ClusterParallelism)
e.log.Warn("If restore fails with lock errors, reduce --jobs or use --profile conservative")
e.log.Warn("=" + strings.Repeat("=", 70))
// DO NOT force Jobs=1 anymore - respect user's choice!
// The previous code here was overriding e.cfg.Jobs = 1 which broke turbo/performance profiles
e.log.Info("Proceeding with user settings",
"jobs", e.cfg.Jobs,
"cluster_parallelism", e.cfg.ClusterParallelism,
"available_locks", originalSettings.MaxLocks,
"note", "User profile settings respected")
}
e.log.Info("PostgreSQL tuning verified - locks sufficient for restore",
"max_locks_per_transaction", originalSettings.MaxLocks,
"target_locks", lockBoostValue,
"maintenance_work_mem", "2GB",
"conservative_mode", strategy.UseConservative)
if e.cfg.DebugLocks {
e.log.Info("🔍 [LOCK-DEBUG] Lock verification PASSED",
"actual_locks", originalSettings.MaxLocks,
"required_locks", lockBoostValue,
"verdict", "PROCEED WITH RESTORE")
}
// Ensure we reset settings when done (even on failure)
defer func() {
if resetErr := e.resetPostgreSQLSettings(ctx, originalSettings); resetErr != nil {
e.log.Warn("Could not reset PostgreSQL settings", "error", resetErr)
} else {
e.log.Info("Reset PostgreSQL settings to original values")
}
}()
var restoreErrors *multierror.Error
var restoreErrorsMu sync.Mutex
totalDBs := 0
// Count total databases and calculate total bytes for weighted progress
var totalBytes int64
dbSizes := make(map[string]int64) // Map database name to dump file size
for _, entry := range entries {
if !entry.IsDir() {
totalDBs++
dumpFile := filepath.Join(dumpsDir, entry.Name())
if info, err := os.Stat(dumpFile); err == nil {
dbName := entry.Name()
dbName = strings.TrimSuffix(dbName, ".dump")
dbName = strings.TrimSuffix(dbName, ".sql.gz")
dbSizes[dbName] = info.Size()
totalBytes += info.Size()
}
}
}
e.log.Info("Calculated total restore size", "databases", totalDBs, "total_bytes", totalBytes)
// Track bytes completed for weighted progress
var bytesCompleted int64
var bytesCompletedMu sync.Mutex
// Create ETA estimator for database restores
estimator := progress.NewETAEstimator("Restoring cluster", totalDBs)
e.progress.SetEstimator(estimator)
// Detect backup format and warn about performance implications
// .sql.gz files (from native engine) cannot use parallel restore like pg_restore -j8
hasSQLFormat := false
hasCustomFormat := false
for _, entry := range entries {
if !entry.IsDir() {
if strings.HasSuffix(entry.Name(), ".sql.gz") {
hasSQLFormat = true
} else if strings.HasSuffix(entry.Name(), ".dump") {
hasCustomFormat = true
}
}
}
// Warn about SQL format performance limitation
if hasSQLFormat && !hasCustomFormat {
if e.cfg.UseNativeEngine {
// Native engine now uses PARALLEL restore - should match pg_restore -j8 performance!
e.log.Info("✅ SQL format detected - using PARALLEL native restore engine",
"mode", "parallel",
"workers", e.cfg.Jobs,
"optimization", "COPY operations run in parallel like pg_restore -j")
if !e.silentMode {
fmt.Println()
fmt.Println("═══════════════════════════════════════════════════════════════")
fmt.Println(" ✅ PARALLEL NATIVE RESTORE: SQL Format with Parallel Loading")
fmt.Println("═══════════════════════════════════════════════════════════════")
fmt.Printf(" Using %d parallel workers for COPY operations.\n", e.cfg.Jobs)
fmt.Println(" Performance should match pg_restore -j" + fmt.Sprintf("%d", e.cfg.Jobs))
fmt.Println("═══════════════════════════════════════════════════════════════")
fmt.Println()
}
} else {
// psql path is still sequential
e.log.Warn("⚠️ PERFORMANCE WARNING: Backup uses SQL format (.sql.gz)",
"reason", "psql mode cannot parallelize SQL format",
"recommendation", "Enable --use-native-engine for parallel COPY loading")
if !e.silentMode {
fmt.Println()
fmt.Println("═══════════════════════════════════════════════════════════════")
fmt.Println(" ⚠️ PERFORMANCE NOTE: SQL Format with psql (sequential)")
fmt.Println("═══════════════════════════════════════════════════════════════")
fmt.Println(" Backup files use .sql.gz format.")
fmt.Println(" psql mode restores are sequential.")
fmt.Println()
fmt.Println(" For PARALLEL restore, use: --use-native-engine")
fmt.Println(" The native engine parallelizes COPY like pg_restore -j8")
fmt.Println("═══════════════════════════════════════════════════════════════")
fmt.Println()
}
time.Sleep(2 * time.Second)
}
}
// Check for large objects in dump files and adjust parallelism
hasLargeObjects := e.detectLargeObjectsInDumps(dumpsDir, entries)
// Use worker pool for parallel restore
parallelism := e.cfg.ClusterParallelism
if parallelism < 1 {
parallelism = 1 // Ensure at least sequential
}
// Automatically reduce parallelism if large objects detected
if hasLargeObjects && parallelism > 1 {
e.log.Warn("Large objects detected in dump files - reducing parallelism to avoid lock contention",
"original_parallelism", parallelism,
"adjusted_parallelism", 1)
e.progress.Update("[WARN] Large objects detected - using sequential restore to avoid lock conflicts")
time.Sleep(2 * time.Second) // Give user time to see warning
parallelism = 1
}
var successCount, failCount int32
var mu sync.Mutex // Protect shared resources (progress, logger)
// CRITICAL: Check context before starting database restore loop
// This helps debug issues where context gets cancelled between extraction and restore
if ctx.Err() != nil {
e.log.Error("Context cancelled before database restore loop started",
"context_error", ctx.Err(),
"total_databases", totalDBs,
"parallelism", parallelism)
operation.Fail("Context cancelled before database restores could start")
return fmt.Errorf("context cancelled before database restore: %w", ctx.Err())
}
e.log.Info("Starting database restore loop", "databases", totalDBs, "parallelism", parallelism)
// Timing tracking for restore phase progress
restorePhaseStart := time.Now()
var completedDBTimes []time.Duration // Track duration for each completed DB restore
var completedDBTimesMu sync.Mutex
// Create semaphore to limit concurrency
semaphore := make(chan struct{}, parallelism)
var wg sync.WaitGroup
dbIndex := 0
for _, entry := range entries {
if entry.IsDir() {
continue
}
// Check context before acquiring semaphore to prevent goroutine leak
if ctx.Err() != nil {
e.log.Warn("Context cancelled - stopping database restore scheduling")
break
}
wg.Add(1)
// Acquire semaphore with context awareness to prevent goroutine leak
select {
case semaphore <- struct{}{}:
// Acquired, proceed
case <-ctx.Done():
wg.Done()
e.log.Warn("Context cancelled while waiting for semaphore", "file", entry.Name())
continue
}
go func(idx int, filename string) {
defer wg.Done()
defer func() { <-semaphore }() // Release
// Panic recovery - prevent one database failure from crashing entire cluster restore
defer func() {
if r := recover(); r != nil {
e.log.Error("Panic in database restore goroutine", "file", filename, "panic", r)
atomic.AddInt32(&failCount, 1)
}
}()
// Check for context cancellation before starting
if ctx.Err() != nil {
e.log.Warn("Context cancelled - skipping database restore", "file", filename)
atomic.AddInt32(&failCount, 1)
restoreErrorsMu.Lock()
restoreErrors = multierror.Append(restoreErrors, fmt.Errorf("%s: restore skipped (context cancelled)", strings.TrimSuffix(strings.TrimSuffix(filename, ".dump"), ".sql.gz")))
restoreErrorsMu.Unlock()
return
}
// Track timing for this database restore
dbRestoreStart := time.Now()
// Update estimator progress (thread-safe)
mu.Lock()
estimator.UpdateProgress(idx)
mu.Unlock()
dumpFile := filepath.Join(dumpsDir, filename)
dbName := filename
dbName = strings.TrimSuffix(dbName, ".dump")
dbName = strings.TrimSuffix(dbName, ".sql.gz")
dbProgress := 15 + int(float64(idx)/float64(totalDBs)*85.0)
// Calculate average time per DB and report progress with timing
completedDBTimesMu.Lock()
var avgPerDB time.Duration
if len(completedDBTimes) > 0 {
var totalDuration time.Duration
for _, d := range completedDBTimes {
totalDuration += d
}
avgPerDB = totalDuration / time.Duration(len(completedDBTimes))
}
phaseElapsed := time.Since(restorePhaseStart)
completedDBTimesMu.Unlock()
mu.Lock()
statusMsg := fmt.Sprintf("Restoring database %s (%d/%d)", dbName, idx+1, totalDBs)
e.progress.Update(statusMsg)
e.log.Info("Restoring database", "name", dbName, "file", dumpFile, "progress", dbProgress)
// Report database progress for TUI (both callbacks)
e.reportDatabaseProgress(idx, totalDBs, dbName)
e.reportDatabaseProgressWithTiming(idx, totalDBs, dbName, phaseElapsed, avgPerDB)
mu.Unlock()
// STEP 1: Drop existing database completely (clean slate)
e.log.Info("Dropping existing database for clean restore", "name", dbName)
if err := e.dropDatabaseIfExists(ctx, dbName); err != nil {
e.log.Warn("Could not drop existing database", "name", dbName, "error", err)
}
// STEP 2: Create fresh database
if err := e.ensureDatabaseExists(ctx, dbName); err != nil {
e.log.Error("Failed to create database", "name", dbName, "error", err)
restoreErrorsMu.Lock()
restoreErrors = multierror.Append(restoreErrors, fmt.Errorf("%s: failed to create database: %w", dbName, err))
restoreErrorsMu.Unlock()
atomic.AddInt32(&failCount, 1)
return
}
// STEP 3: Restore with ownership preservation if superuser
preserveOwnership := isSuperuser
isCompressedSQL := strings.HasSuffix(dumpFile, ".sql.gz")
// Get expected size for this database for progress estimation
expectedDBSize := dbSizes[dbName]
// Start heartbeat ticker to show progress during long-running restore
// CRITICAL FIX: Report progress to TUI callbacks so large DB restores show updates
heartbeatCtx, cancelHeartbeat := context.WithCancel(ctx)
heartbeatTicker := time.NewTicker(5 * time.Second) // More frequent updates (was 15s)
heartbeatCount := int64(0)
go func() {
for {
select {
case <-heartbeatTicker.C:
heartbeatCount++
dbElapsed := time.Since(dbRestoreStart) // Per-database elapsed
phaseElapsedNow := time.Since(restorePhaseStart) // Overall phase elapsed
mu.Lock()
statusMsg := fmt.Sprintf("Restoring %s (%d/%d) - running: %s (phase: %s)",
dbName, idx+1, totalDBs, formatDuration(dbElapsed), formatDuration(phaseElapsedNow))
e.progress.Update(statusMsg)
// CRITICAL: Report activity to TUI callbacks during long-running restore
// Use time-based progress estimation: assume ~10MB/s average throughput
// This gives visual feedback even when pg_restore hasn't completed
estimatedBytesPerSec := int64(10 * 1024 * 1024) // 10 MB/s conservative estimate
estimatedBytesDone := dbElapsed.Milliseconds() / 1000 * estimatedBytesPerSec
if expectedDBSize > 0 && estimatedBytesDone > expectedDBSize {
estimatedBytesDone = expectedDBSize * 95 / 100 // Cap at 95%
}
// Calculate current progress including in-flight database
currentBytesEstimate := bytesCompleted + estimatedBytesDone
// Report to TUI with estimated progress
e.reportDatabaseProgressByBytes(currentBytesEstimate, totalBytes, dbName, int(atomic.LoadInt32(&successCount)), totalDBs)
// Also report timing info (use phaseElapsedNow computed above)
var avgPerDB time.Duration
completedDBTimesMu.Lock()
if len(completedDBTimes) > 0 {
var total time.Duration
for _, d := range completedDBTimes {
total += d
}
avgPerDB = total / time.Duration(len(completedDBTimes))
}
completedDBTimesMu.Unlock()
e.reportDatabaseProgressWithTiming(idx, totalDBs, dbName, phaseElapsedNow, avgPerDB)
mu.Unlock()
case <-heartbeatCtx.Done():
return
}
}
}()
var restoreErr error
if isCompressedSQL {
mu.Lock()
if e.cfg.UseNativeEngine {
e.log.Info("Detected compressed SQL format, using native Go engine", "file", dumpFile, "database", dbName)
} else {
e.log.Info("Detected compressed SQL format, using psql + pgzip", "file", dumpFile, "database", dbName)
}
mu.Unlock()
restoreErr = e.restorePostgreSQLSQL(ctx, dumpFile, dbName, true)
} else {
mu.Lock()
e.log.Info("Detected custom dump format, using pg_restore", "file", dumpFile, "database", dbName)
mu.Unlock()
restoreErr = e.restorePostgreSQLDumpWithOwnership(ctx, dumpFile, dbName, false, preserveOwnership)
}
// Stop heartbeat ticker
heartbeatTicker.Stop()
cancelHeartbeat()
if restoreErr != nil {
mu.Lock()
e.log.Error("Failed to restore database", "name", dbName, "file", dumpFile, "error", restoreErr)
mu.Unlock()
// Check for specific recoverable errors
errMsg := restoreErr.Error()
// CRITICAL: Check for LOCK_EXHAUSTION error that escaped preflight checks
if strings.Contains(errMsg, "LOCK_EXHAUSTION:") ||
strings.Contains(errMsg, "out of shared memory") ||
strings.Contains(errMsg, "max_locks_per_transaction") {
mu.Lock()
e.log.Error("🔴 LOCK EXHAUSTION ERROR - ABORTING ALL DATABASE RESTORES",
"database", dbName,
"error", errMsg,
"action", "Will force sequential mode and abort current parallel restore")
// Force sequential mode for any future restores
e.cfg.ClusterParallelism = 1
e.cfg.Jobs = 1
e.log.Error("=" + strings.Repeat("=", 70))
e.log.Error("CRITICAL: Lock exhaustion during restore - this should NOT happen")
e.log.Error("Setting ClusterParallelism=1 and Jobs=1 for future operations")
e.log.Error("Current restore MUST be aborted and restarted")
e.log.Error("=" + strings.Repeat("=", 70))
mu.Unlock()
// Add error and abort immediately - don't continue with other databases
restoreErrorsMu.Lock()
restoreErrors = multierror.Append(restoreErrors,
fmt.Errorf("LOCK_EXHAUSTION: %s - all restores aborted, must restart with sequential mode", dbName))
restoreErrorsMu.Unlock()
atomic.AddInt32(&failCount, 1)
// Cancel context to stop all other goroutines
// This will cause the entire restore to fail fast
return
}
if strings.Contains(errMsg, "max_locks_per_transaction") {
mu.Lock()
e.log.Warn("Database restore failed due to insufficient locks - this is a PostgreSQL configuration issue",
"database", dbName,
"solution", "increase max_locks_per_transaction in postgresql.conf")
mu.Unlock()
} else if strings.Contains(errMsg, "total errors:") && strings.Contains(errMsg, "2562426") {
mu.Lock()
e.log.Warn("Database has massive error count - likely data corruption or incompatible dump format",
"database", dbName,
"errors", "2562426")
mu.Unlock()
}
restoreErrorsMu.Lock()
// Include more context in the error message
restoreErrors = multierror.Append(restoreErrors, fmt.Errorf("%s: restore failed: %w", dbName, restoreErr))
restoreErrorsMu.Unlock()
atomic.AddInt32(&failCount, 1)
return
}
// Track completed database restore duration for ETA calculation
dbRestoreDuration := time.Since(dbRestoreStart)
completedDBTimesMu.Lock()
completedDBTimes = append(completedDBTimes, dbRestoreDuration)
completedDBTimesMu.Unlock()
// Update bytes completed for weighted progress
dbSize := dbSizes[dbName]
bytesCompletedMu.Lock()
bytesCompleted += dbSize
currentBytesCompleted := bytesCompleted
currentSuccessCount := int(atomic.LoadInt32(&successCount)) + 1 // +1 because we're about to increment
bytesCompletedMu.Unlock()
// Report weighted progress (bytes-based)
e.reportDatabaseProgressByBytes(currentBytesCompleted, totalBytes, dbName, currentSuccessCount, totalDBs)
atomic.AddInt32(&successCount, 1)
// Small delay to ensure PostgreSQL fully closes connections before next restore
time.Sleep(100 * time.Millisecond)
}(dbIndex, entry.Name())
dbIndex++
}
// Wait for all restores to complete
wg.Wait()
successCountFinal := int(atomic.LoadInt32(&successCount))
failCountFinal := int(atomic.LoadInt32(&failCount))
// SANITY CHECK: Verify all databases were accounted for
// This catches any goroutine that exited without updating counters
accountedFor := successCountFinal + failCountFinal
if accountedFor != totalDBs {
missingCount := totalDBs - accountedFor
e.log.Error("INTERNAL ERROR: Some database restore goroutines did not report status",
"expected", totalDBs,
"success", successCountFinal,
"failed", failCountFinal,
"unaccounted", missingCount)
// Treat unaccounted databases as failures
failCountFinal += missingCount
restoreErrorsMu.Lock()
restoreErrors = multierror.Append(restoreErrors, fmt.Errorf("%d database(s) did not complete (possible goroutine crash or deadlock)", missingCount))
restoreErrorsMu.Unlock()
}
// CRITICAL: Check if no databases were restored at all
if successCountFinal == 0 {
e.progress.Fail(fmt.Sprintf("Cluster restore FAILED: 0 of %d databases restored", totalDBs))
operation.Fail("No databases were restored")
if failCountFinal > 0 && restoreErrors != nil {
return fmt.Errorf("cluster restore failed: all %d database(s) failed:\n%s", failCountFinal, restoreErrors.Error())
}
return fmt.Errorf("cluster restore failed: no databases were restored (0 of %d total). Check PostgreSQL logs for details", totalDBs)
}
if failCountFinal > 0 {
// Format multi-error with detailed output
restoreErrors.ErrorFormat = func(errs []error) string {
if len(errs) == 1 {
return errs[0].Error()
}
points := make([]string, len(errs))
for i, err := range errs {
points[i] = fmt.Sprintf(" • %s", err.Error())
}
return fmt.Sprintf("%d database(s) failed:\n%s", len(errs), strings.Join(points, "\n"))
}
// Log summary
e.log.Info("Cluster restore completed with failures",
"succeeded", successCountFinal,
"failed", failCountFinal,
"total", totalDBs)
e.progress.Fail(fmt.Sprintf("Cluster restore: %d succeeded, %d failed out of %d total", successCountFinal, failCountFinal, totalDBs))
operation.Complete(fmt.Sprintf("Partial restore: %d/%d databases succeeded", successCountFinal, totalDBs))
// Record cluster restore metrics (partial failure)
e.recordClusterRestoreMetrics(clusterStartTime, archivePath, totalDBs, successCountFinal, false, restoreErrors.Error())
return fmt.Errorf("cluster restore completed with %d failures:\n%s", failCountFinal, restoreErrors.Error())
}
e.progress.Complete(fmt.Sprintf("Cluster restored successfully: %d databases", successCountFinal))
operation.Complete(fmt.Sprintf("Restored %d databases from cluster archive", successCountFinal))
// Record cluster restore metrics (success)
e.recordClusterRestoreMetrics(clusterStartTime, archivePath, totalDBs, successCountFinal, true, "")
return nil
}
// restoreClusterFromSQL restores a pg_dumpall SQL file using the native engine
// This handles .sql and .sql.gz files containing full cluster dumps
func (e *Engine) restoreClusterFromSQL(ctx context.Context, archivePath string, operation logger.OperationLogger) error {
e.log.Info("Restoring cluster from SQL file (pg_dumpall format)",
"file", filepath.Base(archivePath),
"native_engine", true)
clusterStartTime := time.Now()
// Determine if compressed
compressed := strings.HasSuffix(strings.ToLower(archivePath), ".gz")
// Use native engine to restore directly to postgres database (globals + all databases)
e.log.Info("Restoring SQL dump using native engine...",
"compressed", compressed,
"size", FormatBytes(getFileSize(archivePath)))
e.progress.Start("Restoring cluster from SQL dump...")
// For pg_dumpall, we restore to the 'postgres' database which then creates other databases
targetDB := "postgres"
err := e.restoreWithNativeEngine(ctx, archivePath, targetDB, compressed)
if err != nil {
operation.Fail(fmt.Sprintf("SQL cluster restore failed: %v", err))
e.recordClusterRestoreMetrics(clusterStartTime, archivePath, 0, 0, false, err.Error())
return fmt.Errorf("SQL cluster restore failed: %w", err)
}
duration := time.Since(clusterStartTime)
e.progress.Complete(fmt.Sprintf("Cluster restored successfully from SQL in %s", duration.Round(time.Second)))
operation.Complete("SQL cluster restore completed")
// Record metrics
e.recordClusterRestoreMetrics(clusterStartTime, archivePath, 1, 1, true, "")
return nil
}
// recordClusterRestoreMetrics records metrics for cluster restore operations
func (e *Engine) recordClusterRestoreMetrics(startTime time.Time, archivePath string, totalDBs, successCount int, success bool, errorMsg string) {
duration := time.Since(startTime)
// Get archive size
var archiveSize int64
if fi, err := os.Stat(archivePath); err == nil {
archiveSize = fi.Size()
}
record := RestoreRecord{
Database: "cluster",
Engine: "postgresql",
StartedAt: startTime,
CompletedAt: time.Now(),
Duration: duration,
SizeBytes: archiveSize,
ParallelJobs: e.cfg.Jobs,
Profile: e.cfg.ResourceProfile,
Success: success,
SourceFile: filepath.Base(archivePath),
IsCluster: true,
ErrorMessage: errorMsg,
}
if recordErr := RecordRestore(record); recordErr != nil {
e.log.Warn("Failed to record cluster restore metrics", "error", recordErr)
}
// Log performance summary
e.log.Info("📊 RESTORE PERFORMANCE SUMMARY",
"total_duration", duration.Round(time.Second),
"databases", totalDBs,
"successful", successCount,
"parallel_jobs", e.cfg.Jobs,
"profile", e.cfg.ResourceProfile,
"avg_per_db", (duration / time.Duration(totalDBs)).Round(time.Second))
}
// extractArchive extracts a tar.gz archive with progress reporting
func (e *Engine) extractArchive(ctx context.Context, archivePath, destDir string) error {
// If progress callback is set, use Go's archive/tar for progress tracking
if e.progressCallback != nil {
return e.extractArchiveWithProgress(ctx, archivePath, destDir)
}
// Otherwise use fast shell tar (no progress)
return e.extractArchiveShell(ctx, archivePath, destDir)
}
// extractArchiveWithProgress extracts using Go's archive/tar with detailed progress reporting
func (e *Engine) extractArchiveWithProgress(ctx context.Context, archivePath, destDir string) error {
// Get archive size for progress calculation
archiveInfo, err := os.Stat(archivePath)
if err != nil {
return fmt.Errorf("failed to stat archive: %w", err)
}
totalSize := archiveInfo.Size()
// Open the archive file
file, err := os.Open(archivePath)
if err != nil {
return fmt.Errorf("failed to open archive: %w", err)
}
defer file.Close()
// Wrap with progress reader
progressReader := &progressReader{
reader: file,
totalSize: totalSize,
callback: e.progressCallback,
desc: "Extracting archive",
}
// Create parallel gzip reader for faster decompression
gzReader, err := pgzip.NewReader(progressReader)
if err != nil {
return fmt.Errorf("failed to create gzip reader: %w", err)
}
defer gzReader.Close()
// Create tar reader
tarReader := tar.NewReader(gzReader)
// Extract files
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
header, err := tarReader.Next()
if err == io.EOF {
break // End of archive
}
if err != nil {
return fmt.Errorf("failed to read tar header: %w", err)
}
// Sanitize and validate path
targetPath := filepath.Join(destDir, header.Name)
// Security check: ensure path is within destDir (prevent path traversal)
if !strings.HasPrefix(filepath.Clean(targetPath), filepath.Clean(destDir)) {
e.log.Warn("Skipping potentially malicious path in archive", "path", header.Name)
continue
}
switch header.Typeflag {
case tar.TypeDir:
if err := os.MkdirAll(targetPath, 0755); err != nil {
return fmt.Errorf("failed to create directory %s: %w", targetPath, err)
}
case tar.TypeReg:
// Ensure parent directory exists
if err := os.MkdirAll(filepath.Dir(targetPath), 0755); err != nil {
return fmt.Errorf("failed to create parent directory: %w", err)
}
// Create the file
outFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(header.Mode))
if err != nil {
return fmt.Errorf("failed to create file %s: %w", targetPath, err)
}
// Copy file contents with context awareness for Ctrl+C interruption
// Use buffered I/O for turbo mode (32KB buffer)
if e.cfg.BufferedIO {
bufferedWriter := bufio.NewWriterSize(outFile, 32*1024) // 32KB buffer for faster writes
if _, err := fs.CopyWithContext(ctx, bufferedWriter, tarReader); err != nil {
outFile.Close()
os.Remove(targetPath) // Clean up partial file
return fmt.Errorf("failed to write file %s: %w", targetPath, err)
}
if err := bufferedWriter.Flush(); err != nil {
outFile.Close()
os.Remove(targetPath)
return fmt.Errorf("failed to flush buffer for %s: %w", targetPath, err)
}
} else {
if _, err := fs.CopyWithContext(ctx, outFile, tarReader); err != nil {
outFile.Close()
os.Remove(targetPath) // Clean up partial file
return fmt.Errorf("failed to write file %s: %w", targetPath, err)
}
}
outFile.Close()
case tar.TypeSymlink:
// Handle symlinks (common in some archives)
if err := os.Symlink(header.Linkname, targetPath); err != nil {
// Ignore symlink errors (may already exist or not supported)
e.log.Debug("Could not create symlink", "path", targetPath, "target", header.Linkname)
}
}
}
// Final progress update
e.reportProgress(totalSize, totalSize, "Extraction complete")
return nil
}
// progressReader wraps an io.Reader to report read progress
type progressReader struct {
reader io.Reader
totalSize int64
bytesRead int64
callback ProgressCallback
desc string
lastReport time.Time
reportEvery time.Duration
}
func (pr *progressReader) Read(p []byte) (n int, err error) {
n, err = pr.reader.Read(p)
pr.bytesRead += int64(n)
// Throttle progress reporting to every 50ms for smoother updates
if pr.reportEvery == 0 {
pr.reportEvery = 50 * time.Millisecond
}
if time.Since(pr.lastReport) > pr.reportEvery {
if pr.callback != nil {
pr.callback(pr.bytesRead, pr.totalSize, pr.desc)
}
pr.lastReport = time.Now()
}
return n, err
}
// extractArchiveShell extracts using pgzip (parallel gzip, 2-4x faster on multi-core)
func (e *Engine) extractArchiveShell(ctx context.Context, archivePath, destDir string) error {
// Start heartbeat ticker for extraction progress
extractionStart := time.Now()
e.log.Info("Extracting archive with pgzip (parallel gzip)",
"archive", archivePath,
"dest", destDir,
"method", "pgzip")
// Use parallel extraction
err := fs.ExtractTarGzParallel(ctx, archivePath, destDir, func(progress fs.ExtractProgress) {
if progress.TotalBytes > 0 {
elapsed := time.Since(extractionStart)
pct := float64(progress.BytesRead) / float64(progress.TotalBytes) * 100
e.progress.Update(fmt.Sprintf("Extracting archive... %.1f%% (elapsed: %s)", pct, formatDuration(elapsed)))
}
})
if err != nil {
return fmt.Errorf("parallel extraction failed: %w", err)
}
elapsed := time.Since(extractionStart)
e.log.Info("Archive extraction complete", "duration", formatDuration(elapsed))
return nil
}
// restoreGlobals restores global objects (roles, tablespaces)
// Note: psql returns 0 even when some statements fail (e.g., role already exists)
// We track errors but only fail on FATAL errors that would prevent restore
func (e *Engine) restoreGlobals(ctx context.Context, globalsFile string) error {
args := []string{
"-p", fmt.Sprintf("%d", e.cfg.Port),
"-U", e.cfg.User,
"-d", "postgres",
"-f", globalsFile,
}
// Only add -h flag if host is not localhost (to use Unix socket for peer auth)
if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" {
args = append([]string{"-h", e.cfg.Host}, args...)
}
cmd := cleanup.SafeCommand(ctx, "psql", args...)
cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password))
// Stream output to avoid memory issues with large globals.sql files
stderr, err := cmd.StderrPipe()
if err != nil {
return fmt.Errorf("failed to create stderr pipe: %w", err)
}
if err := cmd.Start(); err != nil {
return fmt.Errorf("failed to start psql: %w", err)
}
// Read stderr in chunks in goroutine
var lastError string
var errorCount int
var fatalError bool
stderrDone := make(chan struct{})
go func() {
defer close(stderrDone)
buf := make([]byte, 4096)
for {
n, err := stderr.Read(buf)
if n > 0 {
chunk := string(buf[:n])
// Track different error types
if strings.Contains(chunk, "FATAL") {
fatalError = true
lastError = chunk
e.log.Error("Globals restore FATAL error", "output", chunk)
} else if strings.Contains(chunk, "ERROR") {
errorCount++
lastError = chunk
// Only log first few errors to avoid spam
if errorCount <= 5 {
// Check if it's an ignorable "already exists" error
if strings.Contains(chunk, "already exists") {
e.log.Debug("Globals restore: object already exists (expected)", "output", chunk)
} else {
e.log.Warn("Globals restore error", "output", chunk)
}
}
}
}
if err != nil {
break
}
}
}()
// Wait for command with proper context handling
cmdDone := make(chan error, 1)
go func() {
cmdDone <- cmd.Wait()
}()
var cmdErr error
select {
case cmdErr = <-cmdDone:
// Command completed
case <-ctx.Done():
e.log.Warn("Globals restore cancelled - killing process group")
cleanup.KillCommandGroup(cmd)
<-cmdDone
cmdErr = ctx.Err()
}
// Wait for stderr reader with timeout to prevent indefinite hang
// if the process doesn't fully terminate
select {
case <-stderrDone:
// Normal completion
case <-time.After(5 * time.Second):
e.log.Warn("Stderr reader timeout - forcefully continuing")
}
// Only fail on actual command errors or FATAL PostgreSQL errors
// Regular ERROR messages (like "role already exists") are expected
if cmdErr != nil {
return fmt.Errorf("failed to restore globals: %w (last error: %s)", cmdErr, lastError)
}
// If we had FATAL errors, those are real problems
if fatalError {
return fmt.Errorf("globals restore had FATAL error: %s", lastError)
}
// Log summary if there were errors (but don't fail)
if errorCount > 0 {
e.log.Info("Globals restore completed with some errors (usually 'already exists' - expected)",
"error_count", errorCount)
}
return nil
}
// checkSuperuser verifies if the current user has superuser privileges
func (e *Engine) checkSuperuser(ctx context.Context) (bool, error) {
args := []string{
"-p", fmt.Sprintf("%d", e.cfg.Port),
"-U", e.cfg.User,
"-d", "postgres",
"-tAc", "SELECT usesuper FROM pg_user WHERE usename = current_user",
}
// Only add -h flag if host is not localhost (to use Unix socket for peer auth)
if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" {
args = append([]string{"-h", e.cfg.Host}, args...)
}
cmd := cleanup.SafeCommand(ctx, "psql", args...)
// Always set PGPASSWORD (empty string is fine for peer/ident auth)
cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password))
output, err := cmd.CombinedOutput()
if err != nil {
return false, fmt.Errorf("failed to check superuser status: %w", err)
}
isSuperuser := strings.TrimSpace(string(output)) == "t"
return isSuperuser, nil
}
// terminateConnections kills all active connections to a database
func (e *Engine) terminateConnections(ctx context.Context, dbName string) error {
query := fmt.Sprintf(`
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE datname = '%s'
AND pid <> pg_backend_pid()
`, dbName)
args := []string{
"-p", fmt.Sprintf("%d", e.cfg.Port),
"-U", e.cfg.User,
"-d", "postgres",
"-tAc", query,
}
// Only add -h flag if host is not localhost (to use Unix socket for peer auth)
if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" {
args = append([]string{"-h", e.cfg.Host}, args...)
}
cmd := cleanup.SafeCommand(ctx, "psql", args...)
// Always set PGPASSWORD (empty string is fine for peer/ident auth)
cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password))
output, err := cmd.CombinedOutput()
if err != nil {
e.log.Warn("Failed to terminate connections", "database", dbName, "error", err, "output", string(output))
// Don't fail - database might not exist or have no connections
}
return nil
}
// dropDatabaseIfExists drops a database completely (clean slate)
// Uses PostgreSQL 13+ WITH (FORCE) option to forcefully drop even with active connections
func (e *Engine) dropDatabaseIfExists(ctx context.Context, dbName string) error {
// First terminate all connections
if err := e.terminateConnections(ctx, dbName); err != nil {
e.log.Warn("Could not terminate connections", "database", dbName, "error", err)
}
// Wait a moment for connections to terminate
time.Sleep(500 * time.Millisecond)
// Try to revoke new connections (prevents race condition)
// This only works if we have the privilege to do so
revokeArgs := []string{
"-p", fmt.Sprintf("%d", e.cfg.Port),
"-U", e.cfg.User,
"-d", "postgres",
"-c", fmt.Sprintf("REVOKE CONNECT ON DATABASE \"%s\" FROM PUBLIC", dbName),
}
if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" {
revokeArgs = append([]string{"-h", e.cfg.Host}, revokeArgs...)
}
revokeCmd := cleanup.SafeCommand(ctx, "psql", revokeArgs...)
revokeCmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password))
revokeCmd.Run() // Ignore errors - database might not exist
// Terminate connections again after revoking connect privilege
e.terminateConnections(ctx, dbName)
time.Sleep(200 * time.Millisecond)
// Try DROP DATABASE WITH (FORCE) first (PostgreSQL 13+)
// This forcefully terminates connections and drops the database atomically
forceArgs := []string{
"-p", fmt.Sprintf("%d", e.cfg.Port),
"-U", e.cfg.User,
"-d", "postgres",
"-c", fmt.Sprintf("DROP DATABASE IF EXISTS \"%s\" WITH (FORCE)", dbName),
}
if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" {
forceArgs = append([]string{"-h", e.cfg.Host}, forceArgs...)
}
forceCmd := cleanup.SafeCommand(ctx, "psql", forceArgs...)
forceCmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password))
output, err := forceCmd.CombinedOutput()
if err == nil {
e.log.Info("Dropped existing database (with FORCE)", "name", dbName)
return nil
}
// If FORCE option failed (PostgreSQL < 13), try regular drop
if strings.Contains(string(output), "syntax error") || strings.Contains(string(output), "WITH (FORCE)") {
e.log.Debug("WITH (FORCE) not supported, using standard DROP", "name", dbName)
args := []string{
"-p", fmt.Sprintf("%d", e.cfg.Port),
"-U", e.cfg.User,
"-d", "postgres",
"-c", fmt.Sprintf("DROP DATABASE IF EXISTS \"%s\"", dbName),
}
if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" {
args = append([]string{"-h", e.cfg.Host}, args...)
}
cmd := cleanup.SafeCommand(ctx, "psql", args...)
cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password))
output, err = cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("failed to drop database '%s': %w\nOutput: %s", dbName, err, string(output))
}
} else if err != nil {
return fmt.Errorf("failed to drop database '%s': %w\nOutput: %s", dbName, err, string(output))
}
e.log.Info("Dropped existing database", "name", dbName)
return nil
}
// ensureDatabaseExists checks if a database exists and creates it if not
func (e *Engine) ensureDatabaseExists(ctx context.Context, dbName string) error {
// Route to appropriate implementation based on database type
if e.cfg.DatabaseType == "mysql" || e.cfg.DatabaseType == "mariadb" {
return e.ensureMySQLDatabaseExists(ctx, dbName)
}
return e.ensurePostgresDatabaseExists(ctx, dbName)
}
// ensureMySQLDatabaseExists checks if a MySQL database exists and creates it if not
func (e *Engine) ensureMySQLDatabaseExists(ctx context.Context, dbName string) error {
// Build mysql command - use environment variable for password (security: avoid process list exposure)
args := []string{
"-h", e.cfg.Host,
"-P", fmt.Sprintf("%d", e.cfg.Port),
"-u", e.cfg.User,
"-e", fmt.Sprintf("CREATE DATABASE IF NOT EXISTS `%s`", dbName),
}
cmd := cleanup.SafeCommand(ctx, "mysql", args...)
cmd.Env = os.Environ()
if e.cfg.Password != "" {
cmd.Env = append(cmd.Env, "MYSQL_PWD="+e.cfg.Password)
}
output, err := cmd.CombinedOutput()
if err != nil {
e.log.Warn("MySQL database creation failed", "name", dbName, "error", err, "output", string(output))
return fmt.Errorf("failed to create database '%s': %w (output: %s)", dbName, err, strings.TrimSpace(string(output)))
}
e.log.Info("Successfully ensured MySQL database exists", "name", dbName)
return nil
}
// ensurePostgresDatabaseExists checks if a PostgreSQL database exists and creates it if not
// It attempts to extract encoding/locale from the dump file to preserve original settings
func (e *Engine) ensurePostgresDatabaseExists(ctx context.Context, dbName string) error {
// Skip creation for postgres and template databases - they should already exist
if dbName == "postgres" || dbName == "template0" || dbName == "template1" {
e.log.Info("Skipping create for system database (assume exists)", "name", dbName)
return nil
}
// Build psql command with authentication
buildPsqlCmd := func(ctx context.Context, database, query string) *exec.Cmd {
args := []string{
"-p", fmt.Sprintf("%d", e.cfg.Port),
"-U", e.cfg.User,
"-d", database,
"-tAc", query,
}
// Only add -h flag if host is not localhost (to use Unix socket for peer auth)
if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" {
args = append([]string{"-h", e.cfg.Host}, args...)
}
cmd := cleanup.SafeCommand(ctx, "psql", args...)
// Always set PGPASSWORD (empty string is fine for peer/ident auth)
cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password))
return cmd
}
// Check if database exists
checkCmd := buildPsqlCmd(ctx, "postgres", fmt.Sprintf("SELECT 1 FROM pg_database WHERE datname = '%s'", dbName))
output, err := checkCmd.CombinedOutput()
if err != nil {
e.log.Warn("Database existence check failed", "name", dbName, "error", err, "output", string(output))
// Continue anyway - maybe we can create it
}
// If database exists, we're done
if strings.TrimSpace(string(output)) == "1" {
e.log.Info("Database already exists", "name", dbName)
return nil
}
// Database doesn't exist, create it
// IMPORTANT: Use template0 to avoid duplicate definition errors from local additions to template1
// Also use UTF8 encoding explicitly as it's the most common and safest choice
// See PostgreSQL docs: https://www.postgresql.org/docs/current/app-pgrestore.html#APP-PGRESTORE-NOTES
e.log.Info("Creating database from template0 with UTF8 encoding", "name", dbName)
// Get server's default locale for LC_COLLATE and LC_CTYPE
// This ensures compatibility while using the correct encoding
localeCmd := buildPsqlCmd(ctx, "postgres", "SHOW lc_collate")
localeOutput, _ := localeCmd.CombinedOutput()
serverLocale := strings.TrimSpace(string(localeOutput))
if serverLocale == "" {
serverLocale = "en_US.UTF-8" // Fallback to common default
}
// Build CREATE DATABASE command with encoding and locale
// Using ENCODING 'UTF8' explicitly ensures the dump can be restored
createSQL := fmt.Sprintf(
"CREATE DATABASE \"%s\" WITH TEMPLATE template0 ENCODING 'UTF8' LC_COLLATE '%s' LC_CTYPE '%s'",
dbName, serverLocale, serverLocale,
)
createArgs := []string{
"-p", fmt.Sprintf("%d", e.cfg.Port),
"-U", e.cfg.User,
"-d", "postgres",
"-c", createSQL,
}
// Only add -h flag if host is not localhost (to use Unix socket for peer auth)
if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" {
createArgs = append([]string{"-h", e.cfg.Host}, createArgs...)
}
createCmd := cleanup.SafeCommand(ctx, "psql", createArgs...)
// Always set PGPASSWORD (empty string is fine for peer/ident auth)
createCmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password))
createOutput, createErr := createCmd.CombinedOutput()
if createErr != nil {
// If encoding/locale fails, try simpler CREATE DATABASE
e.log.Warn("Database creation with encoding failed, trying simple create", "name", dbName, "error", createErr, "output", string(createOutput))
simpleArgs := []string{
"-p", fmt.Sprintf("%d", e.cfg.Port),
"-U", e.cfg.User,
"-d", "postgres",
"-c", fmt.Sprintf("CREATE DATABASE \"%s\" WITH TEMPLATE template0", dbName),
}
if e.cfg.Host != "localhost" && e.cfg.Host != "127.0.0.1" && e.cfg.Host != "" {
simpleArgs = append([]string{"-h", e.cfg.Host}, simpleArgs...)
}
simpleCmd := cleanup.SafeCommand(ctx, "psql", simpleArgs...)
simpleCmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSWORD=%s", e.cfg.Password))
output, err = simpleCmd.CombinedOutput()
if err != nil {
e.log.Warn("Database creation failed", "name", dbName, "error", err, "output", string(output))
return fmt.Errorf("failed to create database '%s': %w (output: %s)", dbName, err, strings.TrimSpace(string(output)))
}
}
e.log.Info("Successfully created database from template0", "name", dbName)
return nil
}
// previewClusterRestore shows cluster restore preview
func (e *Engine) previewClusterRestore(archivePath string) error {
fmt.Println("\n" + strings.Repeat("=", 60))
fmt.Println(" CLUSTER RESTORE PREVIEW (DRY RUN)")
fmt.Println(strings.Repeat("=", 60))
stat, _ := os.Stat(archivePath)
fmt.Printf("\nArchive: %s\n", filepath.Base(archivePath))
if stat != nil {
fmt.Printf("Size: %s\n", FormatBytes(stat.Size()))
fmt.Printf("Modified: %s\n", stat.ModTime().Format("2006-01-02 15:04:05"))
}
fmt.Printf("Target Host: %s:%d\n", e.cfg.Host, e.cfg.Port)
fmt.Println("\nOperations that would be performed:")
fmt.Println(" 1. Extract cluster archive to temporary directory")
fmt.Println(" 2. Restore global objects (roles, tablespaces)")
fmt.Println(" 3. Restore all databases found in archive")
fmt.Println(" 4. Cleanup temporary files")
fmt.Println("\n[WARN] WARNING: This will restore multiple databases.")
fmt.Println(" Existing databases may be overwritten or merged.")
fmt.Println("\nTo execute this restore, add the --confirm flag.")
fmt.Println(strings.Repeat("=", 60) + "\n")
return nil
}
// detectLargeObjectsInDumps checks if any dump files contain large objects
func (e *Engine) detectLargeObjectsInDumps(dumpsDir string, entries []os.DirEntry) bool {
hasLargeObjects := false
checkedCount := 0
maxChecks := 5 // Only check first 5 dumps to avoid slowdown
for _, entry := range entries {
if entry.IsDir() || checkedCount >= maxChecks {
continue
}
dumpFile := filepath.Join(dumpsDir, entry.Name())
// Skip compressed SQL files (can't easily check without decompressing)
if strings.HasSuffix(dumpFile, ".sql.gz") {
continue
}
// Use pg_restore -l to list contents (fast, doesn't restore data)
// 2 minutes for large dumps with many objects
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
cmd := cleanup.SafeCommand(ctx, "pg_restore", "-l", dumpFile)
output, err := cmd.Output()
if err != nil {
// If pg_restore -l fails, it might not be custom format - skip
continue
}
checkedCount++
// Check if output contains "BLOB" or "LARGE OBJECT" entries
outputStr := string(output)
if strings.Contains(outputStr, "BLOB") ||
strings.Contains(outputStr, "LARGE OBJECT") ||
strings.Contains(outputStr, " BLOBS ") {
e.log.Info("Large objects detected in dump file", "file", entry.Name())
hasLargeObjects = true
// Don't break - log all files with large objects
}
}
if hasLargeObjects {
e.log.Warn("Cluster contains databases with large objects - parallel restore may cause lock contention")
}
return hasLargeObjects
}
// isIgnorableError checks if an error message represents an ignorable PostgreSQL restore error
func (e *Engine) isIgnorableError(errorMsg string) bool {
// Convert to lowercase for case-insensitive matching
lowerMsg := strings.ToLower(errorMsg)
// CRITICAL: Syntax errors are NOT ignorable - indicates corrupted dump
if strings.Contains(lowerMsg, "syntax error") {
e.log.Error("CRITICAL: Syntax error in dump file - dump may be corrupted", "error", errorMsg)
return false
}
// CRITICAL: If error count is extremely high (>100k), dump is likely corrupted
if strings.Contains(errorMsg, "total errors:") {
// Extract error count if present in message
parts := strings.Split(errorMsg, "total errors:")
if len(parts) > 1 {
errorCountStr := strings.TrimSpace(strings.Split(parts[1], ")")[0])
// Try to parse as number
var count int
if _, err := fmt.Sscanf(errorCountStr, "%d", &count); err == nil && count > 100000 {
e.log.Error("CRITICAL: Excessive errors indicate corrupted dump", "error_count", count)
return false
}
}
}
// List of ignorable error patterns (objects that already exist or don't exist)
ignorablePatterns := []string{
"already exists",
"duplicate key",
"does not exist, skipping", // For DROP IF EXISTS
"no pg_hba.conf entry", // Permission warnings (not fatal)
}
for _, pattern := range ignorablePatterns {
if strings.Contains(lowerMsg, pattern) {
return true
}
}
// Special handling for "role does not exist" - this is a warning, not fatal
// Happens when globals.sql didn't contain a role that the dump references
// The restore can continue, but ownership won't be preserved for that role
if strings.Contains(lowerMsg, "role") && strings.Contains(lowerMsg, "does not exist") {
e.log.Warn("Role referenced in dump does not exist - ownership won't be preserved",
"error", errorMsg,
"hint", "The role may not have been in globals.sql or globals restore failed")
return true // Treat as ignorable - restore can continue
}
return false
}
// getFileSize returns the size of a file, or 0 if it can't be read
func getFileSize(path string) int64 {
info, err := os.Stat(path)
if err != nil {
return 0
}
return info.Size()
}
// FormatBytes formats bytes to human readable format
func FormatBytes(bytes int64) string {
const unit = 1024
if bytes < unit {
return fmt.Sprintf("%d B", bytes)
}
div, exp := int64(unit), 0
for n := bytes / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.1f %cB", float64(bytes)/float64(div), "KMGTPE"[exp])
}
// formatDuration formats a duration to human readable format (e.g., "3m 45s", "1h 23m", "45s")
func formatDuration(d time.Duration) string {
if d < time.Second {
return "0s"
}
hours := int(d.Hours())
minutes := int(d.Minutes()) % 60
seconds := int(d.Seconds()) % 60
if hours > 0 {
return fmt.Sprintf("%dh %dm", hours, minutes)
}
if minutes > 0 {
return fmt.Sprintf("%dm %ds", minutes, seconds)
}
return fmt.Sprintf("%ds", seconds)
}
// quickValidateSQLDump performs a fast validation of SQL dump files
// by checking for truncated COPY blocks. This catches corrupted dumps
// BEFORE attempting a full restore (which could waste 49+ minutes).
func (e *Engine) quickValidateSQLDump(archivePath string, compressed bool) error {
e.log.Debug("Pre-validating SQL dump file", "path", archivePath, "compressed", compressed)
diagnoser := NewDiagnoser(e.log, false) // non-verbose for speed
result, err := diagnoser.DiagnoseFile(archivePath)
if err != nil {
return fmt.Errorf("diagnosis error: %w", err)
}
// Check for critical issues that would cause restore failure
if result.IsTruncated {
errMsg := "SQL dump file is TRUNCATED"
if result.Details != nil && result.Details.UnterminatedCopy {
errMsg = fmt.Sprintf("%s - unterminated COPY block for table '%s' at line %d",
errMsg, result.Details.LastCopyTable, result.Details.LastCopyLineNumber)
if len(result.Details.SampleCopyData) > 0 {
errMsg = fmt.Sprintf("%s (sample orphaned data: %s)", errMsg, result.Details.SampleCopyData[0])
}
}
return fmt.Errorf("%s", errMsg)
}
if result.IsCorrupted {
return fmt.Errorf("SQL dump file is corrupted: %v", result.Errors)
}
if !result.IsValid {
if len(result.Errors) > 0 {
return fmt.Errorf("dump validation failed: %s", result.Errors[0])
}
return fmt.Errorf("dump file is invalid (unknown reason)")
}
// Log any warnings but don't fail
for _, warning := range result.Warnings {
e.log.Warn("Dump validation warning", "warning", warning)
}
e.log.Debug("SQL dump validation passed", "path", archivePath)
return nil
}
// OriginalSettings stores PostgreSQL settings to restore after operation
type OriginalSettings struct {
MaxLocks int
MaintenanceWorkMem string
}
// boostPostgreSQLSettings boosts multiple PostgreSQL settings for large restores
// NOTE: max_locks_per_transaction requires a PostgreSQL RESTART to take effect!
// maintenance_work_mem can be changed with pg_reload_conf().
func (e *Engine) boostPostgreSQLSettings(ctx context.Context, lockBoostValue int) (*OriginalSettings, error) {
if e.cfg.DebugLocks {
e.log.Info("🔍 [LOCK-DEBUG] boostPostgreSQLSettings: Starting lock boost procedure",
"target_lock_value", lockBoostValue)
}
connStr := e.buildConnString()
db, err := sql.Open("pgx", connStr)
if err != nil {
if e.cfg.DebugLocks {
e.log.Error("🔍 [LOCK-DEBUG] Failed to connect to PostgreSQL",
"error", err)
}
return nil, fmt.Errorf("failed to connect: %w", err)
}
defer db.Close()
original := &OriginalSettings{}
// Get current max_locks_per_transaction
var maxLocksStr string
if err := db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&maxLocksStr); err == nil {
original.MaxLocks, _ = strconv.Atoi(maxLocksStr)
}
if e.cfg.DebugLocks {
e.log.Info("🔍 [LOCK-DEBUG] Current PostgreSQL lock configuration",
"current_max_locks", original.MaxLocks,
"target_max_locks", lockBoostValue,
"boost_required", original.MaxLocks < lockBoostValue)
}
// Get current maintenance_work_mem
db.QueryRowContext(ctx, "SHOW maintenance_work_mem").Scan(&original.MaintenanceWorkMem)
// CRITICAL: max_locks_per_transaction requires a PostgreSQL RESTART!
// pg_reload_conf() is NOT sufficient for this parameter.
needsRestart := false
if original.MaxLocks < lockBoostValue {
if e.cfg.DebugLocks {
e.log.Info("🔍 [LOCK-DEBUG] Executing ALTER SYSTEM to boost locks",
"from", original.MaxLocks,
"to", lockBoostValue)
}
_, err = db.ExecContext(ctx, fmt.Sprintf("ALTER SYSTEM SET max_locks_per_transaction = %d", lockBoostValue))
if err != nil {
e.log.Warn("Could not set max_locks_per_transaction", "error", err)
if e.cfg.DebugLocks {
e.log.Error("🔍 [LOCK-DEBUG] ALTER SYSTEM failed",
"error", err)
}
} else {
needsRestart = true
e.log.Warn("max_locks_per_transaction requires PostgreSQL restart to take effect",
"current", original.MaxLocks,
"target", lockBoostValue)
if e.cfg.DebugLocks {
e.log.Info("🔍 [LOCK-DEBUG] ALTER SYSTEM succeeded - restart required",
"setting_saved_to", "postgresql.auto.conf",
"active_after", "PostgreSQL restart")
}
}
}
// Boost maintenance_work_mem to 2GB for faster index creation
// (this one CAN be applied via pg_reload_conf)
_, err = db.ExecContext(ctx, "ALTER SYSTEM SET maintenance_work_mem = '2GB'")
if err != nil {
e.log.Warn("Could not boost maintenance_work_mem", "error", err)
}
// Reload config to apply maintenance_work_mem
_, err = db.ExecContext(ctx, "SELECT pg_reload_conf()")
if err != nil {
return original, fmt.Errorf("failed to reload config: %w", err)
}
// If max_locks_per_transaction needs a restart, try to do it
if needsRestart {
if e.cfg.DebugLocks {
e.log.Info("🔍 [LOCK-DEBUG] Attempting PostgreSQL restart to activate new lock setting")
}
if restarted := e.tryRestartPostgreSQL(ctx); restarted {
e.log.Info("PostgreSQL restarted successfully - max_locks_per_transaction now active")
if e.cfg.DebugLocks {
e.log.Info("🔍 [LOCK-DEBUG] PostgreSQL restart SUCCEEDED")
}
// Wait for PostgreSQL to be ready
time.Sleep(3 * time.Second)
// Update original.MaxLocks to reflect the new value after restart
var newMaxLocksStr string
if err := db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&newMaxLocksStr); err == nil {
original.MaxLocks, _ = strconv.Atoi(newMaxLocksStr)
e.log.Info("Verified new max_locks_per_transaction after restart", "value", original.MaxLocks)
if e.cfg.DebugLocks {
e.log.Info("🔍 [LOCK-DEBUG] Post-restart verification",
"new_max_locks", original.MaxLocks,
"target_was", lockBoostValue,
"verification", "PASS")
}
}
} else {
// Cannot restart - this is now a CRITICAL failure
// We tried to boost locks but can't apply them without restart
e.log.Error("CRITICAL: max_locks_per_transaction boost requires PostgreSQL restart")
e.log.Error("Current value: " + strconv.Itoa(original.MaxLocks) + ", required: " + strconv.Itoa(lockBoostValue))
e.log.Error("The setting has been saved to postgresql.auto.conf but is NOT ACTIVE")
e.log.Error("Restore will ABORT to prevent 'out of shared memory' failure")
e.log.Error("Action required: Ask DBA to restart PostgreSQL, then retry restore")
if e.cfg.DebugLocks {
e.log.Error("🔍 [LOCK-DEBUG] PostgreSQL restart FAILED",
"current_locks", original.MaxLocks,
"required_locks", lockBoostValue,
"setting_saved", true,
"setting_active", false,
"verdict", "ABORT - Manual restart required")
}
// Return original settings so caller can check and abort
return original, nil
}
}
if e.cfg.DebugLocks {
e.log.Info("🔍 [LOCK-DEBUG] boostPostgreSQLSettings: Complete",
"final_max_locks", original.MaxLocks,
"target_was", lockBoostValue,
"boost_successful", original.MaxLocks >= lockBoostValue)
}
return original, nil
}
// canRestartPostgreSQL checks if we have the ability to restart PostgreSQL
// Returns false if running in a restricted environment (e.g., su postgres on enterprise systems)
func (e *Engine) canRestartPostgreSQL() bool {
// Check if we're running as postgres user - if so, we likely can't restart
// because PostgreSQL is managed by init/systemd, not directly by pg_ctl
currentUser := os.Getenv("USER")
if currentUser == "" {
currentUser = os.Getenv("LOGNAME")
}
// If we're the postgres user, check if we have sudo access
if currentUser == "postgres" {
// Try a quick sudo check - if this fails, we can't restart
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
cmd := cleanup.SafeCommand(ctx, "sudo", "-n", "true")
cmd.Stdin = nil
if err := cmd.Run(); err != nil {
e.log.Info("Running as postgres user without sudo access - cannot restart PostgreSQL",
"user", currentUser,
"hint", "Ask system administrator to restart PostgreSQL if needed")
return false
}
}
return true
}
// tryRestartPostgreSQL attempts to restart PostgreSQL using various methods
// Returns true if restart was successful
// IMPORTANT: Uses short timeouts and non-interactive sudo to avoid blocking on password prompts
// NOTE: This function will return false immediately if running as postgres without sudo
func (e *Engine) tryRestartPostgreSQL(ctx context.Context) bool {
// First check if we can even attempt a restart
if !e.canRestartPostgreSQL() {
e.log.Info("Skipping PostgreSQL restart attempt (no privileges)")
return false
}
e.progress.Update("Attempting PostgreSQL restart for lock settings...")
// Use short timeout for each restart attempt (don't block on sudo password prompts)
runWithTimeout := func(args ...string) bool {
cmdCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := cleanup.SafeCommand(cmdCtx, args[0], args[1:]...)
// Set stdin to /dev/null to prevent sudo from waiting for password
cmd.Stdin = nil
return cmd.Run() == nil
}
// Method 1: systemctl (most common on modern Linux) - use sudo -n for non-interactive
if runWithTimeout("sudo", "-n", "systemctl", "restart", "postgresql") {
return true
}
// Method 2: systemctl with version suffix (e.g., postgresql-15)
for _, ver := range []string{"17", "16", "15", "14", "13", "12"} {
if runWithTimeout("sudo", "-n", "systemctl", "restart", "postgresql-"+ver) {
return true
}
}
// Method 3: service command (older systems)
if runWithTimeout("sudo", "-n", "service", "postgresql", "restart") {
return true
}
// Method 4: pg_ctl as postgres user (if we ARE postgres user, no sudo needed)
if runWithTimeout("pg_ctl", "restart", "-D", "/var/lib/postgresql/data", "-m", "fast") {
return true
}
// Method 5: Try common PGDATA paths with pg_ctl directly (for postgres user)
pgdataPaths := []string{
"/var/lib/pgsql/data",
"/var/lib/pgsql/17/data",
"/var/lib/pgsql/16/data",
"/var/lib/pgsql/15/data",
"/var/lib/postgresql/17/main",
"/var/lib/postgresql/16/main",
"/var/lib/postgresql/15/main",
}
for _, pgdata := range pgdataPaths {
if runWithTimeout("pg_ctl", "restart", "-D", pgdata, "-m", "fast") {
return true
}
}
return false
}
// resetPostgreSQLSettings restores original PostgreSQL settings
// NOTE: max_locks_per_transaction changes are written but require restart to take effect.
// We don't restart here since we're done with the restore.
func (e *Engine) resetPostgreSQLSettings(ctx context.Context, original *OriginalSettings) error {
connStr := e.buildConnString()
db, err := sql.Open("pgx", connStr)
if err != nil {
return fmt.Errorf("failed to connect: %w", err)
}
defer db.Close()
// Reset max_locks_per_transaction (will take effect on next restart)
if original.MaxLocks == 64 { // Default
db.ExecContext(ctx, "ALTER SYSTEM RESET max_locks_per_transaction")
} else if original.MaxLocks > 0 {
db.ExecContext(ctx, fmt.Sprintf("ALTER SYSTEM SET max_locks_per_transaction = %d", original.MaxLocks))
}
// Reset maintenance_work_mem (takes effect immediately with reload)
if original.MaintenanceWorkMem == "64MB" { // Default
db.ExecContext(ctx, "ALTER SYSTEM RESET maintenance_work_mem")
} else if original.MaintenanceWorkMem != "" {
db.ExecContext(ctx, fmt.Sprintf("ALTER SYSTEM SET maintenance_work_mem = '%s'", original.MaintenanceWorkMem))
}
// Reload config (only maintenance_work_mem will take effect immediately)
_, err = db.ExecContext(ctx, "SELECT pg_reload_conf()")
if err != nil {
return fmt.Errorf("failed to reload config: %w", err)
}
e.log.Info("PostgreSQL settings reset queued",
"note", "max_locks_per_transaction will revert on next PostgreSQL restart")
return nil
}