feat: comprehensive preflight checks for cluster restore

- Linux system checks (read-only from /proc, no auth needed):
  * shmmax, shmall kernel limits
  * Available RAM check

- PostgreSQL auto-tuning:
  * max_locks_per_transaction scaled by BLOB count
  * maintenance_work_mem boosted to 2GB for faster indexes
  * All settings auto-reset after restore (even on failure)

- Archive analysis:
  * Count BLOBs per database (pg_restore -l or zgrep)
  * Scale lock boost: 2048 (default) → 4096/8192/16384 based on count

- Nice TUI preflight summary display with ✓/⚠ indicators
This commit is contained in:
2026-01-14 15:30:41 +01:00
parent 22a7b9e81e
commit 8695823ebc
3 changed files with 541 additions and 13 deletions

View File

@@ -4,8 +4,8 @@ This directory contains pre-compiled binaries for the DB Backup Tool across mult
## Build Information
- **Version**: 3.42.10
- **Build Time**: 2026-01-14_07:16:09_UTC
- **Git Commit**: c71889b
- **Build Time**: 2026-01-14_14:06:01_UTC
- **Git Commit**: 22a7b9e
## Recent Updates (v1.1.0)
- ✅ Fixed TUI progress display with line-by-line output

View File

@@ -7,6 +7,7 @@ import (
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"sync"
"sync/atomic"
@@ -928,23 +929,34 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
}
e.log.Info("All dump files passed validation")
// AUTO-TUNE: Boost PostgreSQL lock capacity for large restores
// This prevents "out of shared memory" / max_locks_per_transaction errors
// Run comprehensive preflight checks (Linux system + PostgreSQL + Archive analysis)
preflight, preflightErr := e.RunPreflightChecks(ctx, dumpsDir, entries)
if preflightErr != nil {
e.log.Warn("Preflight checks failed", "error", preflightErr)
}
// Calculate optimal lock boost based on BLOB count
lockBoostValue := 2048 // Default
if preflight != nil && preflight.Archive.RecommendedLockBoost > 0 {
lockBoostValue = preflight.Archive.RecommendedLockBoost
}
// AUTO-TUNE: Boost PostgreSQL settings for large restores
e.progress.Update("Tuning PostgreSQL for large restore...")
originalLockValue, tuneErr := e.boostLockCapacity(ctx)
originalSettings, tuneErr := e.boostPostgreSQLSettings(ctx, lockBoostValue)
if tuneErr != nil {
e.log.Warn("Could not boost lock capacity - restore may fail on BLOB-heavy databases",
e.log.Warn("Could not boost PostgreSQL settings - restore may fail on BLOB-heavy databases",
"error", tuneErr)
} else {
e.log.Info("Boosted max_locks_per_transaction for restore",
"original", originalLockValue,
"boosted", 2048)
// Ensure we reset lock capacity when done (even on failure)
e.log.Info("Boosted PostgreSQL settings for restore",
"max_locks_per_transaction", fmt.Sprintf("%d → %d", originalSettings.MaxLocks, lockBoostValue),
"maintenance_work_mem", fmt.Sprintf("%s → 2GB", originalSettings.MaintenanceWorkMem))
// Ensure we reset settings when done (even on failure)
defer func() {
if resetErr := e.resetLockCapacity(ctx, originalLockValue); resetErr != nil {
e.log.Warn("Could not reset lock capacity", "error", resetErr)
if resetErr := e.resetPostgreSQLSettings(ctx, originalSettings); resetErr != nil {
e.log.Warn("Could not reset PostgreSQL settings", "error", resetErr)
} else {
e.log.Info("Reset max_locks_per_transaction to original value", "value", originalLockValue)
e.log.Info("Reset PostgreSQL settings to original values")
}
}()
}
@@ -1730,3 +1742,84 @@ func (e *Engine) resetLockCapacity(ctx context.Context, originalValue int) error
return nil
}
// OriginalSettings stores PostgreSQL settings to restore after operation
type OriginalSettings struct {
MaxLocks int
MaintenanceWorkMem string
}
// boostPostgreSQLSettings boosts multiple PostgreSQL settings for large restores
func (e *Engine) boostPostgreSQLSettings(ctx context.Context, lockBoostValue int) (*OriginalSettings, error) {
connStr := e.buildConnString()
db, err := sql.Open("pgx", connStr)
if err != nil {
return nil, fmt.Errorf("failed to connect: %w", err)
}
defer db.Close()
original := &OriginalSettings{}
// Get current max_locks_per_transaction
var maxLocksStr string
if err := db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&maxLocksStr); err == nil {
original.MaxLocks, _ = strconv.Atoi(maxLocksStr)
}
// Get current maintenance_work_mem
db.QueryRowContext(ctx, "SHOW maintenance_work_mem").Scan(&original.MaintenanceWorkMem)
// Boost max_locks_per_transaction (if not already high enough)
if original.MaxLocks < lockBoostValue {
_, err = db.ExecContext(ctx, fmt.Sprintf("ALTER SYSTEM SET max_locks_per_transaction = %d", lockBoostValue))
if err != nil {
e.log.Warn("Could not boost max_locks_per_transaction", "error", err)
}
}
// Boost maintenance_work_mem to 2GB for faster index creation
_, err = db.ExecContext(ctx, "ALTER SYSTEM SET maintenance_work_mem = '2GB'")
if err != nil {
e.log.Warn("Could not boost maintenance_work_mem", "error", err)
}
// Reload config to apply changes (no restart needed for these settings)
_, err = db.ExecContext(ctx, "SELECT pg_reload_conf()")
if err != nil {
return original, fmt.Errorf("failed to reload config: %w", err)
}
return original, nil
}
// resetPostgreSQLSettings restores original PostgreSQL settings
func (e *Engine) resetPostgreSQLSettings(ctx context.Context, original *OriginalSettings) error {
connStr := e.buildConnString()
db, err := sql.Open("pgx", connStr)
if err != nil {
return fmt.Errorf("failed to connect: %w", err)
}
defer db.Close()
// Reset max_locks_per_transaction
if original.MaxLocks == 64 { // Default
db.ExecContext(ctx, "ALTER SYSTEM RESET max_locks_per_transaction")
} else if original.MaxLocks > 0 {
db.ExecContext(ctx, fmt.Sprintf("ALTER SYSTEM SET max_locks_per_transaction = %d", original.MaxLocks))
}
// Reset maintenance_work_mem
if original.MaintenanceWorkMem == "64MB" { // Default
db.ExecContext(ctx, "ALTER SYSTEM RESET maintenance_work_mem")
} else if original.MaintenanceWorkMem != "" {
db.ExecContext(ctx, fmt.Sprintf("ALTER SYSTEM SET maintenance_work_mem = '%s'", original.MaintenanceWorkMem))
}
// Reload config
_, err = db.ExecContext(ctx, "SELECT pg_reload_conf()")
if err != nil {
return fmt.Errorf("failed to reload config: %w", err)
}
return nil
}

View File

@@ -0,0 +1,435 @@
package restore
import (
"bufio"
"context"
"database/sql"
"fmt"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
)
// PreflightResult contains all preflight check results
type PreflightResult struct {
// Linux system checks
Linux LinuxChecks
// PostgreSQL checks
PostgreSQL PostgreSQLChecks
// Archive analysis
Archive ArchiveChecks
// Overall status
CanProceed bool
Warnings []string
Errors []string
}
// LinuxChecks contains Linux kernel/system checks
type LinuxChecks struct {
ShmMax int64 // /proc/sys/kernel/shmmax
ShmAll int64 // /proc/sys/kernel/shmall
MemTotal int64 // Total RAM in bytes
MemAvailable int64 // Available RAM in bytes
ShmMaxOK bool // Is shmmax sufficient?
ShmAllOK bool // Is shmall sufficient?
MemAvailableOK bool // Is available RAM sufficient?
IsLinux bool // Are we running on Linux?
}
// PostgreSQLChecks contains PostgreSQL configuration checks
type PostgreSQLChecks struct {
MaxLocksPerTransaction int // Current setting
MaintenanceWorkMem string // Current setting
SharedBuffers string // Current setting (info only)
MaxConnections int // Current setting
Version string // PostgreSQL version
IsSuperuser bool // Can we modify settings?
}
// ArchiveChecks contains analysis of the backup archive
type ArchiveChecks struct {
TotalDatabases int
TotalBlobCount int // Estimated total BLOBs across all databases
BlobsByDB map[string]int // BLOBs per database
HasLargeBlobs bool // Any DB with >1000 BLOBs?
RecommendedLockBoost int // Calculated lock boost value
}
// RunPreflightChecks performs all preflight checks before a cluster restore
func (e *Engine) RunPreflightChecks(ctx context.Context, dumpsDir string, entries []os.DirEntry) (*PreflightResult, error) {
result := &PreflightResult{
CanProceed: true,
Archive: ArchiveChecks{
BlobsByDB: make(map[string]int),
},
}
e.progress.Update("[PREFLIGHT] Running system checks...")
e.log.Info("Starting preflight checks for cluster restore")
// 1. Linux system checks (read-only from /proc)
e.checkLinuxSystem(result)
// 2. PostgreSQL checks (via existing connection)
e.checkPostgreSQL(ctx, result)
// 3. Archive analysis (count BLOBs to scale lock boost)
e.analyzeArchive(ctx, dumpsDir, entries, result)
// 4. Calculate recommended settings
e.calculateRecommendations(result)
// 5. Print summary
e.printPreflightSummary(result)
return result, nil
}
// checkLinuxSystem reads kernel limits from /proc (no auth needed)
func (e *Engine) checkLinuxSystem(result *PreflightResult) {
result.Linux.IsLinux = runtime.GOOS == "linux"
if !result.Linux.IsLinux {
e.log.Info("Not running on Linux - skipping kernel checks", "os", runtime.GOOS)
return
}
// Read shmmax
if data, err := os.ReadFile("/proc/sys/kernel/shmmax"); err == nil {
val, _ := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64)
result.Linux.ShmMax = val
// 8GB minimum for large restores
result.Linux.ShmMaxOK = val >= 8*1024*1024*1024
}
// Read shmall (in pages, typically 4KB each)
if data, err := os.ReadFile("/proc/sys/kernel/shmall"); err == nil {
val, _ := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64)
result.Linux.ShmAll = val
// 2M pages = 8GB minimum
result.Linux.ShmAllOK = val >= 2*1024*1024
}
// Read memory info
if file, err := os.Open("/proc/meminfo"); err == nil {
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "MemTotal:") {
parts := strings.Fields(line)
if len(parts) >= 2 {
val, _ := strconv.ParseInt(parts[1], 10, 64)
result.Linux.MemTotal = val * 1024 // Convert KB to bytes
}
}
if strings.HasPrefix(line, "MemAvailable:") {
parts := strings.Fields(line)
if len(parts) >= 2 {
val, _ := strconv.ParseInt(parts[1], 10, 64)
result.Linux.MemAvailable = val * 1024 // Convert KB to bytes
// 4GB minimum available for large restores
result.Linux.MemAvailableOK = result.Linux.MemAvailable >= 4*1024*1024*1024
}
}
}
}
// Add warnings for insufficient resources
if !result.Linux.ShmMaxOK && result.Linux.ShmMax > 0 {
result.Warnings = append(result.Warnings,
fmt.Sprintf("Linux shmmax is low: %s (recommend 8GB+). Fix: sudo sysctl -w kernel.shmmax=17179869184",
formatBytesLong(result.Linux.ShmMax)))
}
if !result.Linux.ShmAllOK && result.Linux.ShmAll > 0 {
result.Warnings = append(result.Warnings,
fmt.Sprintf("Linux shmall is low: %d pages (recommend 2M+). Fix: sudo sysctl -w kernel.shmall=4194304",
result.Linux.ShmAll))
}
if !result.Linux.MemAvailableOK && result.Linux.MemAvailable > 0 {
result.Warnings = append(result.Warnings,
fmt.Sprintf("Available RAM is low: %s (recommend 4GB+ for large restores)",
formatBytesLong(result.Linux.MemAvailable)))
}
}
// checkPostgreSQL checks PostgreSQL configuration via SQL
func (e *Engine) checkPostgreSQL(ctx context.Context, result *PreflightResult) {
connStr := e.buildConnString()
db, err := sql.Open("pgx", connStr)
if err != nil {
e.log.Warn("Could not connect to PostgreSQL for preflight checks", "error", err)
return
}
defer db.Close()
// Check max_locks_per_transaction
var maxLocks string
if err := db.QueryRowContext(ctx, "SHOW max_locks_per_transaction").Scan(&maxLocks); err == nil {
result.PostgreSQL.MaxLocksPerTransaction, _ = strconv.Atoi(maxLocks)
}
// Check maintenance_work_mem
db.QueryRowContext(ctx, "SHOW maintenance_work_mem").Scan(&result.PostgreSQL.MaintenanceWorkMem)
// Check shared_buffers (info only, can't change without restart)
db.QueryRowContext(ctx, "SHOW shared_buffers").Scan(&result.PostgreSQL.SharedBuffers)
// Check max_connections
var maxConn string
if err := db.QueryRowContext(ctx, "SHOW max_connections").Scan(&maxConn); err == nil {
result.PostgreSQL.MaxConnections, _ = strconv.Atoi(maxConn)
}
// Check version
db.QueryRowContext(ctx, "SHOW server_version").Scan(&result.PostgreSQL.Version)
// Check if superuser
var isSuperuser bool
if err := db.QueryRowContext(ctx, "SELECT current_setting('is_superuser') = 'on'").Scan(&isSuperuser); err == nil {
result.PostgreSQL.IsSuperuser = isSuperuser
}
// Add info/warnings
if result.PostgreSQL.MaxLocksPerTransaction < 256 {
e.log.Info("PostgreSQL max_locks_per_transaction is low - will auto-boost",
"current", result.PostgreSQL.MaxLocksPerTransaction)
}
// Parse shared_buffers and warn if very low
sharedBuffersMB := parseMemoryToMB(result.PostgreSQL.SharedBuffers)
if sharedBuffersMB > 0 && sharedBuffersMB < 256 {
result.Warnings = append(result.Warnings,
fmt.Sprintf("PostgreSQL shared_buffers is low: %s (recommend 1GB+, requires restart)",
result.PostgreSQL.SharedBuffers))
}
}
// analyzeArchive counts BLOBs in dump files to calculate optimal lock boost
func (e *Engine) analyzeArchive(ctx context.Context, dumpsDir string, entries []os.DirEntry, result *PreflightResult) {
e.progress.Update("[PREFLIGHT] Analyzing archive for large objects...")
for _, entry := range entries {
if entry.IsDir() {
continue
}
result.Archive.TotalDatabases++
dumpFile := filepath.Join(dumpsDir, entry.Name())
dbName := strings.TrimSuffix(entry.Name(), ".dump")
dbName = strings.TrimSuffix(dbName, ".sql.gz")
// For custom format dumps, use pg_restore -l to count BLOBs
if strings.HasSuffix(entry.Name(), ".dump") {
blobCount := e.countBlobsInDump(ctx, dumpFile)
if blobCount > 0 {
result.Archive.BlobsByDB[dbName] = blobCount
result.Archive.TotalBlobCount += blobCount
if blobCount > 1000 {
result.Archive.HasLargeBlobs = true
}
}
}
// For SQL format, try to estimate from file content (sample check)
if strings.HasSuffix(entry.Name(), ".sql.gz") {
// Check for lo_create patterns in compressed SQL
blobCount := e.estimateBlobsInSQL(dumpFile)
if blobCount > 0 {
result.Archive.BlobsByDB[dbName] = blobCount
result.Archive.TotalBlobCount += blobCount
if blobCount > 1000 {
result.Archive.HasLargeBlobs = true
}
}
}
}
}
// countBlobsInDump uses pg_restore -l to count BLOB entries
func (e *Engine) countBlobsInDump(ctx context.Context, dumpFile string) int {
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "pg_restore", "-l", dumpFile)
output, err := cmd.Output()
if err != nil {
return 0
}
// Count lines containing BLOB/LARGE OBJECT
count := 0
for _, line := range strings.Split(string(output), "\n") {
if strings.Contains(line, "BLOB") || strings.Contains(line, "LARGE OBJECT") {
count++
}
}
return count
}
// estimateBlobsInSQL samples compressed SQL for lo_create patterns
func (e *Engine) estimateBlobsInSQL(sqlFile string) int {
// Use zgrep for efficient searching in gzipped files
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Count lo_create calls (each = one large object)
cmd := exec.CommandContext(ctx, "zgrep", "-c", "lo_create", sqlFile)
output, err := cmd.Output()
if err != nil {
// Also try SELECT lo_create pattern
cmd2 := exec.CommandContext(ctx, "zgrep", "-c", "SELECT.*lo_create", sqlFile)
output, err = cmd2.Output()
if err != nil {
return 0
}
}
count, _ := strconv.Atoi(strings.TrimSpace(string(output)))
return count
}
// calculateRecommendations determines optimal settings based on analysis
func (e *Engine) calculateRecommendations(result *PreflightResult) {
// Base lock boost
lockBoost := 2048
// Scale up based on BLOB count
if result.Archive.TotalBlobCount > 5000 {
lockBoost = 4096
}
if result.Archive.TotalBlobCount > 10000 {
lockBoost = 8192
}
if result.Archive.TotalBlobCount > 50000 {
lockBoost = 16384
}
// Cap at reasonable maximum
if lockBoost > 16384 {
lockBoost = 16384
}
result.Archive.RecommendedLockBoost = lockBoost
// Log recommendation
e.log.Info("Calculated recommended lock boost",
"total_blobs", result.Archive.TotalBlobCount,
"recommended_locks", lockBoost)
}
// printPreflightSummary prints a nice summary of all checks
func (e *Engine) printPreflightSummary(result *PreflightResult) {
fmt.Println()
fmt.Println(strings.Repeat("─", 60))
fmt.Println(" PREFLIGHT CHECKS")
fmt.Println(strings.Repeat("─", 60))
// Linux checks
if result.Linux.IsLinux {
fmt.Println("\n Linux System:")
printCheck("shmmax", formatBytesLong(result.Linux.ShmMax), result.Linux.ShmMaxOK || result.Linux.ShmMax == 0)
printCheck("shmall", fmt.Sprintf("%d pages", result.Linux.ShmAll), result.Linux.ShmAllOK || result.Linux.ShmAll == 0)
printCheck("Available RAM", formatBytesLong(result.Linux.MemAvailable), result.Linux.MemAvailableOK || result.Linux.MemAvailable == 0)
}
// PostgreSQL checks
fmt.Println("\n PostgreSQL:")
printCheck("Version", result.PostgreSQL.Version, true)
printCheck("max_locks_per_transaction", fmt.Sprintf("%d → %d (auto-boost)",
result.PostgreSQL.MaxLocksPerTransaction, result.Archive.RecommendedLockBoost),
true)
printCheck("maintenance_work_mem", fmt.Sprintf("%s → 2GB (auto-boost)",
result.PostgreSQL.MaintenanceWorkMem), true)
printInfo("shared_buffers", result.PostgreSQL.SharedBuffers)
printCheck("Superuser", fmt.Sprintf("%v", result.PostgreSQL.IsSuperuser), result.PostgreSQL.IsSuperuser)
// Archive analysis
fmt.Println("\n Archive Analysis:")
printInfo("Total databases", fmt.Sprintf("%d", result.Archive.TotalDatabases))
printInfo("Total BLOBs detected", fmt.Sprintf("%d", result.Archive.TotalBlobCount))
if len(result.Archive.BlobsByDB) > 0 {
fmt.Println(" Databases with BLOBs:")
for db, count := range result.Archive.BlobsByDB {
status := "✓"
if count > 1000 {
status = "⚠"
}
fmt.Printf(" %s %s: %d BLOBs\n", status, db, count)
}
}
// Warnings
if len(result.Warnings) > 0 {
fmt.Println("\n ⚠ Warnings:")
for _, w := range result.Warnings {
fmt.Printf(" • %s\n", w)
}
}
fmt.Println(strings.Repeat("─", 60))
fmt.Println()
}
func printCheck(name, value string, ok bool) {
status := "✓"
if !ok {
status = "⚠"
}
fmt.Printf(" %s %s: %s\n", status, name, value)
}
func printInfo(name, value string) {
fmt.Printf(" %s: %s\n", name, value)
}
// formatBytesLong is a local formatting helper for preflight display
func formatBytesLong(bytes int64) string {
if bytes == 0 {
return "unknown"
}
const unit = 1024
if bytes < unit {
return fmt.Sprintf("%d B", bytes)
}
div, exp := int64(unit), 0
for n := bytes / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.1f %cB", float64(bytes)/float64(div), "KMGTPE"[exp])
}
func parseMemoryToMB(memStr string) int {
memStr = strings.ToUpper(strings.TrimSpace(memStr))
var value int
var unit string
fmt.Sscanf(memStr, "%d%s", &value, &unit)
switch {
case strings.HasPrefix(unit, "G"):
return value * 1024
case strings.HasPrefix(unit, "M"):
return value
case strings.HasPrefix(unit, "K"):
return value / 1024
default:
return value / (1024 * 1024) // Assume bytes
}
}
func (e *Engine) buildConnString() string {
if e.cfg.Host == "localhost" || e.cfg.Host == "" {
return fmt.Sprintf("user=%s password=%s dbname=postgres sslmode=disable",
e.cfg.User, e.cfg.Password)
}
return fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=postgres sslmode=disable",
e.cfg.Host, e.cfg.Port, e.cfg.User, e.cfg.Password)
}