v3.40.0: Restore diagnostics and error reporting

Features:
- restore diagnose command for backup file analysis
- Deep COPY block verification for truncated dump detection
- PGDMP signature and gzip integrity validation
- Detailed error reports with --save-debug-log flag
- Ring buffer stderr capture (prevents OOM on 2M+ errors)
- Error classification with actionable recommendations

TUI Enhancements:
- Automatic dump validity safety check before restore
- Press 'd' in archive browser to diagnose backups
- Press 'd' in restore preview for debug log toggle
- Debug logs saved to /tmp on failure when enabled

Documentation:
- Updated README with diagnose command and examples
- Updated CHANGELOG with full feature list
- Updated restore preview screenshots
This commit is contained in:
2026-01-05 15:17:54 +01:00
parent e7f0a9f5eb
commit 4c171c0e44
16 changed files with 2271 additions and 26 deletions

View File

@@ -33,6 +33,13 @@ var (
restoreNoProgress bool
restoreWorkdir string
restoreCleanCluster bool
restoreDiagnose bool // Run diagnosis before restore
restoreSaveDebugLog string // Path to save debug log on failure
// Diagnose flags
diagnoseJSON bool
diagnoseDeep bool
diagnoseKeepTemp bool
// Encryption flags
restoreEncryptionKeyFile string
@@ -214,12 +221,53 @@ Examples:
RunE: runRestorePITR,
}
// restoreDiagnoseCmd diagnoses backup files before restore
var restoreDiagnoseCmd = &cobra.Command{
Use: "diagnose [archive-file]",
Short: "Diagnose backup file integrity and format",
Long: `Perform deep analysis of backup files to detect issues before restore.
This command validates backup archives and provides detailed diagnostics
including truncation detection, format verification, and COPY block integrity.
Use this when:
- Restore fails with syntax errors
- You suspect backup corruption or truncation
- You want to verify backup integrity before restore
- Restore reports millions of errors
Checks performed:
- File format detection (custom dump vs SQL)
- PGDMP signature verification
- Gzip integrity validation
- COPY block termination check
- pg_restore --list verification
- Cluster archive structure validation
Examples:
# Diagnose a single dump file
dbbackup restore diagnose mydb.dump.gz
# Diagnose with verbose output
dbbackup restore diagnose mydb.sql.gz --verbose
# Diagnose cluster archive and all contained dumps
dbbackup restore diagnose cluster_backup.tar.gz --deep
# Output as JSON for scripting
dbbackup restore diagnose mydb.dump --json
`,
Args: cobra.ExactArgs(1),
RunE: runRestoreDiagnose,
}
func init() {
rootCmd.AddCommand(restoreCmd)
restoreCmd.AddCommand(restoreSingleCmd)
restoreCmd.AddCommand(restoreClusterCmd)
restoreCmd.AddCommand(restoreListCmd)
restoreCmd.AddCommand(restorePITRCmd)
restoreCmd.AddCommand(restoreDiagnoseCmd)
// Single restore flags
restoreSingleCmd.Flags().BoolVar(&restoreConfirm, "confirm", false, "Confirm and execute restore (required)")
@@ -232,6 +280,8 @@ func init() {
restoreSingleCmd.Flags().BoolVar(&restoreNoProgress, "no-progress", false, "Disable progress indicators")
restoreSingleCmd.Flags().StringVar(&restoreEncryptionKeyFile, "encryption-key-file", "", "Path to encryption key file (required for encrypted backups)")
restoreSingleCmd.Flags().StringVar(&restoreEncryptionKeyEnv, "encryption-key-env", "DBBACKUP_ENCRYPTION_KEY", "Environment variable containing encryption key")
restoreSingleCmd.Flags().BoolVar(&restoreDiagnose, "diagnose", false, "Run deep diagnosis before restore to detect corruption/truncation")
restoreSingleCmd.Flags().StringVar(&restoreSaveDebugLog, "save-debug-log", "", "Save detailed error report to file on failure (e.g., /tmp/restore-debug.json)")
// Cluster restore flags
restoreClusterCmd.Flags().BoolVar(&restoreConfirm, "confirm", false, "Confirm and execute restore (required)")
@@ -244,6 +294,8 @@ func init() {
restoreClusterCmd.Flags().BoolVar(&restoreNoProgress, "no-progress", false, "Disable progress indicators")
restoreClusterCmd.Flags().StringVar(&restoreEncryptionKeyFile, "encryption-key-file", "", "Path to encryption key file (required for encrypted backups)")
restoreClusterCmd.Flags().StringVar(&restoreEncryptionKeyEnv, "encryption-key-env", "DBBACKUP_ENCRYPTION_KEY", "Environment variable containing encryption key")
restoreClusterCmd.Flags().BoolVar(&restoreDiagnose, "diagnose", false, "Run deep diagnosis on all dumps before restore")
restoreClusterCmd.Flags().StringVar(&restoreSaveDebugLog, "save-debug-log", "", "Save detailed error report to file on failure (e.g., /tmp/restore-debug.json)")
// PITR restore flags
restorePITRCmd.Flags().StringVar(&pitrBaseBackup, "base-backup", "", "Path to base backup file (.tar.gz) (required)")
@@ -264,6 +316,117 @@ func init() {
restorePITRCmd.MarkFlagRequired("base-backup")
restorePITRCmd.MarkFlagRequired("wal-archive")
restorePITRCmd.MarkFlagRequired("target-dir")
// Diagnose flags
restoreDiagnoseCmd.Flags().BoolVar(&diagnoseJSON, "json", false, "Output diagnosis as JSON")
restoreDiagnoseCmd.Flags().BoolVar(&diagnoseDeep, "deep", false, "For cluster archives, extract and diagnose all contained dumps")
restoreDiagnoseCmd.Flags().BoolVar(&diagnoseKeepTemp, "keep-temp", false, "Keep temporary extraction directory (for debugging)")
restoreDiagnoseCmd.Flags().BoolVar(&restoreVerbose, "verbose", false, "Show detailed analysis progress")
}
// runRestoreDiagnose diagnoses backup files
func runRestoreDiagnose(cmd *cobra.Command, args []string) error {
archivePath := args[0]
// Convert to absolute path
if !filepath.IsAbs(archivePath) {
absPath, err := filepath.Abs(archivePath)
if err != nil {
return fmt.Errorf("invalid archive path: %w", err)
}
archivePath = absPath
}
// Check if file exists
if _, err := os.Stat(archivePath); err != nil {
return fmt.Errorf("archive not found: %s", archivePath)
}
log.Info("🔍 Diagnosing backup file", "path", archivePath)
diagnoser := restore.NewDiagnoser(log, restoreVerbose)
// Check if it's a cluster archive that needs deep analysis
format := restore.DetectArchiveFormat(archivePath)
if format.IsClusterBackup() && diagnoseDeep {
// Create temp directory for extraction
tempDir, err := os.MkdirTemp("", "dbbackup-diagnose-*")
if err != nil {
return fmt.Errorf("failed to create temp directory: %w", err)
}
if !diagnoseKeepTemp {
defer os.RemoveAll(tempDir)
} else {
log.Info("Temp directory preserved", "path", tempDir)
}
log.Info("Extracting cluster archive for deep analysis...")
// Extract and diagnose all dumps
results, err := diagnoser.DiagnoseClusterDumps(archivePath, tempDir)
if err != nil {
return fmt.Errorf("cluster diagnosis failed: %w", err)
}
// Output results
var hasErrors bool
for _, result := range results {
if diagnoseJSON {
diagnoser.PrintDiagnosisJSON(result)
} else {
diagnoser.PrintDiagnosis(result)
}
if !result.IsValid {
hasErrors = true
}
}
// Summary
if !diagnoseJSON {
fmt.Println("\n" + strings.Repeat("=", 70))
fmt.Printf("📊 CLUSTER SUMMARY: %d databases analyzed\n", len(results))
validCount := 0
for _, r := range results {
if r.IsValid {
validCount++
}
}
if validCount == len(results) {
fmt.Println("✅ All dumps are valid")
} else {
fmt.Printf("❌ %d/%d dumps have issues\n", len(results)-validCount, len(results))
}
fmt.Println(strings.Repeat("=", 70))
}
if hasErrors {
return fmt.Errorf("one or more dumps have validation errors")
}
return nil
}
// Single file diagnosis
result, err := diagnoser.DiagnoseFile(archivePath)
if err != nil {
return fmt.Errorf("diagnosis failed: %w", err)
}
if diagnoseJSON {
diagnoser.PrintDiagnosisJSON(result)
} else {
diagnoser.PrintDiagnosis(result)
}
if !result.IsValid {
return fmt.Errorf("backup file has validation errors")
}
log.Info("✅ Backup file appears valid")
return nil
}
// runRestoreSingle restores a single database
@@ -401,6 +564,12 @@ func runRestoreSingle(cmd *cobra.Command, args []string) error {
// Create restore engine
engine := restore.New(cfg, log, db)
// Enable debug logging if requested
if restoreSaveDebugLog != "" {
engine.SetDebugLogPath(restoreSaveDebugLog)
log.Info("Debug logging enabled", "output", restoreSaveDebugLog)
}
// Setup signal handling
ctx, cancel := context.WithCancel(context.Background())
@@ -416,6 +585,37 @@ func runRestoreSingle(cmd *cobra.Command, args []string) error {
cancel()
}()
// Run pre-restore diagnosis if requested
if restoreDiagnose {
log.Info("🔍 Running pre-restore diagnosis...")
diagnoser := restore.NewDiagnoser(log, restoreVerbose)
result, err := diagnoser.DiagnoseFile(archivePath)
if err != nil {
return fmt.Errorf("diagnosis failed: %w", err)
}
diagnoser.PrintDiagnosis(result)
if !result.IsValid {
log.Error("❌ Pre-restore diagnosis found issues")
if result.IsTruncated {
log.Error(" The backup file appears to be TRUNCATED")
}
if result.IsCorrupted {
log.Error(" The backup file appears to be CORRUPTED")
}
fmt.Println("\nUse --force to attempt restore anyway.")
if !restoreForce {
return fmt.Errorf("aborting restore due to backup file issues")
}
log.Warn("Continuing despite diagnosis errors (--force enabled)")
} else {
log.Info("✅ Backup file passed diagnosis")
}
}
// Execute restore
log.Info("Starting restore...", "database", targetDB)
@@ -584,6 +784,12 @@ func runRestoreCluster(cmd *cobra.Command, args []string) error {
// Create restore engine
engine := restore.New(cfg, log, db)
// Enable debug logging if requested
if restoreSaveDebugLog != "" {
engine.SetDebugLogPath(restoreSaveDebugLog)
log.Info("Debug logging enabled", "output", restoreSaveDebugLog)
}
// Setup signal handling
ctx, cancel := context.WithCancel(context.Background())
@@ -620,6 +826,52 @@ func runRestoreCluster(cmd *cobra.Command, args []string) error {
log.Info("Database cleanup completed")
}
// Run pre-restore diagnosis if requested
if restoreDiagnose {
log.Info("🔍 Running pre-restore diagnosis...")
// Create temp directory for extraction
diagTempDir, err := os.MkdirTemp("", "dbbackup-diagnose-*")
if err != nil {
return fmt.Errorf("failed to create temp directory for diagnosis: %w", err)
}
defer os.RemoveAll(diagTempDir)
diagnoser := restore.NewDiagnoser(log, restoreVerbose)
results, err := diagnoser.DiagnoseClusterDumps(archivePath, diagTempDir)
if err != nil {
return fmt.Errorf("diagnosis failed: %w", err)
}
// Check for any invalid dumps
var invalidDumps []string
for _, result := range results {
if !result.IsValid {
invalidDumps = append(invalidDumps, result.FileName)
diagnoser.PrintDiagnosis(result)
}
}
if len(invalidDumps) > 0 {
log.Error("❌ Pre-restore diagnosis found issues",
"invalid_dumps", len(invalidDumps),
"total_dumps", len(results))
fmt.Println("\n⚠ The following dumps have issues and will likely fail during restore:")
for _, name := range invalidDumps {
fmt.Printf(" - %s\n", name)
}
fmt.Println("\nRun 'dbbackup restore diagnose <archive> --deep' for full details.")
fmt.Println("Use --force to attempt restore anyway.")
if !restoreForce {
return fmt.Errorf("aborting restore due to %d invalid dump(s)", len(invalidDumps))
}
log.Warn("Continuing despite diagnosis errors (--force enabled)")
} else {
log.Info("✅ All dumps passed diagnosis", "count", len(results))
}
}
// Execute cluster restore
log.Info("Starting cluster restore...")