v3.40.0: Restore diagnostics and error reporting

Features: - restore diagnose command for backup file analysis - Deep COPY block verification for truncated dump detection - PGDMP signature and gzip integrity validation - Detailed error reports with --save-debug-log flag - Ring buffer stderr capture (prevents OOM on 2M+ errors) - Error classification with actionable recommendations TUI Enhancements: - Automatic dump validity safety check before restore - Press 'd' in archive browser to diagnose backups - Press 'd' in restore preview for debug log toggle - Debug logs saved to /tmp on failure when enabled Documentation: - Updated README with diagnose command and examples - Updated CHANGELOG with full feature list - Updated restore preview screenshots
2026-01-05 15:17:54 +01:00
parent e7f0a9f5eb
commit 4c171c0e44
16 changed files with 2271 additions and 26 deletions
--- a/cmd/restore.go
+++ b/cmd/restore.go
@@ -33,6 +33,13 @@ var (
 	restoreNoProgress   bool
 	restoreWorkdir      string
 	restoreCleanCluster bool
+	restoreDiagnose     bool   // Run diagnosis before restore
+	restoreSaveDebugLog string // Path to save debug log on failure
+
+	// Diagnose flags
+	diagnoseJSON       bool
+	diagnoseDeep       bool
+	diagnoseKeepTemp   bool

 	// Encryption flags
 	restoreEncryptionKeyFile string
@@ -214,12 +221,53 @@ Examples:
 	RunE: runRestorePITR,
 }

+// restoreDiagnoseCmd diagnoses backup files before restore
+var restoreDiagnoseCmd = &cobra.Command{
+	Use:   "diagnose [archive-file]",
+	Short: "Diagnose backup file integrity and format",
+	Long: `Perform deep analysis of backup files to detect issues before restore.
+
+This command validates backup archives and provides detailed diagnostics
+including truncation detection, format verification, and COPY block integrity.
+
+Use this when:
+  - Restore fails with syntax errors
+  - You suspect backup corruption or truncation
+  - You want to verify backup integrity before restore
+  - Restore reports millions of errors
+
+Checks performed:
+  - File format detection (custom dump vs SQL)
+  - PGDMP signature verification
+  - Gzip integrity validation
+  - COPY block termination check
+  - pg_restore --list verification
+  - Cluster archive structure validation
+
+Examples:
+  # Diagnose a single dump file
+  dbbackup restore diagnose mydb.dump.gz
+
+  # Diagnose with verbose output
+  dbbackup restore diagnose mydb.sql.gz --verbose
+
+  # Diagnose cluster archive and all contained dumps
+  dbbackup restore diagnose cluster_backup.tar.gz --deep
+
+  # Output as JSON for scripting
+  dbbackup restore diagnose mydb.dump --json
+`,
+	Args: cobra.ExactArgs(1),
+	RunE: runRestoreDiagnose,
+}
+
 func init() {
 	rootCmd.AddCommand(restoreCmd)
 	restoreCmd.AddCommand(restoreSingleCmd)
 	restoreCmd.AddCommand(restoreClusterCmd)
 	restoreCmd.AddCommand(restoreListCmd)
 	restoreCmd.AddCommand(restorePITRCmd)
+	restoreCmd.AddCommand(restoreDiagnoseCmd)

 	// Single restore flags
 	restoreSingleCmd.Flags().BoolVar(&restoreConfirm, "confirm", false, "Confirm and execute restore (required)")
@@ -232,6 +280,8 @@ func init() {
 	restoreSingleCmd.Flags().BoolVar(&restoreNoProgress, "no-progress", false, "Disable progress indicators")
 	restoreSingleCmd.Flags().StringVar(&restoreEncryptionKeyFile, "encryption-key-file", "", "Path to encryption key file (required for encrypted backups)")
 	restoreSingleCmd.Flags().StringVar(&restoreEncryptionKeyEnv, "encryption-key-env", "DBBACKUP_ENCRYPTION_KEY", "Environment variable containing encryption key")
+	restoreSingleCmd.Flags().BoolVar(&restoreDiagnose, "diagnose", false, "Run deep diagnosis before restore to detect corruption/truncation")
+	restoreSingleCmd.Flags().StringVar(&restoreSaveDebugLog, "save-debug-log", "", "Save detailed error report to file on failure (e.g., /tmp/restore-debug.json)")

 	// Cluster restore flags
 	restoreClusterCmd.Flags().BoolVar(&restoreConfirm, "confirm", false, "Confirm and execute restore (required)")
@@ -244,6 +294,8 @@ func init() {
 	restoreClusterCmd.Flags().BoolVar(&restoreNoProgress, "no-progress", false, "Disable progress indicators")
 	restoreClusterCmd.Flags().StringVar(&restoreEncryptionKeyFile, "encryption-key-file", "", "Path to encryption key file (required for encrypted backups)")
 	restoreClusterCmd.Flags().StringVar(&restoreEncryptionKeyEnv, "encryption-key-env", "DBBACKUP_ENCRYPTION_KEY", "Environment variable containing encryption key")
+	restoreClusterCmd.Flags().BoolVar(&restoreDiagnose, "diagnose", false, "Run deep diagnosis on all dumps before restore")
+	restoreClusterCmd.Flags().StringVar(&restoreSaveDebugLog, "save-debug-log", "", "Save detailed error report to file on failure (e.g., /tmp/restore-debug.json)")

 	// PITR restore flags
 	restorePITRCmd.Flags().StringVar(&pitrBaseBackup, "base-backup", "", "Path to base backup file (.tar.gz) (required)")
@@ -264,6 +316,117 @@ func init() {
 	restorePITRCmd.MarkFlagRequired("base-backup")
 	restorePITRCmd.MarkFlagRequired("wal-archive")
 	restorePITRCmd.MarkFlagRequired("target-dir")
+
+	// Diagnose flags
+	restoreDiagnoseCmd.Flags().BoolVar(&diagnoseJSON, "json", false, "Output diagnosis as JSON")
+	restoreDiagnoseCmd.Flags().BoolVar(&diagnoseDeep, "deep", false, "For cluster archives, extract and diagnose all contained dumps")
+	restoreDiagnoseCmd.Flags().BoolVar(&diagnoseKeepTemp, "keep-temp", false, "Keep temporary extraction directory (for debugging)")
+	restoreDiagnoseCmd.Flags().BoolVar(&restoreVerbose, "verbose", false, "Show detailed analysis progress")
+}
+
+// runRestoreDiagnose diagnoses backup files
+func runRestoreDiagnose(cmd *cobra.Command, args []string) error {
+	archivePath := args[0]
+
+	// Convert to absolute path
+	if !filepath.IsAbs(archivePath) {
+		absPath, err := filepath.Abs(archivePath)
+		if err != nil {
+			return fmt.Errorf("invalid archive path: %w", err)
+		}
+		archivePath = absPath
+	}
+
+	// Check if file exists
+	if _, err := os.Stat(archivePath); err != nil {
+		return fmt.Errorf("archive not found: %s", archivePath)
+	}
+
+	log.Info("🔍 Diagnosing backup file", "path", archivePath)
+
+	diagnoser := restore.NewDiagnoser(log, restoreVerbose)
+
+	// Check if it's a cluster archive that needs deep analysis
+	format := restore.DetectArchiveFormat(archivePath)
+
+	if format.IsClusterBackup() && diagnoseDeep {
+		// Create temp directory for extraction
+		tempDir, err := os.MkdirTemp("", "dbbackup-diagnose-*")
+		if err != nil {
+			return fmt.Errorf("failed to create temp directory: %w", err)
+		}
+
+		if !diagnoseKeepTemp {
+			defer os.RemoveAll(tempDir)
+		} else {
+			log.Info("Temp directory preserved", "path", tempDir)
+		}
+
+		log.Info("Extracting cluster archive for deep analysis...")
+
+		// Extract and diagnose all dumps
+		results, err := diagnoser.DiagnoseClusterDumps(archivePath, tempDir)
+		if err != nil {
+			return fmt.Errorf("cluster diagnosis failed: %w", err)
+		}
+
+		// Output results
+		var hasErrors bool
+		for _, result := range results {
+			if diagnoseJSON {
+				diagnoser.PrintDiagnosisJSON(result)
+			} else {
+				diagnoser.PrintDiagnosis(result)
+			}
+			if !result.IsValid {
+				hasErrors = true
+			}
+		}
+
+		// Summary
+		if !diagnoseJSON {
+			fmt.Println("\n" + strings.Repeat("=", 70))
+			fmt.Printf("📊 CLUSTER SUMMARY: %d databases analyzed\n", len(results))
+
+			validCount := 0
+			for _, r := range results {
+				if r.IsValid {
+					validCount++
+				}
+			}
+
+			if validCount == len(results) {
+				fmt.Println("✅ All dumps are valid")
+			} else {
+				fmt.Printf("❌ %d/%d dumps have issues\n", len(results)-validCount, len(results))
+			}
+			fmt.Println(strings.Repeat("=", 70))
+		}
+
+		if hasErrors {
+			return fmt.Errorf("one or more dumps have validation errors")
+		}
+		return nil
+	}
+
+	// Single file diagnosis
+	result, err := diagnoser.DiagnoseFile(archivePath)
+	if err != nil {
+		return fmt.Errorf("diagnosis failed: %w", err)
+	}
+
+	if diagnoseJSON {
+		diagnoser.PrintDiagnosisJSON(result)
+	} else {
+		diagnoser.PrintDiagnosis(result)
+	}
+
+	if !result.IsValid {
+		return fmt.Errorf("backup file has validation errors")
+	}
+
+	log.Info("✅ Backup file appears valid")
+	return nil
 }

 // runRestoreSingle restores a single database
@@ -401,6 +564,12 @@ func runRestoreSingle(cmd *cobra.Command, args []string) error {

 	// Create restore engine
 	engine := restore.New(cfg, log, db)
+	
+	// Enable debug logging if requested
+	if restoreSaveDebugLog != "" {
+		engine.SetDebugLogPath(restoreSaveDebugLog)
+		log.Info("Debug logging enabled", "output", restoreSaveDebugLog)
+	}

 	// Setup signal handling
 	ctx, cancel := context.WithCancel(context.Background())
@@ -416,6 +585,37 @@ func runRestoreSingle(cmd *cobra.Command, args []string) error {
 		cancel()
 	}()

+	// Run pre-restore diagnosis if requested
+	if restoreDiagnose {
+		log.Info("🔍 Running pre-restore diagnosis...")
+		
+		diagnoser := restore.NewDiagnoser(log, restoreVerbose)
+		result, err := diagnoser.DiagnoseFile(archivePath)
+		if err != nil {
+			return fmt.Errorf("diagnosis failed: %w", err)
+		}
+		
+		diagnoser.PrintDiagnosis(result)
+		
+		if !result.IsValid {
+			log.Error("❌ Pre-restore diagnosis found issues")
+			if result.IsTruncated {
+				log.Error("   The backup file appears to be TRUNCATED")
+			}
+			if result.IsCorrupted {
+				log.Error("   The backup file appears to be CORRUPTED")
+			}
+			fmt.Println("\nUse --force to attempt restore anyway.")
+			
+			if !restoreForce {
+				return fmt.Errorf("aborting restore due to backup file issues")
+			}
+			log.Warn("Continuing despite diagnosis errors (--force enabled)")
+		} else {
+			log.Info("✅ Backup file passed diagnosis")
+		}
+	}
+
 	// Execute restore
 	log.Info("Starting restore...", "database", targetDB)

@@ -584,6 +784,12 @@ func runRestoreCluster(cmd *cobra.Command, args []string) error {

 	// Create restore engine
 	engine := restore.New(cfg, log, db)
+	
+	// Enable debug logging if requested
+	if restoreSaveDebugLog != "" {
+		engine.SetDebugLogPath(restoreSaveDebugLog)
+		log.Info("Debug logging enabled", "output", restoreSaveDebugLog)
+	}

 	// Setup signal handling
 	ctx, cancel := context.WithCancel(context.Background())
@@ -620,6 +826,52 @@ func runRestoreCluster(cmd *cobra.Command, args []string) error {
 		log.Info("Database cleanup completed")
 	}

+	// Run pre-restore diagnosis if requested
+	if restoreDiagnose {
+		log.Info("🔍 Running pre-restore diagnosis...")
+		
+		// Create temp directory for extraction
+		diagTempDir, err := os.MkdirTemp("", "dbbackup-diagnose-*")
+		if err != nil {
+			return fmt.Errorf("failed to create temp directory for diagnosis: %w", err)
+		}
+		defer os.RemoveAll(diagTempDir)
+		
+		diagnoser := restore.NewDiagnoser(log, restoreVerbose)
+		results, err := diagnoser.DiagnoseClusterDumps(archivePath, diagTempDir)
+		if err != nil {
+			return fmt.Errorf("diagnosis failed: %w", err)
+		}
+		
+		// Check for any invalid dumps
+		var invalidDumps []string
+		for _, result := range results {
+			if !result.IsValid {
+				invalidDumps = append(invalidDumps, result.FileName)
+				diagnoser.PrintDiagnosis(result)
+			}
+		}
+		
+		if len(invalidDumps) > 0 {
+			log.Error("❌ Pre-restore diagnosis found issues",
+				"invalid_dumps", len(invalidDumps),
+				"total_dumps", len(results))
+			fmt.Println("\n⚠️  The following dumps have issues and will likely fail during restore:")
+			for _, name := range invalidDumps {
+				fmt.Printf("    - %s\n", name)
+			}
+			fmt.Println("\nRun 'dbbackup restore diagnose <archive> --deep' for full details.")
+			fmt.Println("Use --force to attempt restore anyway.")
+			
+			if !restoreForce {
+				return fmt.Errorf("aborting restore due to %d invalid dump(s)", len(invalidDumps))
+			}
+			log.Warn("Continuing despite diagnosis errors (--force enabled)")
+		} else {
+			log.Info("✅ All dumps passed diagnosis", "count", len(results))
+		}
+	}
+
 	// Execute cluster restore
 	log.Info("Starting cluster restore...")