v3.40.0: Restore diagnostics and error reporting

Features: - restore diagnose command for backup file analysis - Deep COPY block verification for truncated dump detection - PGDMP signature and gzip integrity validation - Detailed error reports with --save-debug-log flag - Ring buffer stderr capture (prevents OOM on 2M+ errors) - Error classification with actionable recommendations TUI Enhancements: - Automatic dump validity safety check before restore - Press 'd' in archive browser to diagnose backups - Press 'd' in restore preview for debug log toggle - Debug logs saved to /tmp on failure when enabled Documentation: - Updated README with diagnose command and examples - Updated CHANGELOG with full feature list - Updated restore preview screenshots
2026-01-05 15:17:54 +01:00
parent e7f0a9f5eb
commit 4c171c0e44
16 changed files with 2271 additions and 26 deletions
--- a/internal/restore/error_report.go
+++ b/internal/restore/error_report.go
@@ -0,0 +1,569 @@
+package restore
+
+import (
+	"bufio"
+	"compress/gzip"
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"time"
+
+	"dbbackup/internal/config"
+	"dbbackup/internal/logger"
+)
+
+// RestoreErrorReport contains comprehensive information about a restore failure
+type RestoreErrorReport struct {
+	// Metadata
+	Timestamp     time.Time `json:"timestamp"`
+	Version       string    `json:"version"`
+	GoVersion     string    `json:"go_version"`
+	OS            string    `json:"os"`
+	Arch          string    `json:"arch"`
+	
+	// Archive info
+	ArchivePath   string `json:"archive_path"`
+	ArchiveSize   int64  `json:"archive_size"`
+	ArchiveFormat string `json:"archive_format"`
+	
+	// Database info
+	TargetDB      string `json:"target_db"`
+	DatabaseType  string `json:"database_type"`
+	
+	// Error details
+	ExitCode      int      `json:"exit_code"`
+	ErrorMessage  string   `json:"error_message"`
+	ErrorType     string   `json:"error_type"`
+	ErrorHint     string   `json:"error_hint"`
+	TotalErrors   int      `json:"total_errors"`
+	
+	// Captured output
+	LastStderr    []string `json:"last_stderr"`
+	FirstErrors   []string `json:"first_errors"`
+	
+	// Context around failure
+	FailureContext *FailureContext `json:"failure_context,omitempty"`
+	
+	// Diagnosis results
+	DiagnosisResult *DiagnoseResult `json:"diagnosis_result,omitempty"`
+	
+	// Environment (sanitized)
+	PostgresVersion string `json:"postgres_version,omitempty"`
+	PgRestoreVersion string `json:"pg_restore_version,omitempty"`
+	PsqlVersion     string `json:"psql_version,omitempty"`
+	
+	// Recommendations
+	Recommendations []string `json:"recommendations"`
+}
+
+// FailureContext captures context around where the failure occurred
+type FailureContext struct {
+	// For SQL/COPY errors
+	FailedLine       int      `json:"failed_line,omitempty"`
+	FailedStatement  string   `json:"failed_statement,omitempty"`
+	SurroundingLines []string `json:"surrounding_lines,omitempty"`
+	
+	// For COPY block errors
+	InCopyBlock      bool   `json:"in_copy_block,omitempty"`
+	CopyTableName    string `json:"copy_table_name,omitempty"`
+	CopyStartLine    int    `json:"copy_start_line,omitempty"`
+	SampleCopyData   []string `json:"sample_copy_data,omitempty"`
+	
+	// File position info
+	BytePosition     int64 `json:"byte_position,omitempty"`
+	PercentComplete  float64 `json:"percent_complete,omitempty"`
+}
+
+// ErrorCollector captures detailed error information during restore
+type ErrorCollector struct {
+	log            logger.Logger
+	cfg            *config.Config
+	archivePath    string
+	targetDB       string
+	format         ArchiveFormat
+	
+	// Captured data
+	stderrLines    []string
+	firstErrors    []string
+	lastErrors     []string
+	totalErrors    int
+	exitCode       int
+	
+	// Limits
+	maxStderrLines  int
+	maxErrorCapture int
+	
+	// State
+	startTime      time.Time
+	enabled        bool
+}
+
+// NewErrorCollector creates a new error collector
+func NewErrorCollector(cfg *config.Config, log logger.Logger, archivePath, targetDB string, format ArchiveFormat, enabled bool) *ErrorCollector {
+	return &ErrorCollector{
+		log:             log,
+		cfg:             cfg,
+		archivePath:     archivePath,
+		targetDB:        targetDB,
+		format:          format,
+		stderrLines:     make([]string, 0, 100),
+		firstErrors:     make([]string, 0, 10),
+		lastErrors:      make([]string, 0, 10),
+		maxStderrLines:  100,
+		maxErrorCapture: 10,
+		startTime:       time.Now(),
+		enabled:         enabled,
+	}
+}
+
+// CaptureStderr processes and captures stderr output
+func (ec *ErrorCollector) CaptureStderr(chunk string) {
+	if !ec.enabled {
+		return
+	}
+	
+	lines := strings.Split(chunk, "\n")
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		
+		// Store last N lines of stderr
+		if len(ec.stderrLines) >= ec.maxStderrLines {
+			// Shift array, drop oldest
+			ec.stderrLines = ec.stderrLines[1:]
+		}
+		ec.stderrLines = append(ec.stderrLines, line)
+		
+		// Check if this is an error line
+		if isErrorLine(line) {
+			ec.totalErrors++
+			
+			// Capture first N errors
+			if len(ec.firstErrors) < ec.maxErrorCapture {
+				ec.firstErrors = append(ec.firstErrors, line)
+			}
+			
+			// Keep last N errors (ring buffer style)
+			if len(ec.lastErrors) >= ec.maxErrorCapture {
+				ec.lastErrors = ec.lastErrors[1:]
+			}
+			ec.lastErrors = append(ec.lastErrors, line)
+		}
+	}
+}
+
+// SetExitCode records the exit code
+func (ec *ErrorCollector) SetExitCode(code int) {
+	ec.exitCode = code
+}
+
+// GenerateReport creates a comprehensive error report
+func (ec *ErrorCollector) GenerateReport(errMessage string, errType string, errHint string) *RestoreErrorReport {
+	report := &RestoreErrorReport{
+		Timestamp:     time.Now(),
+		Version:       "1.0.0", // TODO: inject actual version
+		GoVersion:     runtime.Version(),
+		OS:            runtime.GOOS,
+		Arch:          runtime.GOARCH,
+		ArchivePath:   ec.archivePath,
+		ArchiveFormat: ec.format.String(),
+		TargetDB:      ec.targetDB,
+		DatabaseType:  getDatabaseType(ec.format),
+		ExitCode:      ec.exitCode,
+		ErrorMessage:  errMessage,
+		ErrorType:     errType,
+		ErrorHint:     errHint,
+		TotalErrors:   ec.totalErrors,
+		LastStderr:    ec.stderrLines,
+		FirstErrors:   ec.firstErrors,
+	}
+	
+	// Get archive size
+	if stat, err := os.Stat(ec.archivePath); err == nil {
+		report.ArchiveSize = stat.Size()
+	}
+	
+	// Get tool versions
+	report.PostgresVersion = getCommandVersion("postgres", "--version")
+	report.PgRestoreVersion = getCommandVersion("pg_restore", "--version")
+	report.PsqlVersion = getCommandVersion("psql", "--version")
+	
+	// Analyze failure context
+	report.FailureContext = ec.analyzeFailureContext()
+	
+	// Run diagnosis if not already done
+	diagnoser := NewDiagnoser(ec.log, false)
+	if diagResult, err := diagnoser.DiagnoseFile(ec.archivePath); err == nil {
+		report.DiagnosisResult = diagResult
+	}
+	
+	// Generate recommendations
+	report.Recommendations = ec.generateRecommendations(report)
+	
+	return report
+}
+
+// analyzeFailureContext extracts context around the failure
+func (ec *ErrorCollector) analyzeFailureContext() *FailureContext {
+	ctx := &FailureContext{}
+	
+	// Look for line number in errors
+	for _, errLine := range ec.lastErrors {
+		if lineNum := extractLineNumber(errLine); lineNum > 0 {
+			ctx.FailedLine = lineNum
+			break
+		}
+	}
+	
+	// Look for COPY-related errors
+	for _, errLine := range ec.lastErrors {
+		if strings.Contains(errLine, "COPY") || strings.Contains(errLine, "syntax error") {
+			ctx.InCopyBlock = true
+			// Try to extract table name
+			if tableName := extractTableName(errLine); tableName != "" {
+				ctx.CopyTableName = tableName
+			}
+			break
+		}
+	}
+	
+	// If we have a line number, try to get surrounding context from the dump
+	if ctx.FailedLine > 0 && ec.archivePath != "" {
+		ctx.SurroundingLines = ec.getSurroundingLines(ctx.FailedLine, 5)
+	}
+	
+	return ctx
+}
+
+// getSurroundingLines reads lines around a specific line number from the dump
+func (ec *ErrorCollector) getSurroundingLines(lineNum int, context int) []string {
+	var reader io.Reader
+	var lines []string
+	
+	file, err := os.Open(ec.archivePath)
+	if err != nil {
+		return nil
+	}
+	defer file.Close()
+	
+	// Handle compressed files
+	if strings.HasSuffix(ec.archivePath, ".gz") {
+		gz, err := gzip.NewReader(file)
+		if err != nil {
+			return nil
+		}
+		defer gz.Close()
+		reader = gz
+	} else {
+		reader = file
+	}
+	
+	scanner := bufio.NewScanner(reader)
+	buf := make([]byte, 0, 1024*1024)
+	scanner.Buffer(buf, 10*1024*1024)
+	
+	currentLine := 0
+	startLine := lineNum - context
+	endLine := lineNum + context
+	
+	if startLine < 1 {
+		startLine = 1
+	}
+	
+	for scanner.Scan() {
+		currentLine++
+		if currentLine >= startLine && currentLine <= endLine {
+			prefix := "  "
+			if currentLine == lineNum {
+				prefix = "> "
+			}
+			lines = append(lines, fmt.Sprintf("%s%d: %s", prefix, currentLine, truncateString(scanner.Text(), 100)))
+		}
+		if currentLine > endLine {
+			break
+		}
+	}
+	
+	return lines
+}
+
+// generateRecommendations provides actionable recommendations based on the error
+func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []string {
+	var recs []string
+	
+	// Check diagnosis results
+	if report.DiagnosisResult != nil {
+		if report.DiagnosisResult.IsTruncated {
+			recs = append(recs, 
+				"CRITICAL: Backup file is truncated/incomplete",
+				"Action: Re-run the backup for the affected database",
+				"Check: Verify disk space was available during backup",
+				"Check: Verify network was stable during backup transfer",
+			)
+		}
+		if report.DiagnosisResult.IsCorrupted {
+			recs = append(recs,
+				"CRITICAL: Backup file appears corrupted",
+				"Action: Restore from a previous backup",
+				"Action: Verify backup file checksum if available",
+			)
+		}
+		if report.DiagnosisResult.Details != nil && report.DiagnosisResult.Details.UnterminatedCopy {
+			recs = append(recs,
+				fmt.Sprintf("ISSUE: COPY block for table '%s' was not terminated", 
+					report.DiagnosisResult.Details.LastCopyTable),
+				"Cause: Backup was interrupted during data export",
+				"Action: Re-run backup ensuring it completes fully",
+			)
+		}
+	}
+	
+	// Check error patterns
+	if report.TotalErrors > 1000000 {
+		recs = append(recs,
+			"ISSUE: Millions of errors indicate structural problem, not individual data issues",
+			"Cause: Likely wrong restore method or truncated dump",
+			"Check: Verify dump format matches restore command",
+		)
+	}
+	
+	// Check for common error types
+	errLower := strings.ToLower(report.ErrorMessage)
+	if strings.Contains(errLower, "syntax error") {
+		recs = append(recs,
+			"ISSUE: SQL syntax errors during restore",
+			"Cause: COPY data being interpreted as SQL commands",
+			"Check: Run 'dbbackup restore diagnose <archive>' for detailed analysis",
+		)
+	}
+	
+	if strings.Contains(errLower, "permission denied") {
+		recs = append(recs,
+			"ISSUE: Permission denied",
+			"Action: Check database user has sufficient privileges",
+			"Action: For ownership preservation, use a superuser account",
+		)
+	}
+	
+	if strings.Contains(errLower, "does not exist") {
+		recs = append(recs,
+			"ISSUE: Missing object reference",
+			"Action: Ensure globals.sql was restored first (for roles/tablespaces)",
+			"Action: Check if target database was created",
+		)
+	}
+	
+	if len(recs) == 0 {
+		recs = append(recs,
+			"Run 'dbbackup restore diagnose <archive>' for detailed analysis",
+			"Check the stderr output above for specific error messages",
+			"Review the PostgreSQL/MySQL logs on the target server",
+		)
+	}
+	
+	return recs
+}
+
+// SaveReport saves the error report to a file
+func (ec *ErrorCollector) SaveReport(report *RestoreErrorReport, outputPath string) error {
+	// Create directory if needed
+	dir := filepath.Dir(outputPath)
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		return fmt.Errorf("failed to create directory: %w", err)
+	}
+	
+	// Marshal to JSON with indentation
+	data, err := json.MarshalIndent(report, "", "  ")
+	if err != nil {
+		return fmt.Errorf("failed to marshal report: %w", err)
+	}
+	
+	// Write file
+	if err := os.WriteFile(outputPath, data, 0644); err != nil {
+		return fmt.Errorf("failed to write report: %w", err)
+	}
+	
+	return nil
+}
+
+// PrintReport prints a human-readable summary of the error report
+func (ec *ErrorCollector) PrintReport(report *RestoreErrorReport) {
+	fmt.Println()
+	fmt.Println(strings.Repeat("═", 70))
+	fmt.Println("  🔴 RESTORE ERROR REPORT")
+	fmt.Println(strings.Repeat("═", 70))
+	
+	fmt.Printf("\n📅 Timestamp:    %s\n", report.Timestamp.Format("2006-01-02 15:04:05"))
+	fmt.Printf("📦 Archive:      %s\n", filepath.Base(report.ArchivePath))
+	fmt.Printf("📊 Format:       %s\n", report.ArchiveFormat)
+	fmt.Printf("🎯 Target DB:    %s\n", report.TargetDB)
+	fmt.Printf("⚠️  Exit Code:    %d\n", report.ExitCode)
+	fmt.Printf("❌ Total Errors: %d\n", report.TotalErrors)
+	
+	fmt.Println("\n" + strings.Repeat("─", 70))
+	fmt.Println("ERROR DETAILS:")
+	fmt.Println(strings.Repeat("─", 70))
+	
+	fmt.Printf("\nType: %s\n", report.ErrorType)
+	fmt.Printf("Message: %s\n", report.ErrorMessage)
+	if report.ErrorHint != "" {
+		fmt.Printf("Hint: %s\n", report.ErrorHint)
+	}
+	
+	// Show failure context
+	if report.FailureContext != nil && report.FailureContext.FailedLine > 0 {
+		fmt.Println("\n" + strings.Repeat("─", 70))
+		fmt.Println("FAILURE CONTEXT:")
+		fmt.Println(strings.Repeat("─", 70))
+		
+		fmt.Printf("\nFailed at line: %d\n", report.FailureContext.FailedLine)
+		if report.FailureContext.InCopyBlock {
+			fmt.Printf("Inside COPY block for table: %s\n", report.FailureContext.CopyTableName)
+		}
+		
+		if len(report.FailureContext.SurroundingLines) > 0 {
+			fmt.Println("\nSurrounding lines:")
+			for _, line := range report.FailureContext.SurroundingLines {
+				fmt.Println(line)
+			}
+		}
+	}
+	
+	// Show first few errors
+	if len(report.FirstErrors) > 0 {
+		fmt.Println("\n" + strings.Repeat("─", 70))
+		fmt.Println("FIRST ERRORS:")
+		fmt.Println(strings.Repeat("─", 70))
+		
+		for i, err := range report.FirstErrors {
+			if i >= 5 {
+				fmt.Printf("... and %d more\n", len(report.FirstErrors)-5)
+				break
+			}
+			fmt.Printf("  %d. %s\n", i+1, truncateString(err, 100))
+		}
+	}
+	
+	// Show diagnosis summary
+	if report.DiagnosisResult != nil && !report.DiagnosisResult.IsValid {
+		fmt.Println("\n" + strings.Repeat("─", 70))
+		fmt.Println("DIAGNOSIS:")
+		fmt.Println(strings.Repeat("─", 70))
+		
+		if report.DiagnosisResult.IsTruncated {
+			fmt.Println("  ❌ File is TRUNCATED")
+		}
+		if report.DiagnosisResult.IsCorrupted {
+			fmt.Println("  ❌ File is CORRUPTED")
+		}
+		for i, err := range report.DiagnosisResult.Errors {
+			if i >= 3 {
+				break
+			}
+			fmt.Printf("  • %s\n", err)
+		}
+	}
+	
+	// Show recommendations
+	fmt.Println("\n" + strings.Repeat("─", 70))
+	fmt.Println("💡 RECOMMENDATIONS:")
+	fmt.Println(strings.Repeat("─", 70))
+	
+	for _, rec := range report.Recommendations {
+		fmt.Printf("  • %s\n", rec)
+	}
+	
+	// Show tool versions
+	fmt.Println("\n" + strings.Repeat("─", 70))
+	fmt.Println("ENVIRONMENT:")
+	fmt.Println(strings.Repeat("─", 70))
+	
+	fmt.Printf("  OS: %s/%s\n", report.OS, report.Arch)
+	fmt.Printf("  Go: %s\n", report.GoVersion)
+	if report.PgRestoreVersion != "" {
+		fmt.Printf("  pg_restore: %s\n", report.PgRestoreVersion)
+	}
+	if report.PsqlVersion != "" {
+		fmt.Printf("  psql: %s\n", report.PsqlVersion)
+	}
+	
+	fmt.Println(strings.Repeat("═", 70))
+}
+
+// Helper functions
+
+func isErrorLine(line string) bool {
+	return strings.Contains(line, "ERROR:") || 
+		strings.Contains(line, "FATAL:") || 
+		strings.Contains(line, "error:") ||
+		strings.Contains(line, "PANIC:")
+}
+
+func extractLineNumber(errLine string) int {
+	// Look for patterns like "LINE 1:" or "line 123"
+	patterns := []string{"LINE ", "line "}
+	for _, pattern := range patterns {
+		if idx := strings.Index(errLine, pattern); idx >= 0 {
+			numStart := idx + len(pattern)
+			numEnd := numStart
+			for numEnd < len(errLine) && errLine[numEnd] >= '0' && errLine[numEnd] <= '9' {
+				numEnd++
+			}
+			if numEnd > numStart {
+				var num int
+				fmt.Sscanf(errLine[numStart:numEnd], "%d", &num)
+				return num
+			}
+		}
+	}
+	return 0
+}
+
+func extractTableName(errLine string) string {
+	// Look for patterns like 'COPY "tablename"' or 'table "tablename"'
+	patterns := []string{"COPY ", "table "}
+	for _, pattern := range patterns {
+		if idx := strings.Index(errLine, pattern); idx >= 0 {
+			start := idx + len(pattern)
+			// Skip optional quote
+			if start < len(errLine) && errLine[start] == '"' {
+				start++
+			}
+			end := start
+			for end < len(errLine) && errLine[end] != '"' && errLine[end] != ' ' && errLine[end] != '(' {
+				end++
+			}
+			if end > start {
+				return errLine[start:end]
+			}
+		}
+	}
+	return ""
+}
+
+func getDatabaseType(format ArchiveFormat) string {
+	if format.IsMySQL() {
+		return "mysql"
+	}
+	return "postgresql"
+}
+
+func getCommandVersion(cmd string, arg string) string {
+	output, err := exec.Command(cmd, arg).CombinedOutput()
+	if err != nil {
+		return ""
+	}
+	// Return first line only
+	lines := strings.Split(string(output), "\n")
+	if len(lines) > 0 {
+		return strings.TrimSpace(lines[0])
+	}
+	return ""
+}