v3.42.9: Fix all timeout bugs and deadlocks

CRITICAL FIXES: - Encryption detection false positive (IsBackupEncrypted returned true for ALL files) - 12 cmd.Wait() deadlocks fixed with channel-based context handling - TUI timeout bugs: 60s->10min for safety checks, 15s->60s for DB listing - diagnose.go timeouts: 60s->5min for tar/pg_restore operations - Panic recovery added to parallel backup/restore goroutines - Variable shadowing fix in restore/engine.go These bugs caused pg_dump backups to fail through TUI for months.
2026-01-08 05:56:31 +01:00
parent 627061cdbb
commit 9c65821250
22 changed files with 1099 additions and 304 deletions
--- a/internal/restore/error_report.go
+++ b/internal/restore/error_report.go
@@ -3,6 +3,7 @@ package restore
 import (
 	"bufio"
 	"compress/gzip"
+	"context"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -20,43 +21,43 @@ import (
 // RestoreErrorReport contains comprehensive information about a restore failure
 type RestoreErrorReport struct {
 	// Metadata
-	Timestamp     time.Time `json:"timestamp"`
-	Version       string    `json:"version"`
-	GoVersion     string    `json:"go_version"`
-	OS            string    `json:"os"`
-	Arch          string    `json:"arch"`
-	
+	Timestamp time.Time `json:"timestamp"`
+	Version   string    `json:"version"`
+	GoVersion string    `json:"go_version"`
+	OS        string    `json:"os"`
+	Arch      string    `json:"arch"`
+
 	// Archive info
 	ArchivePath   string `json:"archive_path"`
 	ArchiveSize   int64  `json:"archive_size"`
 	ArchiveFormat string `json:"archive_format"`
-	
+
 	// Database info
-	TargetDB      string `json:"target_db"`
-	DatabaseType  string `json:"database_type"`
-	
+	TargetDB     string `json:"target_db"`
+	DatabaseType string `json:"database_type"`
+
 	// Error details
-	ExitCode      int      `json:"exit_code"`
-	ErrorMessage  string   `json:"error_message"`
-	ErrorType     string   `json:"error_type"`
-	ErrorHint     string   `json:"error_hint"`
-	TotalErrors   int      `json:"total_errors"`
-	
+	ExitCode     int    `json:"exit_code"`
+	ErrorMessage string `json:"error_message"`
+	ErrorType    string `json:"error_type"`
+	ErrorHint    string `json:"error_hint"`
+	TotalErrors  int    `json:"total_errors"`
+
 	// Captured output
-	LastStderr    []string `json:"last_stderr"`
-	FirstErrors   []string `json:"first_errors"`
-	
+	LastStderr  []string `json:"last_stderr"`
+	FirstErrors []string `json:"first_errors"`
+
 	// Context around failure
 	FailureContext *FailureContext `json:"failure_context,omitempty"`
-	
+
 	// Diagnosis results
 	DiagnosisResult *DiagnoseResult `json:"diagnosis_result,omitempty"`
-	
+
 	// Environment (sanitized)
-	PostgresVersion string `json:"postgres_version,omitempty"`
+	PostgresVersion  string `json:"postgres_version,omitempty"`
 	PgRestoreVersion string `json:"pg_restore_version,omitempty"`
-	PsqlVersion     string `json:"psql_version,omitempty"`
-	
+	PsqlVersion      string `json:"psql_version,omitempty"`
+
 	// Recommendations
 	Recommendations []string `json:"recommendations"`
 }
@@ -67,40 +68,40 @@ type FailureContext struct {
 	FailedLine       int      `json:"failed_line,omitempty"`
 	FailedStatement  string   `json:"failed_statement,omitempty"`
 	SurroundingLines []string `json:"surrounding_lines,omitempty"`
-	
+
 	// For COPY block errors
-	InCopyBlock      bool   `json:"in_copy_block,omitempty"`
-	CopyTableName    string `json:"copy_table_name,omitempty"`
-	CopyStartLine    int    `json:"copy_start_line,omitempty"`
-	SampleCopyData   []string `json:"sample_copy_data,omitempty"`
-	
+	InCopyBlock    bool     `json:"in_copy_block,omitempty"`
+	CopyTableName  string   `json:"copy_table_name,omitempty"`
+	CopyStartLine  int      `json:"copy_start_line,omitempty"`
+	SampleCopyData []string `json:"sample_copy_data,omitempty"`
+
 	// File position info
-	BytePosition     int64 `json:"byte_position,omitempty"`
-	PercentComplete  float64 `json:"percent_complete,omitempty"`
+	BytePosition    int64   `json:"byte_position,omitempty"`
+	PercentComplete float64 `json:"percent_complete,omitempty"`
 }

 // ErrorCollector captures detailed error information during restore
 type ErrorCollector struct {
-	log            logger.Logger
-	cfg            *config.Config
-	archivePath    string
-	targetDB       string
-	format         ArchiveFormat
-	
+	log         logger.Logger
+	cfg         *config.Config
+	archivePath string
+	targetDB    string
+	format      ArchiveFormat
+
 	// Captured data
-	stderrLines    []string
-	firstErrors    []string
-	lastErrors     []string
-	totalErrors    int
-	exitCode       int
-	
+	stderrLines []string
+	firstErrors []string
+	lastErrors  []string
+	totalErrors int
+	exitCode    int
+
 	// Limits
 	maxStderrLines  int
 	maxErrorCapture int
-	
+
 	// State
-	startTime      time.Time
-	enabled        bool
+	startTime time.Time
+	enabled   bool
 }

 // NewErrorCollector creates a new error collector
@@ -126,30 +127,30 @@ func (ec *ErrorCollector) CaptureStderr(chunk string) {
 	if !ec.enabled {
 		return
 	}
-	
+
 	lines := strings.Split(chunk, "\n")
 	for _, line := range lines {
 		line = strings.TrimSpace(line)
 		if line == "" {
 			continue
 		}
-		
+
 		// Store last N lines of stderr
 		if len(ec.stderrLines) >= ec.maxStderrLines {
 			// Shift array, drop oldest
 			ec.stderrLines = ec.stderrLines[1:]
 		}
 		ec.stderrLines = append(ec.stderrLines, line)
-		
+
 		// Check if this is an error line
 		if isErrorLine(line) {
 			ec.totalErrors++
-			
+
 			// Capture first N errors
 			if len(ec.firstErrors) < ec.maxErrorCapture {
 				ec.firstErrors = append(ec.firstErrors, line)
 			}
-			
+
 			// Keep last N errors (ring buffer style)
 			if len(ec.lastErrors) >= ec.maxErrorCapture {
 				ec.lastErrors = ec.lastErrors[1:]
@@ -184,36 +185,36 @@ func (ec *ErrorCollector) GenerateReport(errMessage string, errType string, errH
 		LastStderr:    ec.stderrLines,
 		FirstErrors:   ec.firstErrors,
 	}
-	
+
 	// Get archive size
 	if stat, err := os.Stat(ec.archivePath); err == nil {
 		report.ArchiveSize = stat.Size()
 	}
-	
+
 	// Get tool versions
 	report.PostgresVersion = getCommandVersion("postgres", "--version")
 	report.PgRestoreVersion = getCommandVersion("pg_restore", "--version")
 	report.PsqlVersion = getCommandVersion("psql", "--version")
-	
+
 	// Analyze failure context
 	report.FailureContext = ec.analyzeFailureContext()
-	
+
 	// Run diagnosis if not already done
 	diagnoser := NewDiagnoser(ec.log, false)
 	if diagResult, err := diagnoser.DiagnoseFile(ec.archivePath); err == nil {
 		report.DiagnosisResult = diagResult
 	}
-	
+
 	// Generate recommendations
 	report.Recommendations = ec.generateRecommendations(report)
-	
+
 	return report
 }

 // analyzeFailureContext extracts context around the failure
 func (ec *ErrorCollector) analyzeFailureContext() *FailureContext {
 	ctx := &FailureContext{}
-	
+
 	// Look for line number in errors
 	for _, errLine := range ec.lastErrors {
 		if lineNum := extractLineNumber(errLine); lineNum > 0 {
@@ -221,7 +222,7 @@ func (ec *ErrorCollector) analyzeFailureContext() *FailureContext {
 			break
 		}
 	}
-	
+
 	// Look for COPY-related errors
 	for _, errLine := range ec.lastErrors {
 		if strings.Contains(errLine, "COPY") || strings.Contains(errLine, "syntax error") {
@@ -233,12 +234,12 @@ func (ec *ErrorCollector) analyzeFailureContext() *FailureContext {
 			break
 		}
 	}
-	
+
 	// If we have a line number, try to get surrounding context from the dump
 	if ctx.FailedLine > 0 && ec.archivePath != "" {
 		ctx.SurroundingLines = ec.getSurroundingLines(ctx.FailedLine, 5)
 	}
-	
+
 	return ctx
 }

@@ -246,13 +247,13 @@ func (ec *ErrorCollector) analyzeFailureContext() *FailureContext {
 func (ec *ErrorCollector) getSurroundingLines(lineNum int, context int) []string {
 	var reader io.Reader
 	var lines []string
-	
+
 	file, err := os.Open(ec.archivePath)
 	if err != nil {
 		return nil
 	}
 	defer file.Close()
-	
+
 	// Handle compressed files
 	if strings.HasSuffix(ec.archivePath, ".gz") {
 		gz, err := gzip.NewReader(file)
@@ -264,19 +265,19 @@ func (ec *ErrorCollector) getSurroundingLines(lineNum int, context int) []string
 	} else {
 		reader = file
 	}
-	
+
 	scanner := bufio.NewScanner(reader)
 	buf := make([]byte, 0, 1024*1024)
 	scanner.Buffer(buf, 10*1024*1024)
-	
+
 	currentLine := 0
 	startLine := lineNum - context
 	endLine := lineNum + context
-	
+
 	if startLine < 1 {
 		startLine = 1
 	}
-	
+
 	for scanner.Scan() {
 		currentLine++
 		if currentLine >= startLine && currentLine <= endLine {
@@ -290,18 +291,18 @@ func (ec *ErrorCollector) getSurroundingLines(lineNum int, context int) []string
 			break
 		}
 	}
-	
+
 	return lines
 }

 // generateRecommendations provides actionable recommendations based on the error
 func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []string {
 	var recs []string
-	
+
 	// Check diagnosis results
 	if report.DiagnosisResult != nil {
 		if report.DiagnosisResult.IsTruncated {
-			recs = append(recs, 
+			recs = append(recs,
 				"CRITICAL: Backup file is truncated/incomplete",
 				"Action: Re-run the backup for the affected database",
 				"Check: Verify disk space was available during backup",
@@ -317,14 +318,14 @@ func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []
 		}
 		if report.DiagnosisResult.Details != nil && report.DiagnosisResult.Details.UnterminatedCopy {
 			recs = append(recs,
-				fmt.Sprintf("ISSUE: COPY block for table '%s' was not terminated", 
+				fmt.Sprintf("ISSUE: COPY block for table '%s' was not terminated",
 					report.DiagnosisResult.Details.LastCopyTable),
 				"Cause: Backup was interrupted during data export",
 				"Action: Re-run backup ensuring it completes fully",
 			)
 		}
 	}
-	
+
 	// Check error patterns
 	if report.TotalErrors > 1000000 {
 		recs = append(recs,
@@ -333,7 +334,7 @@ func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []
 			"Check: Verify dump format matches restore command",
 		)
 	}
-	
+
 	// Check for common error types
 	errLower := strings.ToLower(report.ErrorMessage)
 	if strings.Contains(errLower, "syntax error") {
@@ -343,7 +344,7 @@ func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []
 			"Check: Run 'dbbackup restore diagnose <archive>' for detailed analysis",
 		)
 	}
-	
+
 	if strings.Contains(errLower, "permission denied") {
 		recs = append(recs,
 			"ISSUE: Permission denied",
@@ -351,7 +352,7 @@ func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []
 			"Action: For ownership preservation, use a superuser account",
 		)
 	}
-	
+
 	if strings.Contains(errLower, "does not exist") {
 		recs = append(recs,
 			"ISSUE: Missing object reference",
@@ -359,7 +360,7 @@ func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []
 			"Action: Check if target database was created",
 		)
 	}
-	
+
 	if len(recs) == 0 {
 		recs = append(recs,
 			"Run 'dbbackup restore diagnose <archive>' for detailed analysis",
@@ -367,7 +368,7 @@ func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []
 			"Review the PostgreSQL/MySQL logs on the target server",
 		)
 	}
-	
+
 	return recs
 }

@@ -378,18 +379,18 @@ func (ec *ErrorCollector) SaveReport(report *RestoreErrorReport, outputPath stri
 	if err := os.MkdirAll(dir, 0755); err != nil {
 		return fmt.Errorf("failed to create directory: %w", err)
 	}
-	
+
 	// Marshal to JSON with indentation
 	data, err := json.MarshalIndent(report, "", "  ")
 	if err != nil {
 		return fmt.Errorf("failed to marshal report: %w", err)
 	}
-	
+
 	// Write file
 	if err := os.WriteFile(outputPath, data, 0644); err != nil {
 		return fmt.Errorf("failed to write report: %w", err)
 	}
-	
+
 	return nil
 }

@@ -399,35 +400,35 @@ func (ec *ErrorCollector) PrintReport(report *RestoreErrorReport) {
 	fmt.Println(strings.Repeat("═", 70))
 	fmt.Println("  🔴 RESTORE ERROR REPORT")
 	fmt.Println(strings.Repeat("═", 70))
-	
+
 	fmt.Printf("\n📅 Timestamp:    %s\n", report.Timestamp.Format("2006-01-02 15:04:05"))
 	fmt.Printf("📦 Archive:      %s\n", filepath.Base(report.ArchivePath))
 	fmt.Printf("📊 Format:       %s\n", report.ArchiveFormat)
 	fmt.Printf("🎯 Target DB:    %s\n", report.TargetDB)
 	fmt.Printf("⚠️  Exit Code:    %d\n", report.ExitCode)
 	fmt.Printf("❌ Total Errors: %d\n", report.TotalErrors)
-	
+
 	fmt.Println("\n" + strings.Repeat("─", 70))
 	fmt.Println("ERROR DETAILS:")
 	fmt.Println(strings.Repeat("─", 70))
-	
+
 	fmt.Printf("\nType: %s\n", report.ErrorType)
 	fmt.Printf("Message: %s\n", report.ErrorMessage)
 	if report.ErrorHint != "" {
 		fmt.Printf("Hint: %s\n", report.ErrorHint)
 	}
-	
+
 	// Show failure context
 	if report.FailureContext != nil && report.FailureContext.FailedLine > 0 {
 		fmt.Println("\n" + strings.Repeat("─", 70))
 		fmt.Println("FAILURE CONTEXT:")
 		fmt.Println(strings.Repeat("─", 70))
-		
+
 		fmt.Printf("\nFailed at line: %d\n", report.FailureContext.FailedLine)
 		if report.FailureContext.InCopyBlock {
 			fmt.Printf("Inside COPY block for table: %s\n", report.FailureContext.CopyTableName)
 		}
-		
+
 		if len(report.FailureContext.SurroundingLines) > 0 {
 			fmt.Println("\nSurrounding lines:")
 			for _, line := range report.FailureContext.SurroundingLines {
@@ -435,13 +436,13 @@ func (ec *ErrorCollector) PrintReport(report *RestoreErrorReport) {
 			}
 		}
 	}
-	
+
 	// Show first few errors
 	if len(report.FirstErrors) > 0 {
 		fmt.Println("\n" + strings.Repeat("─", 70))
 		fmt.Println("FIRST ERRORS:")
 		fmt.Println(strings.Repeat("─", 70))
-		
+
 		for i, err := range report.FirstErrors {
 			if i >= 5 {
 				fmt.Printf("... and %d more\n", len(report.FirstErrors)-5)
@@ -450,13 +451,13 @@ func (ec *ErrorCollector) PrintReport(report *RestoreErrorReport) {
 			fmt.Printf("  %d. %s\n", i+1, truncateString(err, 100))
 		}
 	}
-	
+
 	// Show diagnosis summary
 	if report.DiagnosisResult != nil && !report.DiagnosisResult.IsValid {
 		fmt.Println("\n" + strings.Repeat("─", 70))
 		fmt.Println("DIAGNOSIS:")
 		fmt.Println(strings.Repeat("─", 70))
-		
+
 		if report.DiagnosisResult.IsTruncated {
 			fmt.Println("  ❌ File is TRUNCATED")
 		}
@@ -470,21 +471,21 @@ func (ec *ErrorCollector) PrintReport(report *RestoreErrorReport) {
 			fmt.Printf("  • %s\n", err)
 		}
 	}
-	
+
 	// Show recommendations
 	fmt.Println("\n" + strings.Repeat("─", 70))
 	fmt.Println("💡 RECOMMENDATIONS:")
 	fmt.Println(strings.Repeat("─", 70))
-	
+
 	for _, rec := range report.Recommendations {
 		fmt.Printf("  • %s\n", rec)
 	}
-	
+
 	// Show tool versions
 	fmt.Println("\n" + strings.Repeat("─", 70))
 	fmt.Println("ENVIRONMENT:")
 	fmt.Println(strings.Repeat("─", 70))
-	
+
 	fmt.Printf("  OS: %s/%s\n", report.OS, report.Arch)
 	fmt.Printf("  Go: %s\n", report.GoVersion)
 	if report.PgRestoreVersion != "" {
@@ -493,15 +494,15 @@ func (ec *ErrorCollector) PrintReport(report *RestoreErrorReport) {
 	if report.PsqlVersion != "" {
 		fmt.Printf("  psql: %s\n", report.PsqlVersion)
 	}
-	
+
 	fmt.Println(strings.Repeat("═", 70))
 }

 // Helper functions

 func isErrorLine(line string) bool {
-	return strings.Contains(line, "ERROR:") || 
-		strings.Contains(line, "FATAL:") || 
+	return strings.Contains(line, "ERROR:") ||
+		strings.Contains(line, "FATAL:") ||
 		strings.Contains(line, "error:") ||
 		strings.Contains(line, "PANIC:")
 }
@@ -556,7 +557,11 @@ func getDatabaseType(format ArchiveFormat) string {
 }

 func getCommandVersion(cmd string, arg string) string {
-	output, err := exec.Command(cmd, arg).CombinedOutput()
+	// Use timeout to prevent blocking if command hangs
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	output, err := exec.CommandContext(ctx, cmd, arg).CombinedOutput()
 	if err != nil {
 		return ""
 	}