v3.42.9: Fix all timeout bugs and deadlocks
CRITICAL FIXES: - Encryption detection false positive (IsBackupEncrypted returned true for ALL files) - 12 cmd.Wait() deadlocks fixed with channel-based context handling - TUI timeout bugs: 60s->10min for safety checks, 15s->60s for DB listing - diagnose.go timeouts: 60s->5min for tar/pg_restore operations - Panic recovery added to parallel backup/restore goroutines - Variable shadowing fix in restore/engine.go These bugs caused pg_dump backups to fail through TUI for months.
This commit is contained in:
@@ -3,6 +3,7 @@ package restore
|
||||
import (
|
||||
"bufio"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -20,43 +21,43 @@ import (
|
||||
// RestoreErrorReport contains comprehensive information about a restore failure
|
||||
type RestoreErrorReport struct {
|
||||
// Metadata
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Version string `json:"version"`
|
||||
GoVersion string `json:"go_version"`
|
||||
OS string `json:"os"`
|
||||
Arch string `json:"arch"`
|
||||
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Version string `json:"version"`
|
||||
GoVersion string `json:"go_version"`
|
||||
OS string `json:"os"`
|
||||
Arch string `json:"arch"`
|
||||
|
||||
// Archive info
|
||||
ArchivePath string `json:"archive_path"`
|
||||
ArchiveSize int64 `json:"archive_size"`
|
||||
ArchiveFormat string `json:"archive_format"`
|
||||
|
||||
|
||||
// Database info
|
||||
TargetDB string `json:"target_db"`
|
||||
DatabaseType string `json:"database_type"`
|
||||
|
||||
TargetDB string `json:"target_db"`
|
||||
DatabaseType string `json:"database_type"`
|
||||
|
||||
// Error details
|
||||
ExitCode int `json:"exit_code"`
|
||||
ErrorMessage string `json:"error_message"`
|
||||
ErrorType string `json:"error_type"`
|
||||
ErrorHint string `json:"error_hint"`
|
||||
TotalErrors int `json:"total_errors"`
|
||||
|
||||
ExitCode int `json:"exit_code"`
|
||||
ErrorMessage string `json:"error_message"`
|
||||
ErrorType string `json:"error_type"`
|
||||
ErrorHint string `json:"error_hint"`
|
||||
TotalErrors int `json:"total_errors"`
|
||||
|
||||
// Captured output
|
||||
LastStderr []string `json:"last_stderr"`
|
||||
FirstErrors []string `json:"first_errors"`
|
||||
|
||||
LastStderr []string `json:"last_stderr"`
|
||||
FirstErrors []string `json:"first_errors"`
|
||||
|
||||
// Context around failure
|
||||
FailureContext *FailureContext `json:"failure_context,omitempty"`
|
||||
|
||||
|
||||
// Diagnosis results
|
||||
DiagnosisResult *DiagnoseResult `json:"diagnosis_result,omitempty"`
|
||||
|
||||
|
||||
// Environment (sanitized)
|
||||
PostgresVersion string `json:"postgres_version,omitempty"`
|
||||
PostgresVersion string `json:"postgres_version,omitempty"`
|
||||
PgRestoreVersion string `json:"pg_restore_version,omitempty"`
|
||||
PsqlVersion string `json:"psql_version,omitempty"`
|
||||
|
||||
PsqlVersion string `json:"psql_version,omitempty"`
|
||||
|
||||
// Recommendations
|
||||
Recommendations []string `json:"recommendations"`
|
||||
}
|
||||
@@ -67,40 +68,40 @@ type FailureContext struct {
|
||||
FailedLine int `json:"failed_line,omitempty"`
|
||||
FailedStatement string `json:"failed_statement,omitempty"`
|
||||
SurroundingLines []string `json:"surrounding_lines,omitempty"`
|
||||
|
||||
|
||||
// For COPY block errors
|
||||
InCopyBlock bool `json:"in_copy_block,omitempty"`
|
||||
CopyTableName string `json:"copy_table_name,omitempty"`
|
||||
CopyStartLine int `json:"copy_start_line,omitempty"`
|
||||
SampleCopyData []string `json:"sample_copy_data,omitempty"`
|
||||
|
||||
InCopyBlock bool `json:"in_copy_block,omitempty"`
|
||||
CopyTableName string `json:"copy_table_name,omitempty"`
|
||||
CopyStartLine int `json:"copy_start_line,omitempty"`
|
||||
SampleCopyData []string `json:"sample_copy_data,omitempty"`
|
||||
|
||||
// File position info
|
||||
BytePosition int64 `json:"byte_position,omitempty"`
|
||||
PercentComplete float64 `json:"percent_complete,omitempty"`
|
||||
BytePosition int64 `json:"byte_position,omitempty"`
|
||||
PercentComplete float64 `json:"percent_complete,omitempty"`
|
||||
}
|
||||
|
||||
// ErrorCollector captures detailed error information during restore
|
||||
type ErrorCollector struct {
|
||||
log logger.Logger
|
||||
cfg *config.Config
|
||||
archivePath string
|
||||
targetDB string
|
||||
format ArchiveFormat
|
||||
|
||||
log logger.Logger
|
||||
cfg *config.Config
|
||||
archivePath string
|
||||
targetDB string
|
||||
format ArchiveFormat
|
||||
|
||||
// Captured data
|
||||
stderrLines []string
|
||||
firstErrors []string
|
||||
lastErrors []string
|
||||
totalErrors int
|
||||
exitCode int
|
||||
|
||||
stderrLines []string
|
||||
firstErrors []string
|
||||
lastErrors []string
|
||||
totalErrors int
|
||||
exitCode int
|
||||
|
||||
// Limits
|
||||
maxStderrLines int
|
||||
maxErrorCapture int
|
||||
|
||||
|
||||
// State
|
||||
startTime time.Time
|
||||
enabled bool
|
||||
startTime time.Time
|
||||
enabled bool
|
||||
}
|
||||
|
||||
// NewErrorCollector creates a new error collector
|
||||
@@ -126,30 +127,30 @@ func (ec *ErrorCollector) CaptureStderr(chunk string) {
|
||||
if !ec.enabled {
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
lines := strings.Split(chunk, "\n")
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
|
||||
// Store last N lines of stderr
|
||||
if len(ec.stderrLines) >= ec.maxStderrLines {
|
||||
// Shift array, drop oldest
|
||||
ec.stderrLines = ec.stderrLines[1:]
|
||||
}
|
||||
ec.stderrLines = append(ec.stderrLines, line)
|
||||
|
||||
|
||||
// Check if this is an error line
|
||||
if isErrorLine(line) {
|
||||
ec.totalErrors++
|
||||
|
||||
|
||||
// Capture first N errors
|
||||
if len(ec.firstErrors) < ec.maxErrorCapture {
|
||||
ec.firstErrors = append(ec.firstErrors, line)
|
||||
}
|
||||
|
||||
|
||||
// Keep last N errors (ring buffer style)
|
||||
if len(ec.lastErrors) >= ec.maxErrorCapture {
|
||||
ec.lastErrors = ec.lastErrors[1:]
|
||||
@@ -184,36 +185,36 @@ func (ec *ErrorCollector) GenerateReport(errMessage string, errType string, errH
|
||||
LastStderr: ec.stderrLines,
|
||||
FirstErrors: ec.firstErrors,
|
||||
}
|
||||
|
||||
|
||||
// Get archive size
|
||||
if stat, err := os.Stat(ec.archivePath); err == nil {
|
||||
report.ArchiveSize = stat.Size()
|
||||
}
|
||||
|
||||
|
||||
// Get tool versions
|
||||
report.PostgresVersion = getCommandVersion("postgres", "--version")
|
||||
report.PgRestoreVersion = getCommandVersion("pg_restore", "--version")
|
||||
report.PsqlVersion = getCommandVersion("psql", "--version")
|
||||
|
||||
|
||||
// Analyze failure context
|
||||
report.FailureContext = ec.analyzeFailureContext()
|
||||
|
||||
|
||||
// Run diagnosis if not already done
|
||||
diagnoser := NewDiagnoser(ec.log, false)
|
||||
if diagResult, err := diagnoser.DiagnoseFile(ec.archivePath); err == nil {
|
||||
report.DiagnosisResult = diagResult
|
||||
}
|
||||
|
||||
|
||||
// Generate recommendations
|
||||
report.Recommendations = ec.generateRecommendations(report)
|
||||
|
||||
|
||||
return report
|
||||
}
|
||||
|
||||
// analyzeFailureContext extracts context around the failure
|
||||
func (ec *ErrorCollector) analyzeFailureContext() *FailureContext {
|
||||
ctx := &FailureContext{}
|
||||
|
||||
|
||||
// Look for line number in errors
|
||||
for _, errLine := range ec.lastErrors {
|
||||
if lineNum := extractLineNumber(errLine); lineNum > 0 {
|
||||
@@ -221,7 +222,7 @@ func (ec *ErrorCollector) analyzeFailureContext() *FailureContext {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Look for COPY-related errors
|
||||
for _, errLine := range ec.lastErrors {
|
||||
if strings.Contains(errLine, "COPY") || strings.Contains(errLine, "syntax error") {
|
||||
@@ -233,12 +234,12 @@ func (ec *ErrorCollector) analyzeFailureContext() *FailureContext {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// If we have a line number, try to get surrounding context from the dump
|
||||
if ctx.FailedLine > 0 && ec.archivePath != "" {
|
||||
ctx.SurroundingLines = ec.getSurroundingLines(ctx.FailedLine, 5)
|
||||
}
|
||||
|
||||
|
||||
return ctx
|
||||
}
|
||||
|
||||
@@ -246,13 +247,13 @@ func (ec *ErrorCollector) analyzeFailureContext() *FailureContext {
|
||||
func (ec *ErrorCollector) getSurroundingLines(lineNum int, context int) []string {
|
||||
var reader io.Reader
|
||||
var lines []string
|
||||
|
||||
|
||||
file, err := os.Open(ec.archivePath)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
|
||||
// Handle compressed files
|
||||
if strings.HasSuffix(ec.archivePath, ".gz") {
|
||||
gz, err := gzip.NewReader(file)
|
||||
@@ -264,19 +265,19 @@ func (ec *ErrorCollector) getSurroundingLines(lineNum int, context int) []string
|
||||
} else {
|
||||
reader = file
|
||||
}
|
||||
|
||||
|
||||
scanner := bufio.NewScanner(reader)
|
||||
buf := make([]byte, 0, 1024*1024)
|
||||
scanner.Buffer(buf, 10*1024*1024)
|
||||
|
||||
|
||||
currentLine := 0
|
||||
startLine := lineNum - context
|
||||
endLine := lineNum + context
|
||||
|
||||
|
||||
if startLine < 1 {
|
||||
startLine = 1
|
||||
}
|
||||
|
||||
|
||||
for scanner.Scan() {
|
||||
currentLine++
|
||||
if currentLine >= startLine && currentLine <= endLine {
|
||||
@@ -290,18 +291,18 @@ func (ec *ErrorCollector) getSurroundingLines(lineNum int, context int) []string
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return lines
|
||||
}
|
||||
|
||||
// generateRecommendations provides actionable recommendations based on the error
|
||||
func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []string {
|
||||
var recs []string
|
||||
|
||||
|
||||
// Check diagnosis results
|
||||
if report.DiagnosisResult != nil {
|
||||
if report.DiagnosisResult.IsTruncated {
|
||||
recs = append(recs,
|
||||
recs = append(recs,
|
||||
"CRITICAL: Backup file is truncated/incomplete",
|
||||
"Action: Re-run the backup for the affected database",
|
||||
"Check: Verify disk space was available during backup",
|
||||
@@ -317,14 +318,14 @@ func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []
|
||||
}
|
||||
if report.DiagnosisResult.Details != nil && report.DiagnosisResult.Details.UnterminatedCopy {
|
||||
recs = append(recs,
|
||||
fmt.Sprintf("ISSUE: COPY block for table '%s' was not terminated",
|
||||
fmt.Sprintf("ISSUE: COPY block for table '%s' was not terminated",
|
||||
report.DiagnosisResult.Details.LastCopyTable),
|
||||
"Cause: Backup was interrupted during data export",
|
||||
"Action: Re-run backup ensuring it completes fully",
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Check error patterns
|
||||
if report.TotalErrors > 1000000 {
|
||||
recs = append(recs,
|
||||
@@ -333,7 +334,7 @@ func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []
|
||||
"Check: Verify dump format matches restore command",
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
// Check for common error types
|
||||
errLower := strings.ToLower(report.ErrorMessage)
|
||||
if strings.Contains(errLower, "syntax error") {
|
||||
@@ -343,7 +344,7 @@ func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []
|
||||
"Check: Run 'dbbackup restore diagnose <archive>' for detailed analysis",
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
if strings.Contains(errLower, "permission denied") {
|
||||
recs = append(recs,
|
||||
"ISSUE: Permission denied",
|
||||
@@ -351,7 +352,7 @@ func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []
|
||||
"Action: For ownership preservation, use a superuser account",
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
if strings.Contains(errLower, "does not exist") {
|
||||
recs = append(recs,
|
||||
"ISSUE: Missing object reference",
|
||||
@@ -359,7 +360,7 @@ func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []
|
||||
"Action: Check if target database was created",
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
if len(recs) == 0 {
|
||||
recs = append(recs,
|
||||
"Run 'dbbackup restore diagnose <archive>' for detailed analysis",
|
||||
@@ -367,7 +368,7 @@ func (ec *ErrorCollector) generateRecommendations(report *RestoreErrorReport) []
|
||||
"Review the PostgreSQL/MySQL logs on the target server",
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
return recs
|
||||
}
|
||||
|
||||
@@ -378,18 +379,18 @@ func (ec *ErrorCollector) SaveReport(report *RestoreErrorReport, outputPath stri
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create directory: %w", err)
|
||||
}
|
||||
|
||||
|
||||
// Marshal to JSON with indentation
|
||||
data, err := json.MarshalIndent(report, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal report: %w", err)
|
||||
}
|
||||
|
||||
|
||||
// Write file
|
||||
if err := os.WriteFile(outputPath, data, 0644); err != nil {
|
||||
return fmt.Errorf("failed to write report: %w", err)
|
||||
}
|
||||
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -399,35 +400,35 @@ func (ec *ErrorCollector) PrintReport(report *RestoreErrorReport) {
|
||||
fmt.Println(strings.Repeat("═", 70))
|
||||
fmt.Println(" 🔴 RESTORE ERROR REPORT")
|
||||
fmt.Println(strings.Repeat("═", 70))
|
||||
|
||||
|
||||
fmt.Printf("\n📅 Timestamp: %s\n", report.Timestamp.Format("2006-01-02 15:04:05"))
|
||||
fmt.Printf("📦 Archive: %s\n", filepath.Base(report.ArchivePath))
|
||||
fmt.Printf("📊 Format: %s\n", report.ArchiveFormat)
|
||||
fmt.Printf("🎯 Target DB: %s\n", report.TargetDB)
|
||||
fmt.Printf("⚠️ Exit Code: %d\n", report.ExitCode)
|
||||
fmt.Printf("❌ Total Errors: %d\n", report.TotalErrors)
|
||||
|
||||
|
||||
fmt.Println("\n" + strings.Repeat("─", 70))
|
||||
fmt.Println("ERROR DETAILS:")
|
||||
fmt.Println(strings.Repeat("─", 70))
|
||||
|
||||
|
||||
fmt.Printf("\nType: %s\n", report.ErrorType)
|
||||
fmt.Printf("Message: %s\n", report.ErrorMessage)
|
||||
if report.ErrorHint != "" {
|
||||
fmt.Printf("Hint: %s\n", report.ErrorHint)
|
||||
}
|
||||
|
||||
|
||||
// Show failure context
|
||||
if report.FailureContext != nil && report.FailureContext.FailedLine > 0 {
|
||||
fmt.Println("\n" + strings.Repeat("─", 70))
|
||||
fmt.Println("FAILURE CONTEXT:")
|
||||
fmt.Println(strings.Repeat("─", 70))
|
||||
|
||||
|
||||
fmt.Printf("\nFailed at line: %d\n", report.FailureContext.FailedLine)
|
||||
if report.FailureContext.InCopyBlock {
|
||||
fmt.Printf("Inside COPY block for table: %s\n", report.FailureContext.CopyTableName)
|
||||
}
|
||||
|
||||
|
||||
if len(report.FailureContext.SurroundingLines) > 0 {
|
||||
fmt.Println("\nSurrounding lines:")
|
||||
for _, line := range report.FailureContext.SurroundingLines {
|
||||
@@ -435,13 +436,13 @@ func (ec *ErrorCollector) PrintReport(report *RestoreErrorReport) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Show first few errors
|
||||
if len(report.FirstErrors) > 0 {
|
||||
fmt.Println("\n" + strings.Repeat("─", 70))
|
||||
fmt.Println("FIRST ERRORS:")
|
||||
fmt.Println(strings.Repeat("─", 70))
|
||||
|
||||
|
||||
for i, err := range report.FirstErrors {
|
||||
if i >= 5 {
|
||||
fmt.Printf("... and %d more\n", len(report.FirstErrors)-5)
|
||||
@@ -450,13 +451,13 @@ func (ec *ErrorCollector) PrintReport(report *RestoreErrorReport) {
|
||||
fmt.Printf(" %d. %s\n", i+1, truncateString(err, 100))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Show diagnosis summary
|
||||
if report.DiagnosisResult != nil && !report.DiagnosisResult.IsValid {
|
||||
fmt.Println("\n" + strings.Repeat("─", 70))
|
||||
fmt.Println("DIAGNOSIS:")
|
||||
fmt.Println(strings.Repeat("─", 70))
|
||||
|
||||
|
||||
if report.DiagnosisResult.IsTruncated {
|
||||
fmt.Println(" ❌ File is TRUNCATED")
|
||||
}
|
||||
@@ -470,21 +471,21 @@ func (ec *ErrorCollector) PrintReport(report *RestoreErrorReport) {
|
||||
fmt.Printf(" • %s\n", err)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Show recommendations
|
||||
fmt.Println("\n" + strings.Repeat("─", 70))
|
||||
fmt.Println("💡 RECOMMENDATIONS:")
|
||||
fmt.Println(strings.Repeat("─", 70))
|
||||
|
||||
|
||||
for _, rec := range report.Recommendations {
|
||||
fmt.Printf(" • %s\n", rec)
|
||||
}
|
||||
|
||||
|
||||
// Show tool versions
|
||||
fmt.Println("\n" + strings.Repeat("─", 70))
|
||||
fmt.Println("ENVIRONMENT:")
|
||||
fmt.Println(strings.Repeat("─", 70))
|
||||
|
||||
|
||||
fmt.Printf(" OS: %s/%s\n", report.OS, report.Arch)
|
||||
fmt.Printf(" Go: %s\n", report.GoVersion)
|
||||
if report.PgRestoreVersion != "" {
|
||||
@@ -493,15 +494,15 @@ func (ec *ErrorCollector) PrintReport(report *RestoreErrorReport) {
|
||||
if report.PsqlVersion != "" {
|
||||
fmt.Printf(" psql: %s\n", report.PsqlVersion)
|
||||
}
|
||||
|
||||
|
||||
fmt.Println(strings.Repeat("═", 70))
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
|
||||
func isErrorLine(line string) bool {
|
||||
return strings.Contains(line, "ERROR:") ||
|
||||
strings.Contains(line, "FATAL:") ||
|
||||
return strings.Contains(line, "ERROR:") ||
|
||||
strings.Contains(line, "FATAL:") ||
|
||||
strings.Contains(line, "error:") ||
|
||||
strings.Contains(line, "PANIC:")
|
||||
}
|
||||
@@ -556,7 +557,11 @@ func getDatabaseType(format ArchiveFormat) string {
|
||||
}
|
||||
|
||||
func getCommandVersion(cmd string, arg string) string {
|
||||
output, err := exec.Command(cmd, arg).CombinedOutput()
|
||||
// Use timeout to prevent blocking if command hangs
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
output, err := exec.CommandContext(ctx, cmd, arg).CombinedOutput()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user