Compare commits

..

1 Commits

Author SHA1 Message Date
f153e61dbf fix: dynamic timeouts for large archives + use WorkDir for disk checks
All checks were successful
CI/CD / Test (push) Successful in 1m21s
CI/CD / Lint (push) Successful in 1m34s
CI/CD / Build & Release (push) Successful in 3m22s
- CheckDiskSpace now uses GetEffectiveWorkDir() instead of BackupDir
- Dynamic timeout calculation based on file size:
  - diagnoseClusterArchive: 5 + (GB/3) min, max 60 min
  - verifyWithPgRestore: 5 + (GB/5) min, max 30 min
  - DiagnoseClusterDumps: 10 + (GB/3) min, max 120 min
  - TUI safety checks: 10 + (GB/5) min, max 120 min
- Timeout vs corruption differentiation (no false CORRUPTED on timeout)
- Streaming tar listing to avoid OOM on large archives

For 119GB archives: ~45 min timeout instead of 5 min false-positive
2026-01-13 08:22:20 +01:00
4 changed files with 144 additions and 21 deletions

View File

@@ -4,8 +4,8 @@ This directory contains pre-compiled binaries for the DB Backup Tool across mult
## Build Information ## Build Information
- **Version**: 3.42.10 - **Version**: 3.42.10
- **Build Time**: 2026-01-12_08:50:35_UTC - **Build Time**: 2026-01-12_14:25:53_UTC
- **Git Commit**: b1f8c6d - **Git Commit**: d19c065
## Recent Updates (v1.1.0) ## Recent Updates (v1.1.0)
- ✅ Fixed TUI progress display with line-by-line output - ✅ Fixed TUI progress display with line-by-line output

View File

@@ -414,14 +414,42 @@ func (d *Diagnoser) diagnoseSQLScript(filePath string, compressed bool, result *
// diagnoseClusterArchive analyzes a cluster tar.gz archive // diagnoseClusterArchive analyzes a cluster tar.gz archive
func (d *Diagnoser) diagnoseClusterArchive(filePath string, result *DiagnoseResult) { func (d *Diagnoser) diagnoseClusterArchive(filePath string, result *DiagnoseResult) {
// First verify tar.gz integrity with timeout // Calculate dynamic timeout based on file size
// 5 minutes for large archives (multi-GB archives need more time) // Assume minimum 50 MB/s throughput for compressed archive listing
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) // Minimum 5 minutes, scales with file size
timeoutMinutes := 5
if result.FileSize > 0 {
// 1 minute per 3 GB, minimum 5 minutes, max 60 minutes
sizeGB := result.FileSize / (1024 * 1024 * 1024)
estimatedMinutes := int(sizeGB/3) + 5
if estimatedMinutes > timeoutMinutes {
timeoutMinutes = estimatedMinutes
}
if timeoutMinutes > 60 {
timeoutMinutes = 60
}
}
d.log.Info("Verifying cluster archive integrity",
"size", fmt.Sprintf("%.1f GB", float64(result.FileSize)/(1024*1024*1024)),
"timeout", fmt.Sprintf("%d min", timeoutMinutes))
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutMinutes)*time.Minute)
defer cancel() defer cancel()
cmd := exec.CommandContext(ctx, "tar", "-tzf", filePath) cmd := exec.CommandContext(ctx, "tar", "-tzf", filePath)
output, err := cmd.Output() output, err := cmd.Output()
if err != nil { if err != nil {
// Check if it was a timeout
if ctx.Err() == context.DeadlineExceeded {
result.IsValid = false
result.Errors = append(result.Errors,
fmt.Sprintf("Verification timed out after %d minutes - archive is very large", timeoutMinutes),
"This does not necessarily mean the archive is corrupted",
"Manual verification: tar -tzf "+filePath+" | wc -l")
// Don't mark as corrupted on timeout
return
}
result.IsValid = false result.IsValid = false
result.IsCorrupted = true result.IsCorrupted = true
result.Errors = append(result.Errors, result.Errors = append(result.Errors,
@@ -497,9 +525,22 @@ func (d *Diagnoser) diagnoseUnknown(filePath string, result *DiagnoseResult) {
// verifyWithPgRestore uses pg_restore --list to verify dump integrity // verifyWithPgRestore uses pg_restore --list to verify dump integrity
func (d *Diagnoser) verifyWithPgRestore(filePath string, result *DiagnoseResult) { func (d *Diagnoser) verifyWithPgRestore(filePath string, result *DiagnoseResult) {
// Use timeout to prevent blocking on very large dump files // Calculate dynamic timeout based on file size
// 5 minutes for large dumps (multi-GB dumps with many tables) // pg_restore --list is usually faster than tar -tzf for same size
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) timeoutMinutes := 5
if result.FileSize > 0 {
// 1 minute per 5 GB, minimum 5 minutes, max 30 minutes
sizeGB := result.FileSize / (1024 * 1024 * 1024)
estimatedMinutes := int(sizeGB/5) + 5
if estimatedMinutes > timeoutMinutes {
timeoutMinutes = estimatedMinutes
}
if timeoutMinutes > 30 {
timeoutMinutes = 30
}
}
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutMinutes)*time.Minute)
defer cancel() defer cancel()
cmd := exec.CommandContext(ctx, "pg_restore", "--list", filePath) cmd := exec.CommandContext(ctx, "pg_restore", "--list", filePath)
@@ -554,14 +595,72 @@ func (d *Diagnoser) verifyWithPgRestore(filePath string, result *DiagnoseResult)
// DiagnoseClusterDumps extracts and diagnoses all dumps in a cluster archive // DiagnoseClusterDumps extracts and diagnoses all dumps in a cluster archive
func (d *Diagnoser) DiagnoseClusterDumps(archivePath, tempDir string) ([]*DiagnoseResult, error) { func (d *Diagnoser) DiagnoseClusterDumps(archivePath, tempDir string) ([]*DiagnoseResult, error) {
// First, try to list archive contents without extracting (fast check) // Get archive size for dynamic timeout calculation
// 10 minutes for very large archives archiveInfo, err := os.Stat(archivePath)
listCtx, listCancel := context.WithTimeout(context.Background(), 10*time.Minute) if err != nil {
return nil, fmt.Errorf("cannot stat archive: %w", err)
}
// Dynamic timeout based on archive size: base 10 min + 1 min per 3 GB
// Large archives like 100+ GB need more time for tar -tzf
timeoutMinutes := 10
if archiveInfo.Size() > 0 {
sizeGB := archiveInfo.Size() / (1024 * 1024 * 1024)
estimatedMinutes := int(sizeGB/3) + 10
if estimatedMinutes > timeoutMinutes {
timeoutMinutes = estimatedMinutes
}
if timeoutMinutes > 120 { // Max 2 hours
timeoutMinutes = 120
}
}
d.log.Info("Listing cluster archive contents",
"size", fmt.Sprintf("%.1f GB", float64(archiveInfo.Size())/(1024*1024*1024)),
"timeout", fmt.Sprintf("%d min", timeoutMinutes))
listCtx, listCancel := context.WithTimeout(context.Background(), time.Duration(timeoutMinutes)*time.Minute)
defer listCancel() defer listCancel()
listCmd := exec.CommandContext(listCtx, "tar", "-tzf", archivePath) listCmd := exec.CommandContext(listCtx, "tar", "-tzf", archivePath)
listOutput, listErr := listCmd.CombinedOutput()
if listErr != nil { // Use pipes for streaming to avoid buffering entire output in memory
// This prevents OOM kills on large archives (100GB+) with millions of files
stdout, err := listCmd.StdoutPipe()
if err != nil {
return nil, fmt.Errorf("failed to create stdout pipe: %w", err)
}
var stderrBuf bytes.Buffer
listCmd.Stderr = &stderrBuf
if err := listCmd.Start(); err != nil {
return nil, fmt.Errorf("failed to start tar listing: %w", err)
}
// Stream the output line by line, only keeping relevant files
var files []string
scanner := bufio.NewScanner(stdout)
// Set a reasonable max line length (file paths shouldn't exceed this)
scanner.Buffer(make([]byte, 0, 4096), 1024*1024)
fileCount := 0
for scanner.Scan() {
fileCount++
line := scanner.Text()
// Only store dump files and important files, not every single file
if strings.HasSuffix(line, ".dump") || strings.HasSuffix(line, ".sql") ||
strings.HasSuffix(line, ".sql.gz") || strings.HasSuffix(line, ".json") ||
strings.Contains(line, "globals") || strings.Contains(line, "manifest") ||
strings.Contains(line, "metadata") || strings.HasSuffix(line, "/") {
files = append(files, line)
}
}
scanErr := scanner.Err()
listErr := listCmd.Wait()
if listErr != nil || scanErr != nil {
// Archive listing failed - likely corrupted // Archive listing failed - likely corrupted
errResult := &DiagnoseResult{ errResult := &DiagnoseResult{
FilePath: archivePath, FilePath: archivePath,
@@ -573,7 +672,12 @@ func (d *Diagnoser) DiagnoseClusterDumps(archivePath, tempDir string) ([]*Diagno
Details: &DiagnoseDetails{}, Details: &DiagnoseDetails{},
} }
errOutput := string(listOutput) errOutput := stderrBuf.String()
actualErr := listErr
if scanErr != nil {
actualErr = scanErr
}
if strings.Contains(errOutput, "unexpected end of file") || if strings.Contains(errOutput, "unexpected end of file") ||
strings.Contains(errOutput, "Unexpected EOF") || strings.Contains(errOutput, "Unexpected EOF") ||
strings.Contains(errOutput, "truncated") { strings.Contains(errOutput, "truncated") {
@@ -585,7 +689,7 @@ func (d *Diagnoser) DiagnoseClusterDumps(archivePath, tempDir string) ([]*Diagno
"Solution: Re-create the backup from source database") "Solution: Re-create the backup from source database")
} else { } else {
errResult.Errors = append(errResult.Errors, errResult.Errors = append(errResult.Errors,
fmt.Sprintf("Cannot list archive contents: %v", listErr), fmt.Sprintf("Cannot list archive contents: %v", actualErr),
fmt.Sprintf("tar error: %s", truncateString(errOutput, 300)), fmt.Sprintf("tar error: %s", truncateString(errOutput, 300)),
"Run manually: tar -tzf "+archivePath+" 2>&1 | tail -50") "Run manually: tar -tzf "+archivePath+" 2>&1 | tail -50")
} }
@@ -593,11 +697,10 @@ func (d *Diagnoser) DiagnoseClusterDumps(archivePath, tempDir string) ([]*Diagno
return []*DiagnoseResult{errResult}, nil return []*DiagnoseResult{errResult}, nil
} }
// Archive is listable - now check disk space before extraction d.log.Debug("Archive listing streamed successfully", "total_files", fileCount, "relevant_files", len(files))
files := strings.Split(strings.TrimSpace(string(listOutput)), "\n")
// Check if we have enough disk space (estimate 4x archive size needed) // Check if we have enough disk space (estimate 4x archive size needed)
archiveInfo, _ := os.Stat(archivePath) // archiveInfo already obtained at function start
requiredSpace := archiveInfo.Size() * 4 requiredSpace := archiveInfo.Size() * 4
// Check temp directory space - try to extract metadata first // Check temp directory space - try to extract metadata first

View File

@@ -229,8 +229,14 @@ func containsSQLKeywords(content string) bool {
} }
// CheckDiskSpace verifies sufficient disk space for restore // CheckDiskSpace verifies sufficient disk space for restore
// Uses the effective work directory (WorkDir if set, otherwise BackupDir) since
// that's where extraction actually happens for large databases
func (s *Safety) CheckDiskSpace(archivePath string, multiplier float64) error { func (s *Safety) CheckDiskSpace(archivePath string, multiplier float64) error {
return s.CheckDiskSpaceAt(archivePath, s.cfg.BackupDir, multiplier) checkDir := s.cfg.GetEffectiveWorkDir()
if checkDir == "" {
checkDir = s.cfg.BackupDir
}
return s.CheckDiskSpaceAt(archivePath, checkDir, multiplier)
} }
// CheckDiskSpaceAt verifies sufficient disk space at a specific directory // CheckDiskSpaceAt verifies sufficient disk space at a specific directory

View File

@@ -106,9 +106,23 @@ type safetyCheckCompleteMsg struct {
func runSafetyChecks(cfg *config.Config, log logger.Logger, archive ArchiveInfo, targetDB string) tea.Cmd { func runSafetyChecks(cfg *config.Config, log logger.Logger, archive ArchiveInfo, targetDB string) tea.Cmd {
return func() tea.Msg { return func() tea.Msg {
// 10 minutes for safety checks - large archives can take a long time to diagnose // Dynamic timeout based on archive size for large database support
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) // Base: 10 minutes + 1 minute per 5 GB, max 120 minutes
timeoutMinutes := 10
if archive.Size > 0 {
sizeGB := archive.Size / (1024 * 1024 * 1024)
estimatedMinutes := int(sizeGB/5) + 10
if estimatedMinutes > timeoutMinutes {
timeoutMinutes = estimatedMinutes
}
if timeoutMinutes > 120 {
timeoutMinutes = 120
}
}
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutMinutes)*time.Minute)
defer cancel() defer cancel()
_ = ctx // Used by database checks below
safety := restore.NewSafety(cfg, log) safety := restore.NewSafety(cfg, log)
checks := []SafetyCheck{} checks := []SafetyCheck{}