fix: streaming tar verification for large cluster archives (100GB+)

- Increase timeout from 60 to 180 minutes for very large archives
- Use streaming pipes instead of buffering entire tar listing
- Only mark as corrupted for clear corruption signals (unexpected EOF, invalid gzip)
- Prevents false CORRUPTED errors on valid large archives
This commit is contained in:
2026-01-13 14:40:18 +01:00
parent f7e9fa64f0
commit 5e96ac3be1
3 changed files with 105 additions and 36 deletions

View File

@@ -4,8 +4,8 @@ This directory contains pre-compiled binaries for the DB Backup Tool across mult
## Build Information ## Build Information
- **Version**: 3.42.10 - **Version**: 3.42.10
- **Build Time**: 2026-01-12_14:25:53_UTC - **Build Time**: 2026-01-13_07:23:20_UTC
- **Git Commit**: d19c065 - **Git Commit**: f153e61
## Recent Updates (v1.1.0) ## Recent Updates (v1.1.0)
- ✅ Fixed TUI progress display with line-by-line output - ✅ Fixed TUI progress display with line-by-line output

View File

@@ -415,18 +415,18 @@ func (d *Diagnoser) diagnoseSQLScript(filePath string, compressed bool, result *
// diagnoseClusterArchive analyzes a cluster tar.gz archive // diagnoseClusterArchive analyzes a cluster tar.gz archive
func (d *Diagnoser) diagnoseClusterArchive(filePath string, result *DiagnoseResult) { func (d *Diagnoser) diagnoseClusterArchive(filePath string, result *DiagnoseResult) {
// Calculate dynamic timeout based on file size // Calculate dynamic timeout based on file size
// Assume minimum 50 MB/s throughput for compressed archive listing // Large archives (100GB+) can take significant time to list
// Minimum 5 minutes, scales with file size // Minimum 5 minutes, scales with file size, max 180 minutes for very large archives
timeoutMinutes := 5 timeoutMinutes := 5
if result.FileSize > 0 { if result.FileSize > 0 {
// 1 minute per 3 GB, minimum 5 minutes, max 60 minutes // 1 minute per 2 GB, minimum 5 minutes, max 180 minutes
sizeGB := result.FileSize / (1024 * 1024 * 1024) sizeGB := result.FileSize / (1024 * 1024 * 1024)
estimatedMinutes := int(sizeGB/3) + 5 estimatedMinutes := int(sizeGB/2) + 5
if estimatedMinutes > timeoutMinutes { if estimatedMinutes > timeoutMinutes {
timeoutMinutes = estimatedMinutes timeoutMinutes = estimatedMinutes
} }
if timeoutMinutes > 60 { if timeoutMinutes > 180 {
timeoutMinutes = 60 timeoutMinutes = 180
} }
} }
@@ -437,29 +437,98 @@ func (d *Diagnoser) diagnoseClusterArchive(filePath string, result *DiagnoseResu
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutMinutes)*time.Minute) ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutMinutes)*time.Minute)
defer cancel() defer cancel()
// Use streaming approach with pipes to avoid memory issues with large archives
cmd := exec.CommandContext(ctx, "tar", "-tzf", filePath) cmd := exec.CommandContext(ctx, "tar", "-tzf", filePath)
output, err := cmd.Output() stdout, pipeErr := cmd.StdoutPipe()
if err != nil { if pipeErr != nil {
// Pipe creation failed - not a corruption issue
result.Warnings = append(result.Warnings,
fmt.Sprintf("Cannot create pipe for verification: %v", pipeErr),
"Archive integrity cannot be verified but may still be valid")
return
}
var stderrBuf bytes.Buffer
cmd.Stderr = &stderrBuf
if startErr := cmd.Start(); startErr != nil {
result.Warnings = append(result.Warnings,
fmt.Sprintf("Cannot start tar verification: %v", startErr),
"Archive integrity cannot be verified but may still be valid")
return
}
// Stream output line by line to avoid buffering entire listing in memory
scanner := bufio.NewScanner(stdout)
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) // Allow long paths
var files []string
fileCount := 0
for scanner.Scan() {
fileCount++
line := scanner.Text()
// Only store dump/metadata files, not every file
if strings.HasSuffix(line, ".dump") || strings.HasSuffix(line, ".sql.gz") ||
strings.HasSuffix(line, ".sql") || strings.HasSuffix(line, ".json") ||
strings.Contains(line, "globals") || strings.Contains(line, "manifest") ||
strings.Contains(line, "metadata") {
files = append(files, line)
}
}
scanErr := scanner.Err()
waitErr := cmd.Wait()
stderrOutput := stderrBuf.String()
// Handle errors - distinguish between actual corruption and resource/timeout issues
if waitErr != nil || scanErr != nil {
// Check if it was a timeout // Check if it was a timeout
if ctx.Err() == context.DeadlineExceeded { if ctx.Err() == context.DeadlineExceeded {
result.IsValid = false result.Warnings = append(result.Warnings,
result.Errors = append(result.Errors,
fmt.Sprintf("Verification timed out after %d minutes - archive is very large", timeoutMinutes), fmt.Sprintf("Verification timed out after %d minutes - archive is very large", timeoutMinutes),
"This does not necessarily mean the archive is corrupted", "This does not necessarily mean the archive is corrupted",
"Manual verification: tar -tzf "+filePath+" | wc -l") "Manual verification: tar -tzf "+filePath+" | wc -l")
// Don't mark as corrupted on timeout // Don't mark as corrupted or invalid on timeout - archive may be fine
if fileCount > 0 {
result.Details.TableCount = len(files)
result.Details.TableList = files
}
return return
} }
// Check for specific gzip/tar corruption indicators
if strings.Contains(stderrOutput, "unexpected end of file") ||
strings.Contains(stderrOutput, "Unexpected EOF") ||
strings.Contains(stderrOutput, "gzip: stdin: unexpected end of file") ||
strings.Contains(stderrOutput, "not in gzip format") ||
strings.Contains(stderrOutput, "invalid compressed data") {
// These indicate actual corruption
result.IsValid = false result.IsValid = false
result.IsCorrupted = true result.IsCorrupted = true
result.Errors = append(result.Errors, result.Errors = append(result.Errors,
fmt.Sprintf("Tar archive is invalid or corrupted: %v", err), "Tar archive appears truncated or corrupted",
fmt.Sprintf("Error: %s", truncateString(stderrOutput, 200)),
"Run: tar -tzf "+filePath+" 2>&1 | tail -20") "Run: tar -tzf "+filePath+" 2>&1 | tail -20")
return return
} }
// Parse tar listing // Other errors (signal killed, memory, etc.) - not necessarily corruption
files := strings.Split(strings.TrimSpace(string(output)), "\n") // If we read some files successfully, the archive structure is likely OK
if fileCount > 0 {
result.Warnings = append(result.Warnings,
fmt.Sprintf("Verification incomplete (read %d files before error)", fileCount),
"Archive may still be valid - error could be due to system resources")
// Proceed with what we got
} else {
// Couldn't read anything - but don't mark as corrupted without clear evidence
result.Warnings = append(result.Warnings,
fmt.Sprintf("Cannot verify archive: %v", waitErr),
"Archive integrity is uncertain - proceed with caution or verify manually")
return
}
}
// Parse the collected file list
var dumpFiles []string var dumpFiles []string
hasGlobals := false hasGlobals := false
hasMetadata := false hasMetadata := false