fix: streaming tar verification for large cluster archives (100GB+)

- Increase timeout from 60 to 180 minutes for very large archives
- Use streaming pipes instead of buffering entire tar listing
- Only mark as corrupted for clear corruption signals (unexpected EOF, invalid gzip)
- Prevents false CORRUPTED errors on valid large archives
This commit is contained in:
2026-01-13 14:40:18 +01:00
parent f7e9fa64f0
commit 5e96ac3be1
3 changed files with 105 additions and 36 deletions

View File

@@ -4,8 +4,8 @@ This directory contains pre-compiled binaries for the DB Backup Tool across mult
## Build Information ## Build Information
- **Version**: 3.42.10 - **Version**: 3.42.10
- **Build Time**: 2026-01-12_14:25:53_UTC - **Build Time**: 2026-01-13_07:23:20_UTC
- **Git Commit**: d19c065 - **Git Commit**: f153e61
## Recent Updates (v1.1.0) ## Recent Updates (v1.1.0)
- ✅ Fixed TUI progress display with line-by-line output - ✅ Fixed TUI progress display with line-by-line output

View File

@@ -185,15 +185,15 @@ Examples:
// Flags // Flags
var ( var (
dedupDir string dedupDir string
dedupIndexDB string // Separate path for SQLite index (for NFS/CIFS support) dedupIndexDB string // Separate path for SQLite index (for NFS/CIFS support)
dedupCompress bool dedupCompress bool
dedupEncrypt bool dedupEncrypt bool
dedupKey string dedupKey string
dedupName string dedupName string
dedupDBType string dedupDBType string
dedupDBName string dedupDBName string
dedupDBHost string dedupDBHost string
dedupDecompress bool // Auto-decompress gzip input dedupDecompress bool // Auto-decompress gzip input
) )

View File

@@ -415,18 +415,18 @@ func (d *Diagnoser) diagnoseSQLScript(filePath string, compressed bool, result *
// diagnoseClusterArchive analyzes a cluster tar.gz archive // diagnoseClusterArchive analyzes a cluster tar.gz archive
func (d *Diagnoser) diagnoseClusterArchive(filePath string, result *DiagnoseResult) { func (d *Diagnoser) diagnoseClusterArchive(filePath string, result *DiagnoseResult) {
// Calculate dynamic timeout based on file size // Calculate dynamic timeout based on file size
// Assume minimum 50 MB/s throughput for compressed archive listing // Large archives (100GB+) can take significant time to list
// Minimum 5 minutes, scales with file size // Minimum 5 minutes, scales with file size, max 180 minutes for very large archives
timeoutMinutes := 5 timeoutMinutes := 5
if result.FileSize > 0 { if result.FileSize > 0 {
// 1 minute per 3 GB, minimum 5 minutes, max 60 minutes // 1 minute per 2 GB, minimum 5 minutes, max 180 minutes
sizeGB := result.FileSize / (1024 * 1024 * 1024) sizeGB := result.FileSize / (1024 * 1024 * 1024)
estimatedMinutes := int(sizeGB/3) + 5 estimatedMinutes := int(sizeGB/2) + 5
if estimatedMinutes > timeoutMinutes { if estimatedMinutes > timeoutMinutes {
timeoutMinutes = estimatedMinutes timeoutMinutes = estimatedMinutes
} }
if timeoutMinutes > 60 { if timeoutMinutes > 180 {
timeoutMinutes = 60 timeoutMinutes = 180
} }
} }
@@ -437,29 +437,98 @@ func (d *Diagnoser) diagnoseClusterArchive(filePath string, result *DiagnoseResu
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutMinutes)*time.Minute) ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutMinutes)*time.Minute)
defer cancel() defer cancel()
// Use streaming approach with pipes to avoid memory issues with large archives
cmd := exec.CommandContext(ctx, "tar", "-tzf", filePath) cmd := exec.CommandContext(ctx, "tar", "-tzf", filePath)
output, err := cmd.Output() stdout, pipeErr := cmd.StdoutPipe()
if err != nil { if pipeErr != nil {
// Check if it was a timeout // Pipe creation failed - not a corruption issue
if ctx.Err() == context.DeadlineExceeded { result.Warnings = append(result.Warnings,
result.IsValid = false fmt.Sprintf("Cannot create pipe for verification: %v", pipeErr),
result.Errors = append(result.Errors, "Archive integrity cannot be verified but may still be valid")
fmt.Sprintf("Verification timed out after %d minutes - archive is very large", timeoutMinutes),
"This does not necessarily mean the archive is corrupted",
"Manual verification: tar -tzf "+filePath+" | wc -l")
// Don't mark as corrupted on timeout
return
}
result.IsValid = false
result.IsCorrupted = true
result.Errors = append(result.Errors,
fmt.Sprintf("Tar archive is invalid or corrupted: %v", err),
"Run: tar -tzf "+filePath+" 2>&1 | tail -20")
return return
} }
// Parse tar listing var stderrBuf bytes.Buffer
files := strings.Split(strings.TrimSpace(string(output)), "\n") cmd.Stderr = &stderrBuf
if startErr := cmd.Start(); startErr != nil {
result.Warnings = append(result.Warnings,
fmt.Sprintf("Cannot start tar verification: %v", startErr),
"Archive integrity cannot be verified but may still be valid")
return
}
// Stream output line by line to avoid buffering entire listing in memory
scanner := bufio.NewScanner(stdout)
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) // Allow long paths
var files []string
fileCount := 0
for scanner.Scan() {
fileCount++
line := scanner.Text()
// Only store dump/metadata files, not every file
if strings.HasSuffix(line, ".dump") || strings.HasSuffix(line, ".sql.gz") ||
strings.HasSuffix(line, ".sql") || strings.HasSuffix(line, ".json") ||
strings.Contains(line, "globals") || strings.Contains(line, "manifest") ||
strings.Contains(line, "metadata") {
files = append(files, line)
}
}
scanErr := scanner.Err()
waitErr := cmd.Wait()
stderrOutput := stderrBuf.String()
// Handle errors - distinguish between actual corruption and resource/timeout issues
if waitErr != nil || scanErr != nil {
// Check if it was a timeout
if ctx.Err() == context.DeadlineExceeded {
result.Warnings = append(result.Warnings,
fmt.Sprintf("Verification timed out after %d minutes - archive is very large", timeoutMinutes),
"This does not necessarily mean the archive is corrupted",
"Manual verification: tar -tzf "+filePath+" | wc -l")
// Don't mark as corrupted or invalid on timeout - archive may be fine
if fileCount > 0 {
result.Details.TableCount = len(files)
result.Details.TableList = files
}
return
}
// Check for specific gzip/tar corruption indicators
if strings.Contains(stderrOutput, "unexpected end of file") ||
strings.Contains(stderrOutput, "Unexpected EOF") ||
strings.Contains(stderrOutput, "gzip: stdin: unexpected end of file") ||
strings.Contains(stderrOutput, "not in gzip format") ||
strings.Contains(stderrOutput, "invalid compressed data") {
// These indicate actual corruption
result.IsValid = false
result.IsCorrupted = true
result.Errors = append(result.Errors,
"Tar archive appears truncated or corrupted",
fmt.Sprintf("Error: %s", truncateString(stderrOutput, 200)),
"Run: tar -tzf "+filePath+" 2>&1 | tail -20")
return
}
// Other errors (signal killed, memory, etc.) - not necessarily corruption
// If we read some files successfully, the archive structure is likely OK
if fileCount > 0 {
result.Warnings = append(result.Warnings,
fmt.Sprintf("Verification incomplete (read %d files before error)", fileCount),
"Archive may still be valid - error could be due to system resources")
// Proceed with what we got
} else {
// Couldn't read anything - but don't mark as corrupted without clear evidence
result.Warnings = append(result.Warnings,
fmt.Sprintf("Cannot verify archive: %v", waitErr),
"Archive integrity is uncertain - proceed with caution or verify manually")
return
}
}
// Parse the collected file list
var dumpFiles []string var dumpFiles []string
hasGlobals := false hasGlobals := false
hasMetadata := false hasMetadata := false