fix: streaming tar verification for large cluster archives (100GB+)

- Increase timeout from 60 to 180 minutes for very large archives - Use streaming pipes instead of buffering entire tar listing - Only mark as corrupted for clear corruption signals (unexpected EOF, invalid gzip) - Prevents false CORRUPTED errors on valid large archives
2026-01-13 14:40:18 +01:00
parent f7e9fa64f0
commit 222bdbef58
3 changed files with 105 additions and 36 deletions
--- a/bin/README.md
+++ b/bin/README.md
@@ -4,8 +4,8 @@ This directory contains pre-compiled binaries for the DB Backup Tool across mult

 ## Build Information
 - **Version**: 3.42.10
- **Build Time**: 2026-01-12_14:25:53_UTC
- **Git Commit**: d19c065
+- **Build Time**: 2026-01-13_07:23:20_UTC
+- **Git Commit**: f153e61

 ## Recent Updates (v1.1.0)
 - ✅ Fixed TUI progress display with line-by-line output
--- a/cmd/dedup.go
+++ b/cmd/dedup.go
@@ -185,15 +185,15 @@ Examples:

 // Flags
 var (
-	dedupDir       string
-	dedupIndexDB   string // Separate path for SQLite index (for NFS/CIFS support)
-	dedupCompress  bool
-	dedupEncrypt   bool
-	dedupKey       string
-	dedupName      string
-	dedupDBType    string
-	dedupDBName    string
-	dedupDBHost    string
+	dedupDir        string
+	dedupIndexDB    string // Separate path for SQLite index (for NFS/CIFS support)
+	dedupCompress   bool
+	dedupEncrypt    bool
+	dedupKey        string
+	dedupName       string
+	dedupDBType     string
+	dedupDBName     string
+	dedupDBHost     string
 	dedupDecompress bool // Auto-decompress gzip input
 )

--- a/internal/restore/diagnose.go
+++ b/internal/restore/diagnose.go
@@ -415,18 +415,18 @@ func (d *Diagnoser) diagnoseSQLScript(filePath string, compressed bool, result *
 // diagnoseClusterArchive analyzes a cluster tar.gz archive
 func (d *Diagnoser) diagnoseClusterArchive(filePath string, result *DiagnoseResult) {
 	// Calculate dynamic timeout based on file size
-	// Assume minimum 50 MB/s throughput for compressed archive listing
-	// Minimum 5 minutes, scales with file size
+	// Large archives (100GB+) can take significant time to list
+	// Minimum 5 minutes, scales with file size, max 180 minutes for very large archives
 	timeoutMinutes := 5
 	if result.FileSize > 0 {
-		// 1 minute per 3 GB, minimum 5 minutes, max 60 minutes
+		// 1 minute per 2 GB, minimum 5 minutes, max 180 minutes
 		sizeGB := result.FileSize / (1024 * 1024 * 1024)
-		estimatedMinutes := int(sizeGB/3) + 5
+		estimatedMinutes := int(sizeGB/2) + 5
 		if estimatedMinutes > timeoutMinutes {
 			timeoutMinutes = estimatedMinutes
 		}
-		if timeoutMinutes > 60 {
-			timeoutMinutes = 60
+		if timeoutMinutes > 180 {
+			timeoutMinutes = 180
 		}
 	}

@@ -437,29 +437,98 @@ func (d *Diagnoser) diagnoseClusterArchive(filePath string, result *DiagnoseResu
 	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutMinutes)*time.Minute)
 	defer cancel()

+	// Use streaming approach with pipes to avoid memory issues with large archives
 	cmd := exec.CommandContext(ctx, "tar", "-tzf", filePath)
-	output, err := cmd.Output()
-	if err != nil {
-		// Check if it was a timeout
-		if ctx.Err() == context.DeadlineExceeded {
-			result.IsValid = false
-			result.Errors = append(result.Errors,
-				fmt.Sprintf("Verification timed out after %d minutes - archive is very large", timeoutMinutes),
-				"This does not necessarily mean the archive is corrupted",
-				"Manual verification: tar -tzf "+filePath+" | wc -l")
-			// Don't mark as corrupted on timeout
-			return
-		}
-		result.IsValid = false
-		result.IsCorrupted = true
-		result.Errors = append(result.Errors,
-			fmt.Sprintf("Tar archive is invalid or corrupted: %v", err),
-			"Run: tar -tzf "+filePath+" 2>&1 | tail -20")
+	stdout, pipeErr := cmd.StdoutPipe()
+	if pipeErr != nil {
+		// Pipe creation failed - not a corruption issue
+		result.Warnings = append(result.Warnings,
+			fmt.Sprintf("Cannot create pipe for verification: %v", pipeErr),
+			"Archive integrity cannot be verified but may still be valid")
 		return
 	}

-	// Parse tar listing
-	files := strings.Split(strings.TrimSpace(string(output)), "\n")
+	var stderrBuf bytes.Buffer
+	cmd.Stderr = &stderrBuf
+
+	if startErr := cmd.Start(); startErr != nil {
+		result.Warnings = append(result.Warnings,
+			fmt.Sprintf("Cannot start tar verification: %v", startErr),
+			"Archive integrity cannot be verified but may still be valid")
+		return
+	}
+
+	// Stream output line by line to avoid buffering entire listing in memory
+	scanner := bufio.NewScanner(stdout)
+	scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) // Allow long paths
+
+	var files []string
+	fileCount := 0
+	for scanner.Scan() {
+		fileCount++
+		line := scanner.Text()
+		// Only store dump/metadata files, not every file
+		if strings.HasSuffix(line, ".dump") || strings.HasSuffix(line, ".sql.gz") ||
+			strings.HasSuffix(line, ".sql") || strings.HasSuffix(line, ".json") ||
+			strings.Contains(line, "globals") || strings.Contains(line, "manifest") ||
+			strings.Contains(line, "metadata") {
+			files = append(files, line)
+		}
+	}
+
+	scanErr := scanner.Err()
+	waitErr := cmd.Wait()
+	stderrOutput := stderrBuf.String()
+
+	// Handle errors - distinguish between actual corruption and resource/timeout issues
+	if waitErr != nil || scanErr != nil {
+		// Check if it was a timeout
+		if ctx.Err() == context.DeadlineExceeded {
+			result.Warnings = append(result.Warnings,
+				fmt.Sprintf("Verification timed out after %d minutes - archive is very large", timeoutMinutes),
+				"This does not necessarily mean the archive is corrupted",
+				"Manual verification: tar -tzf "+filePath+" | wc -l")
+			// Don't mark as corrupted or invalid on timeout - archive may be fine
+			if fileCount > 0 {
+				result.Details.TableCount = len(files)
+				result.Details.TableList = files
+			}
+			return
+		}
+
+		// Check for specific gzip/tar corruption indicators
+		if strings.Contains(stderrOutput, "unexpected end of file") ||
+			strings.Contains(stderrOutput, "Unexpected EOF") ||
+			strings.Contains(stderrOutput, "gzip: stdin: unexpected end of file") ||
+			strings.Contains(stderrOutput, "not in gzip format") ||
+			strings.Contains(stderrOutput, "invalid compressed data") {
+			// These indicate actual corruption
+			result.IsValid = false
+			result.IsCorrupted = true
+			result.Errors = append(result.Errors,
+				"Tar archive appears truncated or corrupted",
+				fmt.Sprintf("Error: %s", truncateString(stderrOutput, 200)),
+				"Run: tar -tzf "+filePath+" 2>&1 | tail -20")
+			return
+		}
+
+		// Other errors (signal killed, memory, etc.) - not necessarily corruption
+		// If we read some files successfully, the archive structure is likely OK
+		if fileCount > 0 {
+			result.Warnings = append(result.Warnings,
+				fmt.Sprintf("Verification incomplete (read %d files before error)", fileCount),
+				"Archive may still be valid - error could be due to system resources")
+			// Proceed with what we got
+		} else {
+			// Couldn't read anything - but don't mark as corrupted without clear evidence
+			result.Warnings = append(result.Warnings,
+				fmt.Sprintf("Cannot verify archive: %v", waitErr),
+				"Archive integrity is uncertain - proceed with caution or verify manually")
+			return
+		}
+	}
+
+	// Parse the collected file list
 	var dumpFiles []string
 	hasGlobals := false
 	hasMetadata := false