Add Phase 2 TUI improvements: disk space checks and error hints

- Created internal/checks package for disk space and error classification - CheckDiskSpace(): Real-time disk usage detection (80% warning, 95% critical) - CheckDiskSpaceForRestore(): 4x archive size requirement calculation - ClassifyError(): Smart error classification (ignorable/warning/critical/fatal) - FormatErrorWithHint(): User-friendly error messages with actionable solutions - Integrated disk checks into backup/restore workflows with pre-flight validation - Error hints for: lock exhaustion, disk full, syntax errors, permissions, connections - Blocks operations at 95% disk usage, warns at 80%
2025-11-18 13:24:07 +00:00
parent 3a2ff21e6f
commit fd5fae4dfa
4 changed files with 449 additions and 4 deletions
--- a/internal/backup/engine.go
+++ b/internal/backup/engine.go
@@ -16,6 +16,7 @@ import (
 	"sync/atomic"
 	"time"

+	"dbbackup/internal/checks"
 	"dbbackup/internal/config"
 	"dbbackup/internal/database"
 	"dbbackup/internal/logger"
@@ -303,6 +304,27 @@ func (e *Engine) BackupCluster(ctx context.Context) error {
 		return fmt.Errorf("failed to create backup directory: %w", err)
 	}
 	
+	// Check disk space before starting backup
+	e.log.Info("Checking disk space availability")
+	spaceCheck := checks.CheckDiskSpace(e.cfg.BackupDir)
+	
+	if !e.silent {
+		// Show disk space status in CLI mode
+		fmt.Println("\n" + checks.FormatDiskSpaceMessage(spaceCheck))
+	}
+	
+	if spaceCheck.Critical {
+		operation.Fail("Insufficient disk space")
+		quietProgress.Fail("Insufficient disk space - free up space and try again")
+		return fmt.Errorf("insufficient disk space: %.1f%% used, operation blocked", spaceCheck.UsedPercent)
+	}
+	
+	if spaceCheck.Warning {
+		e.log.Warn("Low disk space - backup may fail if database is large", 
+			"available_gb", float64(spaceCheck.AvailableBytes)/(1024*1024*1024),
+			"used_percent", spaceCheck.UsedPercent)
+	}
+	
 	// Generate timestamp and filename
 	timestamp := time.Now().Format("20060102_150405")
 	outputFile := filepath.Join(e.cfg.BackupDir, fmt.Sprintf("cluster_%s.tar.gz", timestamp))
--- a/internal/checks/disk_check.go
+++ b/internal/checks/disk_check.go
@@ -0,0 +1,161 @@
+package checks
+
+import (
+	"fmt"
+	"path/filepath"
+	"syscall"
+)
+
+// DiskSpaceCheck represents disk space information
+type DiskSpaceCheck struct {
+	Path           string
+	TotalBytes     uint64
+	AvailableBytes uint64
+	UsedBytes      uint64
+	UsedPercent    float64
+	Sufficient     bool
+	Warning        bool
+	Critical       bool
+}
+
+// CheckDiskSpace checks available disk space for a given path
+func CheckDiskSpace(path string) *DiskSpaceCheck {
+	// Get absolute path
+	absPath, err := filepath.Abs(path)
+	if err != nil {
+		absPath = path
+	}
+
+	// Get filesystem stats
+	var stat syscall.Statfs_t
+	if err := syscall.Statfs(absPath, &stat); err != nil {
+		// Return error state
+		return &DiskSpaceCheck{
+			Path:       absPath,
+			Critical:   true,
+			Sufficient: false,
+		}
+	}
+
+	// Calculate space
+	totalBytes := stat.Blocks * uint64(stat.Bsize)
+	availableBytes := stat.Bavail * uint64(stat.Bsize)
+	usedBytes := totalBytes - availableBytes
+	usedPercent := float64(usedBytes) / float64(totalBytes) * 100
+
+	check := &DiskSpaceCheck{
+		Path:           absPath,
+		TotalBytes:     totalBytes,
+		AvailableBytes: availableBytes,
+		UsedBytes:      usedBytes,
+		UsedPercent:    usedPercent,
+	}
+
+	// Determine status thresholds
+	check.Critical = usedPercent >= 95
+	check.Warning = usedPercent >= 80 && !check.Critical
+	check.Sufficient = !check.Critical && !check.Warning
+
+	return check
+}
+
+// CheckDiskSpaceForRestore checks if there's enough space for restore (needs 4x archive size)
+func CheckDiskSpaceForRestore(path string, archiveSize int64) *DiskSpaceCheck {
+	check := CheckDiskSpace(path)
+	requiredBytes := uint64(archiveSize) * 4 // Account for decompression
+	
+	// Override status based on required space
+	if check.AvailableBytes < requiredBytes {
+		check.Critical = true
+		check.Sufficient = false
+		check.Warning = false
+	} else if check.AvailableBytes < requiredBytes*2 {
+		check.Warning = true
+		check.Sufficient = false
+	}
+	
+	return check
+}
+
+// FormatDiskSpaceMessage creates a user-friendly disk space message
+func FormatDiskSpaceMessage(check *DiskSpaceCheck) string {
+	var status string
+	var icon string
+
+	if check.Critical {
+		status = "CRITICAL"
+		icon = "❌"
+	} else if check.Warning {
+		status = "WARNING"
+		icon = "⚠️ "
+	} else {
+		status = "OK"
+		icon = "✓"
+	}
+
+	msg := fmt.Sprintf(`📊 Disk Space Check (%s):
+   Path: %s
+   Total: %s
+   Available: %s (%.1f%% used)
+   %s Status: %s`,
+		status,
+		check.Path,
+		formatBytes(check.TotalBytes),
+		formatBytes(check.AvailableBytes),
+		check.UsedPercent,
+		icon,
+		status)
+
+	if check.Critical {
+		msg += "\n   \n   ⚠️  CRITICAL: Insufficient disk space!"
+		msg += "\n   Operation blocked. Free up space before continuing."
+	} else if check.Warning {
+		msg += "\n   \n   ⚠️  WARNING: Low disk space!"
+		msg += "\n   Backup may fail if database is larger than estimated."
+	} else {
+		msg += "\n   \n   ✓ Sufficient space available"
+	}
+
+	return msg
+}
+
+// EstimateBackupSize estimates backup size based on database size
+func EstimateBackupSize(databaseSize uint64, compressionLevel int) uint64 {
+	// Typical compression ratios:
+	// Level 0 (no compression): 1.0x
+	// Level 1-3 (fast): 0.4-0.6x
+	// Level 4-6 (balanced): 0.3-0.4x
+	// Level 7-9 (best): 0.2-0.3x
+
+	var compressionRatio float64
+	if compressionLevel == 0 {
+		compressionRatio = 1.0
+	} else if compressionLevel <= 3 {
+		compressionRatio = 0.5
+	} else if compressionLevel <= 6 {
+		compressionRatio = 0.35
+	} else {
+		compressionRatio = 0.25
+	}
+
+	estimated := uint64(float64(databaseSize) * compressionRatio)
+
+	// Add 10% buffer for metadata, indexes, etc.
+	return uint64(float64(estimated) * 1.1)
+}
+
+
+
+// formatBytes formats bytes to human-readable format
+func formatBytes(bytes uint64) string {
+	const unit = 1024
+	if bytes < unit {
+		return fmt.Sprintf("%d B", bytes)
+	}
+	div, exp := uint64(unit), 0
+	for n := bytes / unit; n >= unit; n /= unit {
+		div *= unit
+		exp++
+	}
+	return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
+}
--- a/internal/checks/error_hints.go
+++ b/internal/checks/error_hints.go
@@ -0,0 +1,221 @@
+package checks
+
+import (
+	"fmt"
+	"strings"
+)
+
+// ErrorClassification represents the severity and type of error
+type ErrorClassification struct {
+	Type     string // "ignorable", "warning", "critical", "fatal"
+	Category string // "disk_space", "locks", "corruption", "permissions", "network", "syntax"
+	Message  string
+	Hint     string
+	Action   string // Suggested command or action
+	Severity int    // 0=info, 1=warning, 2=error, 3=fatal
+}
+
+// ClassifyError analyzes an error message and provides actionable hints
+func ClassifyError(errorMsg string) *ErrorClassification {
+	lowerMsg := strings.ToLower(errorMsg)
+
+	// Ignorable errors (objects already exist)
+	if strings.Contains(lowerMsg, "already exists") {
+		return &ErrorClassification{
+			Type:     "ignorable",
+			Category: "duplicate",
+			Message:  errorMsg,
+			Hint:     "Object already exists in target database - this is normal during restore",
+			Action:   "No action needed - restore will continue",
+			Severity: 0,
+		}
+	}
+
+	// Disk space errors
+	if strings.Contains(lowerMsg, "no space left") || strings.Contains(lowerMsg, "disk full") {
+		return &ErrorClassification{
+			Type:     "critical",
+			Category: "disk_space",
+			Message:  errorMsg,
+			Hint:     "Insufficient disk space to complete operation",
+			Action:   "Free up disk space: rm old_backups/* or increase storage",
+			Severity: 3,
+		}
+	}
+
+	// Lock exhaustion errors
+	if strings.Contains(lowerMsg, "max_locks_per_transaction") || 
+	   strings.Contains(lowerMsg, "out of shared memory") ||
+	   strings.Contains(lowerMsg, "could not open large object") {
+		return &ErrorClassification{
+			Type:     "critical",
+			Category: "locks",
+			Message:  errorMsg,
+			Hint:     "Lock table exhausted - typically caused by large objects in parallel restore",
+			Action:   "Increase max_locks_per_transaction in postgresql.conf to 512 or higher",
+			Severity: 2,
+		}
+	}
+
+	// Syntax errors (corrupted dump)
+	if strings.Contains(lowerMsg, "syntax error") {
+		return &ErrorClassification{
+			Type:     "critical",
+			Category: "corruption",
+			Message:  errorMsg,
+			Hint:     "Syntax error in dump file - backup may be corrupted or incomplete",
+			Action:   "Re-create backup with: dbbackup backup single <database>",
+			Severity: 3,
+		}
+	}
+
+	// Permission errors
+	if strings.Contains(lowerMsg, "permission denied") || strings.Contains(lowerMsg, "must be owner") {
+		return &ErrorClassification{
+			Type:     "critical",
+			Category: "permissions",
+			Message:  errorMsg,
+			Hint:     "Insufficient permissions to perform operation",
+			Action:   "Run as superuser or use --no-owner flag for restore",
+			Severity: 2,
+		}
+	}
+
+	// Connection errors
+	if strings.Contains(lowerMsg, "connection refused") || 
+	   strings.Contains(lowerMsg, "could not connect") ||
+	   strings.Contains(lowerMsg, "no pg_hba.conf entry") {
+		return &ErrorClassification{
+			Type:     "critical",
+			Category: "network",
+			Message:  errorMsg,
+			Hint:     "Cannot connect to database server",
+			Action:   "Check database is running and pg_hba.conf allows connection",
+			Severity: 2,
+		}
+	}
+
+	// Version compatibility warnings
+	if strings.Contains(lowerMsg, "version mismatch") || strings.Contains(lowerMsg, "incompatible") {
+		return &ErrorClassification{
+			Type:     "warning",
+			Category: "version",
+			Message:  errorMsg,
+			Hint:     "PostgreSQL version mismatch between backup and restore target",
+			Action:   "Review release notes for compatibility: https://www.postgresql.org/docs/",
+			Severity: 1,
+		}
+	}
+
+	// Excessive errors (corrupted dump)
+	if strings.Contains(errorMsg, "total errors:") {
+		parts := strings.Split(errorMsg, "total errors:")
+		if len(parts) > 1 {
+			var count int
+			if _, err := fmt.Sscanf(parts[1], "%d", &count); err == nil && count > 100000 {
+				return &ErrorClassification{
+					Type:     "fatal",
+					Category: "corruption",
+					Message:  errorMsg,
+					Hint:     fmt.Sprintf("Excessive errors (%d) indicate severely corrupted dump file", count),
+					Action:   "Re-create backup from source database",
+					Severity: 3,
+				}
+			}
+		}
+	}
+
+	// Default: unclassified error
+	return &ErrorClassification{
+		Type:     "error",
+		Category: "unknown",
+		Message:  errorMsg,
+		Hint:     "An error occurred during operation",
+		Action:   "Check logs for details or contact support",
+		Severity: 2,
+	}
+}
+
+// FormatErrorWithHint creates a user-friendly error message with hints
+func FormatErrorWithHint(errorMsg string) string {
+	classification := ClassifyError(errorMsg)
+
+	var icon string
+	switch classification.Type {
+	case "ignorable":
+		icon = "ℹ️ "
+	case "warning":
+		icon = "⚠️ "
+	case "critical":
+		icon = "❌"
+	case "fatal":
+		icon = "🛑"
+	default:
+		icon = "⚠️ "
+	}
+
+	output := fmt.Sprintf("%s %s Error\n\n", icon, strings.ToUpper(classification.Type))
+	output += fmt.Sprintf("Category: %s\n", classification.Category)
+	output += fmt.Sprintf("Message: %s\n\n", classification.Message)
+	output += fmt.Sprintf("💡 Hint: %s\n\n", classification.Hint)
+	output += fmt.Sprintf("🔧 Action: %s\n", classification.Action)
+
+	return output
+}
+
+// FormatMultipleErrors formats multiple errors with classification
+func FormatMultipleErrors(errors []string) string {
+	if len(errors) == 0 {
+		return "✓ No errors"
+	}
+
+	ignorable := 0
+	warnings := 0
+	critical := 0
+	fatal := 0
+
+	var criticalErrors []string
+
+	for _, err := range errors {
+		class := ClassifyError(err)
+		switch class.Type {
+		case "ignorable":
+			ignorable++
+		case "warning":
+			warnings++
+		case "critical":
+			critical++
+			if len(criticalErrors) < 3 { // Keep first 3 critical errors
+				criticalErrors = append(criticalErrors, err)
+			}
+		case "fatal":
+			fatal++
+			criticalErrors = append(criticalErrors, err)
+		}
+	}
+
+	output := "📊 Error Summary:\n\n"
+	if ignorable > 0 {
+		output += fmt.Sprintf("   ℹ️  %d ignorable (objects already exist)\n", ignorable)
+	}
+	if warnings > 0 {
+		output += fmt.Sprintf("   ⚠️  %d warnings\n", warnings)
+	}
+	if critical > 0 {
+		output += fmt.Sprintf("   ❌ %d critical errors\n", critical)
+	}
+	if fatal > 0 {
+		output += fmt.Sprintf("   🛑 %d fatal errors\n", fatal)
+	}
+
+	if len(criticalErrors) > 0 {
+		output += "\n📝 Critical Issues:\n\n"
+		for i, err := range criticalErrors {
+			class := ClassifyError(err)
+			output += fmt.Sprintf("%d. %s\n", i+1, class.Hint)
+			output += fmt.Sprintf("   Action: %s\n\n", class.Action)
+		}
+	}
+
+	return output
+}
--- a/internal/restore/engine.go
+++ b/internal/restore/engine.go
@@ -11,6 +11,7 @@ import (
 	"sync/atomic"
 	"time"

+	"dbbackup/internal/checks"
 	"dbbackup/internal/config"
 	"dbbackup/internal/database"
 	"dbbackup/internal/logger"
@@ -341,10 +342,21 @@ func (e *Engine) executeRestoreCommand(ctx context.Context, cmdArgs []string) er
 			return nil // Success despite ignorable errors
 		}
 		
-		e.log.Error("Restore command failed", "error", err, "last_stderr", lastError, "error_count", errorCount)
+		// Classify error and provide helpful hints
 		if lastError != "" {
-			return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d)", err, lastError, errorCount)
+			classification := checks.ClassifyError(lastError)
+			e.log.Error("Restore command failed", 
+				"error", err, 
+				"last_stderr", lastError, 
+				"error_count", errorCount,
+				"error_type", classification.Type,
+				"hint", classification.Hint,
+				"action", classification.Action)
+			return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d) - %s", 
+				err, lastError, errorCount, classification.Hint)
 		}
+		
+		e.log.Error("Restore command failed", "error", err, "last_stderr", lastError, "error_count", errorCount)
 		return fmt.Errorf("restore failed: %w", err)
 	}

@@ -412,10 +424,21 @@ func (e *Engine) executeRestoreWithDecompression(ctx context.Context, archivePat
 			return nil // Success despite ignorable errors
 		}
 		
-		e.log.Error("Restore with decompression failed", "error", err, "last_stderr", lastError, "error_count", errorCount)
+		// Classify error and provide helpful hints
 		if lastError != "" {
-			return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d)", err, lastError, errorCount)
+			classification := checks.ClassifyError(lastError)
+			e.log.Error("Restore with decompression failed", 
+				"error", err, 
+				"last_stderr", lastError, 
+				"error_count", errorCount,
+				"error_type", classification.Type,
+				"hint", classification.Hint,
+				"action", classification.Action)
+			return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d) - %s", 
+				err, lastError, errorCount, classification.Hint)
 		}
+		
+		e.log.Error("Restore with decompression failed", "error", err, "last_stderr", lastError, "error_count", errorCount)
 		return fmt.Errorf("restore failed: %w", err)
 	}

@@ -474,6 +497,24 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
 		operation.Fail("Invalid cluster archive format")
 		return fmt.Errorf("not a cluster archive: %s (detected format: %s)", archivePath, format)
 	}
+	
+	// Check disk space before starting restore
+	e.log.Info("Checking disk space for restore")
+	archiveInfo, err := os.Stat(archivePath)
+	if err == nil {
+		spaceCheck := checks.CheckDiskSpaceForRestore(e.cfg.BackupDir, archiveInfo.Size())
+		
+		if spaceCheck.Critical {
+			operation.Fail("Insufficient disk space")
+			return fmt.Errorf("insufficient disk space for restore: %.1f%% used - need at least 4x archive size", spaceCheck.UsedPercent)
+		}
+		
+		if spaceCheck.Warning {
+			e.log.Warn("Low disk space - restore may fail", 
+				"available_gb", float64(spaceCheck.AvailableBytes)/(1024*1024*1024),
+				"used_percent", spaceCheck.UsedPercent)
+		}
+	}

 	if e.dryRun {
 		e.log.Info("DRY RUN: Would restore cluster", "archive", archivePath)