Add Phase 2 TUI improvements: disk space checks and error hints
- Created internal/checks package for disk space and error classification - CheckDiskSpace(): Real-time disk usage detection (80% warning, 95% critical) - CheckDiskSpaceForRestore(): 4x archive size requirement calculation - ClassifyError(): Smart error classification (ignorable/warning/critical/fatal) - FormatErrorWithHint(): User-friendly error messages with actionable solutions - Integrated disk checks into backup/restore workflows with pre-flight validation - Error hints for: lock exhaustion, disk full, syntax errors, permissions, connections - Blocks operations at 95% disk usage, warns at 80%
This commit is contained in:
@@ -16,6 +16,7 @@ import (
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"dbbackup/internal/checks"
|
||||
"dbbackup/internal/config"
|
||||
"dbbackup/internal/database"
|
||||
"dbbackup/internal/logger"
|
||||
@@ -303,6 +304,27 @@ func (e *Engine) BackupCluster(ctx context.Context) error {
|
||||
return fmt.Errorf("failed to create backup directory: %w", err)
|
||||
}
|
||||
|
||||
// Check disk space before starting backup
|
||||
e.log.Info("Checking disk space availability")
|
||||
spaceCheck := checks.CheckDiskSpace(e.cfg.BackupDir)
|
||||
|
||||
if !e.silent {
|
||||
// Show disk space status in CLI mode
|
||||
fmt.Println("\n" + checks.FormatDiskSpaceMessage(spaceCheck))
|
||||
}
|
||||
|
||||
if spaceCheck.Critical {
|
||||
operation.Fail("Insufficient disk space")
|
||||
quietProgress.Fail("Insufficient disk space - free up space and try again")
|
||||
return fmt.Errorf("insufficient disk space: %.1f%% used, operation blocked", spaceCheck.UsedPercent)
|
||||
}
|
||||
|
||||
if spaceCheck.Warning {
|
||||
e.log.Warn("Low disk space - backup may fail if database is large",
|
||||
"available_gb", float64(spaceCheck.AvailableBytes)/(1024*1024*1024),
|
||||
"used_percent", spaceCheck.UsedPercent)
|
||||
}
|
||||
|
||||
// Generate timestamp and filename
|
||||
timestamp := time.Now().Format("20060102_150405")
|
||||
outputFile := filepath.Join(e.cfg.BackupDir, fmt.Sprintf("cluster_%s.tar.gz", timestamp))
|
||||
|
||||
161
internal/checks/disk_check.go
Normal file
161
internal/checks/disk_check.go
Normal file
@@ -0,0 +1,161 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// DiskSpaceCheck represents disk space information
|
||||
type DiskSpaceCheck struct {
|
||||
Path string
|
||||
TotalBytes uint64
|
||||
AvailableBytes uint64
|
||||
UsedBytes uint64
|
||||
UsedPercent float64
|
||||
Sufficient bool
|
||||
Warning bool
|
||||
Critical bool
|
||||
}
|
||||
|
||||
// CheckDiskSpace checks available disk space for a given path
|
||||
func CheckDiskSpace(path string) *DiskSpaceCheck {
|
||||
// Get absolute path
|
||||
absPath, err := filepath.Abs(path)
|
||||
if err != nil {
|
||||
absPath = path
|
||||
}
|
||||
|
||||
// Get filesystem stats
|
||||
var stat syscall.Statfs_t
|
||||
if err := syscall.Statfs(absPath, &stat); err != nil {
|
||||
// Return error state
|
||||
return &DiskSpaceCheck{
|
||||
Path: absPath,
|
||||
Critical: true,
|
||||
Sufficient: false,
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate space
|
||||
totalBytes := stat.Blocks * uint64(stat.Bsize)
|
||||
availableBytes := stat.Bavail * uint64(stat.Bsize)
|
||||
usedBytes := totalBytes - availableBytes
|
||||
usedPercent := float64(usedBytes) / float64(totalBytes) * 100
|
||||
|
||||
check := &DiskSpaceCheck{
|
||||
Path: absPath,
|
||||
TotalBytes: totalBytes,
|
||||
AvailableBytes: availableBytes,
|
||||
UsedBytes: usedBytes,
|
||||
UsedPercent: usedPercent,
|
||||
}
|
||||
|
||||
// Determine status thresholds
|
||||
check.Critical = usedPercent >= 95
|
||||
check.Warning = usedPercent >= 80 && !check.Critical
|
||||
check.Sufficient = !check.Critical && !check.Warning
|
||||
|
||||
return check
|
||||
}
|
||||
|
||||
// CheckDiskSpaceForRestore checks if there's enough space for restore (needs 4x archive size)
|
||||
func CheckDiskSpaceForRestore(path string, archiveSize int64) *DiskSpaceCheck {
|
||||
check := CheckDiskSpace(path)
|
||||
requiredBytes := uint64(archiveSize) * 4 // Account for decompression
|
||||
|
||||
// Override status based on required space
|
||||
if check.AvailableBytes < requiredBytes {
|
||||
check.Critical = true
|
||||
check.Sufficient = false
|
||||
check.Warning = false
|
||||
} else if check.AvailableBytes < requiredBytes*2 {
|
||||
check.Warning = true
|
||||
check.Sufficient = false
|
||||
}
|
||||
|
||||
return check
|
||||
}
|
||||
|
||||
// FormatDiskSpaceMessage creates a user-friendly disk space message
|
||||
func FormatDiskSpaceMessage(check *DiskSpaceCheck) string {
|
||||
var status string
|
||||
var icon string
|
||||
|
||||
if check.Critical {
|
||||
status = "CRITICAL"
|
||||
icon = "❌"
|
||||
} else if check.Warning {
|
||||
status = "WARNING"
|
||||
icon = "⚠️ "
|
||||
} else {
|
||||
status = "OK"
|
||||
icon = "✓"
|
||||
}
|
||||
|
||||
msg := fmt.Sprintf(`📊 Disk Space Check (%s):
|
||||
Path: %s
|
||||
Total: %s
|
||||
Available: %s (%.1f%% used)
|
||||
%s Status: %s`,
|
||||
status,
|
||||
check.Path,
|
||||
formatBytes(check.TotalBytes),
|
||||
formatBytes(check.AvailableBytes),
|
||||
check.UsedPercent,
|
||||
icon,
|
||||
status)
|
||||
|
||||
if check.Critical {
|
||||
msg += "\n \n ⚠️ CRITICAL: Insufficient disk space!"
|
||||
msg += "\n Operation blocked. Free up space before continuing."
|
||||
} else if check.Warning {
|
||||
msg += "\n \n ⚠️ WARNING: Low disk space!"
|
||||
msg += "\n Backup may fail if database is larger than estimated."
|
||||
} else {
|
||||
msg += "\n \n ✓ Sufficient space available"
|
||||
}
|
||||
|
||||
return msg
|
||||
}
|
||||
|
||||
// EstimateBackupSize estimates backup size based on database size
|
||||
func EstimateBackupSize(databaseSize uint64, compressionLevel int) uint64 {
|
||||
// Typical compression ratios:
|
||||
// Level 0 (no compression): 1.0x
|
||||
// Level 1-3 (fast): 0.4-0.6x
|
||||
// Level 4-6 (balanced): 0.3-0.4x
|
||||
// Level 7-9 (best): 0.2-0.3x
|
||||
|
||||
var compressionRatio float64
|
||||
if compressionLevel == 0 {
|
||||
compressionRatio = 1.0
|
||||
} else if compressionLevel <= 3 {
|
||||
compressionRatio = 0.5
|
||||
} else if compressionLevel <= 6 {
|
||||
compressionRatio = 0.35
|
||||
} else {
|
||||
compressionRatio = 0.25
|
||||
}
|
||||
|
||||
estimated := uint64(float64(databaseSize) * compressionRatio)
|
||||
|
||||
// Add 10% buffer for metadata, indexes, etc.
|
||||
return uint64(float64(estimated) * 1.1)
|
||||
}
|
||||
|
||||
|
||||
|
||||
// formatBytes formats bytes to human-readable format
|
||||
func formatBytes(bytes uint64) string {
|
||||
const unit = 1024
|
||||
if bytes < unit {
|
||||
return fmt.Sprintf("%d B", bytes)
|
||||
}
|
||||
div, exp := uint64(unit), 0
|
||||
for n := bytes / unit; n >= unit; n /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
221
internal/checks/error_hints.go
Normal file
221
internal/checks/error_hints.go
Normal file
@@ -0,0 +1,221 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ErrorClassification represents the severity and type of error
|
||||
type ErrorClassification struct {
|
||||
Type string // "ignorable", "warning", "critical", "fatal"
|
||||
Category string // "disk_space", "locks", "corruption", "permissions", "network", "syntax"
|
||||
Message string
|
||||
Hint string
|
||||
Action string // Suggested command or action
|
||||
Severity int // 0=info, 1=warning, 2=error, 3=fatal
|
||||
}
|
||||
|
||||
// ClassifyError analyzes an error message and provides actionable hints
|
||||
func ClassifyError(errorMsg string) *ErrorClassification {
|
||||
lowerMsg := strings.ToLower(errorMsg)
|
||||
|
||||
// Ignorable errors (objects already exist)
|
||||
if strings.Contains(lowerMsg, "already exists") {
|
||||
return &ErrorClassification{
|
||||
Type: "ignorable",
|
||||
Category: "duplicate",
|
||||
Message: errorMsg,
|
||||
Hint: "Object already exists in target database - this is normal during restore",
|
||||
Action: "No action needed - restore will continue",
|
||||
Severity: 0,
|
||||
}
|
||||
}
|
||||
|
||||
// Disk space errors
|
||||
if strings.Contains(lowerMsg, "no space left") || strings.Contains(lowerMsg, "disk full") {
|
||||
return &ErrorClassification{
|
||||
Type: "critical",
|
||||
Category: "disk_space",
|
||||
Message: errorMsg,
|
||||
Hint: "Insufficient disk space to complete operation",
|
||||
Action: "Free up disk space: rm old_backups/* or increase storage",
|
||||
Severity: 3,
|
||||
}
|
||||
}
|
||||
|
||||
// Lock exhaustion errors
|
||||
if strings.Contains(lowerMsg, "max_locks_per_transaction") ||
|
||||
strings.Contains(lowerMsg, "out of shared memory") ||
|
||||
strings.Contains(lowerMsg, "could not open large object") {
|
||||
return &ErrorClassification{
|
||||
Type: "critical",
|
||||
Category: "locks",
|
||||
Message: errorMsg,
|
||||
Hint: "Lock table exhausted - typically caused by large objects in parallel restore",
|
||||
Action: "Increase max_locks_per_transaction in postgresql.conf to 512 or higher",
|
||||
Severity: 2,
|
||||
}
|
||||
}
|
||||
|
||||
// Syntax errors (corrupted dump)
|
||||
if strings.Contains(lowerMsg, "syntax error") {
|
||||
return &ErrorClassification{
|
||||
Type: "critical",
|
||||
Category: "corruption",
|
||||
Message: errorMsg,
|
||||
Hint: "Syntax error in dump file - backup may be corrupted or incomplete",
|
||||
Action: "Re-create backup with: dbbackup backup single <database>",
|
||||
Severity: 3,
|
||||
}
|
||||
}
|
||||
|
||||
// Permission errors
|
||||
if strings.Contains(lowerMsg, "permission denied") || strings.Contains(lowerMsg, "must be owner") {
|
||||
return &ErrorClassification{
|
||||
Type: "critical",
|
||||
Category: "permissions",
|
||||
Message: errorMsg,
|
||||
Hint: "Insufficient permissions to perform operation",
|
||||
Action: "Run as superuser or use --no-owner flag for restore",
|
||||
Severity: 2,
|
||||
}
|
||||
}
|
||||
|
||||
// Connection errors
|
||||
if strings.Contains(lowerMsg, "connection refused") ||
|
||||
strings.Contains(lowerMsg, "could not connect") ||
|
||||
strings.Contains(lowerMsg, "no pg_hba.conf entry") {
|
||||
return &ErrorClassification{
|
||||
Type: "critical",
|
||||
Category: "network",
|
||||
Message: errorMsg,
|
||||
Hint: "Cannot connect to database server",
|
||||
Action: "Check database is running and pg_hba.conf allows connection",
|
||||
Severity: 2,
|
||||
}
|
||||
}
|
||||
|
||||
// Version compatibility warnings
|
||||
if strings.Contains(lowerMsg, "version mismatch") || strings.Contains(lowerMsg, "incompatible") {
|
||||
return &ErrorClassification{
|
||||
Type: "warning",
|
||||
Category: "version",
|
||||
Message: errorMsg,
|
||||
Hint: "PostgreSQL version mismatch between backup and restore target",
|
||||
Action: "Review release notes for compatibility: https://www.postgresql.org/docs/",
|
||||
Severity: 1,
|
||||
}
|
||||
}
|
||||
|
||||
// Excessive errors (corrupted dump)
|
||||
if strings.Contains(errorMsg, "total errors:") {
|
||||
parts := strings.Split(errorMsg, "total errors:")
|
||||
if len(parts) > 1 {
|
||||
var count int
|
||||
if _, err := fmt.Sscanf(parts[1], "%d", &count); err == nil && count > 100000 {
|
||||
return &ErrorClassification{
|
||||
Type: "fatal",
|
||||
Category: "corruption",
|
||||
Message: errorMsg,
|
||||
Hint: fmt.Sprintf("Excessive errors (%d) indicate severely corrupted dump file", count),
|
||||
Action: "Re-create backup from source database",
|
||||
Severity: 3,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Default: unclassified error
|
||||
return &ErrorClassification{
|
||||
Type: "error",
|
||||
Category: "unknown",
|
||||
Message: errorMsg,
|
||||
Hint: "An error occurred during operation",
|
||||
Action: "Check logs for details or contact support",
|
||||
Severity: 2,
|
||||
}
|
||||
}
|
||||
|
||||
// FormatErrorWithHint creates a user-friendly error message with hints
|
||||
func FormatErrorWithHint(errorMsg string) string {
|
||||
classification := ClassifyError(errorMsg)
|
||||
|
||||
var icon string
|
||||
switch classification.Type {
|
||||
case "ignorable":
|
||||
icon = "ℹ️ "
|
||||
case "warning":
|
||||
icon = "⚠️ "
|
||||
case "critical":
|
||||
icon = "❌"
|
||||
case "fatal":
|
||||
icon = "🛑"
|
||||
default:
|
||||
icon = "⚠️ "
|
||||
}
|
||||
|
||||
output := fmt.Sprintf("%s %s Error\n\n", icon, strings.ToUpper(classification.Type))
|
||||
output += fmt.Sprintf("Category: %s\n", classification.Category)
|
||||
output += fmt.Sprintf("Message: %s\n\n", classification.Message)
|
||||
output += fmt.Sprintf("💡 Hint: %s\n\n", classification.Hint)
|
||||
output += fmt.Sprintf("🔧 Action: %s\n", classification.Action)
|
||||
|
||||
return output
|
||||
}
|
||||
|
||||
// FormatMultipleErrors formats multiple errors with classification
|
||||
func FormatMultipleErrors(errors []string) string {
|
||||
if len(errors) == 0 {
|
||||
return "✓ No errors"
|
||||
}
|
||||
|
||||
ignorable := 0
|
||||
warnings := 0
|
||||
critical := 0
|
||||
fatal := 0
|
||||
|
||||
var criticalErrors []string
|
||||
|
||||
for _, err := range errors {
|
||||
class := ClassifyError(err)
|
||||
switch class.Type {
|
||||
case "ignorable":
|
||||
ignorable++
|
||||
case "warning":
|
||||
warnings++
|
||||
case "critical":
|
||||
critical++
|
||||
if len(criticalErrors) < 3 { // Keep first 3 critical errors
|
||||
criticalErrors = append(criticalErrors, err)
|
||||
}
|
||||
case "fatal":
|
||||
fatal++
|
||||
criticalErrors = append(criticalErrors, err)
|
||||
}
|
||||
}
|
||||
|
||||
output := "📊 Error Summary:\n\n"
|
||||
if ignorable > 0 {
|
||||
output += fmt.Sprintf(" ℹ️ %d ignorable (objects already exist)\n", ignorable)
|
||||
}
|
||||
if warnings > 0 {
|
||||
output += fmt.Sprintf(" ⚠️ %d warnings\n", warnings)
|
||||
}
|
||||
if critical > 0 {
|
||||
output += fmt.Sprintf(" ❌ %d critical errors\n", critical)
|
||||
}
|
||||
if fatal > 0 {
|
||||
output += fmt.Sprintf(" 🛑 %d fatal errors\n", fatal)
|
||||
}
|
||||
|
||||
if len(criticalErrors) > 0 {
|
||||
output += "\n📝 Critical Issues:\n\n"
|
||||
for i, err := range criticalErrors {
|
||||
class := ClassifyError(err)
|
||||
output += fmt.Sprintf("%d. %s\n", i+1, class.Hint)
|
||||
output += fmt.Sprintf(" Action: %s\n\n", class.Action)
|
||||
}
|
||||
}
|
||||
|
||||
return output
|
||||
}
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"dbbackup/internal/checks"
|
||||
"dbbackup/internal/config"
|
||||
"dbbackup/internal/database"
|
||||
"dbbackup/internal/logger"
|
||||
@@ -341,10 +342,21 @@ func (e *Engine) executeRestoreCommand(ctx context.Context, cmdArgs []string) er
|
||||
return nil // Success despite ignorable errors
|
||||
}
|
||||
|
||||
e.log.Error("Restore command failed", "error", err, "last_stderr", lastError, "error_count", errorCount)
|
||||
// Classify error and provide helpful hints
|
||||
if lastError != "" {
|
||||
return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d)", err, lastError, errorCount)
|
||||
classification := checks.ClassifyError(lastError)
|
||||
e.log.Error("Restore command failed",
|
||||
"error", err,
|
||||
"last_stderr", lastError,
|
||||
"error_count", errorCount,
|
||||
"error_type", classification.Type,
|
||||
"hint", classification.Hint,
|
||||
"action", classification.Action)
|
||||
return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d) - %s",
|
||||
err, lastError, errorCount, classification.Hint)
|
||||
}
|
||||
|
||||
e.log.Error("Restore command failed", "error", err, "last_stderr", lastError, "error_count", errorCount)
|
||||
return fmt.Errorf("restore failed: %w", err)
|
||||
}
|
||||
|
||||
@@ -412,10 +424,21 @@ func (e *Engine) executeRestoreWithDecompression(ctx context.Context, archivePat
|
||||
return nil // Success despite ignorable errors
|
||||
}
|
||||
|
||||
e.log.Error("Restore with decompression failed", "error", err, "last_stderr", lastError, "error_count", errorCount)
|
||||
// Classify error and provide helpful hints
|
||||
if lastError != "" {
|
||||
return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d)", err, lastError, errorCount)
|
||||
classification := checks.ClassifyError(lastError)
|
||||
e.log.Error("Restore with decompression failed",
|
||||
"error", err,
|
||||
"last_stderr", lastError,
|
||||
"error_count", errorCount,
|
||||
"error_type", classification.Type,
|
||||
"hint", classification.Hint,
|
||||
"action", classification.Action)
|
||||
return fmt.Errorf("restore failed: %w (last error: %s, total errors: %d) - %s",
|
||||
err, lastError, errorCount, classification.Hint)
|
||||
}
|
||||
|
||||
e.log.Error("Restore with decompression failed", "error", err, "last_stderr", lastError, "error_count", errorCount)
|
||||
return fmt.Errorf("restore failed: %w", err)
|
||||
}
|
||||
|
||||
@@ -474,6 +497,24 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
|
||||
operation.Fail("Invalid cluster archive format")
|
||||
return fmt.Errorf("not a cluster archive: %s (detected format: %s)", archivePath, format)
|
||||
}
|
||||
|
||||
// Check disk space before starting restore
|
||||
e.log.Info("Checking disk space for restore")
|
||||
archiveInfo, err := os.Stat(archivePath)
|
||||
if err == nil {
|
||||
spaceCheck := checks.CheckDiskSpaceForRestore(e.cfg.BackupDir, archiveInfo.Size())
|
||||
|
||||
if spaceCheck.Critical {
|
||||
operation.Fail("Insufficient disk space")
|
||||
return fmt.Errorf("insufficient disk space for restore: %.1f%% used - need at least 4x archive size", spaceCheck.UsedPercent)
|
||||
}
|
||||
|
||||
if spaceCheck.Warning {
|
||||
e.log.Warn("Low disk space - restore may fail",
|
||||
"available_gb", float64(spaceCheck.AvailableBytes)/(1024*1024*1024),
|
||||
"used_percent", spaceCheck.UsedPercent)
|
||||
}
|
||||
}
|
||||
|
||||
if e.dryRun {
|
||||
e.log.Info("DRY RUN: Would restore cluster", "archive", archivePath)
|
||||
|
||||
Reference in New Issue
Block a user