feat: Add enterprise DBA features for production reliability

New features implemented: 1. Backup Catalog (internal/catalog/) - SQLite-based backup tracking - Gap detection and RPO monitoring - Search and statistics - Filesystem sync 2. DR Drill Testing (internal/drill/) - Automated restore testing in Docker containers - Database validation with custom queries - Catalog integration for drill-tested status 3. Smart Notifications (internal/notify/) - Event batching with configurable intervals - Time-based escalation policies - HTML/text/Slack templates 4. Compliance Reports (internal/report/) - SOC2, GDPR, HIPAA, PCI-DSS, ISO27001 frameworks - Evidence collection from catalog - JSON, Markdown, HTML output formats 5. RTO/RPO Calculator (internal/rto/) - Recovery objective analysis - RTO breakdown by phase - Recommendations for improvement 6. Replica-Aware Backup (internal/replica/) - Topology detection for PostgreSQL/MySQL - Automatic replica selection - Configurable selection strategies 7. Parallel Table Backup (internal/parallel/) - Concurrent table dumps - Worker pool with progress tracking - Large table optimization 8. MySQL/MariaDB PITR (internal/pitr/) - Binary log parsing and replay - Point-in-time recovery support - Transaction filtering CLI commands added: catalog, drill, report, rto All changes support the goal: reliable 3 AM database recovery.
2025-12-13 20:28:55 +01:00
parent d0d83b61ef
commit f69bfe7071
34 changed files with 13469 additions and 41 deletions
--- a/internal/rto/calculator.go
+++ b/internal/rto/calculator.go
@@ -0,0 +1,481 @@
+// Package rto provides RTO/RPO calculation and analysis
+package rto
+
+import (
+	"context"
+	"fmt"
+	"sort"
+	"time"
+
+	"dbbackup/internal/catalog"
+)
+
+// Calculator calculates RTO and RPO metrics
+type Calculator struct {
+	catalog catalog.Catalog
+	config  Config
+}
+
+// Config configures RTO/RPO calculations
+type Config struct {
+	TargetRTO time.Duration `json:"target_rto"` // Target Recovery Time Objective
+	TargetRPO time.Duration `json:"target_rpo"` // Target Recovery Point Objective
+
+	// Assumptions for calculation
+	NetworkSpeedMbps     float64 `json:"network_speed_mbps"`      // Network speed for cloud restores
+	DiskReadSpeedMBps    float64 `json:"disk_read_speed_mbps"`    // Disk read speed
+	DiskWriteSpeedMBps   float64 `json:"disk_write_speed_mbps"`   // Disk write speed
+	CloudDownloadSpeedMbps float64 `json:"cloud_download_speed_mbps"`
+
+	// Time estimates for various operations
+	StartupTimeMinutes    int `json:"startup_time_minutes"`     // DB startup time
+	ValidationTimeMinutes int `json:"validation_time_minutes"`  // Post-restore validation
+	SwitchoverTimeMinutes int `json:"switchover_time_minutes"`  // Application switchover time
+}
+
+// DefaultConfig returns sensible defaults
+func DefaultConfig() Config {
+	return Config{
+		TargetRTO:            4 * time.Hour,
+		TargetRPO:            1 * time.Hour,
+		NetworkSpeedMbps:     100,
+		DiskReadSpeedMBps:    100,
+		DiskWriteSpeedMBps:   50,
+		CloudDownloadSpeedMbps: 100,
+		StartupTimeMinutes:    2,
+		ValidationTimeMinutes: 5,
+		SwitchoverTimeMinutes: 5,
+	}
+}
+
+// Analysis contains RTO/RPO analysis results
+type Analysis struct {
+	Database    string          `json:"database"`
+	Timestamp   time.Time       `json:"timestamp"`
+	
+	// Current state
+	CurrentRPO  time.Duration   `json:"current_rpo"`
+	CurrentRTO  time.Duration   `json:"current_rto"`
+	
+	// Target state
+	TargetRPO   time.Duration   `json:"target_rpo"`
+	TargetRTO   time.Duration   `json:"target_rto"`
+	
+	// Compliance
+	RPOCompliant bool           `json:"rpo_compliant"`
+	RTOCompliant bool           `json:"rto_compliant"`
+	
+	// Details
+	LastBackup     *time.Time   `json:"last_backup,omitempty"`
+	NextScheduled  *time.Time   `json:"next_scheduled,omitempty"`
+	BackupInterval time.Duration `json:"backup_interval"`
+	
+	// RTO breakdown
+	RTOBreakdown   RTOBreakdown  `json:"rto_breakdown"`
+	
+	// Recommendations
+	Recommendations []Recommendation `json:"recommendations,omitempty"`
+	
+	// Historical
+	History         []HistoricalPoint `json:"history,omitempty"`
+}
+
+// RTOBreakdown shows components of RTO calculation
+type RTOBreakdown struct {
+	DetectionTime   time.Duration `json:"detection_time"`
+	DecisionTime    time.Duration `json:"decision_time"`
+	DownloadTime    time.Duration `json:"download_time"`
+	RestoreTime     time.Duration `json:"restore_time"`
+	StartupTime     time.Duration `json:"startup_time"`
+	ValidationTime  time.Duration `json:"validation_time"`
+	SwitchoverTime  time.Duration `json:"switchover_time"`
+	TotalTime       time.Duration `json:"total_time"`
+}
+
+// Recommendation suggests improvements
+type Recommendation struct {
+	Type        RecommendationType `json:"type"`
+	Priority    Priority           `json:"priority"`
+	Title       string             `json:"title"`
+	Description string             `json:"description"`
+	Impact      string             `json:"impact"`
+	Effort      Effort             `json:"effort"`
+}
+
+// RecommendationType categorizes recommendations
+type RecommendationType string
+
+const (
+	RecommendBackupFrequency RecommendationType = "backup_frequency"
+	RecommendIncrementalBackup RecommendationType = "incremental_backup"
+	RecommendCompression     RecommendationType = "compression"
+	RecommendLocalCache      RecommendationType = "local_cache"
+	RecommendParallelRestore RecommendationType = "parallel_restore"
+	RecommendWALArchiving    RecommendationType = "wal_archiving"
+	RecommendReplication     RecommendationType = "replication"
+)
+
+// Priority levels
+type Priority string
+
+const (
+	PriorityCritical Priority = "critical"
+	PriorityHigh     Priority = "high"
+	PriorityMedium   Priority = "medium"
+	PriorityLow      Priority = "low"
+)
+
+// Effort levels
+type Effort string
+
+const (
+	EffortLow    Effort = "low"
+	EffortMedium Effort = "medium"
+	EffortHigh   Effort = "high"
+)
+
+// HistoricalPoint tracks RTO/RPO over time
+type HistoricalPoint struct {
+	Timestamp time.Time     `json:"timestamp"`
+	RPO       time.Duration `json:"rpo"`
+	RTO       time.Duration `json:"rto"`
+}
+
+// NewCalculator creates a new RTO/RPO calculator
+func NewCalculator(cat catalog.Catalog, config Config) *Calculator {
+	return &Calculator{
+		catalog: cat,
+		config:  config,
+	}
+}
+
+// Analyze performs RTO/RPO analysis for a database
+func (c *Calculator) Analyze(ctx context.Context, database string) (*Analysis, error) {
+	analysis := &Analysis{
+		Database:  database,
+		Timestamp: time.Now(),
+		TargetRPO: c.config.TargetRPO,
+		TargetRTO: c.config.TargetRTO,
+	}
+
+	// Get recent backups
+	entries, err := c.catalog.List(ctx, database, 100)
+	if err != nil {
+		return nil, fmt.Errorf("failed to list backups: %w", err)
+	}
+
+	if len(entries) == 0 {
+		// No backups - worst case scenario
+		analysis.CurrentRPO = 0 // undefined
+		analysis.CurrentRTO = 0 // undefined
+		analysis.Recommendations = append(analysis.Recommendations, Recommendation{
+			Type:        RecommendBackupFrequency,
+			Priority:    PriorityCritical,
+			Title:       "No Backups Found",
+			Description: "No backups exist for this database",
+			Impact:      "Cannot recover in case of failure",
+			Effort:      EffortLow,
+		})
+		return analysis, nil
+	}
+
+	// Calculate current RPO (time since last backup)
+	lastBackup := entries[0].CreatedAt
+	analysis.LastBackup = &lastBackup
+	analysis.CurrentRPO = time.Since(lastBackup)
+	analysis.RPOCompliant = analysis.CurrentRPO <= c.config.TargetRPO
+
+	// Calculate backup interval
+	if len(entries) >= 2 {
+		analysis.BackupInterval = calculateAverageInterval(entries)
+	}
+
+	// Calculate RTO
+	latestEntry := entries[0]
+	analysis.RTOBreakdown = c.calculateRTOBreakdown(latestEntry)
+	analysis.CurrentRTO = analysis.RTOBreakdown.TotalTime
+	analysis.RTOCompliant = analysis.CurrentRTO <= c.config.TargetRTO
+
+	// Generate recommendations
+	analysis.Recommendations = c.generateRecommendations(analysis, entries)
+
+	// Calculate history
+	analysis.History = c.calculateHistory(entries)
+
+	return analysis, nil
+}
+
+// AnalyzeAll analyzes all databases
+func (c *Calculator) AnalyzeAll(ctx context.Context) ([]*Analysis, error) {
+	databases, err := c.catalog.ListDatabases(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("failed to list databases: %w", err)
+	}
+
+	var analyses []*Analysis
+	for _, db := range databases {
+		analysis, err := c.Analyze(ctx, db)
+		if err != nil {
+			continue // Skip errors for individual databases
+		}
+		analyses = append(analyses, analysis)
+	}
+
+	return analyses, nil
+}
+
+// calculateRTOBreakdown calculates RTO components
+func (c *Calculator) calculateRTOBreakdown(entry *catalog.Entry) RTOBreakdown {
+	breakdown := RTOBreakdown{
+		// Detection time - assume monitoring catches issues quickly
+		DetectionTime: 5 * time.Minute,
+		
+		// Decision time - human decision making
+		DecisionTime: 10 * time.Minute,
+		
+		// Startup time
+		StartupTime: time.Duration(c.config.StartupTimeMinutes) * time.Minute,
+		
+		// Validation time
+		ValidationTime: time.Duration(c.config.ValidationTimeMinutes) * time.Minute,
+		
+		// Switchover time
+		SwitchoverTime: time.Duration(c.config.SwitchoverTimeMinutes) * time.Minute,
+	}
+
+	// Calculate download time (if cloud backup)
+	if entry.CloudLocation != "" {
+		// Cloud download
+		bytesPerSecond := c.config.CloudDownloadSpeedMbps * 125000 // Mbps to bytes/sec
+		downloadSeconds := float64(entry.SizeBytes) / bytesPerSecond
+		breakdown.DownloadTime = time.Duration(downloadSeconds * float64(time.Second))
+	}
+
+	// Calculate restore time
+	// Estimate based on disk write speed
+	bytesPerSecond := c.config.DiskWriteSpeedMBps * 1000000 // MB/s to bytes/sec
+	restoreSeconds := float64(entry.SizeBytes) / bytesPerSecond
+	
+	// Add overhead for decompression if compressed
+	if entry.Compression != "" && entry.Compression != "none" {
+		restoreSeconds *= 1.3 // 30% overhead for decompression
+	}
+	
+	// Add overhead for decryption if encrypted
+	if entry.Encrypted {
+		restoreSeconds *= 1.1 // 10% overhead for decryption
+	}
+	
+	breakdown.RestoreTime = time.Duration(restoreSeconds * float64(time.Second))
+
+	// Calculate total
+	breakdown.TotalTime = breakdown.DetectionTime +
+		breakdown.DecisionTime +
+		breakdown.DownloadTime +
+		breakdown.RestoreTime +
+		breakdown.StartupTime +
+		breakdown.ValidationTime +
+		breakdown.SwitchoverTime
+
+	return breakdown
+}
+
+// calculateAverageInterval calculates average time between backups
+func calculateAverageInterval(entries []*catalog.Entry) time.Duration {
+	if len(entries) < 2 {
+		return 0
+	}
+
+	var totalInterval time.Duration
+	for i := 0; i < len(entries)-1; i++ {
+		interval := entries[i].CreatedAt.Sub(entries[i+1].CreatedAt)
+		totalInterval += interval
+	}
+
+	return totalInterval / time.Duration(len(entries)-1)
+}
+
+// generateRecommendations creates recommendations based on analysis
+func (c *Calculator) generateRecommendations(analysis *Analysis, entries []*catalog.Entry) []Recommendation {
+	var recommendations []Recommendation
+
+	// RPO violations
+	if !analysis.RPOCompliant {
+		gap := analysis.CurrentRPO - c.config.TargetRPO
+		recommendations = append(recommendations, Recommendation{
+			Type:        RecommendBackupFrequency,
+			Priority:    PriorityCritical,
+			Title:       "RPO Target Not Met",
+			Description: fmt.Sprintf("Current RPO (%s) exceeds target (%s) by %s",
+				formatDuration(analysis.CurrentRPO),
+				formatDuration(c.config.TargetRPO),
+				formatDuration(gap)),
+			Impact: "Potential data loss exceeds acceptable threshold",
+			Effort: EffortLow,
+		})
+	}
+
+	// RTO violations
+	if !analysis.RTOCompliant {
+		recommendations = append(recommendations, Recommendation{
+			Type:        RecommendParallelRestore,
+			Priority:    PriorityHigh,
+			Title:       "RTO Target Not Met",
+			Description: fmt.Sprintf("Estimated recovery time (%s) exceeds target (%s)",
+				formatDuration(analysis.CurrentRTO),
+				formatDuration(c.config.TargetRTO)),
+			Impact: "Recovery may take longer than acceptable",
+			Effort: EffortMedium,
+		})
+	}
+
+	// Large download time
+	if analysis.RTOBreakdown.DownloadTime > 30*time.Minute {
+		recommendations = append(recommendations, Recommendation{
+			Type:        RecommendLocalCache,
+			Priority:    PriorityMedium,
+			Title:       "Consider Local Backup Cache",
+			Description: fmt.Sprintf("Cloud download takes %s, local cache would reduce this",
+				formatDuration(analysis.RTOBreakdown.DownloadTime)),
+			Impact: "Faster recovery from local storage",
+			Effort: EffortMedium,
+		})
+	}
+
+	// No incremental backups
+	hasIncremental := false
+	for _, e := range entries {
+		if e.BackupType == "incremental" {
+			hasIncremental = true
+			break
+		}
+	}
+	if !hasIncremental && analysis.BackupInterval > 6*time.Hour {
+		recommendations = append(recommendations, Recommendation{
+			Type:        RecommendIncrementalBackup,
+			Priority:    PriorityMedium,
+			Title:       "Enable Incremental Backups",
+			Description: "Incremental backups can reduce backup time and storage",
+			Impact:      "Better RPO with less resource usage",
+			Effort:      EffortLow,
+		})
+	}
+
+	// WAL archiving for PostgreSQL
+	if len(entries) > 0 && entries[0].DatabaseType == "postgresql" {
+		recommendations = append(recommendations, Recommendation{
+			Type:        RecommendWALArchiving,
+			Priority:    PriorityMedium,
+			Title:       "Consider WAL Archiving",
+			Description: "Enable WAL archiving for point-in-time recovery",
+			Impact:      "Achieve near-zero RPO with PITR",
+			Effort:      EffortMedium,
+		})
+	}
+
+	return recommendations
+}
+
+// calculateHistory generates historical RTO/RPO points
+func (c *Calculator) calculateHistory(entries []*catalog.Entry) []HistoricalPoint {
+	var history []HistoricalPoint
+
+	// Sort entries by date (oldest first)
+	sorted := make([]*catalog.Entry, len(entries))
+	copy(sorted, entries)
+	sort.Slice(sorted, func(i, j int) bool {
+		return sorted[i].CreatedAt.Before(sorted[j].CreatedAt)
+	})
+
+	for i, entry := range sorted {
+		point := HistoricalPoint{
+			Timestamp: entry.CreatedAt,
+			RTO:       c.calculateRTOBreakdown(entry).TotalTime,
+		}
+
+		// Calculate RPO at that point (time until next backup)
+		if i < len(sorted)-1 {
+			point.RPO = sorted[i+1].CreatedAt.Sub(entry.CreatedAt)
+		} else {
+			point.RPO = time.Since(entry.CreatedAt)
+		}
+
+		history = append(history, point)
+	}
+
+	return history
+}
+
+// Summary provides aggregate RTO/RPO status
+type Summary struct {
+	TotalDatabases    int           `json:"total_databases"`
+	RPOCompliant      int           `json:"rpo_compliant"`
+	RTOCompliant      int           `json:"rto_compliant"`
+	FullyCompliant    int           `json:"fully_compliant"`
+	CriticalIssues    int           `json:"critical_issues"`
+	WorstRPO          time.Duration `json:"worst_rpo"`
+	WorstRTO          time.Duration `json:"worst_rto"`
+	WorstRPODatabase  string        `json:"worst_rpo_database"`
+	WorstRTODatabase  string        `json:"worst_rto_database"`
+	AverageRPO        time.Duration `json:"average_rpo"`
+	AverageRTO        time.Duration `json:"average_rto"`
+}
+
+// Summarize creates a summary from analyses
+func Summarize(analyses []*Analysis) *Summary {
+	summary := &Summary{}
+	
+	var totalRPO, totalRTO time.Duration
+
+	for _, a := range analyses {
+		summary.TotalDatabases++
+		
+		if a.RPOCompliant {
+			summary.RPOCompliant++
+		}
+		if a.RTOCompliant {
+			summary.RTOCompliant++
+		}
+		if a.RPOCompliant && a.RTOCompliant {
+			summary.FullyCompliant++
+		}
+
+		for _, r := range a.Recommendations {
+			if r.Priority == PriorityCritical {
+				summary.CriticalIssues++
+				break
+			}
+		}
+
+		if a.CurrentRPO > summary.WorstRPO {
+			summary.WorstRPO = a.CurrentRPO
+			summary.WorstRPODatabase = a.Database
+		}
+		if a.CurrentRTO > summary.WorstRTO {
+			summary.WorstRTO = a.CurrentRTO
+			summary.WorstRTODatabase = a.Database
+		}
+
+		totalRPO += a.CurrentRPO
+		totalRTO += a.CurrentRTO
+	}
+
+	if len(analyses) > 0 {
+		summary.AverageRPO = totalRPO / time.Duration(len(analyses))
+		summary.AverageRTO = totalRTO / time.Duration(len(analyses))
+	}
+
+	return summary
+}
+
+func formatDuration(d time.Duration) string {
+	if d < time.Minute {
+		return fmt.Sprintf("%.0fs", d.Seconds())
+	}
+	if d < time.Hour {
+		return fmt.Sprintf("%.0fm", d.Minutes())
+	}
+	hours := int(d.Hours())
+	mins := int(d.Minutes()) - hours*60
+	return fmt.Sprintf("%dh %dm", hours, mins)
+}