Files
dbbackup/internal/rto/calculator.go
Alexander Renz f69bfe7071 feat: Add enterprise DBA features for production reliability
New features implemented:

1. Backup Catalog (internal/catalog/)
   - SQLite-based backup tracking
   - Gap detection and RPO monitoring
   - Search and statistics
   - Filesystem sync

2. DR Drill Testing (internal/drill/)
   - Automated restore testing in Docker containers
   - Database validation with custom queries
   - Catalog integration for drill-tested status

3. Smart Notifications (internal/notify/)
   - Event batching with configurable intervals
   - Time-based escalation policies
   - HTML/text/Slack templates

4. Compliance Reports (internal/report/)
   - SOC2, GDPR, HIPAA, PCI-DSS, ISO27001 frameworks
   - Evidence collection from catalog
   - JSON, Markdown, HTML output formats

5. RTO/RPO Calculator (internal/rto/)
   - Recovery objective analysis
   - RTO breakdown by phase
   - Recommendations for improvement

6. Replica-Aware Backup (internal/replica/)
   - Topology detection for PostgreSQL/MySQL
   - Automatic replica selection
   - Configurable selection strategies

7. Parallel Table Backup (internal/parallel/)
   - Concurrent table dumps
   - Worker pool with progress tracking
   - Large table optimization

8. MySQL/MariaDB PITR (internal/pitr/)
   - Binary log parsing and replay
   - Point-in-time recovery support
   - Transaction filtering

CLI commands added: catalog, drill, report, rto

All changes support the goal: reliable 3 AM database recovery.
2025-12-13 20:28:55 +01:00

482 lines
14 KiB
Go

// Package rto provides RTO/RPO calculation and analysis
package rto
import (
"context"
"fmt"
"sort"
"time"
"dbbackup/internal/catalog"
)
// Calculator calculates RTO and RPO metrics
type Calculator struct {
catalog catalog.Catalog
config Config
}
// Config configures RTO/RPO calculations
type Config struct {
TargetRTO time.Duration `json:"target_rto"` // Target Recovery Time Objective
TargetRPO time.Duration `json:"target_rpo"` // Target Recovery Point Objective
// Assumptions for calculation
NetworkSpeedMbps float64 `json:"network_speed_mbps"` // Network speed for cloud restores
DiskReadSpeedMBps float64 `json:"disk_read_speed_mbps"` // Disk read speed
DiskWriteSpeedMBps float64 `json:"disk_write_speed_mbps"` // Disk write speed
CloudDownloadSpeedMbps float64 `json:"cloud_download_speed_mbps"`
// Time estimates for various operations
StartupTimeMinutes int `json:"startup_time_minutes"` // DB startup time
ValidationTimeMinutes int `json:"validation_time_minutes"` // Post-restore validation
SwitchoverTimeMinutes int `json:"switchover_time_minutes"` // Application switchover time
}
// DefaultConfig returns sensible defaults
func DefaultConfig() Config {
return Config{
TargetRTO: 4 * time.Hour,
TargetRPO: 1 * time.Hour,
NetworkSpeedMbps: 100,
DiskReadSpeedMBps: 100,
DiskWriteSpeedMBps: 50,
CloudDownloadSpeedMbps: 100,
StartupTimeMinutes: 2,
ValidationTimeMinutes: 5,
SwitchoverTimeMinutes: 5,
}
}
// Analysis contains RTO/RPO analysis results
type Analysis struct {
Database string `json:"database"`
Timestamp time.Time `json:"timestamp"`
// Current state
CurrentRPO time.Duration `json:"current_rpo"`
CurrentRTO time.Duration `json:"current_rto"`
// Target state
TargetRPO time.Duration `json:"target_rpo"`
TargetRTO time.Duration `json:"target_rto"`
// Compliance
RPOCompliant bool `json:"rpo_compliant"`
RTOCompliant bool `json:"rto_compliant"`
// Details
LastBackup *time.Time `json:"last_backup,omitempty"`
NextScheduled *time.Time `json:"next_scheduled,omitempty"`
BackupInterval time.Duration `json:"backup_interval"`
// RTO breakdown
RTOBreakdown RTOBreakdown `json:"rto_breakdown"`
// Recommendations
Recommendations []Recommendation `json:"recommendations,omitempty"`
// Historical
History []HistoricalPoint `json:"history,omitempty"`
}
// RTOBreakdown shows components of RTO calculation
type RTOBreakdown struct {
DetectionTime time.Duration `json:"detection_time"`
DecisionTime time.Duration `json:"decision_time"`
DownloadTime time.Duration `json:"download_time"`
RestoreTime time.Duration `json:"restore_time"`
StartupTime time.Duration `json:"startup_time"`
ValidationTime time.Duration `json:"validation_time"`
SwitchoverTime time.Duration `json:"switchover_time"`
TotalTime time.Duration `json:"total_time"`
}
// Recommendation suggests improvements
type Recommendation struct {
Type RecommendationType `json:"type"`
Priority Priority `json:"priority"`
Title string `json:"title"`
Description string `json:"description"`
Impact string `json:"impact"`
Effort Effort `json:"effort"`
}
// RecommendationType categorizes recommendations
type RecommendationType string
const (
RecommendBackupFrequency RecommendationType = "backup_frequency"
RecommendIncrementalBackup RecommendationType = "incremental_backup"
RecommendCompression RecommendationType = "compression"
RecommendLocalCache RecommendationType = "local_cache"
RecommendParallelRestore RecommendationType = "parallel_restore"
RecommendWALArchiving RecommendationType = "wal_archiving"
RecommendReplication RecommendationType = "replication"
)
// Priority levels
type Priority string
const (
PriorityCritical Priority = "critical"
PriorityHigh Priority = "high"
PriorityMedium Priority = "medium"
PriorityLow Priority = "low"
)
// Effort levels
type Effort string
const (
EffortLow Effort = "low"
EffortMedium Effort = "medium"
EffortHigh Effort = "high"
)
// HistoricalPoint tracks RTO/RPO over time
type HistoricalPoint struct {
Timestamp time.Time `json:"timestamp"`
RPO time.Duration `json:"rpo"`
RTO time.Duration `json:"rto"`
}
// NewCalculator creates a new RTO/RPO calculator
func NewCalculator(cat catalog.Catalog, config Config) *Calculator {
return &Calculator{
catalog: cat,
config: config,
}
}
// Analyze performs RTO/RPO analysis for a database
func (c *Calculator) Analyze(ctx context.Context, database string) (*Analysis, error) {
analysis := &Analysis{
Database: database,
Timestamp: time.Now(),
TargetRPO: c.config.TargetRPO,
TargetRTO: c.config.TargetRTO,
}
// Get recent backups
entries, err := c.catalog.List(ctx, database, 100)
if err != nil {
return nil, fmt.Errorf("failed to list backups: %w", err)
}
if len(entries) == 0 {
// No backups - worst case scenario
analysis.CurrentRPO = 0 // undefined
analysis.CurrentRTO = 0 // undefined
analysis.Recommendations = append(analysis.Recommendations, Recommendation{
Type: RecommendBackupFrequency,
Priority: PriorityCritical,
Title: "No Backups Found",
Description: "No backups exist for this database",
Impact: "Cannot recover in case of failure",
Effort: EffortLow,
})
return analysis, nil
}
// Calculate current RPO (time since last backup)
lastBackup := entries[0].CreatedAt
analysis.LastBackup = &lastBackup
analysis.CurrentRPO = time.Since(lastBackup)
analysis.RPOCompliant = analysis.CurrentRPO <= c.config.TargetRPO
// Calculate backup interval
if len(entries) >= 2 {
analysis.BackupInterval = calculateAverageInterval(entries)
}
// Calculate RTO
latestEntry := entries[0]
analysis.RTOBreakdown = c.calculateRTOBreakdown(latestEntry)
analysis.CurrentRTO = analysis.RTOBreakdown.TotalTime
analysis.RTOCompliant = analysis.CurrentRTO <= c.config.TargetRTO
// Generate recommendations
analysis.Recommendations = c.generateRecommendations(analysis, entries)
// Calculate history
analysis.History = c.calculateHistory(entries)
return analysis, nil
}
// AnalyzeAll analyzes all databases
func (c *Calculator) AnalyzeAll(ctx context.Context) ([]*Analysis, error) {
databases, err := c.catalog.ListDatabases(ctx)
if err != nil {
return nil, fmt.Errorf("failed to list databases: %w", err)
}
var analyses []*Analysis
for _, db := range databases {
analysis, err := c.Analyze(ctx, db)
if err != nil {
continue // Skip errors for individual databases
}
analyses = append(analyses, analysis)
}
return analyses, nil
}
// calculateRTOBreakdown calculates RTO components
func (c *Calculator) calculateRTOBreakdown(entry *catalog.Entry) RTOBreakdown {
breakdown := RTOBreakdown{
// Detection time - assume monitoring catches issues quickly
DetectionTime: 5 * time.Minute,
// Decision time - human decision making
DecisionTime: 10 * time.Minute,
// Startup time
StartupTime: time.Duration(c.config.StartupTimeMinutes) * time.Minute,
// Validation time
ValidationTime: time.Duration(c.config.ValidationTimeMinutes) * time.Minute,
// Switchover time
SwitchoverTime: time.Duration(c.config.SwitchoverTimeMinutes) * time.Minute,
}
// Calculate download time (if cloud backup)
if entry.CloudLocation != "" {
// Cloud download
bytesPerSecond := c.config.CloudDownloadSpeedMbps * 125000 // Mbps to bytes/sec
downloadSeconds := float64(entry.SizeBytes) / bytesPerSecond
breakdown.DownloadTime = time.Duration(downloadSeconds * float64(time.Second))
}
// Calculate restore time
// Estimate based on disk write speed
bytesPerSecond := c.config.DiskWriteSpeedMBps * 1000000 // MB/s to bytes/sec
restoreSeconds := float64(entry.SizeBytes) / bytesPerSecond
// Add overhead for decompression if compressed
if entry.Compression != "" && entry.Compression != "none" {
restoreSeconds *= 1.3 // 30% overhead for decompression
}
// Add overhead for decryption if encrypted
if entry.Encrypted {
restoreSeconds *= 1.1 // 10% overhead for decryption
}
breakdown.RestoreTime = time.Duration(restoreSeconds * float64(time.Second))
// Calculate total
breakdown.TotalTime = breakdown.DetectionTime +
breakdown.DecisionTime +
breakdown.DownloadTime +
breakdown.RestoreTime +
breakdown.StartupTime +
breakdown.ValidationTime +
breakdown.SwitchoverTime
return breakdown
}
// calculateAverageInterval calculates average time between backups
func calculateAverageInterval(entries []*catalog.Entry) time.Duration {
if len(entries) < 2 {
return 0
}
var totalInterval time.Duration
for i := 0; i < len(entries)-1; i++ {
interval := entries[i].CreatedAt.Sub(entries[i+1].CreatedAt)
totalInterval += interval
}
return totalInterval / time.Duration(len(entries)-1)
}
// generateRecommendations creates recommendations based on analysis
func (c *Calculator) generateRecommendations(analysis *Analysis, entries []*catalog.Entry) []Recommendation {
var recommendations []Recommendation
// RPO violations
if !analysis.RPOCompliant {
gap := analysis.CurrentRPO - c.config.TargetRPO
recommendations = append(recommendations, Recommendation{
Type: RecommendBackupFrequency,
Priority: PriorityCritical,
Title: "RPO Target Not Met",
Description: fmt.Sprintf("Current RPO (%s) exceeds target (%s) by %s",
formatDuration(analysis.CurrentRPO),
formatDuration(c.config.TargetRPO),
formatDuration(gap)),
Impact: "Potential data loss exceeds acceptable threshold",
Effort: EffortLow,
})
}
// RTO violations
if !analysis.RTOCompliant {
recommendations = append(recommendations, Recommendation{
Type: RecommendParallelRestore,
Priority: PriorityHigh,
Title: "RTO Target Not Met",
Description: fmt.Sprintf("Estimated recovery time (%s) exceeds target (%s)",
formatDuration(analysis.CurrentRTO),
formatDuration(c.config.TargetRTO)),
Impact: "Recovery may take longer than acceptable",
Effort: EffortMedium,
})
}
// Large download time
if analysis.RTOBreakdown.DownloadTime > 30*time.Minute {
recommendations = append(recommendations, Recommendation{
Type: RecommendLocalCache,
Priority: PriorityMedium,
Title: "Consider Local Backup Cache",
Description: fmt.Sprintf("Cloud download takes %s, local cache would reduce this",
formatDuration(analysis.RTOBreakdown.DownloadTime)),
Impact: "Faster recovery from local storage",
Effort: EffortMedium,
})
}
// No incremental backups
hasIncremental := false
for _, e := range entries {
if e.BackupType == "incremental" {
hasIncremental = true
break
}
}
if !hasIncremental && analysis.BackupInterval > 6*time.Hour {
recommendations = append(recommendations, Recommendation{
Type: RecommendIncrementalBackup,
Priority: PriorityMedium,
Title: "Enable Incremental Backups",
Description: "Incremental backups can reduce backup time and storage",
Impact: "Better RPO with less resource usage",
Effort: EffortLow,
})
}
// WAL archiving for PostgreSQL
if len(entries) > 0 && entries[0].DatabaseType == "postgresql" {
recommendations = append(recommendations, Recommendation{
Type: RecommendWALArchiving,
Priority: PriorityMedium,
Title: "Consider WAL Archiving",
Description: "Enable WAL archiving for point-in-time recovery",
Impact: "Achieve near-zero RPO with PITR",
Effort: EffortMedium,
})
}
return recommendations
}
// calculateHistory generates historical RTO/RPO points
func (c *Calculator) calculateHistory(entries []*catalog.Entry) []HistoricalPoint {
var history []HistoricalPoint
// Sort entries by date (oldest first)
sorted := make([]*catalog.Entry, len(entries))
copy(sorted, entries)
sort.Slice(sorted, func(i, j int) bool {
return sorted[i].CreatedAt.Before(sorted[j].CreatedAt)
})
for i, entry := range sorted {
point := HistoricalPoint{
Timestamp: entry.CreatedAt,
RTO: c.calculateRTOBreakdown(entry).TotalTime,
}
// Calculate RPO at that point (time until next backup)
if i < len(sorted)-1 {
point.RPO = sorted[i+1].CreatedAt.Sub(entry.CreatedAt)
} else {
point.RPO = time.Since(entry.CreatedAt)
}
history = append(history, point)
}
return history
}
// Summary provides aggregate RTO/RPO status
type Summary struct {
TotalDatabases int `json:"total_databases"`
RPOCompliant int `json:"rpo_compliant"`
RTOCompliant int `json:"rto_compliant"`
FullyCompliant int `json:"fully_compliant"`
CriticalIssues int `json:"critical_issues"`
WorstRPO time.Duration `json:"worst_rpo"`
WorstRTO time.Duration `json:"worst_rto"`
WorstRPODatabase string `json:"worst_rpo_database"`
WorstRTODatabase string `json:"worst_rto_database"`
AverageRPO time.Duration `json:"average_rpo"`
AverageRTO time.Duration `json:"average_rto"`
}
// Summarize creates a summary from analyses
func Summarize(analyses []*Analysis) *Summary {
summary := &Summary{}
var totalRPO, totalRTO time.Duration
for _, a := range analyses {
summary.TotalDatabases++
if a.RPOCompliant {
summary.RPOCompliant++
}
if a.RTOCompliant {
summary.RTOCompliant++
}
if a.RPOCompliant && a.RTOCompliant {
summary.FullyCompliant++
}
for _, r := range a.Recommendations {
if r.Priority == PriorityCritical {
summary.CriticalIssues++
break
}
}
if a.CurrentRPO > summary.WorstRPO {
summary.WorstRPO = a.CurrentRPO
summary.WorstRPODatabase = a.Database
}
if a.CurrentRTO > summary.WorstRTO {
summary.WorstRTO = a.CurrentRTO
summary.WorstRTODatabase = a.Database
}
totalRPO += a.CurrentRPO
totalRTO += a.CurrentRTO
}
if len(analyses) > 0 {
summary.AverageRPO = totalRPO / time.Duration(len(analyses))
summary.AverageRTO = totalRTO / time.Duration(len(analyses))
}
return summary
}
func formatDuration(d time.Duration) string {
if d < time.Minute {
return fmt.Sprintf("%.0fs", d.Seconds())
}
if d < time.Hour {
return fmt.Sprintf("%.0fm", d.Minutes())
}
hours := int(d.Hours())
mins := int(d.Minutes()) - hours*60
return fmt.Sprintf("%dh %dm", hours, mins)
}