feat: Add enterprise DBA features for production reliability
New features implemented: 1. Backup Catalog (internal/catalog/) - SQLite-based backup tracking - Gap detection and RPO monitoring - Search and statistics - Filesystem sync 2. DR Drill Testing (internal/drill/) - Automated restore testing in Docker containers - Database validation with custom queries - Catalog integration for drill-tested status 3. Smart Notifications (internal/notify/) - Event batching with configurable intervals - Time-based escalation policies - HTML/text/Slack templates 4. Compliance Reports (internal/report/) - SOC2, GDPR, HIPAA, PCI-DSS, ISO27001 frameworks - Evidence collection from catalog - JSON, Markdown, HTML output formats 5. RTO/RPO Calculator (internal/rto/) - Recovery objective analysis - RTO breakdown by phase - Recommendations for improvement 6. Replica-Aware Backup (internal/replica/) - Topology detection for PostgreSQL/MySQL - Automatic replica selection - Configurable selection strategies 7. Parallel Table Backup (internal/parallel/) - Concurrent table dumps - Worker pool with progress tracking - Large table optimization 8. MySQL/MariaDB PITR (internal/pitr/) - Binary log parsing and replay - Point-in-time recovery support - Transaction filtering CLI commands added: catalog, drill, report, rto All changes support the goal: reliable 3 AM database recovery.
This commit is contained in:
481
internal/rto/calculator.go
Normal file
481
internal/rto/calculator.go
Normal file
@@ -0,0 +1,481 @@
|
||||
// Package rto provides RTO/RPO calculation and analysis
|
||||
package rto
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"dbbackup/internal/catalog"
|
||||
)
|
||||
|
||||
// Calculator calculates RTO and RPO metrics
|
||||
type Calculator struct {
|
||||
catalog catalog.Catalog
|
||||
config Config
|
||||
}
|
||||
|
||||
// Config configures RTO/RPO calculations
|
||||
type Config struct {
|
||||
TargetRTO time.Duration `json:"target_rto"` // Target Recovery Time Objective
|
||||
TargetRPO time.Duration `json:"target_rpo"` // Target Recovery Point Objective
|
||||
|
||||
// Assumptions for calculation
|
||||
NetworkSpeedMbps float64 `json:"network_speed_mbps"` // Network speed for cloud restores
|
||||
DiskReadSpeedMBps float64 `json:"disk_read_speed_mbps"` // Disk read speed
|
||||
DiskWriteSpeedMBps float64 `json:"disk_write_speed_mbps"` // Disk write speed
|
||||
CloudDownloadSpeedMbps float64 `json:"cloud_download_speed_mbps"`
|
||||
|
||||
// Time estimates for various operations
|
||||
StartupTimeMinutes int `json:"startup_time_minutes"` // DB startup time
|
||||
ValidationTimeMinutes int `json:"validation_time_minutes"` // Post-restore validation
|
||||
SwitchoverTimeMinutes int `json:"switchover_time_minutes"` // Application switchover time
|
||||
}
|
||||
|
||||
// DefaultConfig returns sensible defaults
|
||||
func DefaultConfig() Config {
|
||||
return Config{
|
||||
TargetRTO: 4 * time.Hour,
|
||||
TargetRPO: 1 * time.Hour,
|
||||
NetworkSpeedMbps: 100,
|
||||
DiskReadSpeedMBps: 100,
|
||||
DiskWriteSpeedMBps: 50,
|
||||
CloudDownloadSpeedMbps: 100,
|
||||
StartupTimeMinutes: 2,
|
||||
ValidationTimeMinutes: 5,
|
||||
SwitchoverTimeMinutes: 5,
|
||||
}
|
||||
}
|
||||
|
||||
// Analysis contains RTO/RPO analysis results
|
||||
type Analysis struct {
|
||||
Database string `json:"database"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
|
||||
// Current state
|
||||
CurrentRPO time.Duration `json:"current_rpo"`
|
||||
CurrentRTO time.Duration `json:"current_rto"`
|
||||
|
||||
// Target state
|
||||
TargetRPO time.Duration `json:"target_rpo"`
|
||||
TargetRTO time.Duration `json:"target_rto"`
|
||||
|
||||
// Compliance
|
||||
RPOCompliant bool `json:"rpo_compliant"`
|
||||
RTOCompliant bool `json:"rto_compliant"`
|
||||
|
||||
// Details
|
||||
LastBackup *time.Time `json:"last_backup,omitempty"`
|
||||
NextScheduled *time.Time `json:"next_scheduled,omitempty"`
|
||||
BackupInterval time.Duration `json:"backup_interval"`
|
||||
|
||||
// RTO breakdown
|
||||
RTOBreakdown RTOBreakdown `json:"rto_breakdown"`
|
||||
|
||||
// Recommendations
|
||||
Recommendations []Recommendation `json:"recommendations,omitempty"`
|
||||
|
||||
// Historical
|
||||
History []HistoricalPoint `json:"history,omitempty"`
|
||||
}
|
||||
|
||||
// RTOBreakdown shows components of RTO calculation
|
||||
type RTOBreakdown struct {
|
||||
DetectionTime time.Duration `json:"detection_time"`
|
||||
DecisionTime time.Duration `json:"decision_time"`
|
||||
DownloadTime time.Duration `json:"download_time"`
|
||||
RestoreTime time.Duration `json:"restore_time"`
|
||||
StartupTime time.Duration `json:"startup_time"`
|
||||
ValidationTime time.Duration `json:"validation_time"`
|
||||
SwitchoverTime time.Duration `json:"switchover_time"`
|
||||
TotalTime time.Duration `json:"total_time"`
|
||||
}
|
||||
|
||||
// Recommendation suggests improvements
|
||||
type Recommendation struct {
|
||||
Type RecommendationType `json:"type"`
|
||||
Priority Priority `json:"priority"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Impact string `json:"impact"`
|
||||
Effort Effort `json:"effort"`
|
||||
}
|
||||
|
||||
// RecommendationType categorizes recommendations
|
||||
type RecommendationType string
|
||||
|
||||
const (
|
||||
RecommendBackupFrequency RecommendationType = "backup_frequency"
|
||||
RecommendIncrementalBackup RecommendationType = "incremental_backup"
|
||||
RecommendCompression RecommendationType = "compression"
|
||||
RecommendLocalCache RecommendationType = "local_cache"
|
||||
RecommendParallelRestore RecommendationType = "parallel_restore"
|
||||
RecommendWALArchiving RecommendationType = "wal_archiving"
|
||||
RecommendReplication RecommendationType = "replication"
|
||||
)
|
||||
|
||||
// Priority levels
|
||||
type Priority string
|
||||
|
||||
const (
|
||||
PriorityCritical Priority = "critical"
|
||||
PriorityHigh Priority = "high"
|
||||
PriorityMedium Priority = "medium"
|
||||
PriorityLow Priority = "low"
|
||||
)
|
||||
|
||||
// Effort levels
|
||||
type Effort string
|
||||
|
||||
const (
|
||||
EffortLow Effort = "low"
|
||||
EffortMedium Effort = "medium"
|
||||
EffortHigh Effort = "high"
|
||||
)
|
||||
|
||||
// HistoricalPoint tracks RTO/RPO over time
|
||||
type HistoricalPoint struct {
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
RPO time.Duration `json:"rpo"`
|
||||
RTO time.Duration `json:"rto"`
|
||||
}
|
||||
|
||||
// NewCalculator creates a new RTO/RPO calculator
|
||||
func NewCalculator(cat catalog.Catalog, config Config) *Calculator {
|
||||
return &Calculator{
|
||||
catalog: cat,
|
||||
config: config,
|
||||
}
|
||||
}
|
||||
|
||||
// Analyze performs RTO/RPO analysis for a database
|
||||
func (c *Calculator) Analyze(ctx context.Context, database string) (*Analysis, error) {
|
||||
analysis := &Analysis{
|
||||
Database: database,
|
||||
Timestamp: time.Now(),
|
||||
TargetRPO: c.config.TargetRPO,
|
||||
TargetRTO: c.config.TargetRTO,
|
||||
}
|
||||
|
||||
// Get recent backups
|
||||
entries, err := c.catalog.List(ctx, database, 100)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list backups: %w", err)
|
||||
}
|
||||
|
||||
if len(entries) == 0 {
|
||||
// No backups - worst case scenario
|
||||
analysis.CurrentRPO = 0 // undefined
|
||||
analysis.CurrentRTO = 0 // undefined
|
||||
analysis.Recommendations = append(analysis.Recommendations, Recommendation{
|
||||
Type: RecommendBackupFrequency,
|
||||
Priority: PriorityCritical,
|
||||
Title: "No Backups Found",
|
||||
Description: "No backups exist for this database",
|
||||
Impact: "Cannot recover in case of failure",
|
||||
Effort: EffortLow,
|
||||
})
|
||||
return analysis, nil
|
||||
}
|
||||
|
||||
// Calculate current RPO (time since last backup)
|
||||
lastBackup := entries[0].CreatedAt
|
||||
analysis.LastBackup = &lastBackup
|
||||
analysis.CurrentRPO = time.Since(lastBackup)
|
||||
analysis.RPOCompliant = analysis.CurrentRPO <= c.config.TargetRPO
|
||||
|
||||
// Calculate backup interval
|
||||
if len(entries) >= 2 {
|
||||
analysis.BackupInterval = calculateAverageInterval(entries)
|
||||
}
|
||||
|
||||
// Calculate RTO
|
||||
latestEntry := entries[0]
|
||||
analysis.RTOBreakdown = c.calculateRTOBreakdown(latestEntry)
|
||||
analysis.CurrentRTO = analysis.RTOBreakdown.TotalTime
|
||||
analysis.RTOCompliant = analysis.CurrentRTO <= c.config.TargetRTO
|
||||
|
||||
// Generate recommendations
|
||||
analysis.Recommendations = c.generateRecommendations(analysis, entries)
|
||||
|
||||
// Calculate history
|
||||
analysis.History = c.calculateHistory(entries)
|
||||
|
||||
return analysis, nil
|
||||
}
|
||||
|
||||
// AnalyzeAll analyzes all databases
|
||||
func (c *Calculator) AnalyzeAll(ctx context.Context) ([]*Analysis, error) {
|
||||
databases, err := c.catalog.ListDatabases(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list databases: %w", err)
|
||||
}
|
||||
|
||||
var analyses []*Analysis
|
||||
for _, db := range databases {
|
||||
analysis, err := c.Analyze(ctx, db)
|
||||
if err != nil {
|
||||
continue // Skip errors for individual databases
|
||||
}
|
||||
analyses = append(analyses, analysis)
|
||||
}
|
||||
|
||||
return analyses, nil
|
||||
}
|
||||
|
||||
// calculateRTOBreakdown calculates RTO components
|
||||
func (c *Calculator) calculateRTOBreakdown(entry *catalog.Entry) RTOBreakdown {
|
||||
breakdown := RTOBreakdown{
|
||||
// Detection time - assume monitoring catches issues quickly
|
||||
DetectionTime: 5 * time.Minute,
|
||||
|
||||
// Decision time - human decision making
|
||||
DecisionTime: 10 * time.Minute,
|
||||
|
||||
// Startup time
|
||||
StartupTime: time.Duration(c.config.StartupTimeMinutes) * time.Minute,
|
||||
|
||||
// Validation time
|
||||
ValidationTime: time.Duration(c.config.ValidationTimeMinutes) * time.Minute,
|
||||
|
||||
// Switchover time
|
||||
SwitchoverTime: time.Duration(c.config.SwitchoverTimeMinutes) * time.Minute,
|
||||
}
|
||||
|
||||
// Calculate download time (if cloud backup)
|
||||
if entry.CloudLocation != "" {
|
||||
// Cloud download
|
||||
bytesPerSecond := c.config.CloudDownloadSpeedMbps * 125000 // Mbps to bytes/sec
|
||||
downloadSeconds := float64(entry.SizeBytes) / bytesPerSecond
|
||||
breakdown.DownloadTime = time.Duration(downloadSeconds * float64(time.Second))
|
||||
}
|
||||
|
||||
// Calculate restore time
|
||||
// Estimate based on disk write speed
|
||||
bytesPerSecond := c.config.DiskWriteSpeedMBps * 1000000 // MB/s to bytes/sec
|
||||
restoreSeconds := float64(entry.SizeBytes) / bytesPerSecond
|
||||
|
||||
// Add overhead for decompression if compressed
|
||||
if entry.Compression != "" && entry.Compression != "none" {
|
||||
restoreSeconds *= 1.3 // 30% overhead for decompression
|
||||
}
|
||||
|
||||
// Add overhead for decryption if encrypted
|
||||
if entry.Encrypted {
|
||||
restoreSeconds *= 1.1 // 10% overhead for decryption
|
||||
}
|
||||
|
||||
breakdown.RestoreTime = time.Duration(restoreSeconds * float64(time.Second))
|
||||
|
||||
// Calculate total
|
||||
breakdown.TotalTime = breakdown.DetectionTime +
|
||||
breakdown.DecisionTime +
|
||||
breakdown.DownloadTime +
|
||||
breakdown.RestoreTime +
|
||||
breakdown.StartupTime +
|
||||
breakdown.ValidationTime +
|
||||
breakdown.SwitchoverTime
|
||||
|
||||
return breakdown
|
||||
}
|
||||
|
||||
// calculateAverageInterval calculates average time between backups
|
||||
func calculateAverageInterval(entries []*catalog.Entry) time.Duration {
|
||||
if len(entries) < 2 {
|
||||
return 0
|
||||
}
|
||||
|
||||
var totalInterval time.Duration
|
||||
for i := 0; i < len(entries)-1; i++ {
|
||||
interval := entries[i].CreatedAt.Sub(entries[i+1].CreatedAt)
|
||||
totalInterval += interval
|
||||
}
|
||||
|
||||
return totalInterval / time.Duration(len(entries)-1)
|
||||
}
|
||||
|
||||
// generateRecommendations creates recommendations based on analysis
|
||||
func (c *Calculator) generateRecommendations(analysis *Analysis, entries []*catalog.Entry) []Recommendation {
|
||||
var recommendations []Recommendation
|
||||
|
||||
// RPO violations
|
||||
if !analysis.RPOCompliant {
|
||||
gap := analysis.CurrentRPO - c.config.TargetRPO
|
||||
recommendations = append(recommendations, Recommendation{
|
||||
Type: RecommendBackupFrequency,
|
||||
Priority: PriorityCritical,
|
||||
Title: "RPO Target Not Met",
|
||||
Description: fmt.Sprintf("Current RPO (%s) exceeds target (%s) by %s",
|
||||
formatDuration(analysis.CurrentRPO),
|
||||
formatDuration(c.config.TargetRPO),
|
||||
formatDuration(gap)),
|
||||
Impact: "Potential data loss exceeds acceptable threshold",
|
||||
Effort: EffortLow,
|
||||
})
|
||||
}
|
||||
|
||||
// RTO violations
|
||||
if !analysis.RTOCompliant {
|
||||
recommendations = append(recommendations, Recommendation{
|
||||
Type: RecommendParallelRestore,
|
||||
Priority: PriorityHigh,
|
||||
Title: "RTO Target Not Met",
|
||||
Description: fmt.Sprintf("Estimated recovery time (%s) exceeds target (%s)",
|
||||
formatDuration(analysis.CurrentRTO),
|
||||
formatDuration(c.config.TargetRTO)),
|
||||
Impact: "Recovery may take longer than acceptable",
|
||||
Effort: EffortMedium,
|
||||
})
|
||||
}
|
||||
|
||||
// Large download time
|
||||
if analysis.RTOBreakdown.DownloadTime > 30*time.Minute {
|
||||
recommendations = append(recommendations, Recommendation{
|
||||
Type: RecommendLocalCache,
|
||||
Priority: PriorityMedium,
|
||||
Title: "Consider Local Backup Cache",
|
||||
Description: fmt.Sprintf("Cloud download takes %s, local cache would reduce this",
|
||||
formatDuration(analysis.RTOBreakdown.DownloadTime)),
|
||||
Impact: "Faster recovery from local storage",
|
||||
Effort: EffortMedium,
|
||||
})
|
||||
}
|
||||
|
||||
// No incremental backups
|
||||
hasIncremental := false
|
||||
for _, e := range entries {
|
||||
if e.BackupType == "incremental" {
|
||||
hasIncremental = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasIncremental && analysis.BackupInterval > 6*time.Hour {
|
||||
recommendations = append(recommendations, Recommendation{
|
||||
Type: RecommendIncrementalBackup,
|
||||
Priority: PriorityMedium,
|
||||
Title: "Enable Incremental Backups",
|
||||
Description: "Incremental backups can reduce backup time and storage",
|
||||
Impact: "Better RPO with less resource usage",
|
||||
Effort: EffortLow,
|
||||
})
|
||||
}
|
||||
|
||||
// WAL archiving for PostgreSQL
|
||||
if len(entries) > 0 && entries[0].DatabaseType == "postgresql" {
|
||||
recommendations = append(recommendations, Recommendation{
|
||||
Type: RecommendWALArchiving,
|
||||
Priority: PriorityMedium,
|
||||
Title: "Consider WAL Archiving",
|
||||
Description: "Enable WAL archiving for point-in-time recovery",
|
||||
Impact: "Achieve near-zero RPO with PITR",
|
||||
Effort: EffortMedium,
|
||||
})
|
||||
}
|
||||
|
||||
return recommendations
|
||||
}
|
||||
|
||||
// calculateHistory generates historical RTO/RPO points
|
||||
func (c *Calculator) calculateHistory(entries []*catalog.Entry) []HistoricalPoint {
|
||||
var history []HistoricalPoint
|
||||
|
||||
// Sort entries by date (oldest first)
|
||||
sorted := make([]*catalog.Entry, len(entries))
|
||||
copy(sorted, entries)
|
||||
sort.Slice(sorted, func(i, j int) bool {
|
||||
return sorted[i].CreatedAt.Before(sorted[j].CreatedAt)
|
||||
})
|
||||
|
||||
for i, entry := range sorted {
|
||||
point := HistoricalPoint{
|
||||
Timestamp: entry.CreatedAt,
|
||||
RTO: c.calculateRTOBreakdown(entry).TotalTime,
|
||||
}
|
||||
|
||||
// Calculate RPO at that point (time until next backup)
|
||||
if i < len(sorted)-1 {
|
||||
point.RPO = sorted[i+1].CreatedAt.Sub(entry.CreatedAt)
|
||||
} else {
|
||||
point.RPO = time.Since(entry.CreatedAt)
|
||||
}
|
||||
|
||||
history = append(history, point)
|
||||
}
|
||||
|
||||
return history
|
||||
}
|
||||
|
||||
// Summary provides aggregate RTO/RPO status
|
||||
type Summary struct {
|
||||
TotalDatabases int `json:"total_databases"`
|
||||
RPOCompliant int `json:"rpo_compliant"`
|
||||
RTOCompliant int `json:"rto_compliant"`
|
||||
FullyCompliant int `json:"fully_compliant"`
|
||||
CriticalIssues int `json:"critical_issues"`
|
||||
WorstRPO time.Duration `json:"worst_rpo"`
|
||||
WorstRTO time.Duration `json:"worst_rto"`
|
||||
WorstRPODatabase string `json:"worst_rpo_database"`
|
||||
WorstRTODatabase string `json:"worst_rto_database"`
|
||||
AverageRPO time.Duration `json:"average_rpo"`
|
||||
AverageRTO time.Duration `json:"average_rto"`
|
||||
}
|
||||
|
||||
// Summarize creates a summary from analyses
|
||||
func Summarize(analyses []*Analysis) *Summary {
|
||||
summary := &Summary{}
|
||||
|
||||
var totalRPO, totalRTO time.Duration
|
||||
|
||||
for _, a := range analyses {
|
||||
summary.TotalDatabases++
|
||||
|
||||
if a.RPOCompliant {
|
||||
summary.RPOCompliant++
|
||||
}
|
||||
if a.RTOCompliant {
|
||||
summary.RTOCompliant++
|
||||
}
|
||||
if a.RPOCompliant && a.RTOCompliant {
|
||||
summary.FullyCompliant++
|
||||
}
|
||||
|
||||
for _, r := range a.Recommendations {
|
||||
if r.Priority == PriorityCritical {
|
||||
summary.CriticalIssues++
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if a.CurrentRPO > summary.WorstRPO {
|
||||
summary.WorstRPO = a.CurrentRPO
|
||||
summary.WorstRPODatabase = a.Database
|
||||
}
|
||||
if a.CurrentRTO > summary.WorstRTO {
|
||||
summary.WorstRTO = a.CurrentRTO
|
||||
summary.WorstRTODatabase = a.Database
|
||||
}
|
||||
|
||||
totalRPO += a.CurrentRPO
|
||||
totalRTO += a.CurrentRTO
|
||||
}
|
||||
|
||||
if len(analyses) > 0 {
|
||||
summary.AverageRPO = totalRPO / time.Duration(len(analyses))
|
||||
summary.AverageRTO = totalRTO / time.Duration(len(analyses))
|
||||
}
|
||||
|
||||
return summary
|
||||
}
|
||||
|
||||
func formatDuration(d time.Duration) string {
|
||||
if d < time.Minute {
|
||||
return fmt.Sprintf("%.0fs", d.Seconds())
|
||||
}
|
||||
if d < time.Hour {
|
||||
return fmt.Sprintf("%.0fm", d.Minutes())
|
||||
}
|
||||
hours := int(d.Hours())
|
||||
mins := int(d.Minutes()) - hours*60
|
||||
return fmt.Sprintf("%dh %dm", hours, mins)
|
||||
}
|
||||
Reference in New Issue
Block a user