New features implemented: 1. Backup Catalog (internal/catalog/) - SQLite-based backup tracking - Gap detection and RPO monitoring - Search and statistics - Filesystem sync 2. DR Drill Testing (internal/drill/) - Automated restore testing in Docker containers - Database validation with custom queries - Catalog integration for drill-tested status 3. Smart Notifications (internal/notify/) - Event batching with configurable intervals - Time-based escalation policies - HTML/text/Slack templates 4. Compliance Reports (internal/report/) - SOC2, GDPR, HIPAA, PCI-DSS, ISO27001 frameworks - Evidence collection from catalog - JSON, Markdown, HTML output formats 5. RTO/RPO Calculator (internal/rto/) - Recovery objective analysis - RTO breakdown by phase - Recommendations for improvement 6. Replica-Aware Backup (internal/replica/) - Topology detection for PostgreSQL/MySQL - Automatic replica selection - Configurable selection strategies 7. Parallel Table Backup (internal/parallel/) - Concurrent table dumps - Worker pool with progress tracking - Large table optimization 8. MySQL/MariaDB PITR (internal/pitr/) - Binary log parsing and replay - Point-in-time recovery support - Transaction filtering CLI commands added: catalog, drill, report, rto All changes support the goal: reliable 3 AM database recovery.
482 lines
14 KiB
Go
482 lines
14 KiB
Go
// Package rto provides RTO/RPO calculation and analysis
|
|
package rto
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sort"
|
|
"time"
|
|
|
|
"dbbackup/internal/catalog"
|
|
)
|
|
|
|
// Calculator calculates RTO and RPO metrics
|
|
type Calculator struct {
|
|
catalog catalog.Catalog
|
|
config Config
|
|
}
|
|
|
|
// Config configures RTO/RPO calculations
|
|
type Config struct {
|
|
TargetRTO time.Duration `json:"target_rto"` // Target Recovery Time Objective
|
|
TargetRPO time.Duration `json:"target_rpo"` // Target Recovery Point Objective
|
|
|
|
// Assumptions for calculation
|
|
NetworkSpeedMbps float64 `json:"network_speed_mbps"` // Network speed for cloud restores
|
|
DiskReadSpeedMBps float64 `json:"disk_read_speed_mbps"` // Disk read speed
|
|
DiskWriteSpeedMBps float64 `json:"disk_write_speed_mbps"` // Disk write speed
|
|
CloudDownloadSpeedMbps float64 `json:"cloud_download_speed_mbps"`
|
|
|
|
// Time estimates for various operations
|
|
StartupTimeMinutes int `json:"startup_time_minutes"` // DB startup time
|
|
ValidationTimeMinutes int `json:"validation_time_minutes"` // Post-restore validation
|
|
SwitchoverTimeMinutes int `json:"switchover_time_minutes"` // Application switchover time
|
|
}
|
|
|
|
// DefaultConfig returns sensible defaults
|
|
func DefaultConfig() Config {
|
|
return Config{
|
|
TargetRTO: 4 * time.Hour,
|
|
TargetRPO: 1 * time.Hour,
|
|
NetworkSpeedMbps: 100,
|
|
DiskReadSpeedMBps: 100,
|
|
DiskWriteSpeedMBps: 50,
|
|
CloudDownloadSpeedMbps: 100,
|
|
StartupTimeMinutes: 2,
|
|
ValidationTimeMinutes: 5,
|
|
SwitchoverTimeMinutes: 5,
|
|
}
|
|
}
|
|
|
|
// Analysis contains RTO/RPO analysis results
|
|
type Analysis struct {
|
|
Database string `json:"database"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
|
|
// Current state
|
|
CurrentRPO time.Duration `json:"current_rpo"`
|
|
CurrentRTO time.Duration `json:"current_rto"`
|
|
|
|
// Target state
|
|
TargetRPO time.Duration `json:"target_rpo"`
|
|
TargetRTO time.Duration `json:"target_rto"`
|
|
|
|
// Compliance
|
|
RPOCompliant bool `json:"rpo_compliant"`
|
|
RTOCompliant bool `json:"rto_compliant"`
|
|
|
|
// Details
|
|
LastBackup *time.Time `json:"last_backup,omitempty"`
|
|
NextScheduled *time.Time `json:"next_scheduled,omitempty"`
|
|
BackupInterval time.Duration `json:"backup_interval"`
|
|
|
|
// RTO breakdown
|
|
RTOBreakdown RTOBreakdown `json:"rto_breakdown"`
|
|
|
|
// Recommendations
|
|
Recommendations []Recommendation `json:"recommendations,omitempty"`
|
|
|
|
// Historical
|
|
History []HistoricalPoint `json:"history,omitempty"`
|
|
}
|
|
|
|
// RTOBreakdown shows components of RTO calculation
|
|
type RTOBreakdown struct {
|
|
DetectionTime time.Duration `json:"detection_time"`
|
|
DecisionTime time.Duration `json:"decision_time"`
|
|
DownloadTime time.Duration `json:"download_time"`
|
|
RestoreTime time.Duration `json:"restore_time"`
|
|
StartupTime time.Duration `json:"startup_time"`
|
|
ValidationTime time.Duration `json:"validation_time"`
|
|
SwitchoverTime time.Duration `json:"switchover_time"`
|
|
TotalTime time.Duration `json:"total_time"`
|
|
}
|
|
|
|
// Recommendation suggests improvements
|
|
type Recommendation struct {
|
|
Type RecommendationType `json:"type"`
|
|
Priority Priority `json:"priority"`
|
|
Title string `json:"title"`
|
|
Description string `json:"description"`
|
|
Impact string `json:"impact"`
|
|
Effort Effort `json:"effort"`
|
|
}
|
|
|
|
// RecommendationType categorizes recommendations
|
|
type RecommendationType string
|
|
|
|
const (
|
|
RecommendBackupFrequency RecommendationType = "backup_frequency"
|
|
RecommendIncrementalBackup RecommendationType = "incremental_backup"
|
|
RecommendCompression RecommendationType = "compression"
|
|
RecommendLocalCache RecommendationType = "local_cache"
|
|
RecommendParallelRestore RecommendationType = "parallel_restore"
|
|
RecommendWALArchiving RecommendationType = "wal_archiving"
|
|
RecommendReplication RecommendationType = "replication"
|
|
)
|
|
|
|
// Priority levels
|
|
type Priority string
|
|
|
|
const (
|
|
PriorityCritical Priority = "critical"
|
|
PriorityHigh Priority = "high"
|
|
PriorityMedium Priority = "medium"
|
|
PriorityLow Priority = "low"
|
|
)
|
|
|
|
// Effort levels
|
|
type Effort string
|
|
|
|
const (
|
|
EffortLow Effort = "low"
|
|
EffortMedium Effort = "medium"
|
|
EffortHigh Effort = "high"
|
|
)
|
|
|
|
// HistoricalPoint tracks RTO/RPO over time
|
|
type HistoricalPoint struct {
|
|
Timestamp time.Time `json:"timestamp"`
|
|
RPO time.Duration `json:"rpo"`
|
|
RTO time.Duration `json:"rto"`
|
|
}
|
|
|
|
// NewCalculator creates a new RTO/RPO calculator
|
|
func NewCalculator(cat catalog.Catalog, config Config) *Calculator {
|
|
return &Calculator{
|
|
catalog: cat,
|
|
config: config,
|
|
}
|
|
}
|
|
|
|
// Analyze performs RTO/RPO analysis for a database
|
|
func (c *Calculator) Analyze(ctx context.Context, database string) (*Analysis, error) {
|
|
analysis := &Analysis{
|
|
Database: database,
|
|
Timestamp: time.Now(),
|
|
TargetRPO: c.config.TargetRPO,
|
|
TargetRTO: c.config.TargetRTO,
|
|
}
|
|
|
|
// Get recent backups
|
|
entries, err := c.catalog.List(ctx, database, 100)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to list backups: %w", err)
|
|
}
|
|
|
|
if len(entries) == 0 {
|
|
// No backups - worst case scenario
|
|
analysis.CurrentRPO = 0 // undefined
|
|
analysis.CurrentRTO = 0 // undefined
|
|
analysis.Recommendations = append(analysis.Recommendations, Recommendation{
|
|
Type: RecommendBackupFrequency,
|
|
Priority: PriorityCritical,
|
|
Title: "No Backups Found",
|
|
Description: "No backups exist for this database",
|
|
Impact: "Cannot recover in case of failure",
|
|
Effort: EffortLow,
|
|
})
|
|
return analysis, nil
|
|
}
|
|
|
|
// Calculate current RPO (time since last backup)
|
|
lastBackup := entries[0].CreatedAt
|
|
analysis.LastBackup = &lastBackup
|
|
analysis.CurrentRPO = time.Since(lastBackup)
|
|
analysis.RPOCompliant = analysis.CurrentRPO <= c.config.TargetRPO
|
|
|
|
// Calculate backup interval
|
|
if len(entries) >= 2 {
|
|
analysis.BackupInterval = calculateAverageInterval(entries)
|
|
}
|
|
|
|
// Calculate RTO
|
|
latestEntry := entries[0]
|
|
analysis.RTOBreakdown = c.calculateRTOBreakdown(latestEntry)
|
|
analysis.CurrentRTO = analysis.RTOBreakdown.TotalTime
|
|
analysis.RTOCompliant = analysis.CurrentRTO <= c.config.TargetRTO
|
|
|
|
// Generate recommendations
|
|
analysis.Recommendations = c.generateRecommendations(analysis, entries)
|
|
|
|
// Calculate history
|
|
analysis.History = c.calculateHistory(entries)
|
|
|
|
return analysis, nil
|
|
}
|
|
|
|
// AnalyzeAll analyzes all databases
|
|
func (c *Calculator) AnalyzeAll(ctx context.Context) ([]*Analysis, error) {
|
|
databases, err := c.catalog.ListDatabases(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to list databases: %w", err)
|
|
}
|
|
|
|
var analyses []*Analysis
|
|
for _, db := range databases {
|
|
analysis, err := c.Analyze(ctx, db)
|
|
if err != nil {
|
|
continue // Skip errors for individual databases
|
|
}
|
|
analyses = append(analyses, analysis)
|
|
}
|
|
|
|
return analyses, nil
|
|
}
|
|
|
|
// calculateRTOBreakdown calculates RTO components
|
|
func (c *Calculator) calculateRTOBreakdown(entry *catalog.Entry) RTOBreakdown {
|
|
breakdown := RTOBreakdown{
|
|
// Detection time - assume monitoring catches issues quickly
|
|
DetectionTime: 5 * time.Minute,
|
|
|
|
// Decision time - human decision making
|
|
DecisionTime: 10 * time.Minute,
|
|
|
|
// Startup time
|
|
StartupTime: time.Duration(c.config.StartupTimeMinutes) * time.Minute,
|
|
|
|
// Validation time
|
|
ValidationTime: time.Duration(c.config.ValidationTimeMinutes) * time.Minute,
|
|
|
|
// Switchover time
|
|
SwitchoverTime: time.Duration(c.config.SwitchoverTimeMinutes) * time.Minute,
|
|
}
|
|
|
|
// Calculate download time (if cloud backup)
|
|
if entry.CloudLocation != "" {
|
|
// Cloud download
|
|
bytesPerSecond := c.config.CloudDownloadSpeedMbps * 125000 // Mbps to bytes/sec
|
|
downloadSeconds := float64(entry.SizeBytes) / bytesPerSecond
|
|
breakdown.DownloadTime = time.Duration(downloadSeconds * float64(time.Second))
|
|
}
|
|
|
|
// Calculate restore time
|
|
// Estimate based on disk write speed
|
|
bytesPerSecond := c.config.DiskWriteSpeedMBps * 1000000 // MB/s to bytes/sec
|
|
restoreSeconds := float64(entry.SizeBytes) / bytesPerSecond
|
|
|
|
// Add overhead for decompression if compressed
|
|
if entry.Compression != "" && entry.Compression != "none" {
|
|
restoreSeconds *= 1.3 // 30% overhead for decompression
|
|
}
|
|
|
|
// Add overhead for decryption if encrypted
|
|
if entry.Encrypted {
|
|
restoreSeconds *= 1.1 // 10% overhead for decryption
|
|
}
|
|
|
|
breakdown.RestoreTime = time.Duration(restoreSeconds * float64(time.Second))
|
|
|
|
// Calculate total
|
|
breakdown.TotalTime = breakdown.DetectionTime +
|
|
breakdown.DecisionTime +
|
|
breakdown.DownloadTime +
|
|
breakdown.RestoreTime +
|
|
breakdown.StartupTime +
|
|
breakdown.ValidationTime +
|
|
breakdown.SwitchoverTime
|
|
|
|
return breakdown
|
|
}
|
|
|
|
// calculateAverageInterval calculates average time between backups
|
|
func calculateAverageInterval(entries []*catalog.Entry) time.Duration {
|
|
if len(entries) < 2 {
|
|
return 0
|
|
}
|
|
|
|
var totalInterval time.Duration
|
|
for i := 0; i < len(entries)-1; i++ {
|
|
interval := entries[i].CreatedAt.Sub(entries[i+1].CreatedAt)
|
|
totalInterval += interval
|
|
}
|
|
|
|
return totalInterval / time.Duration(len(entries)-1)
|
|
}
|
|
|
|
// generateRecommendations creates recommendations based on analysis
|
|
func (c *Calculator) generateRecommendations(analysis *Analysis, entries []*catalog.Entry) []Recommendation {
|
|
var recommendations []Recommendation
|
|
|
|
// RPO violations
|
|
if !analysis.RPOCompliant {
|
|
gap := analysis.CurrentRPO - c.config.TargetRPO
|
|
recommendations = append(recommendations, Recommendation{
|
|
Type: RecommendBackupFrequency,
|
|
Priority: PriorityCritical,
|
|
Title: "RPO Target Not Met",
|
|
Description: fmt.Sprintf("Current RPO (%s) exceeds target (%s) by %s",
|
|
formatDuration(analysis.CurrentRPO),
|
|
formatDuration(c.config.TargetRPO),
|
|
formatDuration(gap)),
|
|
Impact: "Potential data loss exceeds acceptable threshold",
|
|
Effort: EffortLow,
|
|
})
|
|
}
|
|
|
|
// RTO violations
|
|
if !analysis.RTOCompliant {
|
|
recommendations = append(recommendations, Recommendation{
|
|
Type: RecommendParallelRestore,
|
|
Priority: PriorityHigh,
|
|
Title: "RTO Target Not Met",
|
|
Description: fmt.Sprintf("Estimated recovery time (%s) exceeds target (%s)",
|
|
formatDuration(analysis.CurrentRTO),
|
|
formatDuration(c.config.TargetRTO)),
|
|
Impact: "Recovery may take longer than acceptable",
|
|
Effort: EffortMedium,
|
|
})
|
|
}
|
|
|
|
// Large download time
|
|
if analysis.RTOBreakdown.DownloadTime > 30*time.Minute {
|
|
recommendations = append(recommendations, Recommendation{
|
|
Type: RecommendLocalCache,
|
|
Priority: PriorityMedium,
|
|
Title: "Consider Local Backup Cache",
|
|
Description: fmt.Sprintf("Cloud download takes %s, local cache would reduce this",
|
|
formatDuration(analysis.RTOBreakdown.DownloadTime)),
|
|
Impact: "Faster recovery from local storage",
|
|
Effort: EffortMedium,
|
|
})
|
|
}
|
|
|
|
// No incremental backups
|
|
hasIncremental := false
|
|
for _, e := range entries {
|
|
if e.BackupType == "incremental" {
|
|
hasIncremental = true
|
|
break
|
|
}
|
|
}
|
|
if !hasIncremental && analysis.BackupInterval > 6*time.Hour {
|
|
recommendations = append(recommendations, Recommendation{
|
|
Type: RecommendIncrementalBackup,
|
|
Priority: PriorityMedium,
|
|
Title: "Enable Incremental Backups",
|
|
Description: "Incremental backups can reduce backup time and storage",
|
|
Impact: "Better RPO with less resource usage",
|
|
Effort: EffortLow,
|
|
})
|
|
}
|
|
|
|
// WAL archiving for PostgreSQL
|
|
if len(entries) > 0 && entries[0].DatabaseType == "postgresql" {
|
|
recommendations = append(recommendations, Recommendation{
|
|
Type: RecommendWALArchiving,
|
|
Priority: PriorityMedium,
|
|
Title: "Consider WAL Archiving",
|
|
Description: "Enable WAL archiving for point-in-time recovery",
|
|
Impact: "Achieve near-zero RPO with PITR",
|
|
Effort: EffortMedium,
|
|
})
|
|
}
|
|
|
|
return recommendations
|
|
}
|
|
|
|
// calculateHistory generates historical RTO/RPO points
|
|
func (c *Calculator) calculateHistory(entries []*catalog.Entry) []HistoricalPoint {
|
|
var history []HistoricalPoint
|
|
|
|
// Sort entries by date (oldest first)
|
|
sorted := make([]*catalog.Entry, len(entries))
|
|
copy(sorted, entries)
|
|
sort.Slice(sorted, func(i, j int) bool {
|
|
return sorted[i].CreatedAt.Before(sorted[j].CreatedAt)
|
|
})
|
|
|
|
for i, entry := range sorted {
|
|
point := HistoricalPoint{
|
|
Timestamp: entry.CreatedAt,
|
|
RTO: c.calculateRTOBreakdown(entry).TotalTime,
|
|
}
|
|
|
|
// Calculate RPO at that point (time until next backup)
|
|
if i < len(sorted)-1 {
|
|
point.RPO = sorted[i+1].CreatedAt.Sub(entry.CreatedAt)
|
|
} else {
|
|
point.RPO = time.Since(entry.CreatedAt)
|
|
}
|
|
|
|
history = append(history, point)
|
|
}
|
|
|
|
return history
|
|
}
|
|
|
|
// Summary provides aggregate RTO/RPO status
|
|
type Summary struct {
|
|
TotalDatabases int `json:"total_databases"`
|
|
RPOCompliant int `json:"rpo_compliant"`
|
|
RTOCompliant int `json:"rto_compliant"`
|
|
FullyCompliant int `json:"fully_compliant"`
|
|
CriticalIssues int `json:"critical_issues"`
|
|
WorstRPO time.Duration `json:"worst_rpo"`
|
|
WorstRTO time.Duration `json:"worst_rto"`
|
|
WorstRPODatabase string `json:"worst_rpo_database"`
|
|
WorstRTODatabase string `json:"worst_rto_database"`
|
|
AverageRPO time.Duration `json:"average_rpo"`
|
|
AverageRTO time.Duration `json:"average_rto"`
|
|
}
|
|
|
|
// Summarize creates a summary from analyses
|
|
func Summarize(analyses []*Analysis) *Summary {
|
|
summary := &Summary{}
|
|
|
|
var totalRPO, totalRTO time.Duration
|
|
|
|
for _, a := range analyses {
|
|
summary.TotalDatabases++
|
|
|
|
if a.RPOCompliant {
|
|
summary.RPOCompliant++
|
|
}
|
|
if a.RTOCompliant {
|
|
summary.RTOCompliant++
|
|
}
|
|
if a.RPOCompliant && a.RTOCompliant {
|
|
summary.FullyCompliant++
|
|
}
|
|
|
|
for _, r := range a.Recommendations {
|
|
if r.Priority == PriorityCritical {
|
|
summary.CriticalIssues++
|
|
break
|
|
}
|
|
}
|
|
|
|
if a.CurrentRPO > summary.WorstRPO {
|
|
summary.WorstRPO = a.CurrentRPO
|
|
summary.WorstRPODatabase = a.Database
|
|
}
|
|
if a.CurrentRTO > summary.WorstRTO {
|
|
summary.WorstRTO = a.CurrentRTO
|
|
summary.WorstRTODatabase = a.Database
|
|
}
|
|
|
|
totalRPO += a.CurrentRPO
|
|
totalRTO += a.CurrentRTO
|
|
}
|
|
|
|
if len(analyses) > 0 {
|
|
summary.AverageRPO = totalRPO / time.Duration(len(analyses))
|
|
summary.AverageRTO = totalRTO / time.Duration(len(analyses))
|
|
}
|
|
|
|
return summary
|
|
}
|
|
|
|
func formatDuration(d time.Duration) string {
|
|
if d < time.Minute {
|
|
return fmt.Sprintf("%.0fs", d.Seconds())
|
|
}
|
|
if d < time.Hour {
|
|
return fmt.Sprintf("%.0fm", d.Minutes())
|
|
}
|
|
hours := int(d.Hours())
|
|
mins := int(d.Minutes()) - hours*60
|
|
return fmt.Sprintf("%dh %dm", hours, mins)
|
|
}
|