feat: Add enterprise DBA features for production reliability
New features implemented: 1. Backup Catalog (internal/catalog/) - SQLite-based backup tracking - Gap detection and RPO monitoring - Search and statistics - Filesystem sync 2. DR Drill Testing (internal/drill/) - Automated restore testing in Docker containers - Database validation with custom queries - Catalog integration for drill-tested status 3. Smart Notifications (internal/notify/) - Event batching with configurable intervals - Time-based escalation policies - HTML/text/Slack templates 4. Compliance Reports (internal/report/) - SOC2, GDPR, HIPAA, PCI-DSS, ISO27001 frameworks - Evidence collection from catalog - JSON, Markdown, HTML output formats 5. RTO/RPO Calculator (internal/rto/) - Recovery objective analysis - RTO breakdown by phase - Recommendations for improvement 6. Replica-Aware Backup (internal/replica/) - Topology detection for PostgreSQL/MySQL - Automatic replica selection - Configurable selection strategies 7. Parallel Table Backup (internal/parallel/) - Concurrent table dumps - Worker pool with progress tracking - Large table optimization 8. MySQL/MariaDB PITR (internal/pitr/) - Binary log parsing and replay - Point-in-time recovery support - Transaction filtering CLI commands added: catalog, drill, report, rto All changes support the goal: reliable 3 AM database recovery.
This commit is contained in:
458
cmd/rto.go
Normal file
458
cmd/rto.go
Normal file
@@ -0,0 +1,458 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"dbbackup/internal/catalog"
|
||||
"dbbackup/internal/rto"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
var rtoCmd = &cobra.Command{
|
||||
Use: "rto",
|
||||
Short: "RTO/RPO analysis and monitoring",
|
||||
Long: `Analyze and monitor Recovery Time Objective (RTO) and
|
||||
Recovery Point Objective (RPO) metrics.
|
||||
|
||||
RTO: How long to recover from a failure
|
||||
RPO: How much data you can afford to lose
|
||||
|
||||
Examples:
|
||||
# Analyze RTO/RPO for all databases
|
||||
dbbackup rto analyze
|
||||
|
||||
# Analyze specific database
|
||||
dbbackup rto analyze --database mydb
|
||||
|
||||
# Show summary status
|
||||
dbbackup rto status
|
||||
|
||||
# Set targets and check compliance
|
||||
dbbackup rto check --target-rto 4h --target-rpo 1h`,
|
||||
}
|
||||
|
||||
var rtoAnalyzeCmd = &cobra.Command{
|
||||
Use: "analyze",
|
||||
Short: "Analyze RTO/RPO for databases",
|
||||
Long: "Perform detailed RTO/RPO analysis based on backup history",
|
||||
RunE: runRTOAnalyze,
|
||||
}
|
||||
|
||||
var rtoStatusCmd = &cobra.Command{
|
||||
Use: "status",
|
||||
Short: "Show RTO/RPO status summary",
|
||||
Long: "Display current RTO/RPO compliance status for all databases",
|
||||
RunE: runRTOStatus,
|
||||
}
|
||||
|
||||
var rtoCheckCmd = &cobra.Command{
|
||||
Use: "check",
|
||||
Short: "Check RTO/RPO compliance",
|
||||
Long: "Check if databases meet RTO/RPO targets",
|
||||
RunE: runRTOCheck,
|
||||
}
|
||||
|
||||
var (
|
||||
rtoDatabase string
|
||||
rtoTargetRTO string
|
||||
rtoTargetRPO string
|
||||
rtoCatalog string
|
||||
rtoFormat string
|
||||
rtoOutput string
|
||||
)
|
||||
|
||||
func init() {
|
||||
rootCmd.AddCommand(rtoCmd)
|
||||
rtoCmd.AddCommand(rtoAnalyzeCmd)
|
||||
rtoCmd.AddCommand(rtoStatusCmd)
|
||||
rtoCmd.AddCommand(rtoCheckCmd)
|
||||
|
||||
// Analyze command flags
|
||||
rtoAnalyzeCmd.Flags().StringVarP(&rtoDatabase, "database", "d", "", "Database to analyze (all if not specified)")
|
||||
rtoAnalyzeCmd.Flags().StringVar(&rtoTargetRTO, "target-rto", "4h", "Target RTO (e.g., 4h, 30m)")
|
||||
rtoAnalyzeCmd.Flags().StringVar(&rtoTargetRPO, "target-rpo", "1h", "Target RPO (e.g., 1h, 15m)")
|
||||
rtoAnalyzeCmd.Flags().StringVar(&rtoCatalog, "catalog", "", "Path to backup catalog")
|
||||
rtoAnalyzeCmd.Flags().StringVarP(&rtoFormat, "format", "f", "text", "Output format (text, json)")
|
||||
rtoAnalyzeCmd.Flags().StringVarP(&rtoOutput, "output", "o", "", "Output file")
|
||||
|
||||
// Status command flags
|
||||
rtoStatusCmd.Flags().StringVar(&rtoCatalog, "catalog", "", "Path to backup catalog")
|
||||
rtoStatusCmd.Flags().StringVar(&rtoTargetRTO, "target-rto", "4h", "Target RTO")
|
||||
rtoStatusCmd.Flags().StringVar(&rtoTargetRPO, "target-rpo", "1h", "Target RPO")
|
||||
|
||||
// Check command flags
|
||||
rtoCheckCmd.Flags().StringVarP(&rtoDatabase, "database", "d", "", "Database to check")
|
||||
rtoCheckCmd.Flags().StringVar(&rtoTargetRTO, "target-rto", "4h", "Target RTO")
|
||||
rtoCheckCmd.Flags().StringVar(&rtoTargetRPO, "target-rpo", "1h", "Target RPO")
|
||||
rtoCheckCmd.Flags().StringVar(&rtoCatalog, "catalog", "", "Path to backup catalog")
|
||||
}
|
||||
|
||||
func runRTOAnalyze(cmd *cobra.Command, args []string) error {
|
||||
ctx := context.Background()
|
||||
|
||||
// Parse duration targets
|
||||
targetRTO, err := time.ParseDuration(rtoTargetRTO)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid target-rto: %w", err)
|
||||
}
|
||||
targetRPO, err := time.ParseDuration(rtoTargetRPO)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid target-rpo: %w", err)
|
||||
}
|
||||
|
||||
// Get catalog
|
||||
cat, err := openRTOCatalog()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer cat.Close()
|
||||
|
||||
// Create calculator
|
||||
config := rto.DefaultConfig()
|
||||
config.TargetRTO = targetRTO
|
||||
config.TargetRPO = targetRPO
|
||||
calc := rto.NewCalculator(cat, config)
|
||||
|
||||
var analyses []*rto.Analysis
|
||||
|
||||
if rtoDatabase != "" {
|
||||
// Analyze single database
|
||||
analysis, err := calc.Analyze(ctx, rtoDatabase)
|
||||
if err != nil {
|
||||
return fmt.Errorf("analysis failed: %w", err)
|
||||
}
|
||||
analyses = append(analyses, analysis)
|
||||
} else {
|
||||
// Analyze all databases
|
||||
analyses, err = calc.AnalyzeAll(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("analysis failed: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Output
|
||||
if rtoFormat == "json" {
|
||||
return outputJSON(analyses, rtoOutput)
|
||||
}
|
||||
|
||||
return outputAnalysisText(analyses)
|
||||
}
|
||||
|
||||
func runRTOStatus(cmd *cobra.Command, args []string) error {
|
||||
ctx := context.Background()
|
||||
|
||||
// Parse targets
|
||||
targetRTO, err := time.ParseDuration(rtoTargetRTO)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid target-rto: %w", err)
|
||||
}
|
||||
targetRPO, err := time.ParseDuration(rtoTargetRPO)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid target-rpo: %w", err)
|
||||
}
|
||||
|
||||
// Get catalog
|
||||
cat, err := openRTOCatalog()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer cat.Close()
|
||||
|
||||
// Create calculator and analyze all
|
||||
config := rto.DefaultConfig()
|
||||
config.TargetRTO = targetRTO
|
||||
config.TargetRPO = targetRPO
|
||||
calc := rto.NewCalculator(cat, config)
|
||||
|
||||
analyses, err := calc.AnalyzeAll(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("analysis failed: %w", err)
|
||||
}
|
||||
|
||||
// Create summary
|
||||
summary := rto.Summarize(analyses)
|
||||
|
||||
// Display status
|
||||
fmt.Println()
|
||||
fmt.Println("╔═══════════════════════════════════════════════════════════╗")
|
||||
fmt.Println("║ RTO/RPO STATUS SUMMARY ║")
|
||||
fmt.Println("╠═══════════════════════════════════════════════════════════╣")
|
||||
fmt.Printf("║ Target RTO: %-15s Target RPO: %-15s ║\n",
|
||||
formatDuration(config.TargetRTO),
|
||||
formatDuration(config.TargetRPO))
|
||||
fmt.Println("╠═══════════════════════════════════════════════════════════╣")
|
||||
|
||||
// Compliance status
|
||||
rpoRate := 0.0
|
||||
rtoRate := 0.0
|
||||
fullRate := 0.0
|
||||
if summary.TotalDatabases > 0 {
|
||||
rpoRate = float64(summary.RPOCompliant) / float64(summary.TotalDatabases) * 100
|
||||
rtoRate = float64(summary.RTOCompliant) / float64(summary.TotalDatabases) * 100
|
||||
fullRate = float64(summary.FullyCompliant) / float64(summary.TotalDatabases) * 100
|
||||
}
|
||||
|
||||
fmt.Printf("║ Databases: %-5d ║\n", summary.TotalDatabases)
|
||||
fmt.Printf("║ RPO Compliant: %-5d (%.0f%%) ║\n", summary.RPOCompliant, rpoRate)
|
||||
fmt.Printf("║ RTO Compliant: %-5d (%.0f%%) ║\n", summary.RTOCompliant, rtoRate)
|
||||
fmt.Printf("║ Fully Compliant: %-3d (%.0f%%) ║\n", summary.FullyCompliant, fullRate)
|
||||
|
||||
if summary.CriticalIssues > 0 {
|
||||
fmt.Printf("║ ⚠️ Critical Issues: %-3d ║\n", summary.CriticalIssues)
|
||||
}
|
||||
|
||||
fmt.Println("╠═══════════════════════════════════════════════════════════╣")
|
||||
fmt.Printf("║ Average RPO: %-15s Worst: %-15s ║\n",
|
||||
formatDuration(summary.AverageRPO),
|
||||
formatDuration(summary.WorstRPO))
|
||||
fmt.Printf("║ Average RTO: %-15s Worst: %-15s ║\n",
|
||||
formatDuration(summary.AverageRTO),
|
||||
formatDuration(summary.WorstRTO))
|
||||
|
||||
if summary.WorstRPODatabase != "" {
|
||||
fmt.Printf("║ Worst RPO Database: %-38s║\n", summary.WorstRPODatabase)
|
||||
}
|
||||
if summary.WorstRTODatabase != "" {
|
||||
fmt.Printf("║ Worst RTO Database: %-38s║\n", summary.WorstRTODatabase)
|
||||
}
|
||||
|
||||
fmt.Println("╚═══════════════════════════════════════════════════════════╝")
|
||||
fmt.Println()
|
||||
|
||||
// Per-database status
|
||||
if len(analyses) > 0 {
|
||||
fmt.Println("Database Status:")
|
||||
fmt.Println(strings.Repeat("-", 70))
|
||||
fmt.Printf("%-25s %-12s %-12s %-12s\n", "DATABASE", "RPO", "RTO", "STATUS")
|
||||
fmt.Println(strings.Repeat("-", 70))
|
||||
|
||||
for _, a := range analyses {
|
||||
status := "✅"
|
||||
if !a.RPOCompliant || !a.RTOCompliant {
|
||||
status = "❌"
|
||||
}
|
||||
|
||||
rpoStr := formatDuration(a.CurrentRPO)
|
||||
rtoStr := formatDuration(a.CurrentRTO)
|
||||
|
||||
if !a.RPOCompliant {
|
||||
rpoStr = "⚠️ " + rpoStr
|
||||
}
|
||||
if !a.RTOCompliant {
|
||||
rtoStr = "⚠️ " + rtoStr
|
||||
}
|
||||
|
||||
fmt.Printf("%-25s %-12s %-12s %s\n",
|
||||
truncateRTO(a.Database, 24),
|
||||
rpoStr,
|
||||
rtoStr,
|
||||
status)
|
||||
}
|
||||
fmt.Println(strings.Repeat("-", 70))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func runRTOCheck(cmd *cobra.Command, args []string) error {
|
||||
ctx := context.Background()
|
||||
|
||||
// Parse targets
|
||||
targetRTO, err := time.ParseDuration(rtoTargetRTO)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid target-rto: %w", err)
|
||||
}
|
||||
targetRPO, err := time.ParseDuration(rtoTargetRPO)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid target-rpo: %w", err)
|
||||
}
|
||||
|
||||
// Get catalog
|
||||
cat, err := openRTOCatalog()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer cat.Close()
|
||||
|
||||
// Create calculator
|
||||
config := rto.DefaultConfig()
|
||||
config.TargetRTO = targetRTO
|
||||
config.TargetRPO = targetRPO
|
||||
calc := rto.NewCalculator(cat, config)
|
||||
|
||||
var analyses []*rto.Analysis
|
||||
|
||||
if rtoDatabase != "" {
|
||||
analysis, err := calc.Analyze(ctx, rtoDatabase)
|
||||
if err != nil {
|
||||
return fmt.Errorf("analysis failed: %w", err)
|
||||
}
|
||||
analyses = append(analyses, analysis)
|
||||
} else {
|
||||
analyses, err = calc.AnalyzeAll(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("analysis failed: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Check compliance
|
||||
exitCode := 0
|
||||
for _, a := range analyses {
|
||||
if !a.RPOCompliant {
|
||||
fmt.Printf("❌ %s: RPO violation - current %s exceeds target %s\n",
|
||||
a.Database,
|
||||
formatDuration(a.CurrentRPO),
|
||||
formatDuration(config.TargetRPO))
|
||||
exitCode = 1
|
||||
}
|
||||
if !a.RTOCompliant {
|
||||
fmt.Printf("❌ %s: RTO violation - estimated %s exceeds target %s\n",
|
||||
a.Database,
|
||||
formatDuration(a.CurrentRTO),
|
||||
formatDuration(config.TargetRTO))
|
||||
exitCode = 1
|
||||
}
|
||||
if a.RPOCompliant && a.RTOCompliant {
|
||||
fmt.Printf("✅ %s: Compliant (RPO: %s, RTO: %s)\n",
|
||||
a.Database,
|
||||
formatDuration(a.CurrentRPO),
|
||||
formatDuration(a.CurrentRTO))
|
||||
}
|
||||
}
|
||||
|
||||
if exitCode != 0 {
|
||||
os.Exit(exitCode)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func openRTOCatalog() (*catalog.SQLiteCatalog, error) {
|
||||
catalogPath := rtoCatalog
|
||||
if catalogPath == "" {
|
||||
homeDir, _ := os.UserHomeDir()
|
||||
catalogPath = filepath.Join(homeDir, ".dbbackup", "catalog.db")
|
||||
}
|
||||
|
||||
cat, err := catalog.NewSQLiteCatalog(catalogPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open catalog: %w", err)
|
||||
}
|
||||
|
||||
return cat, nil
|
||||
}
|
||||
|
||||
func outputJSON(data interface{}, outputPath string) error {
|
||||
jsonData, err := json.MarshalIndent(data, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if outputPath != "" {
|
||||
return os.WriteFile(outputPath, jsonData, 0644)
|
||||
}
|
||||
|
||||
fmt.Println(string(jsonData))
|
||||
return nil
|
||||
}
|
||||
|
||||
func outputAnalysisText(analyses []*rto.Analysis) error {
|
||||
for _, a := range analyses {
|
||||
fmt.Println()
|
||||
fmt.Println(strings.Repeat("=", 60))
|
||||
fmt.Printf(" Database: %s\n", a.Database)
|
||||
fmt.Println(strings.Repeat("=", 60))
|
||||
|
||||
// Status
|
||||
rpoStatus := "✅ Compliant"
|
||||
if !a.RPOCompliant {
|
||||
rpoStatus = "❌ Violation"
|
||||
}
|
||||
rtoStatus := "✅ Compliant"
|
||||
if !a.RTOCompliant {
|
||||
rtoStatus = "❌ Violation"
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println(" Recovery Objectives:")
|
||||
fmt.Println(strings.Repeat("-", 50))
|
||||
fmt.Printf(" RPO (Current): %-15s Target: %s\n",
|
||||
formatDuration(a.CurrentRPO), formatDuration(a.TargetRPO))
|
||||
fmt.Printf(" RPO Status: %s\n", rpoStatus)
|
||||
fmt.Printf(" RTO (Estimated): %-14s Target: %s\n",
|
||||
formatDuration(a.CurrentRTO), formatDuration(a.TargetRTO))
|
||||
fmt.Printf(" RTO Status: %s\n", rtoStatus)
|
||||
|
||||
if a.LastBackup != nil {
|
||||
fmt.Printf(" Last Backup: %s\n", a.LastBackup.Format("2006-01-02 15:04:05"))
|
||||
}
|
||||
if a.BackupInterval > 0 {
|
||||
fmt.Printf(" Backup Interval: %s\n", formatDuration(a.BackupInterval))
|
||||
}
|
||||
|
||||
// RTO Breakdown
|
||||
fmt.Println()
|
||||
fmt.Println(" RTO Breakdown:")
|
||||
fmt.Println(strings.Repeat("-", 50))
|
||||
b := a.RTOBreakdown
|
||||
fmt.Printf(" Detection: %s\n", formatDuration(b.DetectionTime))
|
||||
fmt.Printf(" Decision: %s\n", formatDuration(b.DecisionTime))
|
||||
if b.DownloadTime > 0 {
|
||||
fmt.Printf(" Download: %s\n", formatDuration(b.DownloadTime))
|
||||
}
|
||||
fmt.Printf(" Restore: %s\n", formatDuration(b.RestoreTime))
|
||||
fmt.Printf(" Startup: %s\n", formatDuration(b.StartupTime))
|
||||
fmt.Printf(" Validation: %s\n", formatDuration(b.ValidationTime))
|
||||
fmt.Printf(" Switchover: %s\n", formatDuration(b.SwitchoverTime))
|
||||
fmt.Println(strings.Repeat("-", 30))
|
||||
fmt.Printf(" Total: %s\n", formatDuration(b.TotalTime))
|
||||
|
||||
// Recommendations
|
||||
if len(a.Recommendations) > 0 {
|
||||
fmt.Println()
|
||||
fmt.Println(" Recommendations:")
|
||||
fmt.Println(strings.Repeat("-", 50))
|
||||
for _, r := range a.Recommendations {
|
||||
icon := "💡"
|
||||
switch r.Priority {
|
||||
case rto.PriorityCritical:
|
||||
icon = "🔴"
|
||||
case rto.PriorityHigh:
|
||||
icon = "🟠"
|
||||
case rto.PriorityMedium:
|
||||
icon = "🟡"
|
||||
}
|
||||
fmt.Printf(" %s [%s] %s\n", icon, r.Priority, r.Title)
|
||||
fmt.Printf(" %s\n", r.Description)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func formatDuration(d time.Duration) string {
|
||||
if d < time.Minute {
|
||||
return fmt.Sprintf("%.0fs", d.Seconds())
|
||||
}
|
||||
if d < time.Hour {
|
||||
return fmt.Sprintf("%.0fm", d.Minutes())
|
||||
}
|
||||
hours := int(d.Hours())
|
||||
mins := int(d.Minutes()) - hours*60
|
||||
return fmt.Sprintf("%dh %dm", hours, mins)
|
||||
}
|
||||
|
||||
func truncateRTO(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
}
|
||||
return s[:maxLen-3] + "..."
|
||||
}
|
||||
Reference in New Issue
Block a user