feat: Add enterprise DBA features for production reliability
New features implemented: 1. Backup Catalog (internal/catalog/) - SQLite-based backup tracking - Gap detection and RPO monitoring - Search and statistics - Filesystem sync 2. DR Drill Testing (internal/drill/) - Automated restore testing in Docker containers - Database validation with custom queries - Catalog integration for drill-tested status 3. Smart Notifications (internal/notify/) - Event batching with configurable intervals - Time-based escalation policies - HTML/text/Slack templates 4. Compliance Reports (internal/report/) - SOC2, GDPR, HIPAA, PCI-DSS, ISO27001 frameworks - Evidence collection from catalog - JSON, Markdown, HTML output formats 5. RTO/RPO Calculator (internal/rto/) - Recovery objective analysis - RTO breakdown by phase - Recommendations for improvement 6. Replica-Aware Backup (internal/replica/) - Topology detection for PostgreSQL/MySQL - Automatic replica selection - Configurable selection strategies 7. Parallel Table Backup (internal/parallel/) - Concurrent table dumps - Worker pool with progress tracking - Large table optimization 8. MySQL/MariaDB PITR (internal/pitr/) - Binary log parsing and replay - Point-in-time recovery support - Transaction filtering CLI commands added: catalog, drill, report, rto All changes support the goal: reliable 3 AM database recovery.
This commit is contained in:
299
internal/catalog/gaps.go
Normal file
299
internal/catalog/gaps.go
Normal file
@@ -0,0 +1,299 @@
|
||||
// Package catalog - Gap detection for backup schedules
|
||||
package catalog
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sort"
|
||||
"time"
|
||||
)
|
||||
|
||||
// DetectGaps analyzes backup history and finds gaps in the schedule
|
||||
func (c *SQLiteCatalog) DetectGaps(ctx context.Context, database string, config *GapDetectionConfig) ([]*Gap, error) {
|
||||
if config == nil {
|
||||
config = &GapDetectionConfig{
|
||||
ExpectedInterval: 24 * time.Hour,
|
||||
Tolerance: time.Hour,
|
||||
RPOThreshold: 48 * time.Hour,
|
||||
}
|
||||
}
|
||||
|
||||
// Get all backups for this database, ordered by time
|
||||
query := &SearchQuery{
|
||||
Database: database,
|
||||
Status: string(StatusCompleted),
|
||||
OrderBy: "created_at",
|
||||
OrderDesc: false,
|
||||
}
|
||||
|
||||
if config.StartDate != nil {
|
||||
query.StartDate = config.StartDate
|
||||
}
|
||||
if config.EndDate != nil {
|
||||
query.EndDate = config.EndDate
|
||||
}
|
||||
|
||||
entries, err := c.Search(ctx, query)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(entries) < 2 {
|
||||
return nil, nil // Not enough backups to detect gaps
|
||||
}
|
||||
|
||||
var gaps []*Gap
|
||||
|
||||
for i := 1; i < len(entries); i++ {
|
||||
prev := entries[i-1]
|
||||
curr := entries[i]
|
||||
|
||||
actualInterval := curr.CreatedAt.Sub(prev.CreatedAt)
|
||||
expectedWithTolerance := config.ExpectedInterval + config.Tolerance
|
||||
|
||||
if actualInterval > expectedWithTolerance {
|
||||
gap := &Gap{
|
||||
Database: database,
|
||||
GapStart: prev.CreatedAt,
|
||||
GapEnd: curr.CreatedAt,
|
||||
Duration: actualInterval,
|
||||
ExpectedAt: prev.CreatedAt.Add(config.ExpectedInterval),
|
||||
}
|
||||
|
||||
// Determine severity
|
||||
if actualInterval > config.RPOThreshold {
|
||||
gap.Severity = SeverityCritical
|
||||
gap.Description = "CRITICAL: Gap exceeds RPO threshold"
|
||||
} else if actualInterval > config.ExpectedInterval*2 {
|
||||
gap.Severity = SeverityWarning
|
||||
gap.Description = "WARNING: Gap exceeds 2x expected interval"
|
||||
} else {
|
||||
gap.Severity = SeverityInfo
|
||||
gap.Description = "INFO: Gap exceeds expected interval"
|
||||
}
|
||||
|
||||
gaps = append(gaps, gap)
|
||||
}
|
||||
}
|
||||
|
||||
// Check for gap from last backup to now
|
||||
lastBackup := entries[len(entries)-1]
|
||||
now := time.Now()
|
||||
if config.EndDate != nil {
|
||||
now = *config.EndDate
|
||||
}
|
||||
|
||||
sinceLastBackup := now.Sub(lastBackup.CreatedAt)
|
||||
if sinceLastBackup > config.ExpectedInterval+config.Tolerance {
|
||||
gap := &Gap{
|
||||
Database: database,
|
||||
GapStart: lastBackup.CreatedAt,
|
||||
GapEnd: now,
|
||||
Duration: sinceLastBackup,
|
||||
ExpectedAt: lastBackup.CreatedAt.Add(config.ExpectedInterval),
|
||||
}
|
||||
|
||||
if sinceLastBackup > config.RPOThreshold {
|
||||
gap.Severity = SeverityCritical
|
||||
gap.Description = "CRITICAL: No backup since " + FormatDuration(sinceLastBackup)
|
||||
} else if sinceLastBackup > config.ExpectedInterval*2 {
|
||||
gap.Severity = SeverityWarning
|
||||
gap.Description = "WARNING: No backup since " + FormatDuration(sinceLastBackup)
|
||||
} else {
|
||||
gap.Severity = SeverityInfo
|
||||
gap.Description = "INFO: Backup overdue by " + FormatDuration(sinceLastBackup-config.ExpectedInterval)
|
||||
}
|
||||
|
||||
gaps = append(gaps, gap)
|
||||
}
|
||||
|
||||
return gaps, nil
|
||||
}
|
||||
|
||||
// DetectAllGaps analyzes all databases for backup gaps
|
||||
func (c *SQLiteCatalog) DetectAllGaps(ctx context.Context, config *GapDetectionConfig) (map[string][]*Gap, error) {
|
||||
databases, err := c.ListDatabases(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
allGaps := make(map[string][]*Gap)
|
||||
|
||||
for _, db := range databases {
|
||||
gaps, err := c.DetectGaps(ctx, db, config)
|
||||
if err != nil {
|
||||
continue // Skip errors for individual databases
|
||||
}
|
||||
if len(gaps) > 0 {
|
||||
allGaps[db] = gaps
|
||||
}
|
||||
}
|
||||
|
||||
return allGaps, nil
|
||||
}
|
||||
|
||||
// BackupFrequencyAnalysis provides analysis of backup frequency
|
||||
type BackupFrequencyAnalysis struct {
|
||||
Database string `json:"database"`
|
||||
TotalBackups int `json:"total_backups"`
|
||||
AnalysisPeriod time.Duration `json:"analysis_period"`
|
||||
AverageInterval time.Duration `json:"average_interval"`
|
||||
MinInterval time.Duration `json:"min_interval"`
|
||||
MaxInterval time.Duration `json:"max_interval"`
|
||||
StdDeviation time.Duration `json:"std_deviation"`
|
||||
Regularity float64 `json:"regularity"` // 0-1, higher is more regular
|
||||
GapsDetected int `json:"gaps_detected"`
|
||||
MissedBackups int `json:"missed_backups"` // Estimated based on expected interval
|
||||
}
|
||||
|
||||
// AnalyzeFrequency analyzes backup frequency for a database
|
||||
func (c *SQLiteCatalog) AnalyzeFrequency(ctx context.Context, database string, expectedInterval time.Duration) (*BackupFrequencyAnalysis, error) {
|
||||
query := &SearchQuery{
|
||||
Database: database,
|
||||
Status: string(StatusCompleted),
|
||||
OrderBy: "created_at",
|
||||
OrderDesc: false,
|
||||
}
|
||||
|
||||
entries, err := c.Search(ctx, query)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(entries) < 2 {
|
||||
return &BackupFrequencyAnalysis{
|
||||
Database: database,
|
||||
TotalBackups: len(entries),
|
||||
}, nil
|
||||
}
|
||||
|
||||
analysis := &BackupFrequencyAnalysis{
|
||||
Database: database,
|
||||
TotalBackups: len(entries),
|
||||
}
|
||||
|
||||
// Calculate intervals
|
||||
var intervals []time.Duration
|
||||
for i := 1; i < len(entries); i++ {
|
||||
interval := entries[i].CreatedAt.Sub(entries[i-1].CreatedAt)
|
||||
intervals = append(intervals, interval)
|
||||
}
|
||||
|
||||
analysis.AnalysisPeriod = entries[len(entries)-1].CreatedAt.Sub(entries[0].CreatedAt)
|
||||
|
||||
// Calculate min, max, average
|
||||
sort.Slice(intervals, func(i, j int) bool {
|
||||
return intervals[i] < intervals[j]
|
||||
})
|
||||
|
||||
analysis.MinInterval = intervals[0]
|
||||
analysis.MaxInterval = intervals[len(intervals)-1]
|
||||
|
||||
var total time.Duration
|
||||
for _, interval := range intervals {
|
||||
total += interval
|
||||
}
|
||||
analysis.AverageInterval = total / time.Duration(len(intervals))
|
||||
|
||||
// Calculate standard deviation
|
||||
var sumSquares float64
|
||||
avgNanos := float64(analysis.AverageInterval.Nanoseconds())
|
||||
for _, interval := range intervals {
|
||||
diff := float64(interval.Nanoseconds()) - avgNanos
|
||||
sumSquares += diff * diff
|
||||
}
|
||||
variance := sumSquares / float64(len(intervals))
|
||||
analysis.StdDeviation = time.Duration(int64(variance)) // Simplified
|
||||
|
||||
// Calculate regularity score (lower deviation = higher regularity)
|
||||
if analysis.AverageInterval > 0 {
|
||||
deviationRatio := float64(analysis.StdDeviation) / float64(analysis.AverageInterval)
|
||||
analysis.Regularity = 1.0 - min(deviationRatio, 1.0)
|
||||
}
|
||||
|
||||
// Detect gaps and missed backups
|
||||
config := &GapDetectionConfig{
|
||||
ExpectedInterval: expectedInterval,
|
||||
Tolerance: expectedInterval / 4,
|
||||
RPOThreshold: expectedInterval * 2,
|
||||
}
|
||||
|
||||
gaps, _ := c.DetectGaps(ctx, database, config)
|
||||
analysis.GapsDetected = len(gaps)
|
||||
|
||||
// Estimate missed backups
|
||||
if expectedInterval > 0 {
|
||||
expectedBackups := int(analysis.AnalysisPeriod / expectedInterval)
|
||||
if expectedBackups > analysis.TotalBackups {
|
||||
analysis.MissedBackups = expectedBackups - analysis.TotalBackups
|
||||
}
|
||||
}
|
||||
|
||||
return analysis, nil
|
||||
}
|
||||
|
||||
// RecoveryPointObjective calculates the current RPO status
|
||||
type RPOStatus struct {
|
||||
Database string `json:"database"`
|
||||
LastBackup time.Time `json:"last_backup"`
|
||||
TimeSinceBackup time.Duration `json:"time_since_backup"`
|
||||
TargetRPO time.Duration `json:"target_rpo"`
|
||||
CurrentRPO time.Duration `json:"current_rpo"`
|
||||
RPOMet bool `json:"rpo_met"`
|
||||
NextBackupDue time.Time `json:"next_backup_due"`
|
||||
BackupsIn24Hours int `json:"backups_in_24h"`
|
||||
BackupsIn7Days int `json:"backups_in_7d"`
|
||||
}
|
||||
|
||||
// CalculateRPOStatus calculates RPO status for a database
|
||||
func (c *SQLiteCatalog) CalculateRPOStatus(ctx context.Context, database string, targetRPO time.Duration) (*RPOStatus, error) {
|
||||
status := &RPOStatus{
|
||||
Database: database,
|
||||
TargetRPO: targetRPO,
|
||||
}
|
||||
|
||||
// Get most recent backup
|
||||
entries, err := c.List(ctx, database, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(entries) == 0 {
|
||||
status.RPOMet = false
|
||||
status.CurrentRPO = time.Duration(0)
|
||||
return status, nil
|
||||
}
|
||||
|
||||
status.LastBackup = entries[0].CreatedAt
|
||||
status.TimeSinceBackup = time.Since(entries[0].CreatedAt)
|
||||
status.CurrentRPO = status.TimeSinceBackup
|
||||
status.RPOMet = status.TimeSinceBackup <= targetRPO
|
||||
status.NextBackupDue = entries[0].CreatedAt.Add(targetRPO)
|
||||
|
||||
// Count backups in time windows
|
||||
now := time.Now()
|
||||
last24h := now.Add(-24 * time.Hour)
|
||||
last7d := now.Add(-7 * 24 * time.Hour)
|
||||
|
||||
count24h, _ := c.Count(ctx, &SearchQuery{
|
||||
Database: database,
|
||||
StartDate: &last24h,
|
||||
Status: string(StatusCompleted),
|
||||
})
|
||||
count7d, _ := c.Count(ctx, &SearchQuery{
|
||||
Database: database,
|
||||
StartDate: &last7d,
|
||||
Status: string(StatusCompleted),
|
||||
})
|
||||
|
||||
status.BackupsIn24Hours = int(count24h)
|
||||
status.BackupsIn7Days = int(count7d)
|
||||
|
||||
return status, nil
|
||||
}
|
||||
|
||||
func min(a, b float64) float64 {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
Reference in New Issue
Block a user