Files
dbbackup/internal/catalog/gaps.go
Alexander Renz f69bfe7071 feat: Add enterprise DBA features for production reliability
New features implemented:

1. Backup Catalog (internal/catalog/)
   - SQLite-based backup tracking
   - Gap detection and RPO monitoring
   - Search and statistics
   - Filesystem sync

2. DR Drill Testing (internal/drill/)
   - Automated restore testing in Docker containers
   - Database validation with custom queries
   - Catalog integration for drill-tested status

3. Smart Notifications (internal/notify/)
   - Event batching with configurable intervals
   - Time-based escalation policies
   - HTML/text/Slack templates

4. Compliance Reports (internal/report/)
   - SOC2, GDPR, HIPAA, PCI-DSS, ISO27001 frameworks
   - Evidence collection from catalog
   - JSON, Markdown, HTML output formats

5. RTO/RPO Calculator (internal/rto/)
   - Recovery objective analysis
   - RTO breakdown by phase
   - Recommendations for improvement

6. Replica-Aware Backup (internal/replica/)
   - Topology detection for PostgreSQL/MySQL
   - Automatic replica selection
   - Configurable selection strategies

7. Parallel Table Backup (internal/parallel/)
   - Concurrent table dumps
   - Worker pool with progress tracking
   - Large table optimization

8. MySQL/MariaDB PITR (internal/pitr/)
   - Binary log parsing and replay
   - Point-in-time recovery support
   - Transaction filtering

CLI commands added: catalog, drill, report, rto

All changes support the goal: reliable 3 AM database recovery.
2025-12-13 20:28:55 +01:00

300 lines
8.3 KiB
Go

// Package catalog - Gap detection for backup schedules
package catalog
import (
"context"
"sort"
"time"
)
// DetectGaps analyzes backup history and finds gaps in the schedule
func (c *SQLiteCatalog) DetectGaps(ctx context.Context, database string, config *GapDetectionConfig) ([]*Gap, error) {
if config == nil {
config = &GapDetectionConfig{
ExpectedInterval: 24 * time.Hour,
Tolerance: time.Hour,
RPOThreshold: 48 * time.Hour,
}
}
// Get all backups for this database, ordered by time
query := &SearchQuery{
Database: database,
Status: string(StatusCompleted),
OrderBy: "created_at",
OrderDesc: false,
}
if config.StartDate != nil {
query.StartDate = config.StartDate
}
if config.EndDate != nil {
query.EndDate = config.EndDate
}
entries, err := c.Search(ctx, query)
if err != nil {
return nil, err
}
if len(entries) < 2 {
return nil, nil // Not enough backups to detect gaps
}
var gaps []*Gap
for i := 1; i < len(entries); i++ {
prev := entries[i-1]
curr := entries[i]
actualInterval := curr.CreatedAt.Sub(prev.CreatedAt)
expectedWithTolerance := config.ExpectedInterval + config.Tolerance
if actualInterval > expectedWithTolerance {
gap := &Gap{
Database: database,
GapStart: prev.CreatedAt,
GapEnd: curr.CreatedAt,
Duration: actualInterval,
ExpectedAt: prev.CreatedAt.Add(config.ExpectedInterval),
}
// Determine severity
if actualInterval > config.RPOThreshold {
gap.Severity = SeverityCritical
gap.Description = "CRITICAL: Gap exceeds RPO threshold"
} else if actualInterval > config.ExpectedInterval*2 {
gap.Severity = SeverityWarning
gap.Description = "WARNING: Gap exceeds 2x expected interval"
} else {
gap.Severity = SeverityInfo
gap.Description = "INFO: Gap exceeds expected interval"
}
gaps = append(gaps, gap)
}
}
// Check for gap from last backup to now
lastBackup := entries[len(entries)-1]
now := time.Now()
if config.EndDate != nil {
now = *config.EndDate
}
sinceLastBackup := now.Sub(lastBackup.CreatedAt)
if sinceLastBackup > config.ExpectedInterval+config.Tolerance {
gap := &Gap{
Database: database,
GapStart: lastBackup.CreatedAt,
GapEnd: now,
Duration: sinceLastBackup,
ExpectedAt: lastBackup.CreatedAt.Add(config.ExpectedInterval),
}
if sinceLastBackup > config.RPOThreshold {
gap.Severity = SeverityCritical
gap.Description = "CRITICAL: No backup since " + FormatDuration(sinceLastBackup)
} else if sinceLastBackup > config.ExpectedInterval*2 {
gap.Severity = SeverityWarning
gap.Description = "WARNING: No backup since " + FormatDuration(sinceLastBackup)
} else {
gap.Severity = SeverityInfo
gap.Description = "INFO: Backup overdue by " + FormatDuration(sinceLastBackup-config.ExpectedInterval)
}
gaps = append(gaps, gap)
}
return gaps, nil
}
// DetectAllGaps analyzes all databases for backup gaps
func (c *SQLiteCatalog) DetectAllGaps(ctx context.Context, config *GapDetectionConfig) (map[string][]*Gap, error) {
databases, err := c.ListDatabases(ctx)
if err != nil {
return nil, err
}
allGaps := make(map[string][]*Gap)
for _, db := range databases {
gaps, err := c.DetectGaps(ctx, db, config)
if err != nil {
continue // Skip errors for individual databases
}
if len(gaps) > 0 {
allGaps[db] = gaps
}
}
return allGaps, nil
}
// BackupFrequencyAnalysis provides analysis of backup frequency
type BackupFrequencyAnalysis struct {
Database string `json:"database"`
TotalBackups int `json:"total_backups"`
AnalysisPeriod time.Duration `json:"analysis_period"`
AverageInterval time.Duration `json:"average_interval"`
MinInterval time.Duration `json:"min_interval"`
MaxInterval time.Duration `json:"max_interval"`
StdDeviation time.Duration `json:"std_deviation"`
Regularity float64 `json:"regularity"` // 0-1, higher is more regular
GapsDetected int `json:"gaps_detected"`
MissedBackups int `json:"missed_backups"` // Estimated based on expected interval
}
// AnalyzeFrequency analyzes backup frequency for a database
func (c *SQLiteCatalog) AnalyzeFrequency(ctx context.Context, database string, expectedInterval time.Duration) (*BackupFrequencyAnalysis, error) {
query := &SearchQuery{
Database: database,
Status: string(StatusCompleted),
OrderBy: "created_at",
OrderDesc: false,
}
entries, err := c.Search(ctx, query)
if err != nil {
return nil, err
}
if len(entries) < 2 {
return &BackupFrequencyAnalysis{
Database: database,
TotalBackups: len(entries),
}, nil
}
analysis := &BackupFrequencyAnalysis{
Database: database,
TotalBackups: len(entries),
}
// Calculate intervals
var intervals []time.Duration
for i := 1; i < len(entries); i++ {
interval := entries[i].CreatedAt.Sub(entries[i-1].CreatedAt)
intervals = append(intervals, interval)
}
analysis.AnalysisPeriod = entries[len(entries)-1].CreatedAt.Sub(entries[0].CreatedAt)
// Calculate min, max, average
sort.Slice(intervals, func(i, j int) bool {
return intervals[i] < intervals[j]
})
analysis.MinInterval = intervals[0]
analysis.MaxInterval = intervals[len(intervals)-1]
var total time.Duration
for _, interval := range intervals {
total += interval
}
analysis.AverageInterval = total / time.Duration(len(intervals))
// Calculate standard deviation
var sumSquares float64
avgNanos := float64(analysis.AverageInterval.Nanoseconds())
for _, interval := range intervals {
diff := float64(interval.Nanoseconds()) - avgNanos
sumSquares += diff * diff
}
variance := sumSquares / float64(len(intervals))
analysis.StdDeviation = time.Duration(int64(variance)) // Simplified
// Calculate regularity score (lower deviation = higher regularity)
if analysis.AverageInterval > 0 {
deviationRatio := float64(analysis.StdDeviation) / float64(analysis.AverageInterval)
analysis.Regularity = 1.0 - min(deviationRatio, 1.0)
}
// Detect gaps and missed backups
config := &GapDetectionConfig{
ExpectedInterval: expectedInterval,
Tolerance: expectedInterval / 4,
RPOThreshold: expectedInterval * 2,
}
gaps, _ := c.DetectGaps(ctx, database, config)
analysis.GapsDetected = len(gaps)
// Estimate missed backups
if expectedInterval > 0 {
expectedBackups := int(analysis.AnalysisPeriod / expectedInterval)
if expectedBackups > analysis.TotalBackups {
analysis.MissedBackups = expectedBackups - analysis.TotalBackups
}
}
return analysis, nil
}
// RecoveryPointObjective calculates the current RPO status
type RPOStatus struct {
Database string `json:"database"`
LastBackup time.Time `json:"last_backup"`
TimeSinceBackup time.Duration `json:"time_since_backup"`
TargetRPO time.Duration `json:"target_rpo"`
CurrentRPO time.Duration `json:"current_rpo"`
RPOMet bool `json:"rpo_met"`
NextBackupDue time.Time `json:"next_backup_due"`
BackupsIn24Hours int `json:"backups_in_24h"`
BackupsIn7Days int `json:"backups_in_7d"`
}
// CalculateRPOStatus calculates RPO status for a database
func (c *SQLiteCatalog) CalculateRPOStatus(ctx context.Context, database string, targetRPO time.Duration) (*RPOStatus, error) {
status := &RPOStatus{
Database: database,
TargetRPO: targetRPO,
}
// Get most recent backup
entries, err := c.List(ctx, database, 1)
if err != nil {
return nil, err
}
if len(entries) == 0 {
status.RPOMet = false
status.CurrentRPO = time.Duration(0)
return status, nil
}
status.LastBackup = entries[0].CreatedAt
status.TimeSinceBackup = time.Since(entries[0].CreatedAt)
status.CurrentRPO = status.TimeSinceBackup
status.RPOMet = status.TimeSinceBackup <= targetRPO
status.NextBackupDue = entries[0].CreatedAt.Add(targetRPO)
// Count backups in time windows
now := time.Now()
last24h := now.Add(-24 * time.Hour)
last7d := now.Add(-7 * 24 * time.Hour)
count24h, _ := c.Count(ctx, &SearchQuery{
Database: database,
StartDate: &last24h,
Status: string(StatusCompleted),
})
count7d, _ := c.Count(ctx, &SearchQuery{
Database: database,
StartDate: &last7d,
Status: string(StatusCompleted),
})
status.BackupsIn24Hours = int(count24h)
status.BackupsIn7Days = int(count7d)
return status, nil
}
func min(a, b float64) float64 {
if a < b {
return a
}
return b
}