feat: Add enterprise DBA features for production reliability

New features implemented:

1. Backup Catalog (internal/catalog/)
   - SQLite-based backup tracking
   - Gap detection and RPO monitoring
   - Search and statistics
   - Filesystem sync

2. DR Drill Testing (internal/drill/)
   - Automated restore testing in Docker containers
   - Database validation with custom queries
   - Catalog integration for drill-tested status

3. Smart Notifications (internal/notify/)
   - Event batching with configurable intervals
   - Time-based escalation policies
   - HTML/text/Slack templates

4. Compliance Reports (internal/report/)
   - SOC2, GDPR, HIPAA, PCI-DSS, ISO27001 frameworks
   - Evidence collection from catalog
   - JSON, Markdown, HTML output formats

5. RTO/RPO Calculator (internal/rto/)
   - Recovery objective analysis
   - RTO breakdown by phase
   - Recommendations for improvement

6. Replica-Aware Backup (internal/replica/)
   - Topology detection for PostgreSQL/MySQL
   - Automatic replica selection
   - Configurable selection strategies

7. Parallel Table Backup (internal/parallel/)
   - Concurrent table dumps
   - Worker pool with progress tracking
   - Large table optimization

8. MySQL/MariaDB PITR (internal/pitr/)
   - Binary log parsing and replay
   - Point-in-time recovery support
   - Transaction filtering

CLI commands added: catalog, drill, report, rto

All changes support the goal: reliable 3 AM database recovery.
This commit is contained in:
2025-12-13 20:28:55 +01:00
parent d0d83b61ef
commit f69bfe7071
34 changed files with 13469 additions and 41 deletions

View File

@@ -11,41 +11,66 @@ import (
type EventType string
const (
EventBackupStarted EventType = "backup_started"
EventBackupCompleted EventType = "backup_completed"
EventBackupFailed EventType = "backup_failed"
EventRestoreStarted EventType = "restore_started"
EventRestoreCompleted EventType = "restore_completed"
EventRestoreFailed EventType = "restore_failed"
EventCleanupCompleted EventType = "cleanup_completed"
EventVerifyCompleted EventType = "verify_completed"
EventVerifyFailed EventType = "verify_failed"
EventPITRRecovery EventType = "pitr_recovery"
EventBackupStarted EventType = "backup_started"
EventBackupCompleted EventType = "backup_completed"
EventBackupFailed EventType = "backup_failed"
EventRestoreStarted EventType = "restore_started"
EventRestoreCompleted EventType = "restore_completed"
EventRestoreFailed EventType = "restore_failed"
EventCleanupCompleted EventType = "cleanup_completed"
EventVerifyCompleted EventType = "verify_completed"
EventVerifyFailed EventType = "verify_failed"
EventPITRRecovery EventType = "pitr_recovery"
EventVerificationPassed EventType = "verification_passed"
EventVerificationFailed EventType = "verification_failed"
EventDRDrillPassed EventType = "dr_drill_passed"
EventDRDrillFailed EventType = "dr_drill_failed"
EventGapDetected EventType = "gap_detected"
EventRPOViolation EventType = "rpo_violation"
)
// Severity represents the severity level of a notification
type Severity string
const (
SeverityInfo Severity = "info"
SeverityWarning Severity = "warning"
SeverityError Severity = "error"
SeverityInfo Severity = "info"
SeveritySuccess Severity = "success"
SeverityWarning Severity = "warning"
SeverityError Severity = "error"
SeverityCritical Severity = "critical"
)
// severityOrder returns numeric order for severity comparison
func severityOrder(s Severity) int {
switch s {
case SeverityInfo:
return 0
case SeveritySuccess:
return 1
case SeverityWarning:
return 2
case SeverityError:
return 3
case SeverityCritical:
return 4
default:
return 0
}
}
// Event represents a notification event
type Event struct {
Type EventType `json:"type"`
Severity Severity `json:"severity"`
Timestamp time.Time `json:"timestamp"`
Database string `json:"database,omitempty"`
Message string `json:"message"`
Details map[string]string `json:"details,omitempty"`
Error string `json:"error,omitempty"`
Duration time.Duration `json:"duration,omitempty"`
BackupFile string `json:"backup_file,omitempty"`
BackupSize int64 `json:"backup_size,omitempty"`
Hostname string `json:"hostname,omitempty"`
Type EventType `json:"type"`
Severity Severity `json:"severity"`
Timestamp time.Time `json:"timestamp"`
Database string `json:"database,omitempty"`
Message string `json:"message"`
Details map[string]string `json:"details,omitempty"`
Error string `json:"error,omitempty"`
Duration time.Duration `json:"duration,omitempty"`
BackupFile string `json:"backup_file,omitempty"`
BackupSize int64 `json:"backup_size,omitempty"`
Hostname string `json:"hostname,omitempty"`
}
// NewEvent creates a new notification event
@@ -132,27 +157,27 @@ type Config struct {
WebhookSecret string // For signing payloads
// General settings
OnSuccess bool // Send notifications on successful operations
OnFailure bool // Send notifications on failed operations
OnWarning bool // Send notifications on warnings
MinSeverity Severity
Retries int // Number of retry attempts
RetryDelay time.Duration // Delay between retries
OnSuccess bool // Send notifications on successful operations
OnFailure bool // Send notifications on failed operations
OnWarning bool // Send notifications on warnings
MinSeverity Severity
Retries int // Number of retry attempts
RetryDelay time.Duration // Delay between retries
}
// DefaultConfig returns a configuration with sensible defaults
func DefaultConfig() Config {
return Config{
SMTPPort: 587,
SMTPTLS: false,
SMTPStartTLS: true,
SMTPPort: 587,
SMTPTLS: false,
SMTPStartTLS: true,
WebhookMethod: "POST",
OnSuccess: true,
OnFailure: true,
OnWarning: true,
MinSeverity: SeverityInfo,
Retries: 3,
RetryDelay: 5 * time.Second,
OnSuccess: true,
OnFailure: true,
OnWarning: true,
MinSeverity: SeverityInfo,
Retries: 3,
RetryDelay: 5 * time.Second,
}
}