- Add backup_type label (full/incremental/pitr_base) to core metrics - Add new dbbackup_backup_by_type metric for backup type distribution - Add complete PITR metrics: pitr_enabled, pitr_archive_lag_seconds, pitr_chain_valid, pitr_gap_count, pitr_recovery_window_minutes - Add PITR-specific alerting rules for archive lag and chain integrity - Update METRICS.md and EXPORTER.md documentation - Bump version to 4.1.0
221 lines
9.1 KiB
YAML
221 lines
9.1 KiB
YAML
# DBBackup Prometheus Alerting Rules
|
|
# Deploy these to your Prometheus server or use Grafana Alerting
|
|
#
|
|
# Usage with Prometheus:
|
|
# Add to prometheus.yml:
|
|
# rule_files:
|
|
# - /path/to/alerting-rules.yaml
|
|
#
|
|
# Usage with Grafana Alerting:
|
|
# Import these as Grafana alert rules via the UI or provisioning
|
|
|
|
groups:
|
|
- name: dbbackup_alerts
|
|
interval: 1m
|
|
rules:
|
|
# Critical: No backup in 24 hours
|
|
- alert: DBBackupRPOCritical
|
|
expr: dbbackup_rpo_seconds > 86400
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "No backup for {{ $labels.database }} in 24+ hours"
|
|
description: |
|
|
Database {{ $labels.database }} on {{ $labels.server }} has not been
|
|
backed up in {{ $value | humanizeDuration }}. This exceeds the 24-hour
|
|
RPO threshold. Immediate investigation required.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#rpo-critical"
|
|
|
|
# Warning: No backup in 12 hours
|
|
- alert: DBBackupRPOWarning
|
|
expr: dbbackup_rpo_seconds > 43200 and dbbackup_rpo_seconds <= 86400
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "No backup for {{ $labels.database }} in 12+ hours"
|
|
description: |
|
|
Database {{ $labels.database }} on {{ $labels.server }} has not been
|
|
backed up in {{ $value | humanizeDuration }}. Check backup schedule.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#rpo-warning"
|
|
|
|
# Critical: Backup failures detected
|
|
- alert: DBBackupFailure
|
|
expr: increase(dbbackup_backup_total{status="failure"}[1h]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Backup failure detected for {{ $labels.database }}"
|
|
description: |
|
|
One or more backup attempts failed for {{ $labels.database }} on
|
|
{{ $labels.server }} in the last hour. Check logs for details.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#backup-failure"
|
|
|
|
# Warning: Backup not verified
|
|
- alert: DBBackupNotVerified
|
|
expr: dbbackup_backup_verified == 0
|
|
for: 24h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Backup for {{ $labels.database }} not verified"
|
|
description: |
|
|
The latest backup for {{ $labels.database }} on {{ $labels.server }}
|
|
has not been verified. Consider running verification to ensure
|
|
backup integrity.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#verification"
|
|
|
|
# Warning: Dedup ratio dropping
|
|
- alert: DBBackupDedupRatioLow
|
|
expr: dbbackup_dedup_ratio < 0.1
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Low deduplication ratio on {{ $labels.server }}"
|
|
description: |
|
|
Deduplication ratio on {{ $labels.server }} is {{ $value | printf "%.1f%%" }}.
|
|
This may indicate changes in data patterns or dedup configuration issues.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#dedup-low"
|
|
|
|
# Warning: Dedup disk usage growing rapidly
|
|
- alert: DBBackupDedupDiskGrowth
|
|
expr: |
|
|
predict_linear(dbbackup_dedup_disk_usage_bytes[7d], 30*24*3600) >
|
|
(dbbackup_dedup_disk_usage_bytes * 2)
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Rapid dedup storage growth on {{ $labels.server }}"
|
|
description: |
|
|
Dedup storage on {{ $labels.server }} is growing rapidly.
|
|
At current rate, usage will double in 30 days.
|
|
Current usage: {{ $value | humanize1024 }}B
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#storage-growth"
|
|
|
|
# PITR: Archive lag high
|
|
- alert: DBBackupPITRArchiveLag
|
|
expr: dbbackup_pitr_archive_lag_seconds > 600
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PITR archive lag high for {{ $labels.database }}"
|
|
description: |
|
|
WAL/binlog archiving for {{ $labels.database }} on {{ $labels.server }}
|
|
is {{ $value | humanizeDuration }} behind. This reduces the PITR
|
|
recovery point. Check archive process and disk space.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-archive-lag"
|
|
|
|
# PITR: Archive lag critical
|
|
- alert: DBBackupPITRArchiveLagCritical
|
|
expr: dbbackup_pitr_archive_lag_seconds > 1800
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PITR archive severely behind for {{ $labels.database }}"
|
|
description: |
|
|
WAL/binlog archiving for {{ $labels.database }} is {{ $value | humanizeDuration }}
|
|
behind. Point-in-time recovery capability is at risk. Immediate action required.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-archive-critical"
|
|
|
|
# PITR: Chain broken (gaps detected)
|
|
- alert: DBBackupPITRChainBroken
|
|
expr: dbbackup_pitr_chain_valid == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PITR chain broken for {{ $labels.database }}"
|
|
description: |
|
|
The WAL/binlog chain for {{ $labels.database }} on {{ $labels.server }}
|
|
has gaps. Point-in-time recovery to arbitrary points is NOT possible.
|
|
A new base backup is required to restore PITR capability.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-chain-broken"
|
|
|
|
# PITR: Gaps in chain
|
|
- alert: DBBackupPITRGapsDetected
|
|
expr: dbbackup_pitr_gap_count > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PITR chain has {{ $value }} gaps for {{ $labels.database }}"
|
|
description: |
|
|
{{ $value }} gaps detected in WAL/binlog chain for {{ $labels.database }}.
|
|
Recovery to points within gaps will fail. Consider taking a new base backup.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-gaps"
|
|
|
|
# PITR: Unexpectedly disabled
|
|
- alert: DBBackupPITRDisabled
|
|
expr: |
|
|
dbbackup_pitr_enabled == 0
|
|
and on(database) dbbackup_pitr_archive_count > 0
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PITR unexpectedly disabled for {{ $labels.database }}"
|
|
description: |
|
|
PITR was previously enabled for {{ $labels.database }} (has archived logs)
|
|
but is now disabled. This may indicate a configuration issue or
|
|
database restart without PITR settings.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-disabled"
|
|
|
|
# Backup type: No full backups recently
|
|
- alert: DBBackupNoRecentFullBackup
|
|
expr: |
|
|
time() - dbbackup_last_success_timestamp{backup_type="full"} > 604800
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "No full backup in 7+ days for {{ $labels.database }}"
|
|
description: |
|
|
Database {{ $labels.database }} has not had a full backup in over 7 days.
|
|
Incremental backups depend on a valid full backup base.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#no-full-backup"
|
|
|
|
# Info: Exporter not responding
|
|
- alert: DBBackupExporterDown
|
|
expr: up{job="dbbackup"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "DBBackup exporter is down on {{ $labels.instance }}"
|
|
description: |
|
|
The DBBackup Prometheus exporter on {{ $labels.instance }} is not
|
|
responding. Metrics collection is affected.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#exporter-down"
|
|
|
|
# Info: Metrics stale (scrape timestamp old)
|
|
- alert: DBBackupMetricsStale
|
|
expr: time() - dbbackup_scrape_timestamp > 600
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "DBBackup metrics are stale on {{ $labels.server }}"
|
|
description: |
|
|
Metrics for {{ $labels.server }} haven't been updated in
|
|
{{ $value | humanizeDuration }}. The exporter may be having issues.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#metrics-stale"
|
|
|
|
# Critical: No successful backups ever
|
|
- alert: DBBackupNeverSucceeded
|
|
expr: dbbackup_backup_total{status="success"} == 0
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "No successful backups for {{ $labels.database }}"
|
|
description: |
|
|
Database {{ $labels.database }} on {{ $labels.server }} has never
|
|
had a successful backup. This requires immediate attention.
|
|
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#never-succeeded"
|