Files
dbbackup/grafana/alerting-rules.yaml
Alexander Renz 1747365d0d
All checks were successful
CI/CD / Test (push) Successful in 1m54s
CI/CD / Lint (push) Successful in 1m47s
CI/CD / Integration Tests (push) Successful in 1m28s
CI/CD / Build & Release (push) Successful in 10m57s
feat(metrics): add backup_type label and PITR metrics
- Add backup_type label (full/incremental/pitr_base) to core metrics
- Add new dbbackup_backup_by_type metric for backup type distribution
- Add complete PITR metrics: pitr_enabled, pitr_archive_lag_seconds,
  pitr_chain_valid, pitr_gap_count, pitr_recovery_window_minutes
- Add PITR-specific alerting rules for archive lag and chain integrity
- Update METRICS.md and EXPORTER.md documentation
- Bump version to 4.1.0
2026-01-27 14:44:27 +01:00

221 lines
9.1 KiB
YAML

# DBBackup Prometheus Alerting Rules
# Deploy these to your Prometheus server or use Grafana Alerting
#
# Usage with Prometheus:
# Add to prometheus.yml:
# rule_files:
# - /path/to/alerting-rules.yaml
#
# Usage with Grafana Alerting:
# Import these as Grafana alert rules via the UI or provisioning
groups:
- name: dbbackup_alerts
interval: 1m
rules:
# Critical: No backup in 24 hours
- alert: DBBackupRPOCritical
expr: dbbackup_rpo_seconds > 86400
for: 5m
labels:
severity: critical
annotations:
summary: "No backup for {{ $labels.database }} in 24+ hours"
description: |
Database {{ $labels.database }} on {{ $labels.server }} has not been
backed up in {{ $value | humanizeDuration }}. This exceeds the 24-hour
RPO threshold. Immediate investigation required.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#rpo-critical"
# Warning: No backup in 12 hours
- alert: DBBackupRPOWarning
expr: dbbackup_rpo_seconds > 43200 and dbbackup_rpo_seconds <= 86400
for: 5m
labels:
severity: warning
annotations:
summary: "No backup for {{ $labels.database }} in 12+ hours"
description: |
Database {{ $labels.database }} on {{ $labels.server }} has not been
backed up in {{ $value | humanizeDuration }}. Check backup schedule.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#rpo-warning"
# Critical: Backup failures detected
- alert: DBBackupFailure
expr: increase(dbbackup_backup_total{status="failure"}[1h]) > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Backup failure detected for {{ $labels.database }}"
description: |
One or more backup attempts failed for {{ $labels.database }} on
{{ $labels.server }} in the last hour. Check logs for details.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#backup-failure"
# Warning: Backup not verified
- alert: DBBackupNotVerified
expr: dbbackup_backup_verified == 0
for: 24h
labels:
severity: warning
annotations:
summary: "Backup for {{ $labels.database }} not verified"
description: |
The latest backup for {{ $labels.database }} on {{ $labels.server }}
has not been verified. Consider running verification to ensure
backup integrity.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#verification"
# Warning: Dedup ratio dropping
- alert: DBBackupDedupRatioLow
expr: dbbackup_dedup_ratio < 0.1
for: 1h
labels:
severity: warning
annotations:
summary: "Low deduplication ratio on {{ $labels.server }}"
description: |
Deduplication ratio on {{ $labels.server }} is {{ $value | printf "%.1f%%" }}.
This may indicate changes in data patterns or dedup configuration issues.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#dedup-low"
# Warning: Dedup disk usage growing rapidly
- alert: DBBackupDedupDiskGrowth
expr: |
predict_linear(dbbackup_dedup_disk_usage_bytes[7d], 30*24*3600) >
(dbbackup_dedup_disk_usage_bytes * 2)
for: 1h
labels:
severity: warning
annotations:
summary: "Rapid dedup storage growth on {{ $labels.server }}"
description: |
Dedup storage on {{ $labels.server }} is growing rapidly.
At current rate, usage will double in 30 days.
Current usage: {{ $value | humanize1024 }}B
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#storage-growth"
# PITR: Archive lag high
- alert: DBBackupPITRArchiveLag
expr: dbbackup_pitr_archive_lag_seconds > 600
for: 5m
labels:
severity: warning
annotations:
summary: "PITR archive lag high for {{ $labels.database }}"
description: |
WAL/binlog archiving for {{ $labels.database }} on {{ $labels.server }}
is {{ $value | humanizeDuration }} behind. This reduces the PITR
recovery point. Check archive process and disk space.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-archive-lag"
# PITR: Archive lag critical
- alert: DBBackupPITRArchiveLagCritical
expr: dbbackup_pitr_archive_lag_seconds > 1800
for: 5m
labels:
severity: critical
annotations:
summary: "PITR archive severely behind for {{ $labels.database }}"
description: |
WAL/binlog archiving for {{ $labels.database }} is {{ $value | humanizeDuration }}
behind. Point-in-time recovery capability is at risk. Immediate action required.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-archive-critical"
# PITR: Chain broken (gaps detected)
- alert: DBBackupPITRChainBroken
expr: dbbackup_pitr_chain_valid == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PITR chain broken for {{ $labels.database }}"
description: |
The WAL/binlog chain for {{ $labels.database }} on {{ $labels.server }}
has gaps. Point-in-time recovery to arbitrary points is NOT possible.
A new base backup is required to restore PITR capability.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-chain-broken"
# PITR: Gaps in chain
- alert: DBBackupPITRGapsDetected
expr: dbbackup_pitr_gap_count > 0
for: 5m
labels:
severity: warning
annotations:
summary: "PITR chain has {{ $value }} gaps for {{ $labels.database }}"
description: |
{{ $value }} gaps detected in WAL/binlog chain for {{ $labels.database }}.
Recovery to points within gaps will fail. Consider taking a new base backup.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-gaps"
# PITR: Unexpectedly disabled
- alert: DBBackupPITRDisabled
expr: |
dbbackup_pitr_enabled == 0
and on(database) dbbackup_pitr_archive_count > 0
for: 10m
labels:
severity: critical
annotations:
summary: "PITR unexpectedly disabled for {{ $labels.database }}"
description: |
PITR was previously enabled for {{ $labels.database }} (has archived logs)
but is now disabled. This may indicate a configuration issue or
database restart without PITR settings.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-disabled"
# Backup type: No full backups recently
- alert: DBBackupNoRecentFullBackup
expr: |
time() - dbbackup_last_success_timestamp{backup_type="full"} > 604800
for: 1h
labels:
severity: warning
annotations:
summary: "No full backup in 7+ days for {{ $labels.database }}"
description: |
Database {{ $labels.database }} has not had a full backup in over 7 days.
Incremental backups depend on a valid full backup base.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#no-full-backup"
# Info: Exporter not responding
- alert: DBBackupExporterDown
expr: up{job="dbbackup"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "DBBackup exporter is down on {{ $labels.instance }}"
description: |
The DBBackup Prometheus exporter on {{ $labels.instance }} is not
responding. Metrics collection is affected.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#exporter-down"
# Info: Metrics stale (scrape timestamp old)
- alert: DBBackupMetricsStale
expr: time() - dbbackup_scrape_timestamp > 600
for: 5m
labels:
severity: warning
annotations:
summary: "DBBackup metrics are stale on {{ $labels.server }}"
description: |
Metrics for {{ $labels.server }} haven't been updated in
{{ $value | humanizeDuration }}. The exporter may be having issues.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#metrics-stale"
# Critical: No successful backups ever
- alert: DBBackupNeverSucceeded
expr: dbbackup_backup_total{status="success"} == 0
for: 1h
labels:
severity: critical
annotations:
summary: "No successful backups for {{ $labels.database }}"
description: |
Database {{ $labels.database }} on {{ $labels.server }} has never
had a successful backup. This requires immediate attention.
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#never-succeeded"