Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 79f2efaaac | |||
| 19f44749b1 | |||
| c7904c7857 | |||
| 1747365d0d | |||
| 8cf107b8d4 |
3
.gitignore
vendored
3
.gitignore
vendored
@ -37,3 +37,6 @@ CRITICAL_BUGS_FIXED.md
|
||||
LEGAL_DOCUMENTATION.md
|
||||
LEGAL_*.md
|
||||
legal/
|
||||
|
||||
# Release binaries (uploaded via gh release, not git)
|
||||
release/dbbackup_*
|
||||
|
||||
46
CHANGELOG.md
46
CHANGELOG.md
@ -5,6 +5,52 @@ All notable changes to dbbackup will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [4.1.2] - 2026-01-27
|
||||
|
||||
### Added
|
||||
- **`--socket` flag for MySQL/MariaDB** - Connect via Unix socket instead of TCP/IP
|
||||
- Usage: `dbbackup backup single mydb --db-type mysql --socket /var/run/mysqld/mysqld.sock`
|
||||
- Works for both backup and restore operations
|
||||
- Supports socket auth (no password required with proper permissions)
|
||||
|
||||
### Fixed
|
||||
- **Socket path as --host now works** - If `--host` starts with `/`, it's auto-detected as a socket path
|
||||
- Example: `--host /var/run/mysqld/mysqld.sock` now works correctly instead of DNS lookup error
|
||||
- Auto-converts to `--socket` internally
|
||||
|
||||
## [4.1.1] - 2026-01-25
|
||||
|
||||
### Added
|
||||
- **`dbbackup_build_info` metric** - Exposes version and git commit as Prometheus labels
|
||||
- Useful for tracking deployed versions across a fleet
|
||||
- Labels: `server`, `version`, `commit`
|
||||
|
||||
### Fixed
|
||||
- **Documentation clarification**: The `pitr_base` value for `backup_type` label is auto-assigned
|
||||
by `dbbackup pitr base` command. CLI `--backup-type` flag only accepts `full` or `incremental`.
|
||||
This was causing confusion in deployments.
|
||||
|
||||
## [4.1.0] - 2026-01-25
|
||||
|
||||
### Added
|
||||
- **Backup Type Tracking**: All backup metrics now include a `backup_type` label
|
||||
(`full`, `incremental`, or `pitr_base` for PITR base backups)
|
||||
- **PITR Metrics**: Complete Point-in-Time Recovery monitoring
|
||||
- `dbbackup_pitr_enabled` - Whether PITR is enabled (1/0)
|
||||
- `dbbackup_pitr_archive_lag_seconds` - Seconds since last WAL/binlog archived
|
||||
- `dbbackup_pitr_chain_valid` - WAL/binlog chain integrity (1=valid)
|
||||
- `dbbackup_pitr_gap_count` - Number of gaps in archive chain
|
||||
- `dbbackup_pitr_archive_count` - Total archived segments
|
||||
- `dbbackup_pitr_archive_size_bytes` - Total archive storage
|
||||
- `dbbackup_pitr_recovery_window_minutes` - Estimated PITR coverage
|
||||
- **PITR Alerting Rules**: 6 new alerts for PITR monitoring
|
||||
- PITRArchiveLag, PITRChainBroken, PITRGapsDetected, PITRArchiveStalled,
|
||||
PITRStorageGrowing, PITRDisabledUnexpectedly
|
||||
- **`dbbackup_backup_by_type` metric** - Count backups by type
|
||||
|
||||
### Changed
|
||||
- `dbbackup_backup_total` type changed from counter to gauge for snapshot-based collection
|
||||
|
||||
## [3.42.110] - 2026-01-24
|
||||
|
||||
### Improved - Code Quality & Testing
|
||||
|
||||
3
QUICK.md
3
QUICK.md
@ -14,6 +14,9 @@ dbbackup backup single myapp
|
||||
# MySQL
|
||||
dbbackup backup single gitea --db-type mysql --host 127.0.0.1 --port 3306
|
||||
|
||||
# MySQL/MariaDB with Unix socket
|
||||
dbbackup backup single myapp --db-type mysql --socket /var/run/mysqld/mysqld.sock
|
||||
|
||||
# With compression level (0-9, default 6)
|
||||
dbbackup backup cluster --compression 9
|
||||
|
||||
|
||||
18
README.md
18
README.md
@ -1,19 +1,21 @@
|
||||
```
|
||||
██╗ ██╗ ██████╗
|
||||
██║ ██║ ██╔═████╗
|
||||
███████║ ██║██╔██║
|
||||
╚════██║ ████╔╝██║
|
||||
██║██╗╚██████╔╝
|
||||
╚═╝╚═╝ ╚═════╝
|
||||
_ _ _ _
|
||||
| | | | | | |
|
||||
__| | |__ | |__ __ _ ___| | ___ _ _ __
|
||||
/ _` | '_ \| '_ \ / _` |/ __| |/ / | | | '_ \
|
||||
| (_| | |_) | |_) | (_| | (__| <| |_| | |_) |
|
||||
\__,_|_.__/|_.__/ \__,_|\___|_|\_\\__,_| .__/
|
||||
| |
|
||||
|_|
|
||||
```
|
||||
|
||||
# dbbackup v4.0.0
|
||||
# dbbackup
|
||||
|
||||
Database backup and restore utility for PostgreSQL, MySQL, and MariaDB.
|
||||
|
||||
[](https://opensource.org/licenses/Apache-2.0)
|
||||
[](https://golang.org/)
|
||||
[](https://github.com/PlusOne/dbbackup/releases/tag/v4.0.0)
|
||||
[](https://github.com/PlusOne/dbbackup/releases/latest)
|
||||
|
||||
**Repository:** https://git.uuxo.net/UUXO/dbbackup
|
||||
**Mirror:** https://github.com/PlusOne/dbbackup
|
||||
|
||||
@ -127,8 +127,8 @@ func runMetricsExport(ctx context.Context) error {
|
||||
}
|
||||
defer cat.Close()
|
||||
|
||||
// Create metrics writer
|
||||
writer := prometheus.NewMetricsWriter(log, cat, server)
|
||||
// Create metrics writer with version info
|
||||
writer := prometheus.NewMetricsWriterWithVersion(log, cat, server, cfg.Version, cfg.GitCommit)
|
||||
|
||||
// Write textfile
|
||||
if err := writer.WriteTextfile(metricsOutput); err != nil {
|
||||
@ -162,8 +162,8 @@ func runMetricsServe(ctx context.Context) error {
|
||||
}
|
||||
defer cat.Close()
|
||||
|
||||
// Create exporter
|
||||
exporter := prometheus.NewExporter(log, cat, server, metricsPort)
|
||||
// Create exporter with version info
|
||||
exporter := prometheus.NewExporterWithVersion(log, cat, server, metricsPort, cfg.Version, cfg.GitCommit)
|
||||
|
||||
// Run server (blocks until context is cancelled)
|
||||
return exporter.Serve(ctx)
|
||||
|
||||
@ -3,6 +3,7 @@ package cmd
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"dbbackup/internal/config"
|
||||
"dbbackup/internal/logger"
|
||||
@ -107,6 +108,12 @@ For help with specific commands, use: dbbackup [command] --help`,
|
||||
}
|
||||
}
|
||||
|
||||
// Auto-detect socket from --host path (if host starts with /)
|
||||
if strings.HasPrefix(cfg.Host, "/") && cfg.Socket == "" {
|
||||
cfg.Socket = cfg.Host
|
||||
cfg.Host = "localhost" // Reset host for socket connections
|
||||
}
|
||||
|
||||
return cfg.SetDatabaseType(cfg.DatabaseType)
|
||||
},
|
||||
}
|
||||
@ -136,6 +143,7 @@ func Execute(ctx context.Context, config *config.Config, logger logger.Logger) e
|
||||
// Add persistent flags
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.Host, "host", cfg.Host, "Database host")
|
||||
rootCmd.PersistentFlags().IntVar(&cfg.Port, "port", cfg.Port, "Database port")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.Socket, "socket", cfg.Socket, "Unix socket path for MySQL/MariaDB (e.g., /var/run/mysqld/mysqld.sock)")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.User, "user", cfg.User, "Database user")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.Database, "database", cfg.Database, "Database name")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.Password, "password", cfg.Password, "Database password")
|
||||
|
||||
@ -90,6 +90,53 @@ groups:
|
||||
summary: "Backup not verified for {{ $labels.database }}"
|
||||
description: "Last backup was not verified. Run dbbackup verify to check integrity."
|
||||
|
||||
# PITR Alerts
|
||||
- alert: DBBackupPITRArchiveLag
|
||||
expr: dbbackup_pitr_archive_lag_seconds > 600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PITR archive lag on {{ $labels.server }}"
|
||||
description: "WAL/binlog archiving for {{ $labels.database }} is {{ $value | humanizeDuration }} behind."
|
||||
|
||||
- alert: DBBackupPITRArchiveCritical
|
||||
expr: dbbackup_pitr_archive_lag_seconds > 1800
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR archive critically behind on {{ $labels.server }}"
|
||||
description: "WAL/binlog archiving for {{ $labels.database }} is {{ $value | humanizeDuration }} behind. PITR capability at risk!"
|
||||
|
||||
- alert: DBBackupPITRChainBroken
|
||||
expr: dbbackup_pitr_chain_valid == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR chain broken for {{ $labels.database }}"
|
||||
description: "WAL/binlog chain has gaps. Point-in-time recovery NOT possible. New base backup required."
|
||||
|
||||
- alert: DBBackupPITRGaps
|
||||
expr: dbbackup_pitr_gap_count > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PITR chain gaps for {{ $labels.database }}"
|
||||
description: "{{ $value }} gaps in WAL/binlog chain. Recovery to points within gaps will fail."
|
||||
|
||||
# Backup Type Alerts
|
||||
- alert: DBBackupNoRecentFull
|
||||
expr: time() - dbbackup_last_success_timestamp{backup_type="full"} > 604800
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "No full backup in 7+ days for {{ $labels.database }}"
|
||||
description: "Consider taking a full backup. Incremental chains depend on valid base."
|
||||
|
||||
# Exporter Health
|
||||
- alert: DBBackupExporterDown
|
||||
expr: up{job="dbbackup"} == 0
|
||||
|
||||
111
docs/EXPORTER.md
111
docs/EXPORTER.md
@ -2,6 +2,33 @@
|
||||
|
||||
This document provides complete reference for the DBBackup Prometheus exporter, including all exported metrics, setup instructions, and Grafana dashboard configuration.
|
||||
|
||||
## What's New (January 2026)
|
||||
|
||||
### New Features
|
||||
- **Backup Type Tracking**: All backup metrics now include a `backup_type` label (`full`, `incremental`, or `pitr_base` for PITR base backups)
|
||||
- **Note**: CLI `--backup-type` flag only accepts `full` or `incremental`. The `pitr_base` label is auto-assigned when using `dbbackup pitr base`
|
||||
- **PITR Metrics**: Complete Point-in-Time Recovery monitoring for PostgreSQL WAL and MySQL binlog archiving
|
||||
- **New Alerts**: PITR-specific alerts for archive lag, chain integrity, and gap detection
|
||||
|
||||
### New Metrics Added
|
||||
| Metric | Description |
|
||||
|--------|-------------|
|
||||
| `dbbackup_build_info` | Build info with version and commit labels |
|
||||
| `dbbackup_backup_by_type` | Count backups by type (full/incremental/pitr_base) |
|
||||
| `dbbackup_pitr_enabled` | Whether PITR is enabled (1/0) |
|
||||
| `dbbackup_pitr_archive_lag_seconds` | Seconds since last WAL/binlog archived |
|
||||
| `dbbackup_pitr_chain_valid` | WAL/binlog chain integrity (1=valid) |
|
||||
| `dbbackup_pitr_gap_count` | Number of gaps in archive chain |
|
||||
| `dbbackup_pitr_archive_count` | Total archived segments |
|
||||
| `dbbackup_pitr_archive_size_bytes` | Total archive storage |
|
||||
| `dbbackup_pitr_recovery_window_minutes` | Estimated PITR coverage |
|
||||
|
||||
### Label Changes
|
||||
- `backup_type` label added to: `dbbackup_rpo_seconds`, `dbbackup_last_success_timestamp`, `dbbackup_last_backup_duration_seconds`, `dbbackup_last_backup_size_bytes`
|
||||
- `dbbackup_backup_total` type changed from counter to gauge (more accurate for snapshot-based collection)
|
||||
|
||||
---
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Quick Start](#quick-start)
|
||||
@ -112,14 +139,29 @@ All metrics use the `dbbackup_` prefix. Below is the **validated** list of metri
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|-------------|------|--------|-------------|
|
||||
| `dbbackup_last_success_timestamp` | gauge | `server`, `database`, `engine` | Unix timestamp of last successful backup |
|
||||
| `dbbackup_last_backup_duration_seconds` | gauge | `server`, `database`, `engine` | Duration of last successful backup in seconds |
|
||||
| `dbbackup_last_backup_size_bytes` | gauge | `server`, `database`, `engine` | Size of last successful backup in bytes |
|
||||
| `dbbackup_backup_total` | counter | `server`, `database`, `status` | Total backup attempts (status: `success` or `failure`) |
|
||||
| `dbbackup_rpo_seconds` | gauge | `server`, `database` | Seconds since last successful backup (RPO) |
|
||||
| `dbbackup_last_success_timestamp` | gauge | `server`, `database`, `engine`, `backup_type` | Unix timestamp of last successful backup |
|
||||
| `dbbackup_last_backup_duration_seconds` | gauge | `server`, `database`, `engine`, `backup_type` | Duration of last successful backup in seconds |
|
||||
| `dbbackup_last_backup_size_bytes` | gauge | `server`, `database`, `engine`, `backup_type` | Size of last successful backup in bytes |
|
||||
| `dbbackup_backup_total` | gauge | `server`, `database`, `status` | Total backup attempts (status: `success` or `failure`) |
|
||||
| `dbbackup_backup_by_type` | gauge | `server`, `database`, `backup_type` | Backup count by type (`full`, `incremental`, `pitr_base`) |
|
||||
| `dbbackup_rpo_seconds` | gauge | `server`, `database`, `backup_type` | Seconds since last successful backup (RPO) |
|
||||
| `dbbackup_backup_verified` | gauge | `server`, `database` | Whether last backup was verified (1=yes, 0=no) |
|
||||
| `dbbackup_scrape_timestamp` | gauge | `server` | Unix timestamp when metrics were collected |
|
||||
|
||||
### PITR (Point-in-Time Recovery) Metrics
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|-------------|------|--------|-------------|
|
||||
| `dbbackup_pitr_enabled` | gauge | `server`, `database`, `engine` | Whether PITR is enabled (1=yes, 0=no) |
|
||||
| `dbbackup_pitr_last_archived_timestamp` | gauge | `server`, `database`, `engine` | Unix timestamp of last archived WAL/binlog |
|
||||
| `dbbackup_pitr_archive_lag_seconds` | gauge | `server`, `database`, `engine` | Seconds since last archive (lower is better) |
|
||||
| `dbbackup_pitr_archive_count` | gauge | `server`, `database`, `engine` | Total archived WAL segments or binlog files |
|
||||
| `dbbackup_pitr_archive_size_bytes` | gauge | `server`, `database`, `engine` | Total size of archived logs in bytes |
|
||||
| `dbbackup_pitr_chain_valid` | gauge | `server`, `database`, `engine` | Whether archive chain is valid (1=yes, 0=gaps) |
|
||||
| `dbbackup_pitr_gap_count` | gauge | `server`, `database`, `engine` | Number of gaps in archive chain |
|
||||
| `dbbackup_pitr_recovery_window_minutes` | gauge | `server`, `database`, `engine` | Estimated PITR coverage window in minutes |
|
||||
| `dbbackup_pitr_scrape_timestamp` | gauge | `server` | PITR metrics collection timestamp |
|
||||
|
||||
### Deduplication Metrics
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
@ -155,34 +197,55 @@ All metrics use the `dbbackup_` prefix. Below is the **validated** list of metri
|
||||
|
||||
```prometheus
|
||||
# DBBackup Prometheus Metrics
|
||||
# Generated at: 2026-01-26T10:30:00Z
|
||||
# Generated at: 2026-01-27T10:30:00Z
|
||||
# Server: production
|
||||
|
||||
# HELP dbbackup_last_success_timestamp Unix timestamp of last successful backup
|
||||
# TYPE dbbackup_last_success_timestamp gauge
|
||||
dbbackup_last_success_timestamp{server="production",database="myapp",engine="postgres"} 1737884600
|
||||
dbbackup_last_success_timestamp{server="production",database="myapp",engine="postgres",backup_type="full"} 1737884600
|
||||
|
||||
# HELP dbbackup_last_backup_duration_seconds Duration of last successful backup in seconds
|
||||
# TYPE dbbackup_last_backup_duration_seconds gauge
|
||||
dbbackup_last_backup_duration_seconds{server="production",database="myapp",engine="postgres"} 125.50
|
||||
dbbackup_last_backup_duration_seconds{server="production",database="myapp",engine="postgres",backup_type="full"} 125.50
|
||||
|
||||
# HELP dbbackup_last_backup_size_bytes Size of last successful backup in bytes
|
||||
# TYPE dbbackup_last_backup_size_bytes gauge
|
||||
dbbackup_last_backup_size_bytes{server="production",database="myapp",engine="postgres"} 1073741824
|
||||
dbbackup_last_backup_size_bytes{server="production",database="myapp",engine="postgres",backup_type="full"} 1073741824
|
||||
|
||||
# HELP dbbackup_backup_total Total number of backup attempts
|
||||
# TYPE dbbackup_backup_total counter
|
||||
# HELP dbbackup_backup_total Total number of backup attempts by type and status
|
||||
# TYPE dbbackup_backup_total gauge
|
||||
dbbackup_backup_total{server="production",database="myapp",status="success"} 42
|
||||
dbbackup_backup_total{server="production",database="myapp",status="failure"} 2
|
||||
|
||||
# HELP dbbackup_backup_by_type Total number of backups by backup type
|
||||
# TYPE dbbackup_backup_by_type gauge
|
||||
dbbackup_backup_by_type{server="production",database="myapp",backup_type="full"} 30
|
||||
dbbackup_backup_by_type{server="production",database="myapp",backup_type="incremental"} 12
|
||||
|
||||
# HELP dbbackup_rpo_seconds Recovery Point Objective - seconds since last successful backup
|
||||
# TYPE dbbackup_rpo_seconds gauge
|
||||
dbbackup_rpo_seconds{server="production",database="myapp"} 3600
|
||||
dbbackup_rpo_seconds{server="production",database="myapp",backup_type="full"} 3600
|
||||
|
||||
# HELP dbbackup_backup_verified Whether the last backup was verified (1=yes, 0=no)
|
||||
# TYPE dbbackup_backup_verified gauge
|
||||
dbbackup_backup_verified{server="production",database="myapp"} 1
|
||||
|
||||
# HELP dbbackup_pitr_enabled Whether PITR is enabled for database (1=enabled, 0=disabled)
|
||||
# TYPE dbbackup_pitr_enabled gauge
|
||||
dbbackup_pitr_enabled{server="production",database="myapp",engine="postgres"} 1
|
||||
|
||||
# HELP dbbackup_pitr_archive_lag_seconds Seconds since last WAL/binlog was archived
|
||||
# TYPE dbbackup_pitr_archive_lag_seconds gauge
|
||||
dbbackup_pitr_archive_lag_seconds{server="production",database="myapp",engine="postgres"} 45
|
||||
|
||||
# HELP dbbackup_pitr_chain_valid Whether the WAL/binlog chain is valid (1=valid, 0=gaps detected)
|
||||
# TYPE dbbackup_pitr_chain_valid gauge
|
||||
dbbackup_pitr_chain_valid{server="production",database="myapp",engine="postgres"} 1
|
||||
|
||||
# HELP dbbackup_pitr_recovery_window_minutes Estimated recovery window in minutes
|
||||
# TYPE dbbackup_pitr_recovery_window_minutes gauge
|
||||
dbbackup_pitr_recovery_window_minutes{server="production",database="myapp",engine="postgres"} 10080
|
||||
|
||||
# HELP dbbackup_dedup_ratio Deduplication ratio (0-1, higher is better)
|
||||
# TYPE dbbackup_dedup_ratio gauge
|
||||
dbbackup_dedup_ratio{server="production"} 0.6500
|
||||
@ -301,6 +364,7 @@ The dashboard includes the following panels:
|
||||
|
||||
Import `deploy/prometheus/alerting-rules.yaml` into Prometheus/Alertmanager.
|
||||
|
||||
#### Backup Status Alerts
|
||||
| Alert | Expression | Severity | Description |
|
||||
|-------|------------|----------|-------------|
|
||||
| `DBBackupRPOWarning` | `dbbackup_rpo_seconds > 43200` | warning | No backup for 12+ hours |
|
||||
@ -311,6 +375,20 @@ Import `deploy/prometheus/alerting-rules.yaml` into Prometheus/Alertmanager.
|
||||
| `DBBackupSizeZero` | `dbbackup_last_backup_size_bytes == 0` | critical | Empty backup file |
|
||||
| `DBBackupDurationHigh` | `dbbackup_last_backup_duration_seconds > 3600` | warning | Backup taking > 1 hour |
|
||||
| `DBBackupNotVerified` | `dbbackup_backup_verified == 0` for 24h | warning | Backup not verified |
|
||||
| `DBBackupNoRecentFull` | No full backup in 7+ days | warning | Need full backup for incremental chain |
|
||||
|
||||
#### PITR Alerts (New)
|
||||
| Alert | Expression | Severity | Description |
|
||||
|-------|------------|----------|-------------|
|
||||
| `DBBackupPITRArchiveLag` | `dbbackup_pitr_archive_lag_seconds > 600` | warning | Archive 10+ min behind |
|
||||
| `DBBackupPITRArchiveCritical` | `dbbackup_pitr_archive_lag_seconds > 1800` | critical | Archive 30+ min behind |
|
||||
| `DBBackupPITRChainBroken` | `dbbackup_pitr_chain_valid == 0` | critical | Gaps in WAL/binlog chain |
|
||||
| `DBBackupPITRGaps` | `dbbackup_pitr_gap_count > 0` | warning | Gaps detected in archive chain |
|
||||
| `DBBackupPITRDisabled` | PITR unexpectedly disabled | critical | PITR was enabled but now off |
|
||||
|
||||
#### Infrastructure Alerts
|
||||
| Alert | Expression | Severity | Description |
|
||||
|-------|------------|----------|-------------|
|
||||
| `DBBackupExporterDown` | `up{job="dbbackup"} == 0` | critical | Exporter unreachable |
|
||||
| `DBBackupDedupRatioLow` | `dbbackup_dedup_ratio < 0.2` for 24h | info | Low dedup efficiency |
|
||||
| `DBBackupStorageHigh` | `dbbackup_dedup_disk_usage_bytes > 1TB` | warning | High storage usage |
|
||||
@ -329,6 +407,15 @@ groups:
|
||||
annotations:
|
||||
summary: "No backup for {{ $labels.database }} in 24+ hours"
|
||||
description: "RPO violation on {{ $labels.server }}. Last backup: {{ $value | humanizeDuration }} ago."
|
||||
|
||||
- alert: DBBackupPITRChainBroken
|
||||
expr: dbbackup_pitr_chain_valid == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR chain broken for {{ $labels.database }}"
|
||||
description: "WAL/binlog chain has gaps. Point-in-time recovery is NOT possible. New base backup required."
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
161
docs/METRICS.md
161
docs/METRICS.md
@ -6,7 +6,7 @@ This document describes all Prometheus metrics exposed by DBBackup for monitorin
|
||||
|
||||
### `dbbackup_rpo_seconds`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Labels:** `server`, `database`, `backup_type`
|
||||
**Description:** Time in seconds since the last successful backup (Recovery Point Objective).
|
||||
|
||||
**Recommended Thresholds:**
|
||||
@ -17,19 +17,45 @@ This document describes all Prometheus metrics exposed by DBBackup for monitorin
|
||||
**Example Query:**
|
||||
```promql
|
||||
dbbackup_rpo_seconds{server="prod-db-01"} > 86400
|
||||
|
||||
# RPO by backup type
|
||||
dbbackup_rpo_seconds{backup_type="full"}
|
||||
dbbackup_rpo_seconds{backup_type="incremental"}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_backup_total`
|
||||
**Type:** Counter
|
||||
**Labels:** `server`, `database`, `engine`, `status`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `status`
|
||||
**Description:** Total count of backup attempts, labeled by status (`success` or `failure`).
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Failure rate over last hour
|
||||
rate(dbbackup_backup_total{status="failure"}[1h])
|
||||
# Total successful backups
|
||||
dbbackup_backup_total{status="success"}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_backup_by_type`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `backup_type`
|
||||
**Description:** Total count of backups by backup type (`full`, `incremental`, `pitr_base`).
|
||||
|
||||
> **Note:** The `backup_type` label values are:
|
||||
> - `full` - Created with `--backup-type full` (default)
|
||||
> - `incremental` - Created with `--backup-type incremental`
|
||||
> - `pitr_base` - Auto-assigned when using `dbbackup pitr base` command
|
||||
>
|
||||
> The CLI `--backup-type` flag only accepts `full` or `incremental`.
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Count of each backup type
|
||||
dbbackup_backup_by_type{backup_type="full"}
|
||||
dbbackup_backup_by_type{backup_type="incremental"}
|
||||
dbbackup_backup_by_type{backup_type="pitr_base"}
|
||||
```
|
||||
|
||||
---
|
||||
@ -43,24 +69,115 @@ rate(dbbackup_backup_total{status="failure"}[1h])
|
||||
|
||||
### `dbbackup_last_backup_size_bytes`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Labels:** `server`, `database`, `engine`, `backup_type`
|
||||
**Description:** Size of the last successful backup in bytes.
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Total backup storage across all databases
|
||||
sum(dbbackup_last_backup_size_bytes)
|
||||
|
||||
# Size by backup type
|
||||
dbbackup_last_backup_size_bytes{backup_type="full"}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_last_backup_duration_seconds`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Labels:** `server`, `database`, `engine`, `backup_type`
|
||||
**Description:** Duration of the last backup operation in seconds.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_last_success_timestamp`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`, `backup_type`
|
||||
**Description:** Unix timestamp of the last successful backup.
|
||||
|
||||
---
|
||||
|
||||
## PITR (Point-in-Time Recovery) Metrics
|
||||
|
||||
### `dbbackup_pitr_enabled`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Whether PITR is enabled for the database (1 = enabled, 0 = disabled).
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Check if PITR is enabled
|
||||
dbbackup_pitr_enabled{database="production"} == 1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_last_archived_timestamp`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Unix timestamp of the last archived WAL segment (PostgreSQL) or binlog file (MySQL).
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_archive_lag_seconds`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Seconds since the last WAL/binlog was archived. High values indicate archiving issues.
|
||||
|
||||
**Recommended Thresholds:**
|
||||
- Green: < 300 (5 minutes)
|
||||
- Yellow: 300-600 (5-10 minutes)
|
||||
- Red: > 600 (10+ minutes)
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Alert on high archive lag
|
||||
dbbackup_pitr_archive_lag_seconds > 600
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_archive_count`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Total number of archived WAL segments or binlog files.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_archive_size_bytes`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Total size of archived logs in bytes.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_chain_valid`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Whether the WAL/binlog chain is valid (1 = valid, 0 = gaps detected).
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Alert on broken chain
|
||||
dbbackup_pitr_chain_valid == 0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_gap_count`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Number of gaps detected in the WAL/binlog chain. Any value > 0 requires investigation.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_recovery_window_minutes`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Estimated recovery window in minutes - the time span covered by archived logs.
|
||||
|
||||
---
|
||||
|
||||
## Deduplication Metrics
|
||||
|
||||
### `dbbackup_dedup_ratio`
|
||||
@ -119,6 +236,32 @@ sum(dbbackup_last_backup_size_bytes)
|
||||
|
||||
---
|
||||
|
||||
## Build Information Metrics
|
||||
|
||||
### `dbbackup_build_info`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `version`, `commit`
|
||||
**Description:** Build information for the dbbackup exporter. Value is always 1.
|
||||
|
||||
This metric is useful for:
|
||||
- Tracking which version is deployed across your fleet
|
||||
- Alerting when versions drift between servers
|
||||
- Correlating behavior changes with deployments
|
||||
|
||||
**Example Queries:**
|
||||
```promql
|
||||
# Show all deployed versions
|
||||
group by (version) (dbbackup_build_info)
|
||||
|
||||
# Find servers not on latest version
|
||||
dbbackup_build_info{version!="4.1.1"}
|
||||
|
||||
# Alert on version drift
|
||||
count(count by (version) (dbbackup_build_info)) > 1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alerting Rules
|
||||
|
||||
See [alerting-rules.yaml](../grafana/alerting-rules.yaml) for pre-configured Prometheus alerting rules.
|
||||
@ -131,6 +274,10 @@ See [alerting-rules.yaml](../grafana/alerting-rules.yaml) for pre-configured Pro
|
||||
| BackupFailed | `increase(dbbackup_backup_total{status="failure"}[1h]) > 0` | Warning |
|
||||
| BackupNotVerified | `dbbackup_backup_verified == 0` | Warning |
|
||||
| DedupDegraded | `dbbackup_dedup_ratio < 0.1` | Info |
|
||||
| PITRArchiveLag | `dbbackup_pitr_archive_lag_seconds > 600` | Warning |
|
||||
| PITRChainBroken | `dbbackup_pitr_chain_valid == 0` | Critical |
|
||||
| PITRDisabled | `dbbackup_pitr_enabled == 0` (unexpected) | Critical |
|
||||
| NoIncrementalBackups | `dbbackup_backup_by_type{backup_type="incremental"} == 0` for 7d | Info |
|
||||
|
||||
---
|
||||
|
||||
|
||||
@ -96,6 +96,90 @@ groups:
|
||||
Current usage: {{ $value | humanize1024 }}B
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#storage-growth"
|
||||
|
||||
# PITR: Archive lag high
|
||||
- alert: DBBackupPITRArchiveLag
|
||||
expr: dbbackup_pitr_archive_lag_seconds > 600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PITR archive lag high for {{ $labels.database }}"
|
||||
description: |
|
||||
WAL/binlog archiving for {{ $labels.database }} on {{ $labels.server }}
|
||||
is {{ $value | humanizeDuration }} behind. This reduces the PITR
|
||||
recovery point. Check archive process and disk space.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-archive-lag"
|
||||
|
||||
# PITR: Archive lag critical
|
||||
- alert: DBBackupPITRArchiveLagCritical
|
||||
expr: dbbackup_pitr_archive_lag_seconds > 1800
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR archive severely behind for {{ $labels.database }}"
|
||||
description: |
|
||||
WAL/binlog archiving for {{ $labels.database }} is {{ $value | humanizeDuration }}
|
||||
behind. Point-in-time recovery capability is at risk. Immediate action required.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-archive-critical"
|
||||
|
||||
# PITR: Chain broken (gaps detected)
|
||||
- alert: DBBackupPITRChainBroken
|
||||
expr: dbbackup_pitr_chain_valid == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR chain broken for {{ $labels.database }}"
|
||||
description: |
|
||||
The WAL/binlog chain for {{ $labels.database }} on {{ $labels.server }}
|
||||
has gaps. Point-in-time recovery to arbitrary points is NOT possible.
|
||||
A new base backup is required to restore PITR capability.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-chain-broken"
|
||||
|
||||
# PITR: Gaps in chain
|
||||
- alert: DBBackupPITRGapsDetected
|
||||
expr: dbbackup_pitr_gap_count > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PITR chain has {{ $value }} gaps for {{ $labels.database }}"
|
||||
description: |
|
||||
{{ $value }} gaps detected in WAL/binlog chain for {{ $labels.database }}.
|
||||
Recovery to points within gaps will fail. Consider taking a new base backup.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-gaps"
|
||||
|
||||
# PITR: Unexpectedly disabled
|
||||
- alert: DBBackupPITRDisabled
|
||||
expr: |
|
||||
dbbackup_pitr_enabled == 0
|
||||
and on(database) dbbackup_pitr_archive_count > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR unexpectedly disabled for {{ $labels.database }}"
|
||||
description: |
|
||||
PITR was previously enabled for {{ $labels.database }} (has archived logs)
|
||||
but is now disabled. This may indicate a configuration issue or
|
||||
database restart without PITR settings.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-disabled"
|
||||
|
||||
# Backup type: No full backups recently
|
||||
- alert: DBBackupNoRecentFullBackup
|
||||
expr: |
|
||||
time() - dbbackup_last_success_timestamp{backup_type="full"} > 604800
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "No full backup in 7+ days for {{ $labels.database }}"
|
||||
description: |
|
||||
Database {{ $labels.database }} has not had a full backup in over 7 days.
|
||||
Incremental backups depend on a valid full backup base.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#no-full-backup"
|
||||
|
||||
# Info: Exporter not responding
|
||||
- alert: DBBackupExporterDown
|
||||
expr: up{job="dbbackup"} == 0
|
||||
|
||||
@ -23,6 +23,7 @@ type Config struct {
|
||||
User string
|
||||
Database string
|
||||
Password string
|
||||
Socket string // Unix socket path for MySQL/MariaDB
|
||||
DatabaseType string // "postgres" or "mysql"
|
||||
SSLMode string
|
||||
Insecure bool
|
||||
|
||||
@ -278,8 +278,12 @@ func (m *MySQL) GetTableRowCount(ctx context.Context, database, table string) (i
|
||||
func (m *MySQL) BuildBackupCommand(database, outputFile string, options BackupOptions) []string {
|
||||
cmd := []string{"mysqldump"}
|
||||
|
||||
// Connection parameters - handle localhost vs remote differently
|
||||
if m.cfg.Host == "" || m.cfg.Host == "localhost" {
|
||||
// Connection parameters - socket takes priority, then localhost vs remote
|
||||
if m.cfg.Socket != "" {
|
||||
// Explicit socket path provided
|
||||
cmd = append(cmd, "-S", m.cfg.Socket)
|
||||
cmd = append(cmd, "-u", m.cfg.User)
|
||||
} else if m.cfg.Host == "" || m.cfg.Host == "localhost" {
|
||||
// For localhost, use socket connection (don't specify host/port)
|
||||
cmd = append(cmd, "-u", m.cfg.User)
|
||||
} else {
|
||||
@ -338,8 +342,12 @@ func (m *MySQL) BuildBackupCommand(database, outputFile string, options BackupOp
|
||||
func (m *MySQL) BuildRestoreCommand(database, inputFile string, options RestoreOptions) []string {
|
||||
cmd := []string{"mysql"}
|
||||
|
||||
// Connection parameters - handle localhost vs remote differently
|
||||
if m.cfg.Host == "" || m.cfg.Host == "localhost" {
|
||||
// Connection parameters - socket takes priority, then localhost vs remote
|
||||
if m.cfg.Socket != "" {
|
||||
// Explicit socket path provided
|
||||
cmd = append(cmd, "-S", m.cfg.Socket)
|
||||
cmd = append(cmd, "-u", m.cfg.User)
|
||||
} else if m.cfg.Host == "" || m.cfg.Host == "localhost" {
|
||||
// For localhost, use socket connection (don't specify host/port)
|
||||
cmd = append(cmd, "-u", m.cfg.User)
|
||||
} else {
|
||||
@ -417,8 +425,11 @@ func (m *MySQL) buildDSN() string {
|
||||
|
||||
dsn += "@"
|
||||
|
||||
// Handle localhost with Unix socket vs TCP/IP
|
||||
if m.cfg.Host == "" || m.cfg.Host == "localhost" {
|
||||
// Explicit socket takes priority
|
||||
if m.cfg.Socket != "" {
|
||||
dsn += "unix(" + m.cfg.Socket + ")"
|
||||
} else if m.cfg.Host == "" || m.cfg.Host == "localhost" {
|
||||
// Handle localhost with Unix socket vs TCP/IP
|
||||
// Try common socket paths for localhost connections
|
||||
socketPaths := []string{
|
||||
"/run/mysqld/mysqld.sock",
|
||||
|
||||
@ -345,8 +345,10 @@ func (e *MySQLDumpEngine) Restore(ctx context.Context, opts *RestoreOptions) err
|
||||
// Build mysql command
|
||||
args := []string{}
|
||||
|
||||
// Connection parameters
|
||||
if e.config.Host != "" && e.config.Host != "localhost" {
|
||||
// Connection parameters - socket takes priority over host
|
||||
if e.config.Socket != "" {
|
||||
args = append(args, "-S", e.config.Socket)
|
||||
} else if e.config.Host != "" && e.config.Host != "localhost" {
|
||||
args = append(args, "-h", e.config.Host)
|
||||
args = append(args, "-P", strconv.Itoa(e.config.Port))
|
||||
}
|
||||
@ -494,8 +496,10 @@ func (e *MySQLDumpEngine) BackupToWriter(ctx context.Context, w io.Writer, opts
|
||||
func (e *MySQLDumpEngine) buildArgs(database string) []string {
|
||||
args := []string{}
|
||||
|
||||
// Connection parameters
|
||||
if e.config.Host != "" && e.config.Host != "localhost" {
|
||||
// Connection parameters - socket takes priority over host
|
||||
if e.config.Socket != "" {
|
||||
args = append(args, "-S", e.config.Socket)
|
||||
} else if e.config.Host != "" && e.config.Host != "localhost" {
|
||||
args = append(args, "-h", e.config.Host)
|
||||
args = append(args, "-P", strconv.Itoa(e.config.Port))
|
||||
}
|
||||
|
||||
@ -14,10 +14,12 @@ import (
|
||||
|
||||
// Exporter provides an HTTP endpoint for Prometheus metrics
|
||||
type Exporter struct {
|
||||
log logger.Logger
|
||||
catalog catalog.Catalog
|
||||
instance string
|
||||
port int
|
||||
log logger.Logger
|
||||
catalog catalog.Catalog
|
||||
instance string
|
||||
port int
|
||||
version string
|
||||
gitCommit string
|
||||
|
||||
mu sync.RWMutex
|
||||
cachedData string
|
||||
@ -36,6 +38,19 @@ func NewExporter(log logger.Logger, cat catalog.Catalog, instance string, port i
|
||||
}
|
||||
}
|
||||
|
||||
// NewExporterWithVersion creates a new Prometheus exporter with version info
|
||||
func NewExporterWithVersion(log logger.Logger, cat catalog.Catalog, instance string, port int, version, gitCommit string) *Exporter {
|
||||
return &Exporter{
|
||||
log: log,
|
||||
catalog: cat,
|
||||
instance: instance,
|
||||
port: port,
|
||||
version: version,
|
||||
gitCommit: gitCommit,
|
||||
refreshTTL: 30 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
// Serve starts the HTTP server and blocks until context is cancelled
|
||||
func (e *Exporter) Serve(ctx context.Context) error {
|
||||
mux := http.NewServeMux()
|
||||
@ -158,7 +173,7 @@ func (e *Exporter) refreshLoop(ctx context.Context) {
|
||||
|
||||
// refresh updates the cached metrics
|
||||
func (e *Exporter) refresh() error {
|
||||
writer := NewMetricsWriter(e.log, e.catalog, e.instance)
|
||||
writer := NewMetricsWriterWithVersion(e.log, e.catalog, e.instance, e.version, e.gitCommit)
|
||||
data, err := writer.GenerateMetricsString()
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@ -16,17 +16,32 @@ import (
|
||||
|
||||
// MetricsWriter writes metrics in Prometheus text format
|
||||
type MetricsWriter struct {
|
||||
log logger.Logger
|
||||
catalog catalog.Catalog
|
||||
instance string
|
||||
log logger.Logger
|
||||
catalog catalog.Catalog
|
||||
instance string
|
||||
version string
|
||||
gitCommit string
|
||||
}
|
||||
|
||||
// NewMetricsWriter creates a new MetricsWriter
|
||||
func NewMetricsWriter(log logger.Logger, cat catalog.Catalog, instance string) *MetricsWriter {
|
||||
return &MetricsWriter{
|
||||
log: log,
|
||||
catalog: cat,
|
||||
instance: instance,
|
||||
log: log,
|
||||
catalog: cat,
|
||||
instance: instance,
|
||||
version: "unknown",
|
||||
gitCommit: "unknown",
|
||||
}
|
||||
}
|
||||
|
||||
// NewMetricsWriterWithVersion creates a MetricsWriter with version info for build_info metric
|
||||
func NewMetricsWriterWithVersion(log logger.Logger, cat catalog.Catalog, instance, version, gitCommit string) *MetricsWriter {
|
||||
return &MetricsWriter{
|
||||
log: log,
|
||||
catalog: cat,
|
||||
instance: instance,
|
||||
version: version,
|
||||
gitCommit: gitCommit,
|
||||
}
|
||||
}
|
||||
|
||||
@ -42,6 +57,25 @@ type BackupMetrics struct {
|
||||
FailureCount int
|
||||
Verified bool
|
||||
RPOSeconds float64
|
||||
// Backup type tracking
|
||||
LastBackupType string // "full", "incremental", "pitr_base"
|
||||
FullCount int // Count of full backups
|
||||
IncrCount int // Count of incremental backups
|
||||
PITRBaseCount int // Count of PITR base backups
|
||||
}
|
||||
|
||||
// PITRMetrics holds PITR-specific metrics for a database
|
||||
type PITRMetrics struct {
|
||||
Database string
|
||||
Engine string
|
||||
Enabled bool
|
||||
LastArchived time.Time
|
||||
ArchiveLag float64 // Seconds since last archive
|
||||
ArchiveCount int
|
||||
ArchiveSize int64
|
||||
ChainValid bool
|
||||
GapCount int
|
||||
RecoveryMinutes float64 // Estimated recovery window in minutes
|
||||
}
|
||||
|
||||
// WriteTextfile writes metrics to a Prometheus textfile collector file
|
||||
@ -110,6 +144,20 @@ func (m *MetricsWriter) collectMetrics() ([]BackupMetrics, error) {
|
||||
|
||||
metrics.TotalBackups++
|
||||
|
||||
// Track backup type counts
|
||||
backupType := e.BackupType
|
||||
if backupType == "" {
|
||||
backupType = "full" // Default to full if not specified
|
||||
}
|
||||
switch backupType {
|
||||
case "full":
|
||||
metrics.FullCount++
|
||||
case "incremental":
|
||||
metrics.IncrCount++
|
||||
case "pitr_base", "pitr":
|
||||
metrics.PITRBaseCount++
|
||||
}
|
||||
|
||||
isSuccess := e.Status == catalog.StatusCompleted || e.Status == catalog.StatusVerified
|
||||
if isSuccess {
|
||||
metrics.SuccessCount++
|
||||
@ -120,6 +168,7 @@ func (m *MetricsWriter) collectMetrics() ([]BackupMetrics, error) {
|
||||
metrics.LastSize = e.SizeBytes
|
||||
metrics.Verified = e.VerifiedAt != nil && e.VerifyValid != nil && *e.VerifyValid
|
||||
metrics.Engine = e.DatabaseType
|
||||
metrics.LastBackupType = backupType
|
||||
}
|
||||
} else {
|
||||
metrics.FailureCount++
|
||||
@ -159,13 +208,24 @@ func (m *MetricsWriter) formatMetrics(metrics []BackupMetrics) string {
|
||||
b.WriteString(fmt.Sprintf("# Server: %s\n", m.instance))
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_build_info - version and build information
|
||||
b.WriteString("# HELP dbbackup_build_info Build information for dbbackup exporter\n")
|
||||
b.WriteString("# TYPE dbbackup_build_info gauge\n")
|
||||
b.WriteString(fmt.Sprintf("dbbackup_build_info{server=%q,version=%q,commit=%q} 1\n",
|
||||
m.instance, m.version, m.gitCommit))
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_last_success_timestamp
|
||||
b.WriteString("# HELP dbbackup_last_success_timestamp Unix timestamp of last successful backup\n")
|
||||
b.WriteString("# TYPE dbbackup_last_success_timestamp gauge\n")
|
||||
for _, met := range metrics {
|
||||
if !met.LastSuccess.IsZero() {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_success_timestamp{server=%q,database=%q,engine=%q} %d\n",
|
||||
m.instance, met.Database, met.Engine, met.LastSuccess.Unix()))
|
||||
backupType := met.LastBackupType
|
||||
if backupType == "" {
|
||||
backupType = "full"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_success_timestamp{server=%q,database=%q,engine=%q,backup_type=%q} %d\n",
|
||||
m.instance, met.Database, met.Engine, backupType, met.LastSuccess.Unix()))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
@ -175,8 +235,12 @@ func (m *MetricsWriter) formatMetrics(metrics []BackupMetrics) string {
|
||||
b.WriteString("# TYPE dbbackup_last_backup_duration_seconds gauge\n")
|
||||
for _, met := range metrics {
|
||||
if met.LastDuration > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_backup_duration_seconds{server=%q,database=%q,engine=%q} %.2f\n",
|
||||
m.instance, met.Database, met.Engine, met.LastDuration.Seconds()))
|
||||
backupType := met.LastBackupType
|
||||
if backupType == "" {
|
||||
backupType = "full"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_backup_duration_seconds{server=%q,database=%q,engine=%q,backup_type=%q} %.2f\n",
|
||||
m.instance, met.Database, met.Engine, backupType, met.LastDuration.Seconds()))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
@ -186,16 +250,21 @@ func (m *MetricsWriter) formatMetrics(metrics []BackupMetrics) string {
|
||||
b.WriteString("# TYPE dbbackup_last_backup_size_bytes gauge\n")
|
||||
for _, met := range metrics {
|
||||
if met.LastSize > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_backup_size_bytes{server=%q,database=%q,engine=%q} %d\n",
|
||||
m.instance, met.Database, met.Engine, met.LastSize))
|
||||
backupType := met.LastBackupType
|
||||
if backupType == "" {
|
||||
backupType = "full"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_backup_size_bytes{server=%q,database=%q,engine=%q,backup_type=%q} %d\n",
|
||||
m.instance, met.Database, met.Engine, backupType, met.LastSize))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_backup_total (counter)
|
||||
b.WriteString("# HELP dbbackup_backup_total Total number of backup attempts\n")
|
||||
b.WriteString("# TYPE dbbackup_backup_total counter\n")
|
||||
// dbbackup_backup_total - now with backup_type dimension
|
||||
b.WriteString("# HELP dbbackup_backup_total Total number of backup attempts by type and status\n")
|
||||
b.WriteString("# TYPE dbbackup_backup_total gauge\n")
|
||||
for _, met := range metrics {
|
||||
// Success/failure by status (legacy compatibility)
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_total{server=%q,database=%q,status=\"success\"} %d\n",
|
||||
m.instance, met.Database, met.SuccessCount))
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_total{server=%q,database=%q,status=\"failure\"} %d\n",
|
||||
@ -203,13 +272,36 @@ func (m *MetricsWriter) formatMetrics(metrics []BackupMetrics) string {
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_backup_by_type - backup counts by type
|
||||
b.WriteString("# HELP dbbackup_backup_by_type Total number of backups by backup type\n")
|
||||
b.WriteString("# TYPE dbbackup_backup_by_type gauge\n")
|
||||
for _, met := range metrics {
|
||||
if met.FullCount > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_by_type{server=%q,database=%q,backup_type=\"full\"} %d\n",
|
||||
m.instance, met.Database, met.FullCount))
|
||||
}
|
||||
if met.IncrCount > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_by_type{server=%q,database=%q,backup_type=\"incremental\"} %d\n",
|
||||
m.instance, met.Database, met.IncrCount))
|
||||
}
|
||||
if met.PITRBaseCount > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_by_type{server=%q,database=%q,backup_type=\"pitr_base\"} %d\n",
|
||||
m.instance, met.Database, met.PITRBaseCount))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_rpo_seconds
|
||||
b.WriteString("# HELP dbbackup_rpo_seconds Recovery Point Objective - seconds since last successful backup\n")
|
||||
b.WriteString("# TYPE dbbackup_rpo_seconds gauge\n")
|
||||
for _, met := range metrics {
|
||||
if met.RPOSeconds > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_rpo_seconds{server=%q,database=%q} %.0f\n",
|
||||
m.instance, met.Database, met.RPOSeconds))
|
||||
backupType := met.LastBackupType
|
||||
if backupType == "" {
|
||||
backupType = "full"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_rpo_seconds{server=%q,database=%q,backup_type=%q} %.0f\n",
|
||||
m.instance, met.Database, backupType, met.RPOSeconds))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
@ -243,3 +335,150 @@ func (m *MetricsWriter) GenerateMetricsString() (string, error) {
|
||||
}
|
||||
return m.formatMetrics(metrics), nil
|
||||
}
|
||||
|
||||
// PITRMetricsWriter writes PITR-specific metrics
|
||||
type PITRMetricsWriter struct {
|
||||
log logger.Logger
|
||||
instance string
|
||||
}
|
||||
|
||||
// NewPITRMetricsWriter creates a new PITR metrics writer
|
||||
func NewPITRMetricsWriter(log logger.Logger, instance string) *PITRMetricsWriter {
|
||||
return &PITRMetricsWriter{
|
||||
log: log,
|
||||
instance: instance,
|
||||
}
|
||||
}
|
||||
|
||||
// FormatPITRMetrics formats PITR metrics in Prometheus exposition format
|
||||
func (p *PITRMetricsWriter) FormatPITRMetrics(pitrMetrics []PITRMetrics) string {
|
||||
var b strings.Builder
|
||||
now := time.Now().Unix()
|
||||
|
||||
b.WriteString("# DBBackup PITR Prometheus Metrics\n")
|
||||
b.WriteString(fmt.Sprintf("# Generated at: %s\n", time.Now().Format(time.RFC3339)))
|
||||
b.WriteString(fmt.Sprintf("# Server: %s\n", p.instance))
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_enabled
|
||||
b.WriteString("# HELP dbbackup_pitr_enabled Whether PITR is enabled for database (1=enabled, 0=disabled)\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_enabled gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
enabled := 0
|
||||
if met.Enabled {
|
||||
enabled = 1
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_enabled{server=%q,database=%q,engine=%q} %d\n",
|
||||
p.instance, met.Database, met.Engine, enabled))
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_last_archived_timestamp
|
||||
b.WriteString("# HELP dbbackup_pitr_last_archived_timestamp Unix timestamp of last archived WAL/binlog\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_last_archived_timestamp gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled && !met.LastArchived.IsZero() {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_last_archived_timestamp{server=%q,database=%q,engine=%q} %d\n",
|
||||
p.instance, met.Database, met.Engine, met.LastArchived.Unix()))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_archive_lag_seconds
|
||||
b.WriteString("# HELP dbbackup_pitr_archive_lag_seconds Seconds since last WAL/binlog was archived\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_archive_lag_seconds gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_archive_lag_seconds{server=%q,database=%q,engine=%q} %.0f\n",
|
||||
p.instance, met.Database, met.Engine, met.ArchiveLag))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_archive_count
|
||||
b.WriteString("# HELP dbbackup_pitr_archive_count Total number of archived WAL segments/binlog files\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_archive_count gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_archive_count{server=%q,database=%q,engine=%q} %d\n",
|
||||
p.instance, met.Database, met.Engine, met.ArchiveCount))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_archive_size_bytes
|
||||
b.WriteString("# HELP dbbackup_pitr_archive_size_bytes Total size of archived logs in bytes\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_archive_size_bytes gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_archive_size_bytes{server=%q,database=%q,engine=%q} %d\n",
|
||||
p.instance, met.Database, met.Engine, met.ArchiveSize))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_chain_valid
|
||||
b.WriteString("# HELP dbbackup_pitr_chain_valid Whether the WAL/binlog chain is valid (1=valid, 0=gaps detected)\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_chain_valid gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled {
|
||||
valid := 0
|
||||
if met.ChainValid {
|
||||
valid = 1
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_chain_valid{server=%q,database=%q,engine=%q} %d\n",
|
||||
p.instance, met.Database, met.Engine, valid))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_gap_count
|
||||
b.WriteString("# HELP dbbackup_pitr_gap_count Number of gaps detected in WAL/binlog chain\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_gap_count gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_gap_count{server=%q,database=%q,engine=%q} %d\n",
|
||||
p.instance, met.Database, met.Engine, met.GapCount))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_recovery_window_minutes
|
||||
b.WriteString("# HELP dbbackup_pitr_recovery_window_minutes Estimated recovery window in minutes (time span covered by archived logs)\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_recovery_window_minutes gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled && met.RecoveryMinutes > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_recovery_window_minutes{server=%q,database=%q,engine=%q} %.1f\n",
|
||||
p.instance, met.Database, met.Engine, met.RecoveryMinutes))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_scrape_timestamp
|
||||
b.WriteString("# HELP dbbackup_pitr_scrape_timestamp Unix timestamp when PITR metrics were collected\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_scrape_timestamp gauge\n")
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_scrape_timestamp{server=%q} %d\n", p.instance, now))
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// CollectPITRMetricsFromStatus converts PITRStatus to PITRMetrics
|
||||
// This is a helper for integration with the PITR subsystem
|
||||
func CollectPITRMetricsFromStatus(database, engine string, enabled bool, lastArchived time.Time, archiveCount int, archiveSize int64, chainValid bool, gapCount int, recoveryMinutes float64) PITRMetrics {
|
||||
lag := float64(0)
|
||||
if enabled && !lastArchived.IsZero() {
|
||||
lag = time.Since(lastArchived).Seconds()
|
||||
}
|
||||
return PITRMetrics{
|
||||
Database: database,
|
||||
Engine: engine,
|
||||
Enabled: enabled,
|
||||
LastArchived: lastArchived,
|
||||
ArchiveLag: lag,
|
||||
ArchiveCount: archiveCount,
|
||||
ArchiveSize: archiveSize,
|
||||
ChainValid: chainValid,
|
||||
GapCount: gapCount,
|
||||
RecoveryMinutes: recoveryMinutes,
|
||||
}
|
||||
}
|
||||
|
||||
2
main.go
2
main.go
@ -16,7 +16,7 @@ import (
|
||||
|
||||
// Build information (set by ldflags)
|
||||
var (
|
||||
version = "4.0.1"
|
||||
version = "4.1.2"
|
||||
buildTime = "unknown"
|
||||
gitCommit = "unknown"
|
||||
)
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user