Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 79f2efaaac | |||
| 19f44749b1 | |||
| c7904c7857 | |||
| 1747365d0d | |||
| 8cf107b8d4 | |||
| ed5ed8cf5e | |||
| d58240b6c0 |
3
.gitignore
vendored
3
.gitignore
vendored
@ -37,3 +37,6 @@ CRITICAL_BUGS_FIXED.md
|
||||
LEGAL_DOCUMENTATION.md
|
||||
LEGAL_*.md
|
||||
legal/
|
||||
|
||||
# Release binaries (uploaded via gh release, not git)
|
||||
release/dbbackup_*
|
||||
|
||||
46
CHANGELOG.md
46
CHANGELOG.md
@ -5,6 +5,52 @@ All notable changes to dbbackup will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [4.1.2] - 2026-01-27
|
||||
|
||||
### Added
|
||||
- **`--socket` flag for MySQL/MariaDB** - Connect via Unix socket instead of TCP/IP
|
||||
- Usage: `dbbackup backup single mydb --db-type mysql --socket /var/run/mysqld/mysqld.sock`
|
||||
- Works for both backup and restore operations
|
||||
- Supports socket auth (no password required with proper permissions)
|
||||
|
||||
### Fixed
|
||||
- **Socket path as --host now works** - If `--host` starts with `/`, it's auto-detected as a socket path
|
||||
- Example: `--host /var/run/mysqld/mysqld.sock` now works correctly instead of DNS lookup error
|
||||
- Auto-converts to `--socket` internally
|
||||
|
||||
## [4.1.1] - 2026-01-25
|
||||
|
||||
### Added
|
||||
- **`dbbackup_build_info` metric** - Exposes version and git commit as Prometheus labels
|
||||
- Useful for tracking deployed versions across a fleet
|
||||
- Labels: `server`, `version`, `commit`
|
||||
|
||||
### Fixed
|
||||
- **Documentation clarification**: The `pitr_base` value for `backup_type` label is auto-assigned
|
||||
by `dbbackup pitr base` command. CLI `--backup-type` flag only accepts `full` or `incremental`.
|
||||
This was causing confusion in deployments.
|
||||
|
||||
## [4.1.0] - 2026-01-25
|
||||
|
||||
### Added
|
||||
- **Backup Type Tracking**: All backup metrics now include a `backup_type` label
|
||||
(`full`, `incremental`, or `pitr_base` for PITR base backups)
|
||||
- **PITR Metrics**: Complete Point-in-Time Recovery monitoring
|
||||
- `dbbackup_pitr_enabled` - Whether PITR is enabled (1/0)
|
||||
- `dbbackup_pitr_archive_lag_seconds` - Seconds since last WAL/binlog archived
|
||||
- `dbbackup_pitr_chain_valid` - WAL/binlog chain integrity (1=valid)
|
||||
- `dbbackup_pitr_gap_count` - Number of gaps in archive chain
|
||||
- `dbbackup_pitr_archive_count` - Total archived segments
|
||||
- `dbbackup_pitr_archive_size_bytes` - Total archive storage
|
||||
- `dbbackup_pitr_recovery_window_minutes` - Estimated PITR coverage
|
||||
- **PITR Alerting Rules**: 6 new alerts for PITR monitoring
|
||||
- PITRArchiveLag, PITRChainBroken, PITRGapsDetected, PITRArchiveStalled,
|
||||
PITRStorageGrowing, PITRDisabledUnexpectedly
|
||||
- **`dbbackup_backup_by_type` metric** - Count backups by type
|
||||
|
||||
### Changed
|
||||
- `dbbackup_backup_total` type changed from counter to gauge for snapshot-based collection
|
||||
|
||||
## [3.42.110] - 2026-01-24
|
||||
|
||||
### Improved - Code Quality & Testing
|
||||
|
||||
3
QUICK.md
3
QUICK.md
@ -14,6 +14,9 @@ dbbackup backup single myapp
|
||||
# MySQL
|
||||
dbbackup backup single gitea --db-type mysql --host 127.0.0.1 --port 3306
|
||||
|
||||
# MySQL/MariaDB with Unix socket
|
||||
dbbackup backup single myapp --db-type mysql --socket /var/run/mysqld/mysqld.sock
|
||||
|
||||
# With compression level (0-9, default 6)
|
||||
dbbackup backup cluster --compression 9
|
||||
|
||||
|
||||
18
README.md
18
README.md
@ -1,19 +1,21 @@
|
||||
```
|
||||
██╗ ██╗ ██████╗
|
||||
██║ ██║ ██╔═████╗
|
||||
███████║ ██║██╔██║
|
||||
╚════██║ ████╔╝██║
|
||||
██║██╗╚██████╔╝
|
||||
╚═╝╚═╝ ╚═════╝
|
||||
_ _ _ _
|
||||
| | | | | | |
|
||||
__| | |__ | |__ __ _ ___| | ___ _ _ __
|
||||
/ _` | '_ \| '_ \ / _` |/ __| |/ / | | | '_ \
|
||||
| (_| | |_) | |_) | (_| | (__| <| |_| | |_) |
|
||||
\__,_|_.__/|_.__/ \__,_|\___|_|\_\\__,_| .__/
|
||||
| |
|
||||
|_|
|
||||
```
|
||||
|
||||
# dbbackup v4.0.0
|
||||
# dbbackup
|
||||
|
||||
Database backup and restore utility for PostgreSQL, MySQL, and MariaDB.
|
||||
|
||||
[](https://opensource.org/licenses/Apache-2.0)
|
||||
[](https://golang.org/)
|
||||
[](https://github.com/PlusOne/dbbackup/releases/tag/v4.0.0)
|
||||
[](https://github.com/PlusOne/dbbackup/releases/latest)
|
||||
|
||||
**Repository:** https://git.uuxo.net/UUXO/dbbackup
|
||||
**Mirror:** https://github.com/PlusOne/dbbackup
|
||||
|
||||
@ -271,12 +271,20 @@ func runCatalogSync(cmd *cobra.Command, args []string) error {
|
||||
fmt.Printf(" [OK] Added: %d\n", result.Added)
|
||||
fmt.Printf(" [SYNC] Updated: %d\n", result.Updated)
|
||||
fmt.Printf(" [DEL] Removed: %d\n", result.Removed)
|
||||
if result.Skipped > 0 {
|
||||
fmt.Printf(" [SKIP] Skipped: %d (legacy files without metadata)\n", result.Skipped)
|
||||
}
|
||||
if result.Errors > 0 {
|
||||
fmt.Printf(" [FAIL] Errors: %d\n", result.Errors)
|
||||
}
|
||||
fmt.Printf(" [TIME] Duration: %.2fs\n", result.Duration)
|
||||
fmt.Printf("=====================================================\n")
|
||||
|
||||
// Show legacy backup warning
|
||||
if result.LegacyWarning != "" {
|
||||
fmt.Printf("\n[WARN] %s\n", result.LegacyWarning)
|
||||
}
|
||||
|
||||
// Show details if verbose
|
||||
if catalogVerbose && len(result.Details) > 0 {
|
||||
fmt.Printf("\nDetails:\n")
|
||||
|
||||
@ -5,8 +5,10 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
|
||||
"dbbackup/internal/catalog"
|
||||
"dbbackup/internal/prometheus"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
@ -84,37 +86,56 @@ Endpoints:
|
||||
},
|
||||
}
|
||||
|
||||
var metricsCatalogDB string
|
||||
|
||||
func init() {
|
||||
rootCmd.AddCommand(metricsCmd)
|
||||
metricsCmd.AddCommand(metricsExportCmd)
|
||||
metricsCmd.AddCommand(metricsServeCmd)
|
||||
|
||||
// Default catalog path (same as catalog command)
|
||||
home, _ := os.UserHomeDir()
|
||||
defaultCatalogPath := filepath.Join(home, ".dbbackup", "catalog.db")
|
||||
|
||||
// Export flags
|
||||
metricsExportCmd.Flags().StringVar(&metricsServer, "server", "default", "Server name for metrics labels")
|
||||
metricsExportCmd.Flags().StringVar(&metricsServer, "server", "", "Server name for metrics labels (default: hostname)")
|
||||
metricsExportCmd.Flags().StringVarP(&metricsOutput, "output", "o", "/var/lib/dbbackup/metrics/dbbackup.prom", "Output file path")
|
||||
metricsExportCmd.Flags().StringVar(&metricsCatalogDB, "catalog-db", defaultCatalogPath, "Path to catalog SQLite database")
|
||||
|
||||
// Serve flags
|
||||
metricsServeCmd.Flags().StringVar(&metricsServer, "server", "default", "Server name for metrics labels")
|
||||
metricsServeCmd.Flags().StringVar(&metricsServer, "server", "", "Server name for metrics labels (default: hostname)")
|
||||
metricsServeCmd.Flags().IntVarP(&metricsPort, "port", "p", 9399, "HTTP server port")
|
||||
metricsServeCmd.Flags().StringVar(&metricsCatalogDB, "catalog-db", defaultCatalogPath, "Path to catalog SQLite database")
|
||||
}
|
||||
|
||||
func runMetricsExport(ctx context.Context) error {
|
||||
// Open catalog
|
||||
cat, err := openCatalog()
|
||||
// Auto-detect hostname if server not specified
|
||||
server := metricsServer
|
||||
if server == "" {
|
||||
hostname, err := os.Hostname()
|
||||
if err != nil {
|
||||
server = "unknown"
|
||||
} else {
|
||||
server = hostname
|
||||
}
|
||||
}
|
||||
|
||||
// Open catalog using specified path
|
||||
cat, err := catalog.NewSQLiteCatalog(metricsCatalogDB)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open catalog: %w", err)
|
||||
}
|
||||
defer cat.Close()
|
||||
|
||||
// Create metrics writer
|
||||
writer := prometheus.NewMetricsWriter(log, cat, metricsServer)
|
||||
// Create metrics writer with version info
|
||||
writer := prometheus.NewMetricsWriterWithVersion(log, cat, server, cfg.Version, cfg.GitCommit)
|
||||
|
||||
// Write textfile
|
||||
if err := writer.WriteTextfile(metricsOutput); err != nil {
|
||||
return fmt.Errorf("failed to write metrics: %w", err)
|
||||
}
|
||||
|
||||
log.Info("Exported metrics to textfile", "path", metricsOutput, "server", metricsServer)
|
||||
log.Info("Exported metrics to textfile", "path", metricsOutput, "server", server)
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -123,15 +144,26 @@ func runMetricsServe(ctx context.Context) error {
|
||||
ctx, cancel := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM)
|
||||
defer cancel()
|
||||
|
||||
// Open catalog
|
||||
cat, err := openCatalog()
|
||||
// Auto-detect hostname if server not specified
|
||||
server := metricsServer
|
||||
if server == "" {
|
||||
hostname, err := os.Hostname()
|
||||
if err != nil {
|
||||
server = "unknown"
|
||||
} else {
|
||||
server = hostname
|
||||
}
|
||||
}
|
||||
|
||||
// Open catalog using specified path
|
||||
cat, err := catalog.NewSQLiteCatalog(metricsCatalogDB)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open catalog: %w", err)
|
||||
}
|
||||
defer cat.Close()
|
||||
|
||||
// Create exporter
|
||||
exporter := prometheus.NewExporter(log, cat, metricsServer, metricsPort)
|
||||
// Create exporter with version info
|
||||
exporter := prometheus.NewExporterWithVersion(log, cat, server, metricsPort, cfg.Version, cfg.GitCommit)
|
||||
|
||||
// Run server (blocks until context is cancelled)
|
||||
return exporter.Serve(ctx)
|
||||
|
||||
@ -3,6 +3,7 @@ package cmd
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"dbbackup/internal/config"
|
||||
"dbbackup/internal/logger"
|
||||
@ -107,6 +108,12 @@ For help with specific commands, use: dbbackup [command] --help`,
|
||||
}
|
||||
}
|
||||
|
||||
// Auto-detect socket from --host path (if host starts with /)
|
||||
if strings.HasPrefix(cfg.Host, "/") && cfg.Socket == "" {
|
||||
cfg.Socket = cfg.Host
|
||||
cfg.Host = "localhost" // Reset host for socket connections
|
||||
}
|
||||
|
||||
return cfg.SetDatabaseType(cfg.DatabaseType)
|
||||
},
|
||||
}
|
||||
@ -136,6 +143,7 @@ func Execute(ctx context.Context, config *config.Config, logger logger.Logger) e
|
||||
// Add persistent flags
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.Host, "host", cfg.Host, "Database host")
|
||||
rootCmd.PersistentFlags().IntVar(&cfg.Port, "port", cfg.Port, "Database port")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.Socket, "socket", cfg.Socket, "Unix socket path for MySQL/MariaDB (e.g., /var/run/mysqld/mysqld.sock)")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.User, "user", cfg.User, "Database user")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.Database, "database", cfg.Database, "Database name")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.Password, "password", cfg.Password, "Database password")
|
||||
|
||||
@ -90,6 +90,53 @@ groups:
|
||||
summary: "Backup not verified for {{ $labels.database }}"
|
||||
description: "Last backup was not verified. Run dbbackup verify to check integrity."
|
||||
|
||||
# PITR Alerts
|
||||
- alert: DBBackupPITRArchiveLag
|
||||
expr: dbbackup_pitr_archive_lag_seconds > 600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PITR archive lag on {{ $labels.server }}"
|
||||
description: "WAL/binlog archiving for {{ $labels.database }} is {{ $value | humanizeDuration }} behind."
|
||||
|
||||
- alert: DBBackupPITRArchiveCritical
|
||||
expr: dbbackup_pitr_archive_lag_seconds > 1800
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR archive critically behind on {{ $labels.server }}"
|
||||
description: "WAL/binlog archiving for {{ $labels.database }} is {{ $value | humanizeDuration }} behind. PITR capability at risk!"
|
||||
|
||||
- alert: DBBackupPITRChainBroken
|
||||
expr: dbbackup_pitr_chain_valid == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR chain broken for {{ $labels.database }}"
|
||||
description: "WAL/binlog chain has gaps. Point-in-time recovery NOT possible. New base backup required."
|
||||
|
||||
- alert: DBBackupPITRGaps
|
||||
expr: dbbackup_pitr_gap_count > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PITR chain gaps for {{ $labels.database }}"
|
||||
description: "{{ $value }} gaps in WAL/binlog chain. Recovery to points within gaps will fail."
|
||||
|
||||
# Backup Type Alerts
|
||||
- alert: DBBackupNoRecentFull
|
||||
expr: time() - dbbackup_last_success_timestamp{backup_type="full"} > 604800
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "No full backup in 7+ days for {{ $labels.database }}"
|
||||
description: "Consider taking a full backup. Incremental chains depend on valid base."
|
||||
|
||||
# Exporter Health
|
||||
- alert: DBBackupExporterDown
|
||||
expr: up{job="dbbackup"} == 0
|
||||
|
||||
537
docs/EXPORTER.md
Normal file
537
docs/EXPORTER.md
Normal file
@ -0,0 +1,537 @@
|
||||
# DBBackup Prometheus Exporter & Grafana Dashboard
|
||||
|
||||
This document provides complete reference for the DBBackup Prometheus exporter, including all exported metrics, setup instructions, and Grafana dashboard configuration.
|
||||
|
||||
## What's New (January 2026)
|
||||
|
||||
### New Features
|
||||
- **Backup Type Tracking**: All backup metrics now include a `backup_type` label (`full`, `incremental`, or `pitr_base` for PITR base backups)
|
||||
- **Note**: CLI `--backup-type` flag only accepts `full` or `incremental`. The `pitr_base` label is auto-assigned when using `dbbackup pitr base`
|
||||
- **PITR Metrics**: Complete Point-in-Time Recovery monitoring for PostgreSQL WAL and MySQL binlog archiving
|
||||
- **New Alerts**: PITR-specific alerts for archive lag, chain integrity, and gap detection
|
||||
|
||||
### New Metrics Added
|
||||
| Metric | Description |
|
||||
|--------|-------------|
|
||||
| `dbbackup_build_info` | Build info with version and commit labels |
|
||||
| `dbbackup_backup_by_type` | Count backups by type (full/incremental/pitr_base) |
|
||||
| `dbbackup_pitr_enabled` | Whether PITR is enabled (1/0) |
|
||||
| `dbbackup_pitr_archive_lag_seconds` | Seconds since last WAL/binlog archived |
|
||||
| `dbbackup_pitr_chain_valid` | WAL/binlog chain integrity (1=valid) |
|
||||
| `dbbackup_pitr_gap_count` | Number of gaps in archive chain |
|
||||
| `dbbackup_pitr_archive_count` | Total archived segments |
|
||||
| `dbbackup_pitr_archive_size_bytes` | Total archive storage |
|
||||
| `dbbackup_pitr_recovery_window_minutes` | Estimated PITR coverage |
|
||||
|
||||
### Label Changes
|
||||
- `backup_type` label added to: `dbbackup_rpo_seconds`, `dbbackup_last_success_timestamp`, `dbbackup_last_backup_duration_seconds`, `dbbackup_last_backup_size_bytes`
|
||||
- `dbbackup_backup_total` type changed from counter to gauge (more accurate for snapshot-based collection)
|
||||
|
||||
---
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Quick Start](#quick-start)
|
||||
- [Exporter Modes](#exporter-modes)
|
||||
- [Complete Metrics Reference](#complete-metrics-reference)
|
||||
- [Grafana Dashboard Setup](#grafana-dashboard-setup)
|
||||
- [Alerting Rules](#alerting-rules)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
|
||||
---
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Start the Metrics Server
|
||||
|
||||
```bash
|
||||
# Start HTTP exporter on default port 9399 (auto-detects hostname for server label)
|
||||
dbbackup metrics serve
|
||||
|
||||
# Custom port
|
||||
dbbackup metrics serve --port 9100
|
||||
|
||||
# Specify server name for labels (overrides auto-detection)
|
||||
dbbackup metrics serve --server production-db-01
|
||||
|
||||
# Specify custom catalog database location
|
||||
dbbackup metrics serve --catalog-db /path/to/catalog.db
|
||||
```
|
||||
|
||||
### Export to Textfile (for node_exporter)
|
||||
|
||||
```bash
|
||||
# Export to default location
|
||||
dbbackup metrics export
|
||||
|
||||
# Custom output path
|
||||
dbbackup metrics export --output /var/lib/node_exporter/textfile_collector/dbbackup.prom
|
||||
|
||||
# Specify catalog database and server name
|
||||
dbbackup metrics export --catalog-db /root/.dbbackup/catalog.db --server myhost
|
||||
```
|
||||
|
||||
### Install as Systemd Service
|
||||
|
||||
```bash
|
||||
# Install with metrics exporter
|
||||
sudo dbbackup install --with-metrics
|
||||
|
||||
# Start the service
|
||||
sudo systemctl start dbbackup-exporter
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Exporter Modes
|
||||
|
||||
### HTTP Server Mode (`metrics serve`)
|
||||
|
||||
Runs a standalone HTTP server exposing metrics for direct Prometheus scraping.
|
||||
|
||||
| Endpoint | Description |
|
||||
|-------------|----------------------------------|
|
||||
| `/metrics` | Prometheus metrics |
|
||||
| `/health` | Health check (returns 200 OK) |
|
||||
| `/` | Service info page |
|
||||
|
||||
**Default Port:** 9399
|
||||
|
||||
**Server Label:** Auto-detected from hostname (use `--server` to override)
|
||||
|
||||
**Catalog Location:** `~/.dbbackup/catalog.db` (use `--catalog-db` to override)
|
||||
|
||||
**Configuration:**
|
||||
```bash
|
||||
dbbackup metrics serve [--server <instance-name>] [--port <port>] [--catalog-db <path>]
|
||||
```
|
||||
|
||||
| Flag | Default | Description |
|
||||
|------|---------|-------------|
|
||||
| `--server` | hostname | Server label for metrics (auto-detected if not set) |
|
||||
| `--port` | 9399 | HTTP server port |
|
||||
| `--catalog-db` | ~/.dbbackup/catalog.db | Path to catalog SQLite database |
|
||||
|
||||
### Textfile Mode (`metrics export`)
|
||||
|
||||
Writes metrics to a file for collection by node_exporter's textfile collector.
|
||||
|
||||
**Default Path:** `/var/lib/dbbackup/metrics/dbbackup.prom`
|
||||
|
||||
| Flag | Default | Description |
|
||||
|------|---------|-------------|
|
||||
| `--server` | hostname | Server label for metrics (auto-detected if not set) |
|
||||
| `--output` | /var/lib/dbbackup/metrics/dbbackup.prom | Output file path |
|
||||
| `--catalog-db` | ~/.dbbackup/catalog.db | Path to catalog SQLite database |
|
||||
|
||||
**node_exporter Configuration:**
|
||||
```bash
|
||||
node_exporter --collector.textfile.directory=/var/lib/dbbackup/metrics/
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Complete Metrics Reference
|
||||
|
||||
All metrics use the `dbbackup_` prefix. Below is the **validated** list of metrics exported by DBBackup.
|
||||
|
||||
### Backup Status Metrics
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|-------------|------|--------|-------------|
|
||||
| `dbbackup_last_success_timestamp` | gauge | `server`, `database`, `engine`, `backup_type` | Unix timestamp of last successful backup |
|
||||
| `dbbackup_last_backup_duration_seconds` | gauge | `server`, `database`, `engine`, `backup_type` | Duration of last successful backup in seconds |
|
||||
| `dbbackup_last_backup_size_bytes` | gauge | `server`, `database`, `engine`, `backup_type` | Size of last successful backup in bytes |
|
||||
| `dbbackup_backup_total` | gauge | `server`, `database`, `status` | Total backup attempts (status: `success` or `failure`) |
|
||||
| `dbbackup_backup_by_type` | gauge | `server`, `database`, `backup_type` | Backup count by type (`full`, `incremental`, `pitr_base`) |
|
||||
| `dbbackup_rpo_seconds` | gauge | `server`, `database`, `backup_type` | Seconds since last successful backup (RPO) |
|
||||
| `dbbackup_backup_verified` | gauge | `server`, `database` | Whether last backup was verified (1=yes, 0=no) |
|
||||
| `dbbackup_scrape_timestamp` | gauge | `server` | Unix timestamp when metrics were collected |
|
||||
|
||||
### PITR (Point-in-Time Recovery) Metrics
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|-------------|------|--------|-------------|
|
||||
| `dbbackup_pitr_enabled` | gauge | `server`, `database`, `engine` | Whether PITR is enabled (1=yes, 0=no) |
|
||||
| `dbbackup_pitr_last_archived_timestamp` | gauge | `server`, `database`, `engine` | Unix timestamp of last archived WAL/binlog |
|
||||
| `dbbackup_pitr_archive_lag_seconds` | gauge | `server`, `database`, `engine` | Seconds since last archive (lower is better) |
|
||||
| `dbbackup_pitr_archive_count` | gauge | `server`, `database`, `engine` | Total archived WAL segments or binlog files |
|
||||
| `dbbackup_pitr_archive_size_bytes` | gauge | `server`, `database`, `engine` | Total size of archived logs in bytes |
|
||||
| `dbbackup_pitr_chain_valid` | gauge | `server`, `database`, `engine` | Whether archive chain is valid (1=yes, 0=gaps) |
|
||||
| `dbbackup_pitr_gap_count` | gauge | `server`, `database`, `engine` | Number of gaps in archive chain |
|
||||
| `dbbackup_pitr_recovery_window_minutes` | gauge | `server`, `database`, `engine` | Estimated PITR coverage window in minutes |
|
||||
| `dbbackup_pitr_scrape_timestamp` | gauge | `server` | PITR metrics collection timestamp |
|
||||
|
||||
### Deduplication Metrics
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|-------------|------|--------|-------------|
|
||||
| `dbbackup_dedup_chunks_total` | gauge | `server` | Total unique chunks stored |
|
||||
| `dbbackup_dedup_manifests_total` | gauge | `server` | Total number of deduplicated backups |
|
||||
| `dbbackup_dedup_backup_bytes_total` | gauge | `server` | Total logical size of all backups (bytes) |
|
||||
| `dbbackup_dedup_stored_bytes_total` | gauge | `server` | Total unique data stored after dedup (bytes) |
|
||||
| `dbbackup_dedup_space_saved_bytes` | gauge | `server` | Bytes saved by deduplication |
|
||||
| `dbbackup_dedup_ratio` | gauge | `server` | Dedup efficiency (0-1, higher = better) |
|
||||
| `dbbackup_dedup_disk_usage_bytes` | gauge | `server` | Actual disk usage of chunk store |
|
||||
| `dbbackup_dedup_compression_ratio` | gauge | `server` | Compression ratio (0-1, higher = better) |
|
||||
| `dbbackup_dedup_oldest_chunk_timestamp` | gauge | `server` | Unix timestamp of oldest chunk |
|
||||
| `dbbackup_dedup_newest_chunk_timestamp` | gauge | `server` | Unix timestamp of newest chunk |
|
||||
| `dbbackup_dedup_scrape_timestamp` | gauge | `server` | Dedup metrics collection timestamp |
|
||||
|
||||
### Per-Database Dedup Metrics
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|-------------|------|--------|-------------|
|
||||
| `dbbackup_dedup_database_backup_count` | gauge | `server`, `database` | Deduplicated backups per database |
|
||||
| `dbbackup_dedup_database_ratio` | gauge | `server`, `database` | Per-database dedup ratio |
|
||||
| `dbbackup_dedup_database_last_backup_timestamp` | gauge | `server`, `database` | Last backup timestamp per database |
|
||||
| `dbbackup_dedup_database_total_bytes` | gauge | `server`, `database` | Total logical size per database |
|
||||
| `dbbackup_dedup_database_stored_bytes` | gauge | `server`, `database` | Stored bytes per database (after dedup) |
|
||||
| `dbbackup_rpo_seconds` | gauge | `server`, `database` | Seconds since last backup (same as regular backups for unified alerting) |
|
||||
|
||||
> **Note:** The `dbbackup_rpo_seconds` metric is exported by both regular backups and dedup backups, enabling unified alerting without complex PromQL expressions.
|
||||
|
||||
---
|
||||
|
||||
## Example Metrics Output
|
||||
|
||||
```prometheus
|
||||
# DBBackup Prometheus Metrics
|
||||
# Generated at: 2026-01-27T10:30:00Z
|
||||
# Server: production
|
||||
|
||||
# HELP dbbackup_last_success_timestamp Unix timestamp of last successful backup
|
||||
# TYPE dbbackup_last_success_timestamp gauge
|
||||
dbbackup_last_success_timestamp{server="production",database="myapp",engine="postgres",backup_type="full"} 1737884600
|
||||
|
||||
# HELP dbbackup_last_backup_duration_seconds Duration of last successful backup in seconds
|
||||
# TYPE dbbackup_last_backup_duration_seconds gauge
|
||||
dbbackup_last_backup_duration_seconds{server="production",database="myapp",engine="postgres",backup_type="full"} 125.50
|
||||
|
||||
# HELP dbbackup_last_backup_size_bytes Size of last successful backup in bytes
|
||||
# TYPE dbbackup_last_backup_size_bytes gauge
|
||||
dbbackup_last_backup_size_bytes{server="production",database="myapp",engine="postgres",backup_type="full"} 1073741824
|
||||
|
||||
# HELP dbbackup_backup_total Total number of backup attempts by type and status
|
||||
# TYPE dbbackup_backup_total gauge
|
||||
dbbackup_backup_total{server="production",database="myapp",status="success"} 42
|
||||
dbbackup_backup_total{server="production",database="myapp",status="failure"} 2
|
||||
|
||||
# HELP dbbackup_backup_by_type Total number of backups by backup type
|
||||
# TYPE dbbackup_backup_by_type gauge
|
||||
dbbackup_backup_by_type{server="production",database="myapp",backup_type="full"} 30
|
||||
dbbackup_backup_by_type{server="production",database="myapp",backup_type="incremental"} 12
|
||||
|
||||
# HELP dbbackup_rpo_seconds Recovery Point Objective - seconds since last successful backup
|
||||
# TYPE dbbackup_rpo_seconds gauge
|
||||
dbbackup_rpo_seconds{server="production",database="myapp",backup_type="full"} 3600
|
||||
|
||||
# HELP dbbackup_backup_verified Whether the last backup was verified (1=yes, 0=no)
|
||||
# TYPE dbbackup_backup_verified gauge
|
||||
dbbackup_backup_verified{server="production",database="myapp"} 1
|
||||
|
||||
# HELP dbbackup_pitr_enabled Whether PITR is enabled for database (1=enabled, 0=disabled)
|
||||
# TYPE dbbackup_pitr_enabled gauge
|
||||
dbbackup_pitr_enabled{server="production",database="myapp",engine="postgres"} 1
|
||||
|
||||
# HELP dbbackup_pitr_archive_lag_seconds Seconds since last WAL/binlog was archived
|
||||
# TYPE dbbackup_pitr_archive_lag_seconds gauge
|
||||
dbbackup_pitr_archive_lag_seconds{server="production",database="myapp",engine="postgres"} 45
|
||||
|
||||
# HELP dbbackup_pitr_chain_valid Whether the WAL/binlog chain is valid (1=valid, 0=gaps detected)
|
||||
# TYPE dbbackup_pitr_chain_valid gauge
|
||||
dbbackup_pitr_chain_valid{server="production",database="myapp",engine="postgres"} 1
|
||||
|
||||
# HELP dbbackup_pitr_recovery_window_minutes Estimated recovery window in minutes
|
||||
# TYPE dbbackup_pitr_recovery_window_minutes gauge
|
||||
dbbackup_pitr_recovery_window_minutes{server="production",database="myapp",engine="postgres"} 10080
|
||||
|
||||
# HELP dbbackup_dedup_ratio Deduplication ratio (0-1, higher is better)
|
||||
# TYPE dbbackup_dedup_ratio gauge
|
||||
dbbackup_dedup_ratio{server="production"} 0.6500
|
||||
|
||||
# HELP dbbackup_dedup_space_saved_bytes Bytes saved by deduplication
|
||||
# TYPE dbbackup_dedup_space_saved_bytes gauge
|
||||
dbbackup_dedup_space_saved_bytes{server="production"} 5368709120
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Prometheus Scrape Configuration
|
||||
|
||||
Add to your `prometheus.yml`:
|
||||
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'dbbackup'
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 10s
|
||||
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'db-server-01:9399'
|
||||
- 'db-server-02:9399'
|
||||
labels:
|
||||
environment: 'production'
|
||||
|
||||
- targets:
|
||||
- 'db-staging:9399'
|
||||
labels:
|
||||
environment: 'staging'
|
||||
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: instance
|
||||
regex: '([^:]+):\d+'
|
||||
replacement: '$1'
|
||||
```
|
||||
|
||||
### File-based Service Discovery
|
||||
|
||||
```yaml
|
||||
- job_name: 'dbbackup-sd'
|
||||
scrape_interval: 60s
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets/dbbackup/*.yml'
|
||||
refresh_interval: 5m
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Grafana Dashboard Setup
|
||||
|
||||
### Import Dashboard
|
||||
|
||||
1. Open Grafana → **Dashboards** → **Import**
|
||||
2. Upload `grafana/dbbackup-dashboard.json` or paste the JSON
|
||||
3. Select your Prometheus data source
|
||||
4. Click **Import**
|
||||
|
||||
### Dashboard Panels
|
||||
|
||||
The dashboard includes the following panels:
|
||||
|
||||
#### Backup Overview Row
|
||||
| Panel | Metric Used | Description |
|
||||
|-------|-------------|-------------|
|
||||
| Last Backup Status | `dbbackup_rpo_seconds < bool 604800` | SUCCESS/FAILED indicator |
|
||||
| Time Since Last Backup | `dbbackup_rpo_seconds` | Time elapsed since last backup |
|
||||
| Verification Status | `dbbackup_backup_verified` | VERIFIED/NOT VERIFIED |
|
||||
| Total Successful Backups | `dbbackup_backup_total{status="success"}` | Counter |
|
||||
| Total Failed Backups | `dbbackup_backup_total{status="failure"}` | Counter |
|
||||
| RPO Over Time | `dbbackup_rpo_seconds` | Time series graph |
|
||||
| Backup Size | `dbbackup_last_backup_size_bytes` | Bar chart |
|
||||
| Backup Duration | `dbbackup_last_backup_duration_seconds` | Time series |
|
||||
| Backup Status Overview | Multiple metrics | Table with color-coded status |
|
||||
|
||||
#### Deduplication Statistics Row
|
||||
| Panel | Metric Used | Description |
|
||||
|-------|-------------|-------------|
|
||||
| Dedup Ratio | `dbbackup_dedup_ratio` | Percentage efficiency |
|
||||
| Space Saved | `dbbackup_dedup_space_saved_bytes` | Total bytes saved |
|
||||
| Disk Usage | `dbbackup_dedup_disk_usage_bytes` | Actual storage used |
|
||||
| Total Chunks | `dbbackup_dedup_chunks_total` | Chunk count |
|
||||
| Compression Ratio | `dbbackup_dedup_compression_ratio` | Compression efficiency |
|
||||
| Oldest Chunk | `dbbackup_dedup_oldest_chunk_timestamp` | Age of oldest data |
|
||||
| Newest Chunk | `dbbackup_dedup_newest_chunk_timestamp` | Most recent chunk |
|
||||
| Dedup Ratio by Database | `dbbackup_dedup_database_ratio` | Per-database efficiency |
|
||||
| Dedup Storage Over Time | `dbbackup_dedup_space_saved_bytes`, `dbbackup_dedup_disk_usage_bytes` | Storage trends |
|
||||
|
||||
### Dashboard Variables
|
||||
|
||||
| Variable | Query | Description |
|
||||
|----------|-------|-------------|
|
||||
| `$server` | `label_values(dbbackup_rpo_seconds, server)` | Filter by server |
|
||||
| `$DS_PROMETHEUS` | datasource | Prometheus data source |
|
||||
|
||||
### Dashboard Thresholds
|
||||
|
||||
#### RPO Thresholds
|
||||
- **Green:** < 12 hours (43200 seconds)
|
||||
- **Yellow:** 12-24 hours
|
||||
- **Red:** > 24 hours (86400 seconds)
|
||||
|
||||
#### Backup Status Thresholds
|
||||
- **1 (Green):** SUCCESS
|
||||
- **0 (Red):** FAILED
|
||||
|
||||
---
|
||||
|
||||
## Alerting Rules
|
||||
|
||||
### Pre-configured Alerts
|
||||
|
||||
Import `deploy/prometheus/alerting-rules.yaml` into Prometheus/Alertmanager.
|
||||
|
||||
#### Backup Status Alerts
|
||||
| Alert | Expression | Severity | Description |
|
||||
|-------|------------|----------|-------------|
|
||||
| `DBBackupRPOWarning` | `dbbackup_rpo_seconds > 43200` | warning | No backup for 12+ hours |
|
||||
| `DBBackupRPOCritical` | `dbbackup_rpo_seconds > 86400` | critical | No backup for 24+ hours |
|
||||
| `DBBackupFailed` | `increase(dbbackup_backup_total{status="failure"}[1h]) > 0` | critical | Backup failed |
|
||||
| `DBBackupFailureRateHigh` | Failure rate > 10% in 24h | warning | High failure rate |
|
||||
| `DBBackupSizeAnomaly` | Size changed > 50% vs 7-day avg | warning | Unusual backup size |
|
||||
| `DBBackupSizeZero` | `dbbackup_last_backup_size_bytes == 0` | critical | Empty backup file |
|
||||
| `DBBackupDurationHigh` | `dbbackup_last_backup_duration_seconds > 3600` | warning | Backup taking > 1 hour |
|
||||
| `DBBackupNotVerified` | `dbbackup_backup_verified == 0` for 24h | warning | Backup not verified |
|
||||
| `DBBackupNoRecentFull` | No full backup in 7+ days | warning | Need full backup for incremental chain |
|
||||
|
||||
#### PITR Alerts (New)
|
||||
| Alert | Expression | Severity | Description |
|
||||
|-------|------------|----------|-------------|
|
||||
| `DBBackupPITRArchiveLag` | `dbbackup_pitr_archive_lag_seconds > 600` | warning | Archive 10+ min behind |
|
||||
| `DBBackupPITRArchiveCritical` | `dbbackup_pitr_archive_lag_seconds > 1800` | critical | Archive 30+ min behind |
|
||||
| `DBBackupPITRChainBroken` | `dbbackup_pitr_chain_valid == 0` | critical | Gaps in WAL/binlog chain |
|
||||
| `DBBackupPITRGaps` | `dbbackup_pitr_gap_count > 0` | warning | Gaps detected in archive chain |
|
||||
| `DBBackupPITRDisabled` | PITR unexpectedly disabled | critical | PITR was enabled but now off |
|
||||
|
||||
#### Infrastructure Alerts
|
||||
| Alert | Expression | Severity | Description |
|
||||
|-------|------------|----------|-------------|
|
||||
| `DBBackupExporterDown` | `up{job="dbbackup"} == 0` | critical | Exporter unreachable |
|
||||
| `DBBackupDedupRatioLow` | `dbbackup_dedup_ratio < 0.2` for 24h | info | Low dedup efficiency |
|
||||
| `DBBackupStorageHigh` | `dbbackup_dedup_disk_usage_bytes > 1TB` | warning | High storage usage |
|
||||
|
||||
### Example Alert Configuration
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: dbbackup
|
||||
rules:
|
||||
- alert: DBBackupRPOCritical
|
||||
expr: dbbackup_rpo_seconds > 86400
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "No backup for {{ $labels.database }} in 24+ hours"
|
||||
description: "RPO violation on {{ $labels.server }}. Last backup: {{ $value | humanizeDuration }} ago."
|
||||
|
||||
- alert: DBBackupPITRChainBroken
|
||||
expr: dbbackup_pitr_chain_valid == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR chain broken for {{ $labels.database }}"
|
||||
description: "WAL/binlog chain has gaps. Point-in-time recovery is NOT possible. New base backup required."
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Exporter Not Returning Metrics
|
||||
|
||||
1. **Check catalog access:**
|
||||
```bash
|
||||
dbbackup catalog list
|
||||
```
|
||||
|
||||
2. **Verify port is open:**
|
||||
```bash
|
||||
curl -v http://localhost:9399/metrics
|
||||
```
|
||||
|
||||
3. **Check logs:**
|
||||
```bash
|
||||
journalctl -u dbbackup-exporter -f
|
||||
```
|
||||
|
||||
### Missing Dedup Metrics
|
||||
|
||||
Dedup metrics are only exported when using deduplication:
|
||||
```bash
|
||||
# Ensure dedup is enabled
|
||||
dbbackup dedup status
|
||||
```
|
||||
|
||||
### Metrics Not Updating
|
||||
|
||||
The exporter caches metrics for 30 seconds. The `/health` endpoint can confirm the exporter is running.
|
||||
|
||||
### Stale or Empty Metrics (Catalog Location Mismatch)
|
||||
|
||||
If the exporter shows stale or no backup data, verify the catalog database location:
|
||||
|
||||
```bash
|
||||
# Check where catalog sync writes
|
||||
dbbackup catalog sync /path/to/backups
|
||||
# Output shows: [STATS] Catalog database: /root/.dbbackup/catalog.db
|
||||
|
||||
# Ensure exporter reads from the same location
|
||||
dbbackup metrics serve --catalog-db /root/.dbbackup/catalog.db
|
||||
```
|
||||
|
||||
**Common Issue:** If backup scripts run as root but the exporter runs as a different user, they may use different catalog locations. Use `--catalog-db` to ensure consistency.
|
||||
|
||||
### Dashboard Shows "No Data"
|
||||
|
||||
1. Verify Prometheus is scraping successfully:
|
||||
```bash
|
||||
curl http://prometheus:9090/api/v1/targets | grep dbbackup
|
||||
```
|
||||
|
||||
2. Check metric names match (case-sensitive):
|
||||
```promql
|
||||
{__name__=~"dbbackup_.*"}
|
||||
```
|
||||
|
||||
3. Verify `server` label matches dashboard variable.
|
||||
|
||||
### Label Mismatch Issues
|
||||
|
||||
Ensure the `--server` flag matches across all instances:
|
||||
```bash
|
||||
# Consistent naming (or let it auto-detect from hostname)
|
||||
dbbackup metrics serve --server prod-db-01
|
||||
```
|
||||
|
||||
> **Note:** As of v3.x, the exporter auto-detects hostname if `--server` is not specified. This ensures unique server labels in multi-host deployments.
|
||||
|
||||
---
|
||||
|
||||
## Metrics Validation Checklist
|
||||
|
||||
Use this checklist to validate your exporter setup:
|
||||
|
||||
- [ ] `/metrics` endpoint returns HTTP 200
|
||||
- [ ] `/health` endpoint returns `{"status":"ok"}`
|
||||
- [ ] `dbbackup_rpo_seconds` shows correct RPO values
|
||||
- [ ] `dbbackup_backup_total` increments after backups
|
||||
- [ ] `dbbackup_backup_verified` reflects verification status
|
||||
- [ ] `dbbackup_last_backup_size_bytes` matches actual backup sizes
|
||||
- [ ] Prometheus scrape succeeds (check targets page)
|
||||
- [ ] Grafana dashboard loads without errors
|
||||
- [ ] Dashboard variables populate correctly
|
||||
- [ ] All panels show data (no "No Data" messages)
|
||||
|
||||
---
|
||||
|
||||
## Files Reference
|
||||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `grafana/dbbackup-dashboard.json` | Grafana dashboard JSON |
|
||||
| `grafana/alerting-rules.yaml` | Grafana alerting rules |
|
||||
| `deploy/prometheus/alerting-rules.yaml` | Prometheus alerting rules |
|
||||
| `deploy/prometheus/scrape-config.yaml` | Prometheus scrape configuration |
|
||||
| `docs/METRICS.md` | Metrics documentation |
|
||||
|
||||
---
|
||||
|
||||
## Version Compatibility
|
||||
|
||||
| DBBackup Version | Metrics Version | Dashboard UID |
|
||||
|------------------|-----------------|---------------|
|
||||
| 1.0.0+ | v1 | `dbbackup-overview` |
|
||||
|
||||
---
|
||||
|
||||
## Support
|
||||
|
||||
For issues with the exporter or dashboard:
|
||||
1. Check the [troubleshooting section](#troubleshooting)
|
||||
2. Review logs: `journalctl -u dbbackup-exporter`
|
||||
3. Open an issue with metrics output and dashboard screenshots
|
||||
161
docs/METRICS.md
161
docs/METRICS.md
@ -6,7 +6,7 @@ This document describes all Prometheus metrics exposed by DBBackup for monitorin
|
||||
|
||||
### `dbbackup_rpo_seconds`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Labels:** `server`, `database`, `backup_type`
|
||||
**Description:** Time in seconds since the last successful backup (Recovery Point Objective).
|
||||
|
||||
**Recommended Thresholds:**
|
||||
@ -17,19 +17,45 @@ This document describes all Prometheus metrics exposed by DBBackup for monitorin
|
||||
**Example Query:**
|
||||
```promql
|
||||
dbbackup_rpo_seconds{server="prod-db-01"} > 86400
|
||||
|
||||
# RPO by backup type
|
||||
dbbackup_rpo_seconds{backup_type="full"}
|
||||
dbbackup_rpo_seconds{backup_type="incremental"}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_backup_total`
|
||||
**Type:** Counter
|
||||
**Labels:** `server`, `database`, `engine`, `status`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `status`
|
||||
**Description:** Total count of backup attempts, labeled by status (`success` or `failure`).
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Failure rate over last hour
|
||||
rate(dbbackup_backup_total{status="failure"}[1h])
|
||||
# Total successful backups
|
||||
dbbackup_backup_total{status="success"}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_backup_by_type`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `backup_type`
|
||||
**Description:** Total count of backups by backup type (`full`, `incremental`, `pitr_base`).
|
||||
|
||||
> **Note:** The `backup_type` label values are:
|
||||
> - `full` - Created with `--backup-type full` (default)
|
||||
> - `incremental` - Created with `--backup-type incremental`
|
||||
> - `pitr_base` - Auto-assigned when using `dbbackup pitr base` command
|
||||
>
|
||||
> The CLI `--backup-type` flag only accepts `full` or `incremental`.
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Count of each backup type
|
||||
dbbackup_backup_by_type{backup_type="full"}
|
||||
dbbackup_backup_by_type{backup_type="incremental"}
|
||||
dbbackup_backup_by_type{backup_type="pitr_base"}
|
||||
```
|
||||
|
||||
---
|
||||
@ -43,24 +69,115 @@ rate(dbbackup_backup_total{status="failure"}[1h])
|
||||
|
||||
### `dbbackup_last_backup_size_bytes`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Labels:** `server`, `database`, `engine`, `backup_type`
|
||||
**Description:** Size of the last successful backup in bytes.
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Total backup storage across all databases
|
||||
sum(dbbackup_last_backup_size_bytes)
|
||||
|
||||
# Size by backup type
|
||||
dbbackup_last_backup_size_bytes{backup_type="full"}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_last_backup_duration_seconds`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Labels:** `server`, `database`, `engine`, `backup_type`
|
||||
**Description:** Duration of the last backup operation in seconds.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_last_success_timestamp`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`, `backup_type`
|
||||
**Description:** Unix timestamp of the last successful backup.
|
||||
|
||||
---
|
||||
|
||||
## PITR (Point-in-Time Recovery) Metrics
|
||||
|
||||
### `dbbackup_pitr_enabled`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Whether PITR is enabled for the database (1 = enabled, 0 = disabled).
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Check if PITR is enabled
|
||||
dbbackup_pitr_enabled{database="production"} == 1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_last_archived_timestamp`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Unix timestamp of the last archived WAL segment (PostgreSQL) or binlog file (MySQL).
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_archive_lag_seconds`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Seconds since the last WAL/binlog was archived. High values indicate archiving issues.
|
||||
|
||||
**Recommended Thresholds:**
|
||||
- Green: < 300 (5 minutes)
|
||||
- Yellow: 300-600 (5-10 minutes)
|
||||
- Red: > 600 (10+ minutes)
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Alert on high archive lag
|
||||
dbbackup_pitr_archive_lag_seconds > 600
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_archive_count`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Total number of archived WAL segments or binlog files.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_archive_size_bytes`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Total size of archived logs in bytes.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_chain_valid`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Whether the WAL/binlog chain is valid (1 = valid, 0 = gaps detected).
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Alert on broken chain
|
||||
dbbackup_pitr_chain_valid == 0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_gap_count`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Number of gaps detected in the WAL/binlog chain. Any value > 0 requires investigation.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_recovery_window_minutes`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Estimated recovery window in minutes - the time span covered by archived logs.
|
||||
|
||||
---
|
||||
|
||||
## Deduplication Metrics
|
||||
|
||||
### `dbbackup_dedup_ratio`
|
||||
@ -119,6 +236,32 @@ sum(dbbackup_last_backup_size_bytes)
|
||||
|
||||
---
|
||||
|
||||
## Build Information Metrics
|
||||
|
||||
### `dbbackup_build_info`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `version`, `commit`
|
||||
**Description:** Build information for the dbbackup exporter. Value is always 1.
|
||||
|
||||
This metric is useful for:
|
||||
- Tracking which version is deployed across your fleet
|
||||
- Alerting when versions drift between servers
|
||||
- Correlating behavior changes with deployments
|
||||
|
||||
**Example Queries:**
|
||||
```promql
|
||||
# Show all deployed versions
|
||||
group by (version) (dbbackup_build_info)
|
||||
|
||||
# Find servers not on latest version
|
||||
dbbackup_build_info{version!="4.1.1"}
|
||||
|
||||
# Alert on version drift
|
||||
count(count by (version) (dbbackup_build_info)) > 1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alerting Rules
|
||||
|
||||
See [alerting-rules.yaml](../grafana/alerting-rules.yaml) for pre-configured Prometheus alerting rules.
|
||||
@ -131,6 +274,10 @@ See [alerting-rules.yaml](../grafana/alerting-rules.yaml) for pre-configured Pro
|
||||
| BackupFailed | `increase(dbbackup_backup_total{status="failure"}[1h]) > 0` | Warning |
|
||||
| BackupNotVerified | `dbbackup_backup_verified == 0` | Warning |
|
||||
| DedupDegraded | `dbbackup_dedup_ratio < 0.1` | Info |
|
||||
| PITRArchiveLag | `dbbackup_pitr_archive_lag_seconds > 600` | Warning |
|
||||
| PITRChainBroken | `dbbackup_pitr_chain_valid == 0` | Critical |
|
||||
| PITRDisabled | `dbbackup_pitr_enabled == 0` (unexpected) | Critical |
|
||||
| NoIncrementalBackups | `dbbackup_backup_by_type{backup_type="incremental"} == 0` for 7d | Info |
|
||||
|
||||
---
|
||||
|
||||
|
||||
@ -96,6 +96,90 @@ groups:
|
||||
Current usage: {{ $value | humanize1024 }}B
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#storage-growth"
|
||||
|
||||
# PITR: Archive lag high
|
||||
- alert: DBBackupPITRArchiveLag
|
||||
expr: dbbackup_pitr_archive_lag_seconds > 600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PITR archive lag high for {{ $labels.database }}"
|
||||
description: |
|
||||
WAL/binlog archiving for {{ $labels.database }} on {{ $labels.server }}
|
||||
is {{ $value | humanizeDuration }} behind. This reduces the PITR
|
||||
recovery point. Check archive process and disk space.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-archive-lag"
|
||||
|
||||
# PITR: Archive lag critical
|
||||
- alert: DBBackupPITRArchiveLagCritical
|
||||
expr: dbbackup_pitr_archive_lag_seconds > 1800
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR archive severely behind for {{ $labels.database }}"
|
||||
description: |
|
||||
WAL/binlog archiving for {{ $labels.database }} is {{ $value | humanizeDuration }}
|
||||
behind. Point-in-time recovery capability is at risk. Immediate action required.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-archive-critical"
|
||||
|
||||
# PITR: Chain broken (gaps detected)
|
||||
- alert: DBBackupPITRChainBroken
|
||||
expr: dbbackup_pitr_chain_valid == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR chain broken for {{ $labels.database }}"
|
||||
description: |
|
||||
The WAL/binlog chain for {{ $labels.database }} on {{ $labels.server }}
|
||||
has gaps. Point-in-time recovery to arbitrary points is NOT possible.
|
||||
A new base backup is required to restore PITR capability.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-chain-broken"
|
||||
|
||||
# PITR: Gaps in chain
|
||||
- alert: DBBackupPITRGapsDetected
|
||||
expr: dbbackup_pitr_gap_count > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PITR chain has {{ $value }} gaps for {{ $labels.database }}"
|
||||
description: |
|
||||
{{ $value }} gaps detected in WAL/binlog chain for {{ $labels.database }}.
|
||||
Recovery to points within gaps will fail. Consider taking a new base backup.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-gaps"
|
||||
|
||||
# PITR: Unexpectedly disabled
|
||||
- alert: DBBackupPITRDisabled
|
||||
expr: |
|
||||
dbbackup_pitr_enabled == 0
|
||||
and on(database) dbbackup_pitr_archive_count > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR unexpectedly disabled for {{ $labels.database }}"
|
||||
description: |
|
||||
PITR was previously enabled for {{ $labels.database }} (has archived logs)
|
||||
but is now disabled. This may indicate a configuration issue or
|
||||
database restart without PITR settings.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-disabled"
|
||||
|
||||
# Backup type: No full backups recently
|
||||
- alert: DBBackupNoRecentFullBackup
|
||||
expr: |
|
||||
time() - dbbackup_last_success_timestamp{backup_type="full"} > 604800
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "No full backup in 7+ days for {{ $labels.database }}"
|
||||
description: |
|
||||
Database {{ $labels.database }} has not had a full backup in over 7 days.
|
||||
Incremental backups depend on a valid full backup base.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#no-full-backup"
|
||||
|
||||
# Info: Exporter not responding
|
||||
- alert: DBBackupExporterDown
|
||||
expr: up{job="dbbackup"} == 0
|
||||
|
||||
@ -150,12 +150,14 @@ type Catalog interface {
|
||||
|
||||
// SyncResult contains results from a catalog sync operation
|
||||
type SyncResult struct {
|
||||
Added int `json:"added"`
|
||||
Updated int `json:"updated"`
|
||||
Removed int `json:"removed"`
|
||||
Errors int `json:"errors"`
|
||||
Duration float64 `json:"duration_seconds"`
|
||||
Details []string `json:"details,omitempty"`
|
||||
Added int `json:"added"`
|
||||
Updated int `json:"updated"`
|
||||
Removed int `json:"removed"`
|
||||
Skipped int `json:"skipped"` // Files without metadata (legacy backups)
|
||||
Errors int `json:"errors"`
|
||||
Duration float64 `json:"duration_seconds"`
|
||||
Details []string `json:"details,omitempty"`
|
||||
LegacyWarning string `json:"legacy_warning,omitempty"` // Warning about legacy files
|
||||
}
|
||||
|
||||
// FormatSize formats bytes as human-readable string
|
||||
|
||||
@ -30,6 +30,33 @@ func (c *SQLiteCatalog) SyncFromDirectory(ctx context.Context, dir string) (*Syn
|
||||
subMatches, _ := filepath.Glob(subPattern)
|
||||
matches = append(matches, subMatches...)
|
||||
|
||||
// Count legacy backups (files without metadata)
|
||||
legacySkipped := 0
|
||||
legacyPatterns := []string{
|
||||
filepath.Join(dir, "*.sql"),
|
||||
filepath.Join(dir, "*.sql.gz"),
|
||||
filepath.Join(dir, "*.sql.lz4"),
|
||||
filepath.Join(dir, "*.sql.zst"),
|
||||
filepath.Join(dir, "*.dump"),
|
||||
filepath.Join(dir, "*.dump.gz"),
|
||||
filepath.Join(dir, "*", "*.sql"),
|
||||
filepath.Join(dir, "*", "*.sql.gz"),
|
||||
}
|
||||
metaSet := make(map[string]bool)
|
||||
for _, m := range matches {
|
||||
// Store the backup file path (without .meta.json)
|
||||
metaSet[strings.TrimSuffix(m, ".meta.json")] = true
|
||||
}
|
||||
for _, pat := range legacyPatterns {
|
||||
legacyMatches, _ := filepath.Glob(pat)
|
||||
for _, lm := range legacyMatches {
|
||||
// Skip if this file has metadata
|
||||
if !metaSet[lm] {
|
||||
legacySkipped++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, metaPath := range matches {
|
||||
// Derive backup file path from metadata path
|
||||
backupPath := strings.TrimSuffix(metaPath, ".meta.json")
|
||||
@ -97,6 +124,17 @@ func (c *SQLiteCatalog) SyncFromDirectory(ctx context.Context, dir string) (*Syn
|
||||
}
|
||||
}
|
||||
|
||||
// Set legacy backup warning if applicable
|
||||
result.Skipped = legacySkipped
|
||||
if legacySkipped > 0 {
|
||||
result.LegacyWarning = fmt.Sprintf(
|
||||
"%d backup file(s) found without .meta.json metadata. "+
|
||||
"These are likely legacy backups created by raw mysqldump/pg_dump. "+
|
||||
"Only backups created by 'dbbackup backup' (with metadata) can be imported. "+
|
||||
"To track legacy backups, re-create them using 'dbbackup backup' command.",
|
||||
legacySkipped)
|
||||
}
|
||||
|
||||
result.Duration = time.Since(start).Seconds()
|
||||
return result, nil
|
||||
}
|
||||
|
||||
@ -23,6 +23,7 @@ type Config struct {
|
||||
User string
|
||||
Database string
|
||||
Password string
|
||||
Socket string // Unix socket path for MySQL/MariaDB
|
||||
DatabaseType string // "postgres" or "mysql"
|
||||
SSLMode string
|
||||
Insecure bool
|
||||
|
||||
@ -278,8 +278,12 @@ func (m *MySQL) GetTableRowCount(ctx context.Context, database, table string) (i
|
||||
func (m *MySQL) BuildBackupCommand(database, outputFile string, options BackupOptions) []string {
|
||||
cmd := []string{"mysqldump"}
|
||||
|
||||
// Connection parameters - handle localhost vs remote differently
|
||||
if m.cfg.Host == "" || m.cfg.Host == "localhost" {
|
||||
// Connection parameters - socket takes priority, then localhost vs remote
|
||||
if m.cfg.Socket != "" {
|
||||
// Explicit socket path provided
|
||||
cmd = append(cmd, "-S", m.cfg.Socket)
|
||||
cmd = append(cmd, "-u", m.cfg.User)
|
||||
} else if m.cfg.Host == "" || m.cfg.Host == "localhost" {
|
||||
// For localhost, use socket connection (don't specify host/port)
|
||||
cmd = append(cmd, "-u", m.cfg.User)
|
||||
} else {
|
||||
@ -338,8 +342,12 @@ func (m *MySQL) BuildBackupCommand(database, outputFile string, options BackupOp
|
||||
func (m *MySQL) BuildRestoreCommand(database, inputFile string, options RestoreOptions) []string {
|
||||
cmd := []string{"mysql"}
|
||||
|
||||
// Connection parameters - handle localhost vs remote differently
|
||||
if m.cfg.Host == "" || m.cfg.Host == "localhost" {
|
||||
// Connection parameters - socket takes priority, then localhost vs remote
|
||||
if m.cfg.Socket != "" {
|
||||
// Explicit socket path provided
|
||||
cmd = append(cmd, "-S", m.cfg.Socket)
|
||||
cmd = append(cmd, "-u", m.cfg.User)
|
||||
} else if m.cfg.Host == "" || m.cfg.Host == "localhost" {
|
||||
// For localhost, use socket connection (don't specify host/port)
|
||||
cmd = append(cmd, "-u", m.cfg.User)
|
||||
} else {
|
||||
@ -417,8 +425,11 @@ func (m *MySQL) buildDSN() string {
|
||||
|
||||
dsn += "@"
|
||||
|
||||
// Handle localhost with Unix socket vs TCP/IP
|
||||
if m.cfg.Host == "" || m.cfg.Host == "localhost" {
|
||||
// Explicit socket takes priority
|
||||
if m.cfg.Socket != "" {
|
||||
dsn += "unix(" + m.cfg.Socket + ")"
|
||||
} else if m.cfg.Host == "" || m.cfg.Host == "localhost" {
|
||||
// Handle localhost with Unix socket vs TCP/IP
|
||||
// Try common socket paths for localhost connections
|
||||
socketPaths := []string{
|
||||
"/run/mysqld/mysqld.sock",
|
||||
|
||||
@ -261,6 +261,22 @@ func FormatPrometheusMetrics(m *DedupMetrics, server string) string {
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Add RPO (Recovery Point Objective) metric for dedup backups - same metric name as regular backups
|
||||
// This enables unified alerting across regular and dedup backup modes
|
||||
b.WriteString("# HELP dbbackup_rpo_seconds Seconds since last successful backup (Recovery Point Objective)\n")
|
||||
b.WriteString("# TYPE dbbackup_rpo_seconds gauge\n")
|
||||
for _, db := range m.ByDatabase {
|
||||
if !db.LastBackupTime.IsZero() {
|
||||
rpoSeconds := now - db.LastBackupTime.Unix()
|
||||
if rpoSeconds < 0 {
|
||||
rpoSeconds = 0
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_rpo_seconds{server=%q,database=%q} %d\n",
|
||||
server, db.Database, rpoSeconds))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
b.WriteString("# HELP dbbackup_dedup_database_total_bytes Total logical size per database\n")
|
||||
b.WriteString("# TYPE dbbackup_dedup_database_total_bytes gauge\n")
|
||||
for _, db := range m.ByDatabase {
|
||||
|
||||
@ -345,8 +345,10 @@ func (e *MySQLDumpEngine) Restore(ctx context.Context, opts *RestoreOptions) err
|
||||
// Build mysql command
|
||||
args := []string{}
|
||||
|
||||
// Connection parameters
|
||||
if e.config.Host != "" && e.config.Host != "localhost" {
|
||||
// Connection parameters - socket takes priority over host
|
||||
if e.config.Socket != "" {
|
||||
args = append(args, "-S", e.config.Socket)
|
||||
} else if e.config.Host != "" && e.config.Host != "localhost" {
|
||||
args = append(args, "-h", e.config.Host)
|
||||
args = append(args, "-P", strconv.Itoa(e.config.Port))
|
||||
}
|
||||
@ -494,8 +496,10 @@ func (e *MySQLDumpEngine) BackupToWriter(ctx context.Context, w io.Writer, opts
|
||||
func (e *MySQLDumpEngine) buildArgs(database string) []string {
|
||||
args := []string{}
|
||||
|
||||
// Connection parameters
|
||||
if e.config.Host != "" && e.config.Host != "localhost" {
|
||||
// Connection parameters - socket takes priority over host
|
||||
if e.config.Socket != "" {
|
||||
args = append(args, "-S", e.config.Socket)
|
||||
} else if e.config.Host != "" && e.config.Host != "localhost" {
|
||||
args = append(args, "-h", e.config.Host)
|
||||
args = append(args, "-P", strconv.Itoa(e.config.Port))
|
||||
}
|
||||
|
||||
@ -14,10 +14,12 @@ import (
|
||||
|
||||
// Exporter provides an HTTP endpoint for Prometheus metrics
|
||||
type Exporter struct {
|
||||
log logger.Logger
|
||||
catalog catalog.Catalog
|
||||
instance string
|
||||
port int
|
||||
log logger.Logger
|
||||
catalog catalog.Catalog
|
||||
instance string
|
||||
port int
|
||||
version string
|
||||
gitCommit string
|
||||
|
||||
mu sync.RWMutex
|
||||
cachedData string
|
||||
@ -36,6 +38,19 @@ func NewExporter(log logger.Logger, cat catalog.Catalog, instance string, port i
|
||||
}
|
||||
}
|
||||
|
||||
// NewExporterWithVersion creates a new Prometheus exporter with version info
|
||||
func NewExporterWithVersion(log logger.Logger, cat catalog.Catalog, instance string, port int, version, gitCommit string) *Exporter {
|
||||
return &Exporter{
|
||||
log: log,
|
||||
catalog: cat,
|
||||
instance: instance,
|
||||
port: port,
|
||||
version: version,
|
||||
gitCommit: gitCommit,
|
||||
refreshTTL: 30 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
// Serve starts the HTTP server and blocks until context is cancelled
|
||||
func (e *Exporter) Serve(ctx context.Context) error {
|
||||
mux := http.NewServeMux()
|
||||
@ -158,7 +173,7 @@ func (e *Exporter) refreshLoop(ctx context.Context) {
|
||||
|
||||
// refresh updates the cached metrics
|
||||
func (e *Exporter) refresh() error {
|
||||
writer := NewMetricsWriter(e.log, e.catalog, e.instance)
|
||||
writer := NewMetricsWriterWithVersion(e.log, e.catalog, e.instance, e.version, e.gitCommit)
|
||||
data, err := writer.GenerateMetricsString()
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@ -16,17 +16,32 @@ import (
|
||||
|
||||
// MetricsWriter writes metrics in Prometheus text format
|
||||
type MetricsWriter struct {
|
||||
log logger.Logger
|
||||
catalog catalog.Catalog
|
||||
instance string
|
||||
log logger.Logger
|
||||
catalog catalog.Catalog
|
||||
instance string
|
||||
version string
|
||||
gitCommit string
|
||||
}
|
||||
|
||||
// NewMetricsWriter creates a new MetricsWriter
|
||||
func NewMetricsWriter(log logger.Logger, cat catalog.Catalog, instance string) *MetricsWriter {
|
||||
return &MetricsWriter{
|
||||
log: log,
|
||||
catalog: cat,
|
||||
instance: instance,
|
||||
log: log,
|
||||
catalog: cat,
|
||||
instance: instance,
|
||||
version: "unknown",
|
||||
gitCommit: "unknown",
|
||||
}
|
||||
}
|
||||
|
||||
// NewMetricsWriterWithVersion creates a MetricsWriter with version info for build_info metric
|
||||
func NewMetricsWriterWithVersion(log logger.Logger, cat catalog.Catalog, instance, version, gitCommit string) *MetricsWriter {
|
||||
return &MetricsWriter{
|
||||
log: log,
|
||||
catalog: cat,
|
||||
instance: instance,
|
||||
version: version,
|
||||
gitCommit: gitCommit,
|
||||
}
|
||||
}
|
||||
|
||||
@ -42,6 +57,25 @@ type BackupMetrics struct {
|
||||
FailureCount int
|
||||
Verified bool
|
||||
RPOSeconds float64
|
||||
// Backup type tracking
|
||||
LastBackupType string // "full", "incremental", "pitr_base"
|
||||
FullCount int // Count of full backups
|
||||
IncrCount int // Count of incremental backups
|
||||
PITRBaseCount int // Count of PITR base backups
|
||||
}
|
||||
|
||||
// PITRMetrics holds PITR-specific metrics for a database
|
||||
type PITRMetrics struct {
|
||||
Database string
|
||||
Engine string
|
||||
Enabled bool
|
||||
LastArchived time.Time
|
||||
ArchiveLag float64 // Seconds since last archive
|
||||
ArchiveCount int
|
||||
ArchiveSize int64
|
||||
ChainValid bool
|
||||
GapCount int
|
||||
RecoveryMinutes float64 // Estimated recovery window in minutes
|
||||
}
|
||||
|
||||
// WriteTextfile writes metrics to a Prometheus textfile collector file
|
||||
@ -110,6 +144,20 @@ func (m *MetricsWriter) collectMetrics() ([]BackupMetrics, error) {
|
||||
|
||||
metrics.TotalBackups++
|
||||
|
||||
// Track backup type counts
|
||||
backupType := e.BackupType
|
||||
if backupType == "" {
|
||||
backupType = "full" // Default to full if not specified
|
||||
}
|
||||
switch backupType {
|
||||
case "full":
|
||||
metrics.FullCount++
|
||||
case "incremental":
|
||||
metrics.IncrCount++
|
||||
case "pitr_base", "pitr":
|
||||
metrics.PITRBaseCount++
|
||||
}
|
||||
|
||||
isSuccess := e.Status == catalog.StatusCompleted || e.Status == catalog.StatusVerified
|
||||
if isSuccess {
|
||||
metrics.SuccessCount++
|
||||
@ -120,6 +168,7 @@ func (m *MetricsWriter) collectMetrics() ([]BackupMetrics, error) {
|
||||
metrics.LastSize = e.SizeBytes
|
||||
metrics.Verified = e.VerifiedAt != nil && e.VerifyValid != nil && *e.VerifyValid
|
||||
metrics.Engine = e.DatabaseType
|
||||
metrics.LastBackupType = backupType
|
||||
}
|
||||
} else {
|
||||
metrics.FailureCount++
|
||||
@ -159,13 +208,24 @@ func (m *MetricsWriter) formatMetrics(metrics []BackupMetrics) string {
|
||||
b.WriteString(fmt.Sprintf("# Server: %s\n", m.instance))
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_build_info - version and build information
|
||||
b.WriteString("# HELP dbbackup_build_info Build information for dbbackup exporter\n")
|
||||
b.WriteString("# TYPE dbbackup_build_info gauge\n")
|
||||
b.WriteString(fmt.Sprintf("dbbackup_build_info{server=%q,version=%q,commit=%q} 1\n",
|
||||
m.instance, m.version, m.gitCommit))
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_last_success_timestamp
|
||||
b.WriteString("# HELP dbbackup_last_success_timestamp Unix timestamp of last successful backup\n")
|
||||
b.WriteString("# TYPE dbbackup_last_success_timestamp gauge\n")
|
||||
for _, met := range metrics {
|
||||
if !met.LastSuccess.IsZero() {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_success_timestamp{server=%q,database=%q,engine=%q} %d\n",
|
||||
m.instance, met.Database, met.Engine, met.LastSuccess.Unix()))
|
||||
backupType := met.LastBackupType
|
||||
if backupType == "" {
|
||||
backupType = "full"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_success_timestamp{server=%q,database=%q,engine=%q,backup_type=%q} %d\n",
|
||||
m.instance, met.Database, met.Engine, backupType, met.LastSuccess.Unix()))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
@ -175,8 +235,12 @@ func (m *MetricsWriter) formatMetrics(metrics []BackupMetrics) string {
|
||||
b.WriteString("# TYPE dbbackup_last_backup_duration_seconds gauge\n")
|
||||
for _, met := range metrics {
|
||||
if met.LastDuration > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_backup_duration_seconds{server=%q,database=%q,engine=%q} %.2f\n",
|
||||
m.instance, met.Database, met.Engine, met.LastDuration.Seconds()))
|
||||
backupType := met.LastBackupType
|
||||
if backupType == "" {
|
||||
backupType = "full"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_backup_duration_seconds{server=%q,database=%q,engine=%q,backup_type=%q} %.2f\n",
|
||||
m.instance, met.Database, met.Engine, backupType, met.LastDuration.Seconds()))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
@ -186,16 +250,21 @@ func (m *MetricsWriter) formatMetrics(metrics []BackupMetrics) string {
|
||||
b.WriteString("# TYPE dbbackup_last_backup_size_bytes gauge\n")
|
||||
for _, met := range metrics {
|
||||
if met.LastSize > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_backup_size_bytes{server=%q,database=%q,engine=%q} %d\n",
|
||||
m.instance, met.Database, met.Engine, met.LastSize))
|
||||
backupType := met.LastBackupType
|
||||
if backupType == "" {
|
||||
backupType = "full"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_backup_size_bytes{server=%q,database=%q,engine=%q,backup_type=%q} %d\n",
|
||||
m.instance, met.Database, met.Engine, backupType, met.LastSize))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_backup_total (counter)
|
||||
b.WriteString("# HELP dbbackup_backup_total Total number of backup attempts\n")
|
||||
b.WriteString("# TYPE dbbackup_backup_total counter\n")
|
||||
// dbbackup_backup_total - now with backup_type dimension
|
||||
b.WriteString("# HELP dbbackup_backup_total Total number of backup attempts by type and status\n")
|
||||
b.WriteString("# TYPE dbbackup_backup_total gauge\n")
|
||||
for _, met := range metrics {
|
||||
// Success/failure by status (legacy compatibility)
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_total{server=%q,database=%q,status=\"success\"} %d\n",
|
||||
m.instance, met.Database, met.SuccessCount))
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_total{server=%q,database=%q,status=\"failure\"} %d\n",
|
||||
@ -203,13 +272,36 @@ func (m *MetricsWriter) formatMetrics(metrics []BackupMetrics) string {
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_backup_by_type - backup counts by type
|
||||
b.WriteString("# HELP dbbackup_backup_by_type Total number of backups by backup type\n")
|
||||
b.WriteString("# TYPE dbbackup_backup_by_type gauge\n")
|
||||
for _, met := range metrics {
|
||||
if met.FullCount > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_by_type{server=%q,database=%q,backup_type=\"full\"} %d\n",
|
||||
m.instance, met.Database, met.FullCount))
|
||||
}
|
||||
if met.IncrCount > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_by_type{server=%q,database=%q,backup_type=\"incremental\"} %d\n",
|
||||
m.instance, met.Database, met.IncrCount))
|
||||
}
|
||||
if met.PITRBaseCount > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_by_type{server=%q,database=%q,backup_type=\"pitr_base\"} %d\n",
|
||||
m.instance, met.Database, met.PITRBaseCount))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_rpo_seconds
|
||||
b.WriteString("# HELP dbbackup_rpo_seconds Recovery Point Objective - seconds since last successful backup\n")
|
||||
b.WriteString("# TYPE dbbackup_rpo_seconds gauge\n")
|
||||
for _, met := range metrics {
|
||||
if met.RPOSeconds > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_rpo_seconds{server=%q,database=%q} %.0f\n",
|
||||
m.instance, met.Database, met.RPOSeconds))
|
||||
backupType := met.LastBackupType
|
||||
if backupType == "" {
|
||||
backupType = "full"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_rpo_seconds{server=%q,database=%q,backup_type=%q} %.0f\n",
|
||||
m.instance, met.Database, backupType, met.RPOSeconds))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
@ -243,3 +335,150 @@ func (m *MetricsWriter) GenerateMetricsString() (string, error) {
|
||||
}
|
||||
return m.formatMetrics(metrics), nil
|
||||
}
|
||||
|
||||
// PITRMetricsWriter writes PITR-specific metrics
|
||||
type PITRMetricsWriter struct {
|
||||
log logger.Logger
|
||||
instance string
|
||||
}
|
||||
|
||||
// NewPITRMetricsWriter creates a new PITR metrics writer
|
||||
func NewPITRMetricsWriter(log logger.Logger, instance string) *PITRMetricsWriter {
|
||||
return &PITRMetricsWriter{
|
||||
log: log,
|
||||
instance: instance,
|
||||
}
|
||||
}
|
||||
|
||||
// FormatPITRMetrics formats PITR metrics in Prometheus exposition format
|
||||
func (p *PITRMetricsWriter) FormatPITRMetrics(pitrMetrics []PITRMetrics) string {
|
||||
var b strings.Builder
|
||||
now := time.Now().Unix()
|
||||
|
||||
b.WriteString("# DBBackup PITR Prometheus Metrics\n")
|
||||
b.WriteString(fmt.Sprintf("# Generated at: %s\n", time.Now().Format(time.RFC3339)))
|
||||
b.WriteString(fmt.Sprintf("# Server: %s\n", p.instance))
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_enabled
|
||||
b.WriteString("# HELP dbbackup_pitr_enabled Whether PITR is enabled for database (1=enabled, 0=disabled)\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_enabled gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
enabled := 0
|
||||
if met.Enabled {
|
||||
enabled = 1
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_enabled{server=%q,database=%q,engine=%q} %d\n",
|
||||
p.instance, met.Database, met.Engine, enabled))
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_last_archived_timestamp
|
||||
b.WriteString("# HELP dbbackup_pitr_last_archived_timestamp Unix timestamp of last archived WAL/binlog\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_last_archived_timestamp gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled && !met.LastArchived.IsZero() {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_last_archived_timestamp{server=%q,database=%q,engine=%q} %d\n",
|
||||
p.instance, met.Database, met.Engine, met.LastArchived.Unix()))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_archive_lag_seconds
|
||||
b.WriteString("# HELP dbbackup_pitr_archive_lag_seconds Seconds since last WAL/binlog was archived\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_archive_lag_seconds gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_archive_lag_seconds{server=%q,database=%q,engine=%q} %.0f\n",
|
||||
p.instance, met.Database, met.Engine, met.ArchiveLag))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_archive_count
|
||||
b.WriteString("# HELP dbbackup_pitr_archive_count Total number of archived WAL segments/binlog files\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_archive_count gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_archive_count{server=%q,database=%q,engine=%q} %d\n",
|
||||
p.instance, met.Database, met.Engine, met.ArchiveCount))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_archive_size_bytes
|
||||
b.WriteString("# HELP dbbackup_pitr_archive_size_bytes Total size of archived logs in bytes\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_archive_size_bytes gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_archive_size_bytes{server=%q,database=%q,engine=%q} %d\n",
|
||||
p.instance, met.Database, met.Engine, met.ArchiveSize))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_chain_valid
|
||||
b.WriteString("# HELP dbbackup_pitr_chain_valid Whether the WAL/binlog chain is valid (1=valid, 0=gaps detected)\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_chain_valid gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled {
|
||||
valid := 0
|
||||
if met.ChainValid {
|
||||
valid = 1
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_chain_valid{server=%q,database=%q,engine=%q} %d\n",
|
||||
p.instance, met.Database, met.Engine, valid))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_gap_count
|
||||
b.WriteString("# HELP dbbackup_pitr_gap_count Number of gaps detected in WAL/binlog chain\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_gap_count gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_gap_count{server=%q,database=%q,engine=%q} %d\n",
|
||||
p.instance, met.Database, met.Engine, met.GapCount))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_recovery_window_minutes
|
||||
b.WriteString("# HELP dbbackup_pitr_recovery_window_minutes Estimated recovery window in minutes (time span covered by archived logs)\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_recovery_window_minutes gauge\n")
|
||||
for _, met := range pitrMetrics {
|
||||
if met.Enabled && met.RecoveryMinutes > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_recovery_window_minutes{server=%q,database=%q,engine=%q} %.1f\n",
|
||||
p.instance, met.Database, met.Engine, met.RecoveryMinutes))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_pitr_scrape_timestamp
|
||||
b.WriteString("# HELP dbbackup_pitr_scrape_timestamp Unix timestamp when PITR metrics were collected\n")
|
||||
b.WriteString("# TYPE dbbackup_pitr_scrape_timestamp gauge\n")
|
||||
b.WriteString(fmt.Sprintf("dbbackup_pitr_scrape_timestamp{server=%q} %d\n", p.instance, now))
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// CollectPITRMetricsFromStatus converts PITRStatus to PITRMetrics
|
||||
// This is a helper for integration with the PITR subsystem
|
||||
func CollectPITRMetricsFromStatus(database, engine string, enabled bool, lastArchived time.Time, archiveCount int, archiveSize int64, chainValid bool, gapCount int, recoveryMinutes float64) PITRMetrics {
|
||||
lag := float64(0)
|
||||
if enabled && !lastArchived.IsZero() {
|
||||
lag = time.Since(lastArchived).Seconds()
|
||||
}
|
||||
return PITRMetrics{
|
||||
Database: database,
|
||||
Engine: engine,
|
||||
Enabled: enabled,
|
||||
LastArchived: lastArchived,
|
||||
ArchiveLag: lag,
|
||||
ArchiveCount: archiveCount,
|
||||
ArchiveSize: archiveSize,
|
||||
ChainValid: chainValid,
|
||||
GapCount: gapCount,
|
||||
RecoveryMinutes: recoveryMinutes,
|
||||
}
|
||||
}
|
||||
|
||||
2
main.go
2
main.go
@ -16,7 +16,7 @@ import (
|
||||
|
||||
// Build information (set by ldflags)
|
||||
var (
|
||||
version = "4.0.0"
|
||||
version = "4.1.2"
|
||||
buildTime = "unknown"
|
||||
gitCommit = "unknown"
|
||||
)
|
||||
|
||||
1588
release/dbbackup-dashboard.json
Normal file
1588
release/dbbackup-dashboard.json
Normal file
@ -0,0 +1,1588 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Comprehensive monitoring dashboard for DBBackup - tracks backup status, RPO, deduplication, and verification across all database servers.",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 200,
|
||||
"panels": [],
|
||||
"title": "Backup Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Shows SUCCESS if RPO is under 7 days, FAILED otherwise. Green = healthy backup schedule.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "red",
|
||||
"index": 1,
|
||||
"text": "FAILED"
|
||||
},
|
||||
"1": {
|
||||
"color": "green",
|
||||
"index": 0,
|
||||
"text": "SUCCESS"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 5,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_rpo_seconds{server=~\"$server\"} < bool 604800",
|
||||
"legendFormat": "{{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Last Backup Status",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Time elapsed since the last successful backup. Green < 12h, Yellow < 24h, Red > 24h.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 43200
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 86400
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 5,
|
||||
"x": 5,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_rpo_seconds{server=~\"$server\"}",
|
||||
"legendFormat": "{{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Time Since Last Backup",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Whether the most recent backup was verified successfully. 1 = verified and valid.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "orange",
|
||||
"index": 1,
|
||||
"text": "NOT VERIFIED"
|
||||
},
|
||||
"1": {
|
||||
"color": "green",
|
||||
"index": 0,
|
||||
"text": "VERIFIED"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "orange",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 5,
|
||||
"x": 10,
|
||||
"y": 1
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_backup_verified{server=~\"$server\"}",
|
||||
"legendFormat": "{{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Verification Status",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Total count of successful backup completions.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 4,
|
||||
"x": 15,
|
||||
"y": 1
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_backup_total{server=~\"$server\", status=\"success\"}",
|
||||
"legendFormat": "{{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Total Successful Backups",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Total count of failed backup attempts. Any value > 0 warrants investigation.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 5,
|
||||
"x": 19,
|
||||
"y": 1
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_backup_total{server=~\"$server\", status=\"failure\"}",
|
||||
"legendFormat": "{{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Total Failed Backups",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Recovery Point Objective over time. Shows how long since the last successful backup. Red line at 24h threshold.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "line"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 86400
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_rpo_seconds{server=~\"$server\"}",
|
||||
"legendFormat": "{{server}} - {{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "RPO Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Size of each backup over time. Useful for capacity planning and detecting unexpected growth.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "bars",
|
||||
"fillOpacity": 100,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 5
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_last_backup_size_bytes{server=~\"$server\"}",
|
||||
"legendFormat": "{{server}} - {{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Backup Size",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "How long each backup takes. Monitor for trends that may indicate database growth or performance issues.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 13
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_last_backup_duration_seconds{server=~\"$server\"}",
|
||||
"legendFormat": "{{server}} - {{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Backup Duration",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Summary table showing current status of all databases with color-coded RPO and backup sizes.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"custom": {
|
||||
"align": "auto",
|
||||
"cellOptions": {
|
||||
"type": "auto"
|
||||
},
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Status"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "red",
|
||||
"index": 1,
|
||||
"text": "FAILED"
|
||||
},
|
||||
"1": {
|
||||
"color": "green",
|
||||
"index": 0,
|
||||
"text": "SUCCESS"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "custom.cellOptions",
|
||||
"value": {
|
||||
"mode": "basic",
|
||||
"type": "color-background"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "RPO"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "s"
|
||||
},
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 43200
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 86400
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "custom.cellOptions",
|
||||
"value": {
|
||||
"mode": "basic",
|
||||
"type": "color-background"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Size"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "bytes"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 13
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"cellHeight": "sm",
|
||||
"footer": {
|
||||
"countRows": false,
|
||||
"fields": "",
|
||||
"reducer": [
|
||||
"sum"
|
||||
],
|
||||
"show": false
|
||||
},
|
||||
"showHeader": true
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_rpo_seconds{server=~\"$server\"}",
|
||||
"format": "table",
|
||||
"hide": false,
|
||||
"instant": true,
|
||||
"legendFormat": "__auto",
|
||||
"range": false,
|
||||
"refId": "RPO"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_last_backup_size_bytes{server=~\"$server\"}",
|
||||
"format": "table",
|
||||
"hide": false,
|
||||
"instant": true,
|
||||
"legendFormat": "__auto",
|
||||
"range": false,
|
||||
"refId": "Size"
|
||||
}
|
||||
],
|
||||
"title": "Backup Status Overview",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {
|
||||
"byField": "database",
|
||||
"mode": "outer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Time 1": true,
|
||||
"Time 2": true,
|
||||
"__name__": true,
|
||||
"__name__ 1": true,
|
||||
"__name__ 2": true,
|
||||
"instance 1": true,
|
||||
"instance 2": true,
|
||||
"job": true,
|
||||
"job 1": true,
|
||||
"job 2": true,
|
||||
"engine 1": true,
|
||||
"engine 2": true
|
||||
},
|
||||
"indexByName": {
|
||||
"Database": 0,
|
||||
"Instance": 1,
|
||||
"Engine": 2,
|
||||
"RPO": 3,
|
||||
"Size": 4
|
||||
},
|
||||
"renameByName": {
|
||||
"Value #RPO": "RPO",
|
||||
"Value #Size": "Size",
|
||||
"database": "Database",
|
||||
"instance": "Instance",
|
||||
"engine": "Engine"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 21
|
||||
},
|
||||
"id": 100,
|
||||
"panels": [],
|
||||
"title": "Deduplication Statistics",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Overall deduplication efficiency (0-1). Higher values mean more duplicate data eliminated. 0.5 = 50% space savings.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "blue",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 22
|
||||
},
|
||||
"id": 101,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_ratio{server=~\"$server\"}",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Dedup Ratio",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Total bytes saved by deduplication across all backups.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 22
|
||||
},
|
||||
"id": 102,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_space_saved_bytes{server=~\"$server\"}",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Space Saved",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Actual disk usage of the chunk store after deduplication.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 22
|
||||
},
|
||||
"id": 103,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_disk_usage_bytes{server=~\"$server\"}",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Disk Usage",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Total number of unique content-addressed chunks in the dedup store.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "purple",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 22
|
||||
},
|
||||
"id": 104,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_chunks_total{server=~\"$server\"}",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Total Chunks",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Compression ratio achieved (0-1). Higher = better compression of chunk data.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "orange",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 4,
|
||||
"x": 0,
|
||||
"y": 27
|
||||
},
|
||||
"id": 107,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_compression_ratio{server=~\"$server\"}",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Compression Ratio",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Timestamp of the oldest chunk - useful for monitoring retention policy.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "semi-dark-blue",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "dateTimeFromNow"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 4,
|
||||
"x": 4,
|
||||
"y": 27
|
||||
},
|
||||
"id": 108,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_oldest_chunk_timestamp{server=~\"$server\"} * 1000",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Oldest Chunk",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Timestamp of the newest chunk - confirms dedup is working on recent backups.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "semi-dark-green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "dateTimeFromNow"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 4,
|
||||
"x": 8,
|
||||
"y": 27
|
||||
},
|
||||
"id": 109,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_newest_chunk_timestamp{server=~\"$server\"} * 1000",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Newest Chunk",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Per-database deduplication efficiency over time. Compare databases to identify which benefit most from dedup.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"id": 105,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_database_ratio{server=~\"$server\"}",
|
||||
"legendFormat": "{{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Dedup Ratio by Database",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Storage trends: compare space saved by dedup vs actual disk usage over time.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 32
|
||||
},
|
||||
"id": 106,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_space_saved_bytes{server=~\"$server\"}",
|
||||
"legendFormat": "Space Saved",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_disk_usage_bytes{server=~\"$server\"}",
|
||||
"legendFormat": "Disk Usage",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Dedup Storage Over Time",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"tags": [
|
||||
"dbbackup",
|
||||
"backup",
|
||||
"database",
|
||||
"dedup",
|
||||
"monitoring"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"definition": "label_values(dbbackup_rpo_seconds, server)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Server",
|
||||
"multi": true,
|
||||
"name": "server",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values(dbbackup_rpo_seconds, server)",
|
||||
"refId": "StandardVariableQuery"
|
||||
},
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"hide": 2,
|
||||
"name": "DS_PROMETHEUS",
|
||||
"query": "prometheus",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-24h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "DBBackup Overview",
|
||||
"uid": "dbbackup-overview",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
Reference in New Issue
Block a user