feat: add embedded systemd installer and Prometheus metrics
Some checks failed
CI/CD / Test (push) Successful in 2m42s
CI/CD / Lint (push) Successful in 2m50s
CI/CD / Build (amd64, darwin) (push) Successful in 2m0s
CI/CD / Build (amd64, linux) (push) Successful in 1m58s
CI/CD / Build (arm64, darwin) (push) Successful in 2m1s
CI/CD / Build (arm64, linux) (push) Has been cancelled

Systemd Integration:
- New 'dbbackup install' command creates service/timer units
- Supports single-database and cluster backup modes
- Automatic dbbackup user/group creation with proper permissions
- Hardened service units with security features
- Template units with configurable OnCalendar schedules
- 'dbbackup uninstall' for clean removal

Prometheus Metrics:
- 'dbbackup metrics export' for textfile collector format
- 'dbbackup metrics serve' runs HTTP exporter on port 9399
- Metrics: last_success_timestamp, rpo_seconds, backup_total, etc.
- Integration with node_exporter textfile collector
- --with-metrics flag during install

Technical:
- Systemd templates embedded with //go:embed
- Service units include ReadWritePaths, OOMScoreAdjust
- Metrics exporter caches with 30s TTL
- Graceful shutdown on SIGTERM
This commit is contained in:
2026-01-07 11:18:09 +01:00
parent 120ee33e3b
commit 7e32a0369d
12 changed files with 1641 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
// Package installer provides systemd service installation for dbbackup
package installer
import (
"embed"
)
// Templates contains embedded systemd unit files
//
//go:embed templates/*.service templates/*.timer
var Templates embed.FS

View File

@@ -0,0 +1,634 @@
// Package installer provides systemd service installation for dbbackup
package installer
import (
"context"
"fmt"
"os"
"os/exec"
"os/user"
"path/filepath"
"runtime"
"strings"
"text/template"
"dbbackup/internal/logger"
)
// Installer handles systemd service installation
type Installer struct {
log logger.Logger
unitDir string // /etc/systemd/system or custom
dryRun bool
}
// InstallOptions configures the installation
type InstallOptions struct {
// Instance name (e.g., "production", "staging")
Instance string
// Binary path (auto-detected if empty)
BinaryPath string
// Backup configuration
BackupType string // "single" or "cluster"
Schedule string // OnCalendar format, e.g., "daily", "*-*-* 02:00:00"
// Service user/group
User string
Group string
// Paths
BackupDir string
ConfigPath string
// Timeout in seconds (default: 3600)
TimeoutSeconds int
// Metrics
WithMetrics bool
MetricsPort int
}
// ServiceStatus contains information about installed services
type ServiceStatus struct {
Installed bool
Enabled bool
Active bool
TimerEnabled bool
TimerActive bool
LastRun string
NextRun string
ServicePath string
TimerPath string
ExporterPath string
}
// NewInstaller creates a new Installer
func NewInstaller(log logger.Logger, dryRun bool) *Installer {
return &Installer{
log: log,
unitDir: "/etc/systemd/system",
dryRun: dryRun,
}
}
// SetUnitDir allows overriding the systemd unit directory (for testing)
func (i *Installer) SetUnitDir(dir string) {
i.unitDir = dir
}
// Install installs the systemd service and timer
func (i *Installer) Install(ctx context.Context, opts InstallOptions) error {
// Validate platform
if runtime.GOOS != "linux" {
return fmt.Errorf("systemd installation only supported on Linux (current: %s)", runtime.GOOS)
}
// Validate prerequisites
if err := i.validatePrerequisites(); err != nil {
return err
}
// Set defaults
if err := i.setDefaults(&opts); err != nil {
return err
}
// Create user if needed
if err := i.ensureUser(opts.User, opts.Group); err != nil {
return err
}
// Create directories
if err := i.createDirectories(opts); err != nil {
return err
}
// Write service and timer files
if err := i.writeUnitFiles(opts); err != nil {
return err
}
// Reload systemd
if err := i.systemctl(ctx, "daemon-reload"); err != nil {
return err
}
// Enable timer
timerName := i.getTimerName(opts)
if err := i.systemctl(ctx, "enable", timerName); err != nil {
return err
}
// Install metrics exporter if requested
if opts.WithMetrics {
if err := i.installExporter(ctx, opts); err != nil {
i.log.Warn("Failed to install metrics exporter", "error", err)
}
}
i.log.Info("Installation complete",
"instance", opts.Instance,
"timer", timerName,
"schedule", opts.Schedule)
i.printNextSteps(opts)
return nil
}
// Uninstall removes the systemd service and timer
func (i *Installer) Uninstall(ctx context.Context, instance string, purge bool) error {
if runtime.GOOS != "linux" {
return fmt.Errorf("systemd uninstallation only supported on Linux")
}
if err := i.validatePrerequisites(); err != nil {
return err
}
// Determine service names
var serviceName, timerName string
if instance == "cluster" || instance == "" {
serviceName = "dbbackup-cluster.service"
timerName = "dbbackup-cluster.timer"
} else {
serviceName = fmt.Sprintf("dbbackup@%s.service", instance)
timerName = fmt.Sprintf("dbbackup@%s.timer", instance)
}
// Stop and disable timer
_ = i.systemctl(ctx, "stop", timerName)
_ = i.systemctl(ctx, "disable", timerName)
// Stop and disable service
_ = i.systemctl(ctx, "stop", serviceName)
_ = i.systemctl(ctx, "disable", serviceName)
// Remove unit files
servicePath := filepath.Join(i.unitDir, serviceName)
timerPath := filepath.Join(i.unitDir, timerName)
if !i.dryRun {
os.Remove(servicePath)
os.Remove(timerPath)
} else {
i.log.Info("Would remove", "service", servicePath)
i.log.Info("Would remove", "timer", timerPath)
}
// Also try to remove template units if they exist
if instance != "cluster" && instance != "" {
templateService := filepath.Join(i.unitDir, "dbbackup@.service")
templateTimer := filepath.Join(i.unitDir, "dbbackup@.timer")
// Only remove templates if no other instances are using them
if i.canRemoveTemplates() {
if !i.dryRun {
os.Remove(templateService)
os.Remove(templateTimer)
}
}
}
// Remove exporter
exporterPath := filepath.Join(i.unitDir, "dbbackup-exporter.service")
_ = i.systemctl(ctx, "stop", "dbbackup-exporter.service")
_ = i.systemctl(ctx, "disable", "dbbackup-exporter.service")
if !i.dryRun {
os.Remove(exporterPath)
}
// Reload systemd
_ = i.systemctl(ctx, "daemon-reload")
// Purge config files if requested
if purge {
configDirs := []string{
"/etc/dbbackup",
"/var/lib/dbbackup",
}
for _, dir := range configDirs {
if !i.dryRun {
if err := os.RemoveAll(dir); err != nil {
i.log.Warn("Failed to remove directory", "path", dir, "error", err)
} else {
i.log.Info("Removed directory", "path", dir)
}
} else {
i.log.Info("Would remove directory", "path", dir)
}
}
}
i.log.Info("Uninstallation complete", "instance", instance, "purge", purge)
return nil
}
// Status returns the current installation status
func (i *Installer) Status(ctx context.Context, instance string) (*ServiceStatus, error) {
if runtime.GOOS != "linux" {
return nil, fmt.Errorf("systemd status only supported on Linux")
}
status := &ServiceStatus{}
// Determine service names
var serviceName, timerName string
if instance == "cluster" || instance == "" {
serviceName = "dbbackup-cluster.service"
timerName = "dbbackup-cluster.timer"
} else {
serviceName = fmt.Sprintf("dbbackup@%s.service", instance)
timerName = fmt.Sprintf("dbbackup@%s.timer", instance)
}
// Check service file exists
status.ServicePath = filepath.Join(i.unitDir, serviceName)
if _, err := os.Stat(status.ServicePath); err == nil {
status.Installed = true
}
// Check timer file exists
status.TimerPath = filepath.Join(i.unitDir, timerName)
// Check exporter
status.ExporterPath = filepath.Join(i.unitDir, "dbbackup-exporter.service")
// Check enabled/active status
if status.Installed {
status.Enabled = i.isEnabled(ctx, serviceName)
status.Active = i.isActive(ctx, serviceName)
status.TimerEnabled = i.isEnabled(ctx, timerName)
status.TimerActive = i.isActive(ctx, timerName)
// Get timer info
status.NextRun = i.getTimerNext(ctx, timerName)
status.LastRun = i.getTimerLast(ctx, timerName)
}
return status, nil
}
// validatePrerequisites checks system requirements
func (i *Installer) validatePrerequisites() error {
// Check root
if os.Getuid() != 0 {
return fmt.Errorf("installation requires root privileges (use sudo)")
}
// Check systemd
if _, err := exec.LookPath("systemctl"); err != nil {
return fmt.Errorf("systemctl not found - is this a systemd-based system?")
}
// Check for container environment
if _, err := os.Stat("/.dockerenv"); err == nil {
i.log.Warn("Running inside Docker container - systemd may not work correctly")
}
return nil
}
// setDefaults fills in default values
func (i *Installer) setDefaults(opts *InstallOptions) error {
// Auto-detect binary path
if opts.BinaryPath == "" {
binPath, err := os.Executable()
if err != nil {
return fmt.Errorf("failed to detect binary path: %w", err)
}
binPath, err = filepath.EvalSymlinks(binPath)
if err != nil {
return fmt.Errorf("failed to resolve binary path: %w", err)
}
opts.BinaryPath = binPath
}
// Default instance
if opts.Instance == "" {
opts.Instance = "default"
}
// Default backup type
if opts.BackupType == "" {
opts.BackupType = "single"
}
// Default schedule (daily at 2am)
if opts.Schedule == "" {
opts.Schedule = "*-*-* 02:00:00"
}
// Default user/group
if opts.User == "" {
opts.User = "dbbackup"
}
if opts.Group == "" {
opts.Group = "dbbackup"
}
// Default paths
if opts.BackupDir == "" {
opts.BackupDir = "/var/lib/dbbackup/backups"
}
if opts.ConfigPath == "" {
opts.ConfigPath = "/etc/dbbackup/dbbackup.conf"
}
// Default timeout (1 hour)
if opts.TimeoutSeconds == 0 {
opts.TimeoutSeconds = 3600
}
// Default metrics port
if opts.MetricsPort == 0 {
opts.MetricsPort = 9399
}
return nil
}
// ensureUser creates the service user if it doesn't exist
func (i *Installer) ensureUser(username, groupname string) error {
// Check if user exists
if _, err := user.Lookup(username); err == nil {
i.log.Debug("User already exists", "user", username)
return nil
}
if i.dryRun {
i.log.Info("Would create user", "user", username, "group", groupname)
return nil
}
// Create group first
groupCmd := exec.Command("groupadd", "--system", groupname)
if output, err := groupCmd.CombinedOutput(); err != nil {
// Ignore if group already exists
if !strings.Contains(string(output), "already exists") {
i.log.Debug("Group creation output", "output", string(output))
}
}
// Create user
userCmd := exec.Command("useradd",
"--system",
"--shell", "/usr/sbin/nologin",
"--home-dir", "/var/lib/dbbackup",
"--gid", groupname,
username)
if output, err := userCmd.CombinedOutput(); err != nil {
if !strings.Contains(string(output), "already exists") {
return fmt.Errorf("failed to create user %s: %w (%s)", username, err, output)
}
}
i.log.Info("Created system user", "user", username, "group", groupname)
return nil
}
// createDirectories creates required directories
func (i *Installer) createDirectories(opts InstallOptions) error {
dirs := []struct {
path string
mode os.FileMode
}{
{"/etc/dbbackup", 0755},
{"/etc/dbbackup/env.d", 0700},
{"/var/lib/dbbackup", 0750},
{"/var/lib/dbbackup/backups", 0750},
{"/var/lib/dbbackup/metrics", 0755},
{"/var/log/dbbackup", 0750},
{opts.BackupDir, 0750},
}
for _, d := range dirs {
if i.dryRun {
i.log.Info("Would create directory", "path", d.path, "mode", d.mode)
continue
}
if err := os.MkdirAll(d.path, d.mode); err != nil {
return fmt.Errorf("failed to create directory %s: %w", d.path, err)
}
// Set ownership
u, err := user.Lookup(opts.User)
if err == nil {
var uid, gid int
fmt.Sscanf(u.Uid, "%d", &uid)
fmt.Sscanf(u.Gid, "%d", &gid)
os.Chown(d.path, uid, gid)
}
}
return nil
}
// writeUnitFiles renders and writes the systemd unit files
func (i *Installer) writeUnitFiles(opts InstallOptions) error {
// Prepare template data
data := map[string]interface{}{
"User": opts.User,
"Group": opts.Group,
"BinaryPath": opts.BinaryPath,
"BackupType": opts.BackupType,
"BackupDir": opts.BackupDir,
"ConfigPath": opts.ConfigPath,
"TimeoutSeconds": opts.TimeoutSeconds,
"Schedule": opts.Schedule,
"MetricsPort": opts.MetricsPort,
}
// Determine which templates to use
var serviceTemplate, timerTemplate string
var serviceName, timerName string
if opts.BackupType == "cluster" {
serviceTemplate = "templates/dbbackup-cluster.service"
timerTemplate = "templates/dbbackup-cluster.timer"
serviceName = "dbbackup-cluster.service"
timerName = "dbbackup-cluster.timer"
} else {
serviceTemplate = "templates/dbbackup@.service"
timerTemplate = "templates/dbbackup@.timer"
serviceName = "dbbackup@.service"
timerName = "dbbackup@.timer"
}
// Write service file
if err := i.writeTemplateFile(serviceTemplate, serviceName, data); err != nil {
return fmt.Errorf("failed to write service file: %w", err)
}
// Write timer file
if err := i.writeTemplateFile(timerTemplate, timerName, data); err != nil {
return fmt.Errorf("failed to write timer file: %w", err)
}
return nil
}
// writeTemplateFile reads an embedded template and writes it to the unit directory
func (i *Installer) writeTemplateFile(templatePath, outputName string, data map[string]interface{}) error {
// Read template
content, err := Templates.ReadFile(templatePath)
if err != nil {
return fmt.Errorf("failed to read template %s: %w", templatePath, err)
}
// Parse template
tmpl, err := template.New(outputName).Parse(string(content))
if err != nil {
return fmt.Errorf("failed to parse template %s: %w", templatePath, err)
}
// Render template
var buf strings.Builder
if err := tmpl.Execute(&buf, data); err != nil {
return fmt.Errorf("failed to render template %s: %w", templatePath, err)
}
// Write file
outputPath := filepath.Join(i.unitDir, outputName)
if i.dryRun {
i.log.Info("Would write unit file", "path", outputPath)
i.log.Debug("Unit file content", "content", buf.String())
return nil
}
if err := os.WriteFile(outputPath, []byte(buf.String()), 0644); err != nil {
return fmt.Errorf("failed to write %s: %w", outputPath, err)
}
i.log.Info("Created unit file", "path", outputPath)
return nil
}
// installExporter installs the metrics exporter service
func (i *Installer) installExporter(ctx context.Context, opts InstallOptions) error {
data := map[string]interface{}{
"User": opts.User,
"Group": opts.Group,
"BinaryPath": opts.BinaryPath,
"ConfigPath": opts.ConfigPath,
"MetricsPort": opts.MetricsPort,
}
if err := i.writeTemplateFile("templates/dbbackup-exporter.service", "dbbackup-exporter.service", data); err != nil {
return err
}
if err := i.systemctl(ctx, "daemon-reload"); err != nil {
return err
}
if err := i.systemctl(ctx, "enable", "dbbackup-exporter.service"); err != nil {
return err
}
if err := i.systemctl(ctx, "start", "dbbackup-exporter.service"); err != nil {
return err
}
i.log.Info("Installed metrics exporter", "port", opts.MetricsPort)
return nil
}
// getTimerName returns the timer unit name for the given options
func (i *Installer) getTimerName(opts InstallOptions) string {
if opts.BackupType == "cluster" {
return "dbbackup-cluster.timer"
}
return fmt.Sprintf("dbbackup@%s.timer", opts.Instance)
}
// systemctl runs a systemctl command
func (i *Installer) systemctl(ctx context.Context, args ...string) error {
if i.dryRun {
i.log.Info("Would run: systemctl", "args", args)
return nil
}
cmd := exec.CommandContext(ctx, "systemctl", args...)
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("systemctl %v failed: %w\n%s", args, err, string(output))
}
return nil
}
// isEnabled checks if a unit is enabled
func (i *Installer) isEnabled(ctx context.Context, unit string) bool {
cmd := exec.CommandContext(ctx, "systemctl", "is-enabled", unit)
return cmd.Run() == nil
}
// isActive checks if a unit is active
func (i *Installer) isActive(ctx context.Context, unit string) bool {
cmd := exec.CommandContext(ctx, "systemctl", "is-active", unit)
return cmd.Run() == nil
}
// getTimerNext gets the next run time for a timer
func (i *Installer) getTimerNext(ctx context.Context, timer string) string {
cmd := exec.CommandContext(ctx, "systemctl", "show", timer, "--property=NextElapseUSecRealtime", "--value")
output, err := cmd.Output()
if err != nil {
return ""
}
return strings.TrimSpace(string(output))
}
// getTimerLast gets the last run time for a timer
func (i *Installer) getTimerLast(ctx context.Context, timer string) string {
cmd := exec.CommandContext(ctx, "systemctl", "show", timer, "--property=LastTriggerUSec", "--value")
output, err := cmd.Output()
if err != nil {
return ""
}
return strings.TrimSpace(string(output))
}
// canRemoveTemplates checks if template units can be safely removed
func (i *Installer) canRemoveTemplates() bool {
// Check if any dbbackup@*.service instances exist
pattern := filepath.Join(i.unitDir, "dbbackup@*.service")
matches, _ := filepath.Glob(pattern)
// Also check for running instances
cmd := exec.Command("systemctl", "list-units", "--type=service", "--all", "dbbackup@*")
output, _ := cmd.Output()
return len(matches) == 0 && !strings.Contains(string(output), "dbbackup@")
}
// printNextSteps prints helpful next steps after installation
func (i *Installer) printNextSteps(opts InstallOptions) {
timerName := i.getTimerName(opts)
serviceName := strings.Replace(timerName, ".timer", ".service", 1)
fmt.Println()
fmt.Println("✅ Installation successful!")
fmt.Println()
fmt.Println("📋 Next steps:")
fmt.Println()
fmt.Printf(" 1. Edit configuration: sudo nano %s\n", opts.ConfigPath)
fmt.Printf(" 2. Set credentials: sudo nano /etc/dbbackup/env.d/%s.conf\n", opts.Instance)
fmt.Printf(" 3. Start the timer: sudo systemctl start %s\n", timerName)
fmt.Printf(" 4. Verify timer status: sudo systemctl status %s\n", timerName)
fmt.Printf(" 5. Run backup manually: sudo systemctl start %s\n", serviceName)
fmt.Println()
fmt.Println("📊 View backup logs:")
fmt.Printf(" journalctl -u %s -f\n", serviceName)
fmt.Println()
if opts.WithMetrics {
fmt.Println("📈 Prometheus metrics:")
fmt.Printf(" curl http://localhost:%d/metrics\n", opts.MetricsPort)
fmt.Println()
}
}

View File

@@ -0,0 +1,47 @@
[Unit]
Description=Database Cluster Backup
Documentation=https://github.com/PlusOne/dbbackup
After=network-online.target postgresql.service mysql.service mariadb.service
Wants=network-online.target
[Service]
Type=oneshot
User={{.User}}
Group={{.Group}}
# Security hardening
NoNewPrivileges=yes
ProtectSystem=strict
ProtectHome=read-only
PrivateTmp=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictSUIDSGID=yes
RestrictRealtime=yes
LockPersonality=yes
RemoveIPC=yes
CapabilityBoundingSet=
AmbientCapabilities=
# Directories
ReadWritePaths={{.BackupDir}} /var/lib/dbbackup /var/log/dbbackup
# Network access for cloud uploads
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
# Environment
EnvironmentFile=-/etc/dbbackup/env.d/cluster.conf
# Execution - cluster backup (all databases)
ExecStart={{.BinaryPath}} backup cluster --config {{.ConfigPath}}
TimeoutStartSec={{.TimeoutSeconds}}
# Post-backup metrics export
ExecStopPost=-{{.BinaryPath}} metrics export --instance cluster --output /var/lib/dbbackup/metrics/cluster.prom
# OOM protection for large backups
OOMScoreAdjust=-500
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,11 @@
[Unit]
Description=Database Cluster Backup Timer
Documentation=https://github.com/PlusOne/dbbackup
[Timer]
OnCalendar={{.Schedule}}
Persistent=true
RandomizedDelaySec=1800
[Install]
WantedBy=timers.target

View File

@@ -0,0 +1,37 @@
[Unit]
Description=DBBackup Prometheus Metrics Exporter
Documentation=https://github.com/PlusOne/dbbackup
After=network-online.target
[Service]
Type=simple
User={{.User}}
Group={{.Group}}
# Security hardening
NoNewPrivileges=yes
ProtectSystem=strict
ProtectHome=yes
PrivateTmp=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictSUIDSGID=yes
RestrictRealtime=yes
LockPersonality=yes
RemoveIPC=yes
# Read-only access to catalog and backups
ReadOnlyPaths=/var/lib/dbbackup
# Network for HTTP server
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
# Execution
ExecStart={{.BinaryPath}} metrics serve --port {{.MetricsPort}} --config {{.ConfigPath}}
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,47 @@
[Unit]
Description=Database Backup for %i
Documentation=https://github.com/PlusOne/dbbackup
After=network-online.target postgresql.service mysql.service mariadb.service
Wants=network-online.target
[Service]
Type=oneshot
User={{.User}}
Group={{.Group}}
# Security hardening
NoNewPrivileges=yes
ProtectSystem=strict
ProtectHome=read-only
PrivateTmp=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictSUIDSGID=yes
RestrictRealtime=yes
LockPersonality=yes
RemoveIPC=yes
CapabilityBoundingSet=
AmbientCapabilities=
# Directories
ReadWritePaths={{.BackupDir}} /var/lib/dbbackup /var/log/dbbackup
# Network access for cloud uploads
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
# Environment
EnvironmentFile=-/etc/dbbackup/env.d/%i.conf
# Execution
ExecStart={{.BinaryPath}} backup {{.BackupType}} %i --config {{.ConfigPath}}
TimeoutStartSec={{.TimeoutSeconds}}
# Post-backup metrics export
ExecStopPost=-{{.BinaryPath}} metrics export --instance %i --output /var/lib/dbbackup/metrics/%i.prom
# OOM protection for large backups
OOMScoreAdjust=-500
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,11 @@
[Unit]
Description=Database Backup Timer for %i
Documentation=https://github.com/PlusOne/dbbackup
[Timer]
OnCalendar={{.Schedule}}
Persistent=true
RandomizedDelaySec=1800
[Install]
WantedBy=timers.target

View File

@@ -0,0 +1,174 @@
// Package prometheus provides Prometheus metrics for dbbackup
package prometheus
import (
"context"
"fmt"
"net/http"
"sync"
"time"
"dbbackup/internal/catalog"
"dbbackup/internal/logger"
)
// Exporter provides an HTTP endpoint for Prometheus metrics
type Exporter struct {
log logger.Logger
catalog catalog.Catalog
instance string
port int
mu sync.RWMutex
cachedData string
lastRefresh time.Time
refreshTTL time.Duration
}
// NewExporter creates a new Prometheus exporter
func NewExporter(log logger.Logger, cat catalog.Catalog, instance string, port int) *Exporter {
return &Exporter{
log: log,
catalog: cat,
instance: instance,
port: port,
refreshTTL: 30 * time.Second,
}
}
// Serve starts the HTTP server and blocks until context is cancelled
func (e *Exporter) Serve(ctx context.Context) error {
mux := http.NewServeMux()
// /metrics endpoint
mux.HandleFunc("/metrics", e.handleMetrics)
// /health endpoint
mux.HandleFunc("/health", e.handleHealth)
// / root with info
mux.HandleFunc("/", e.handleRoot)
addr := fmt.Sprintf(":%d", e.port)
srv := &http.Server{
Addr: addr,
Handler: mux,
ReadTimeout: 10 * time.Second,
WriteTimeout: 30 * time.Second,
IdleTimeout: 60 * time.Second,
}
// Start refresh goroutine
go e.refreshLoop(ctx)
// Graceful shutdown
go func() {
<-ctx.Done()
e.log.Info("Shutting down metrics server...")
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := srv.Shutdown(shutdownCtx); err != nil {
e.log.Error("Server shutdown error", "error", err)
}
}()
e.log.Info("Starting Prometheus metrics server", "addr", addr)
if err := srv.ListenAndServe(); err != http.ErrServerClosed {
return fmt.Errorf("server error: %w", err)
}
return nil
}
// handleMetrics handles /metrics endpoint
func (e *Exporter) handleMetrics(w http.ResponseWriter, r *http.Request) {
e.mu.RLock()
data := e.cachedData
e.mu.RUnlock()
if data == "" {
// Force refresh if cache is empty
if err := e.refresh(); err != nil {
http.Error(w, "Failed to collect metrics", http.StatusInternalServerError)
return
}
e.mu.RLock()
data = e.cachedData
e.mu.RUnlock()
}
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
w.WriteHeader(http.StatusOK)
w.Write([]byte(data))
}
// handleHealth handles /health endpoint
func (e *Exporter) handleHealth(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"status":"ok","service":"dbbackup-exporter"}`))
}
// handleRoot handles / endpoint
func (e *Exporter) handleRoot(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/" {
http.NotFound(w, r)
return
}
w.Header().Set("Content-Type", "text/html")
w.WriteHeader(http.StatusOK)
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
<title>DBBackup Exporter</title>
</head>
<body>
<h1>DBBackup Prometheus Exporter</h1>
<p>This is a Prometheus metrics exporter for DBBackup.</p>
<ul>
<li><a href="/metrics">/metrics</a> - Prometheus metrics</li>
<li><a href="/health">/health</a> - Health check</li>
</ul>
</body>
</html>`))
}
// refreshLoop periodically refreshes the metrics cache
func (e *Exporter) refreshLoop(ctx context.Context) {
ticker := time.NewTicker(e.refreshTTL)
defer ticker.Stop()
// Initial refresh
if err := e.refresh(); err != nil {
e.log.Error("Initial metrics refresh failed", "error", err)
}
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
if err := e.refresh(); err != nil {
e.log.Error("Metrics refresh failed", "error", err)
}
}
}
}
// refresh updates the cached metrics
func (e *Exporter) refresh() error {
writer := NewMetricsWriter(e.log, e.catalog, e.instance)
data, err := writer.GenerateMetricsString()
if err != nil {
return err
}
e.mu.Lock()
e.cachedData = data
e.lastRefresh = time.Now()
e.mu.Unlock()
e.log.Debug("Refreshed metrics cache")
return nil
}

View File

@@ -0,0 +1,245 @@
// Package prometheus provides Prometheus metrics for dbbackup
package prometheus
import (
"context"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"time"
"dbbackup/internal/catalog"
"dbbackup/internal/logger"
)
// MetricsWriter writes metrics in Prometheus text format
type MetricsWriter struct {
log logger.Logger
catalog catalog.Catalog
instance string
}
// NewMetricsWriter creates a new MetricsWriter
func NewMetricsWriter(log logger.Logger, cat catalog.Catalog, instance string) *MetricsWriter {
return &MetricsWriter{
log: log,
catalog: cat,
instance: instance,
}
}
// BackupMetrics holds metrics for a single database
type BackupMetrics struct {
Database string
Engine string
LastSuccess time.Time
LastDuration time.Duration
LastSize int64
TotalBackups int
SuccessCount int
FailureCount int
Verified bool
RPOSeconds float64
}
// WriteTextfile writes metrics to a Prometheus textfile collector file
func (m *MetricsWriter) WriteTextfile(path string) error {
metrics, err := m.collectMetrics()
if err != nil {
return fmt.Errorf("failed to collect metrics: %w", err)
}
output := m.formatMetrics(metrics)
// Atomic write: write to temp file, then rename
dir := filepath.Dir(path)
if err := os.MkdirAll(dir, 0755); err != nil {
return fmt.Errorf("failed to create directory %s: %w", dir, err)
}
tmpPath := path + ".tmp"
if err := os.WriteFile(tmpPath, []byte(output), 0644); err != nil {
return fmt.Errorf("failed to write temp file: %w", err)
}
if err := os.Rename(tmpPath, path); err != nil {
os.Remove(tmpPath)
return fmt.Errorf("failed to rename temp file: %w", err)
}
m.log.Debug("Wrote metrics to textfile", "path", path, "databases", len(metrics))
return nil
}
// collectMetrics gathers metrics from the catalog
func (m *MetricsWriter) collectMetrics() ([]BackupMetrics, error) {
if m.catalog == nil {
return nil, fmt.Errorf("catalog not available")
}
ctx := context.Background()
// Get recent backups using Search with limit
query := &catalog.SearchQuery{
Limit: 1000,
}
entries, err := m.catalog.Search(ctx, query)
if err != nil {
return nil, fmt.Errorf("failed to search backups: %w", err)
}
// Group by database
byDB := make(map[string]*BackupMetrics)
for _, e := range entries {
key := e.Database
if key == "" {
key = "unknown"
}
metrics, ok := byDB[key]
if !ok {
metrics = &BackupMetrics{
Database: key,
Engine: e.DatabaseType,
}
byDB[key] = metrics
}
metrics.TotalBackups++
isSuccess := e.Status == catalog.StatusCompleted || e.Status == catalog.StatusVerified
if isSuccess {
metrics.SuccessCount++
// Track most recent success
if e.CreatedAt.After(metrics.LastSuccess) {
metrics.LastSuccess = e.CreatedAt
metrics.LastDuration = time.Duration(e.Duration * float64(time.Second))
metrics.LastSize = e.SizeBytes
metrics.Verified = e.VerifiedAt != nil && e.VerifyValid != nil && *e.VerifyValid
metrics.Engine = e.DatabaseType
}
} else {
metrics.FailureCount++
}
}
// Calculate RPO for each database
now := time.Now()
for _, metrics := range byDB {
if !metrics.LastSuccess.IsZero() {
metrics.RPOSeconds = now.Sub(metrics.LastSuccess).Seconds()
}
}
// Convert to slice and sort
result := make([]BackupMetrics, 0, len(byDB))
for _, metrics := range byDB {
result = append(result, *metrics)
}
sort.Slice(result, func(i, j int) bool {
return result[i].Database < result[j].Database
})
return result, nil
}
// formatMetrics formats metrics in Prometheus exposition format
func (m *MetricsWriter) formatMetrics(metrics []BackupMetrics) string {
var b strings.Builder
// Timestamp of metrics generation
now := time.Now().Unix()
// Header comment
b.WriteString("# DBBackup Prometheus Metrics\n")
b.WriteString(fmt.Sprintf("# Generated at: %s\n", time.Now().Format(time.RFC3339)))
b.WriteString(fmt.Sprintf("# Instance: %s\n", m.instance))
b.WriteString("\n")
// dbbackup_last_success_timestamp
b.WriteString("# HELP dbbackup_last_success_timestamp Unix timestamp of last successful backup\n")
b.WriteString("# TYPE dbbackup_last_success_timestamp gauge\n")
for _, met := range metrics {
if !met.LastSuccess.IsZero() {
b.WriteString(fmt.Sprintf("dbbackup_last_success_timestamp{instance=%q,database=%q,engine=%q} %d\n",
m.instance, met.Database, met.Engine, met.LastSuccess.Unix()))
}
}
b.WriteString("\n")
// dbbackup_last_backup_duration_seconds
b.WriteString("# HELP dbbackup_last_backup_duration_seconds Duration of last successful backup in seconds\n")
b.WriteString("# TYPE dbbackup_last_backup_duration_seconds gauge\n")
for _, met := range metrics {
if met.LastDuration > 0 {
b.WriteString(fmt.Sprintf("dbbackup_last_backup_duration_seconds{instance=%q,database=%q,engine=%q} %.2f\n",
m.instance, met.Database, met.Engine, met.LastDuration.Seconds()))
}
}
b.WriteString("\n")
// dbbackup_last_backup_size_bytes
b.WriteString("# HELP dbbackup_last_backup_size_bytes Size of last successful backup in bytes\n")
b.WriteString("# TYPE dbbackup_last_backup_size_bytes gauge\n")
for _, met := range metrics {
if met.LastSize > 0 {
b.WriteString(fmt.Sprintf("dbbackup_last_backup_size_bytes{instance=%q,database=%q,engine=%q} %d\n",
m.instance, met.Database, met.Engine, met.LastSize))
}
}
b.WriteString("\n")
// dbbackup_backup_total (counter)
b.WriteString("# HELP dbbackup_backup_total Total number of backup attempts\n")
b.WriteString("# TYPE dbbackup_backup_total counter\n")
for _, met := range metrics {
b.WriteString(fmt.Sprintf("dbbackup_backup_total{instance=%q,database=%q,status=\"success\"} %d\n",
m.instance, met.Database, met.SuccessCount))
b.WriteString(fmt.Sprintf("dbbackup_backup_total{instance=%q,database=%q,status=\"failure\"} %d\n",
m.instance, met.Database, met.FailureCount))
}
b.WriteString("\n")
// dbbackup_rpo_seconds
b.WriteString("# HELP dbbackup_rpo_seconds Recovery Point Objective - seconds since last successful backup\n")
b.WriteString("# TYPE dbbackup_rpo_seconds gauge\n")
for _, met := range metrics {
if met.RPOSeconds > 0 {
b.WriteString(fmt.Sprintf("dbbackup_rpo_seconds{instance=%q,database=%q} %.0f\n",
m.instance, met.Database, met.RPOSeconds))
}
}
b.WriteString("\n")
// dbbackup_backup_verified
b.WriteString("# HELP dbbackup_backup_verified Whether the last backup was verified (1=yes, 0=no)\n")
b.WriteString("# TYPE dbbackup_backup_verified gauge\n")
for _, met := range metrics {
verified := 0
if met.Verified {
verified = 1
}
b.WriteString(fmt.Sprintf("dbbackup_backup_verified{instance=%q,database=%q} %d\n",
m.instance, met.Database, verified))
}
b.WriteString("\n")
// dbbackup_scrape_timestamp
b.WriteString("# HELP dbbackup_scrape_timestamp Unix timestamp when metrics were collected\n")
b.WriteString("# TYPE dbbackup_scrape_timestamp gauge\n")
b.WriteString(fmt.Sprintf("dbbackup_scrape_timestamp{instance=%q} %d\n", m.instance, now))
return b.String()
}
// GenerateMetricsString returns metrics as a string (for HTTP endpoint)
func (m *MetricsWriter) GenerateMetricsString() (string, error) {
metrics, err := m.collectMetrics()
if err != nil {
return "", err
}
return m.formatMetrics(metrics), nil
}