feat: add embedded systemd installer and Prometheus metrics
Some checks failed
CI/CD / Test (push) Successful in 2m42s
CI/CD / Lint (push) Successful in 2m50s
CI/CD / Build (amd64, darwin) (push) Successful in 2m0s
CI/CD / Build (amd64, linux) (push) Successful in 1m58s
CI/CD / Build (arm64, darwin) (push) Successful in 2m1s
CI/CD / Build (arm64, linux) (push) Has been cancelled
Some checks failed
CI/CD / Test (push) Successful in 2m42s
CI/CD / Lint (push) Successful in 2m50s
CI/CD / Build (amd64, darwin) (push) Successful in 2m0s
CI/CD / Build (amd64, linux) (push) Successful in 1m58s
CI/CD / Build (arm64, darwin) (push) Successful in 2m1s
CI/CD / Build (arm64, linux) (push) Has been cancelled
Systemd Integration: - New 'dbbackup install' command creates service/timer units - Supports single-database and cluster backup modes - Automatic dbbackup user/group creation with proper permissions - Hardened service units with security features - Template units with configurable OnCalendar schedules - 'dbbackup uninstall' for clean removal Prometheus Metrics: - 'dbbackup metrics export' for textfile collector format - 'dbbackup metrics serve' runs HTTP exporter on port 9399 - Metrics: last_success_timestamp, rpo_seconds, backup_total, etc. - Integration with node_exporter textfile collector - --with-metrics flag during install Technical: - Systemd templates embedded with //go:embed - Service units include ReadWritePaths, OOMScoreAdjust - Metrics exporter caches with 30s TTL - Graceful shutdown on SIGTERM
This commit is contained in:
245
internal/prometheus/textfile.go
Normal file
245
internal/prometheus/textfile.go
Normal file
@@ -0,0 +1,245 @@
|
||||
// Package prometheus provides Prometheus metrics for dbbackup
|
||||
package prometheus
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"dbbackup/internal/catalog"
|
||||
"dbbackup/internal/logger"
|
||||
)
|
||||
|
||||
// MetricsWriter writes metrics in Prometheus text format
|
||||
type MetricsWriter struct {
|
||||
log logger.Logger
|
||||
catalog catalog.Catalog
|
||||
instance string
|
||||
}
|
||||
|
||||
// NewMetricsWriter creates a new MetricsWriter
|
||||
func NewMetricsWriter(log logger.Logger, cat catalog.Catalog, instance string) *MetricsWriter {
|
||||
return &MetricsWriter{
|
||||
log: log,
|
||||
catalog: cat,
|
||||
instance: instance,
|
||||
}
|
||||
}
|
||||
|
||||
// BackupMetrics holds metrics for a single database
|
||||
type BackupMetrics struct {
|
||||
Database string
|
||||
Engine string
|
||||
LastSuccess time.Time
|
||||
LastDuration time.Duration
|
||||
LastSize int64
|
||||
TotalBackups int
|
||||
SuccessCount int
|
||||
FailureCount int
|
||||
Verified bool
|
||||
RPOSeconds float64
|
||||
}
|
||||
|
||||
// WriteTextfile writes metrics to a Prometheus textfile collector file
|
||||
func (m *MetricsWriter) WriteTextfile(path string) error {
|
||||
metrics, err := m.collectMetrics()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to collect metrics: %w", err)
|
||||
}
|
||||
|
||||
output := m.formatMetrics(metrics)
|
||||
|
||||
// Atomic write: write to temp file, then rename
|
||||
dir := filepath.Dir(path)
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create directory %s: %w", dir, err)
|
||||
}
|
||||
|
||||
tmpPath := path + ".tmp"
|
||||
if err := os.WriteFile(tmpPath, []byte(output), 0644); err != nil {
|
||||
return fmt.Errorf("failed to write temp file: %w", err)
|
||||
}
|
||||
|
||||
if err := os.Rename(tmpPath, path); err != nil {
|
||||
os.Remove(tmpPath)
|
||||
return fmt.Errorf("failed to rename temp file: %w", err)
|
||||
}
|
||||
|
||||
m.log.Debug("Wrote metrics to textfile", "path", path, "databases", len(metrics))
|
||||
return nil
|
||||
}
|
||||
|
||||
// collectMetrics gathers metrics from the catalog
|
||||
func (m *MetricsWriter) collectMetrics() ([]BackupMetrics, error) {
|
||||
if m.catalog == nil {
|
||||
return nil, fmt.Errorf("catalog not available")
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Get recent backups using Search with limit
|
||||
query := &catalog.SearchQuery{
|
||||
Limit: 1000,
|
||||
}
|
||||
entries, err := m.catalog.Search(ctx, query)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to search backups: %w", err)
|
||||
}
|
||||
|
||||
// Group by database
|
||||
byDB := make(map[string]*BackupMetrics)
|
||||
|
||||
for _, e := range entries {
|
||||
key := e.Database
|
||||
if key == "" {
|
||||
key = "unknown"
|
||||
}
|
||||
|
||||
metrics, ok := byDB[key]
|
||||
if !ok {
|
||||
metrics = &BackupMetrics{
|
||||
Database: key,
|
||||
Engine: e.DatabaseType,
|
||||
}
|
||||
byDB[key] = metrics
|
||||
}
|
||||
|
||||
metrics.TotalBackups++
|
||||
|
||||
isSuccess := e.Status == catalog.StatusCompleted || e.Status == catalog.StatusVerified
|
||||
if isSuccess {
|
||||
metrics.SuccessCount++
|
||||
// Track most recent success
|
||||
if e.CreatedAt.After(metrics.LastSuccess) {
|
||||
metrics.LastSuccess = e.CreatedAt
|
||||
metrics.LastDuration = time.Duration(e.Duration * float64(time.Second))
|
||||
metrics.LastSize = e.SizeBytes
|
||||
metrics.Verified = e.VerifiedAt != nil && e.VerifyValid != nil && *e.VerifyValid
|
||||
metrics.Engine = e.DatabaseType
|
||||
}
|
||||
} else {
|
||||
metrics.FailureCount++
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate RPO for each database
|
||||
now := time.Now()
|
||||
for _, metrics := range byDB {
|
||||
if !metrics.LastSuccess.IsZero() {
|
||||
metrics.RPOSeconds = now.Sub(metrics.LastSuccess).Seconds()
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to slice and sort
|
||||
result := make([]BackupMetrics, 0, len(byDB))
|
||||
for _, metrics := range byDB {
|
||||
result = append(result, *metrics)
|
||||
}
|
||||
sort.Slice(result, func(i, j int) bool {
|
||||
return result[i].Database < result[j].Database
|
||||
})
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// formatMetrics formats metrics in Prometheus exposition format
|
||||
func (m *MetricsWriter) formatMetrics(metrics []BackupMetrics) string {
|
||||
var b strings.Builder
|
||||
|
||||
// Timestamp of metrics generation
|
||||
now := time.Now().Unix()
|
||||
|
||||
// Header comment
|
||||
b.WriteString("# DBBackup Prometheus Metrics\n")
|
||||
b.WriteString(fmt.Sprintf("# Generated at: %s\n", time.Now().Format(time.RFC3339)))
|
||||
b.WriteString(fmt.Sprintf("# Instance: %s\n", m.instance))
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_last_success_timestamp
|
||||
b.WriteString("# HELP dbbackup_last_success_timestamp Unix timestamp of last successful backup\n")
|
||||
b.WriteString("# TYPE dbbackup_last_success_timestamp gauge\n")
|
||||
for _, met := range metrics {
|
||||
if !met.LastSuccess.IsZero() {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_success_timestamp{instance=%q,database=%q,engine=%q} %d\n",
|
||||
m.instance, met.Database, met.Engine, met.LastSuccess.Unix()))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_last_backup_duration_seconds
|
||||
b.WriteString("# HELP dbbackup_last_backup_duration_seconds Duration of last successful backup in seconds\n")
|
||||
b.WriteString("# TYPE dbbackup_last_backup_duration_seconds gauge\n")
|
||||
for _, met := range metrics {
|
||||
if met.LastDuration > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_backup_duration_seconds{instance=%q,database=%q,engine=%q} %.2f\n",
|
||||
m.instance, met.Database, met.Engine, met.LastDuration.Seconds()))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_last_backup_size_bytes
|
||||
b.WriteString("# HELP dbbackup_last_backup_size_bytes Size of last successful backup in bytes\n")
|
||||
b.WriteString("# TYPE dbbackup_last_backup_size_bytes gauge\n")
|
||||
for _, met := range metrics {
|
||||
if met.LastSize > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_last_backup_size_bytes{instance=%q,database=%q,engine=%q} %d\n",
|
||||
m.instance, met.Database, met.Engine, met.LastSize))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_backup_total (counter)
|
||||
b.WriteString("# HELP dbbackup_backup_total Total number of backup attempts\n")
|
||||
b.WriteString("# TYPE dbbackup_backup_total counter\n")
|
||||
for _, met := range metrics {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_total{instance=%q,database=%q,status=\"success\"} %d\n",
|
||||
m.instance, met.Database, met.SuccessCount))
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_total{instance=%q,database=%q,status=\"failure\"} %d\n",
|
||||
m.instance, met.Database, met.FailureCount))
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_rpo_seconds
|
||||
b.WriteString("# HELP dbbackup_rpo_seconds Recovery Point Objective - seconds since last successful backup\n")
|
||||
b.WriteString("# TYPE dbbackup_rpo_seconds gauge\n")
|
||||
for _, met := range metrics {
|
||||
if met.RPOSeconds > 0 {
|
||||
b.WriteString(fmt.Sprintf("dbbackup_rpo_seconds{instance=%q,database=%q} %.0f\n",
|
||||
m.instance, met.Database, met.RPOSeconds))
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_backup_verified
|
||||
b.WriteString("# HELP dbbackup_backup_verified Whether the last backup was verified (1=yes, 0=no)\n")
|
||||
b.WriteString("# TYPE dbbackup_backup_verified gauge\n")
|
||||
for _, met := range metrics {
|
||||
verified := 0
|
||||
if met.Verified {
|
||||
verified = 1
|
||||
}
|
||||
b.WriteString(fmt.Sprintf("dbbackup_backup_verified{instance=%q,database=%q} %d\n",
|
||||
m.instance, met.Database, verified))
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// dbbackup_scrape_timestamp
|
||||
b.WriteString("# HELP dbbackup_scrape_timestamp Unix timestamp when metrics were collected\n")
|
||||
b.WriteString("# TYPE dbbackup_scrape_timestamp gauge\n")
|
||||
b.WriteString(fmt.Sprintf("dbbackup_scrape_timestamp{instance=%q} %d\n", m.instance, now))
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// GenerateMetricsString returns metrics as a string (for HTTP endpoint)
|
||||
func (m *MetricsWriter) GenerateMetricsString() (string, error) {
|
||||
metrics, err := m.collectMetrics()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return m.formatMetrics(metrics), nil
|
||||
}
|
||||
Reference in New Issue
Block a user