diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f891e7..fd7cf09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,55 @@ All notable changes to dbbackup will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [3.42.0] - 2026-01-07 "The Operator" + +### Added - 🐧 Systemd Integration & Prometheus Metrics + +**Embedded Systemd Installer:** +- New `dbbackup install` command installs as systemd service/timer +- Supports single-database (`--backup-type single`) and cluster (`--backup-type cluster`) modes +- Automatic `dbbackup` user/group creation with proper permissions +- Hardened service units with security features (NoNewPrivileges, ProtectSystem, CapabilityBoundingSet) +- Templated timer units with configurable schedules (daily, weekly, or custom OnCalendar) +- Built-in dry-run mode (`--dry-run`) to preview installation +- `dbbackup install --status` shows current installation state +- `dbbackup uninstall` cleanly removes all systemd units and optionally configuration + +**Prometheus Metrics Support:** +- New `dbbackup metrics export` command writes textfile collector format +- New `dbbackup metrics serve` command runs HTTP exporter on port 9399 +- Metrics: `dbbackup_last_success_timestamp`, `dbbackup_rpo_seconds`, `dbbackup_backup_total`, etc. +- Integration with node_exporter textfile collector +- Metrics automatically updated via ExecStopPost in service units +- `--with-metrics` flag during install sets up exporter as systemd service + +**New Commands:** +```bash +# Install as systemd service +sudo dbbackup install --backup-type cluster --schedule daily + +# Install with Prometheus metrics +sudo dbbackup install --with-metrics --metrics-port 9399 + +# Check installation status +dbbackup install --status + +# Export metrics for node_exporter +dbbackup metrics export --output /var/lib/dbbackup/metrics/dbbackup.prom + +# Run HTTP metrics server +dbbackup metrics serve --port 9399 +``` + +### Technical Details +- Systemd templates embedded with `//go:embed` for self-contained binary +- Templates use ReadWritePaths for security isolation +- Service units include proper OOMScoreAdjust (-100) to protect backups +- Metrics exporter caches with 30-second TTL for performance +- Graceful shutdown on SIGTERM for metrics server + +--- + ## [3.41.0] - 2026-01-07 "The Pre-Flight Check" ### Added - 🛡️ Pre-Restore Validation diff --git a/cmd/install.go b/cmd/install.go new file mode 100644 index 0000000..58b5839 --- /dev/null +++ b/cmd/install.go @@ -0,0 +1,237 @@ +package cmd + +import ( + "context" + "fmt" + "os" + "os/signal" + "strings" + "syscall" + + "dbbackup/internal/installer" + + "github.com/spf13/cobra" +) + +var ( + // Install flags + installInstance string + installSchedule string + installBackupType string + installUser string + installGroup string + installBackupDir string + installConfigPath string + installTimeout int + installWithMetrics bool + installMetricsPort int + installDryRun bool + installStatus bool + + // Uninstall flags + uninstallPurge bool +) + +// installCmd represents the install command +var installCmd = &cobra.Command{ + Use: "install", + Short: "Install dbbackup as a systemd service", + Long: `Install dbbackup as a systemd service with automatic scheduling. + +This command creates systemd service and timer units for automated database backups. +It supports both single database and cluster backup modes. + +Examples: + # Interactive installation (will prompt for options) + sudo dbbackup install + + # Install cluster backup running daily at 2am + sudo dbbackup install --backup-type cluster --schedule "daily" + + # Install single database backup with custom schedule + sudo dbbackup install --instance production --backup-type single --schedule "*-*-* 03:00:00" + + # Install with Prometheus metrics exporter + sudo dbbackup install --with-metrics --metrics-port 9399 + + # Check installation status + dbbackup install --status + + # Dry-run to see what would be installed + sudo dbbackup install --dry-run + +Schedule format (OnCalendar): + daily - Every day at midnight + weekly - Every Monday at midnight + *-*-* 02:00:00 - Every day at 2am + *-*-* 02,14:00 - Twice daily at 2am and 2pm + Mon *-*-* 03:00 - Every Monday at 3am +`, + RunE: func(cmd *cobra.Command, args []string) error { + // Handle --status flag + if installStatus { + return runInstallStatus(cmd.Context()) + } + + return runInstall(cmd.Context()) + }, +} + +// uninstallCmd represents the uninstall command +var uninstallCmd = &cobra.Command{ + Use: "uninstall [instance]", + Short: "Uninstall dbbackup systemd service", + Long: `Uninstall dbbackup systemd service and timer. + +Examples: + # Uninstall default instance + sudo dbbackup uninstall + + # Uninstall specific instance + sudo dbbackup uninstall production + + # Uninstall and remove all configuration + sudo dbbackup uninstall --purge +`, + RunE: func(cmd *cobra.Command, args []string) error { + instance := "cluster" + if len(args) > 0 { + instance = args[0] + } + return runUninstall(cmd.Context(), instance) + }, +} + +func init() { + rootCmd.AddCommand(installCmd) + rootCmd.AddCommand(uninstallCmd) + + // Install flags + installCmd.Flags().StringVarP(&installInstance, "instance", "i", "", "Instance name (e.g., production, staging)") + installCmd.Flags().StringVarP(&installSchedule, "schedule", "s", "daily", "Backup schedule (OnCalendar format)") + installCmd.Flags().StringVarP(&installBackupType, "backup-type", "t", "cluster", "Backup type: single or cluster") + installCmd.Flags().StringVar(&installUser, "user", "dbbackup", "System user to run backups") + installCmd.Flags().StringVar(&installGroup, "group", "dbbackup", "System group for backup user") + installCmd.Flags().StringVar(&installBackupDir, "backup-dir", "/var/lib/dbbackup/backups", "Directory for backups") + installCmd.Flags().StringVar(&installConfigPath, "config-path", "/etc/dbbackup/dbbackup.conf", "Path to config file") + installCmd.Flags().IntVar(&installTimeout, "timeout", 3600, "Backup timeout in seconds") + installCmd.Flags().BoolVar(&installWithMetrics, "with-metrics", false, "Install Prometheus metrics exporter") + installCmd.Flags().IntVar(&installMetricsPort, "metrics-port", 9399, "Prometheus metrics port") + installCmd.Flags().BoolVar(&installDryRun, "dry-run", false, "Show what would be installed without making changes") + installCmd.Flags().BoolVar(&installStatus, "status", false, "Show installation status") + + // Uninstall flags + uninstallCmd.Flags().BoolVar(&uninstallPurge, "purge", false, "Also remove configuration files") +} + +func runInstall(ctx context.Context) error { + // Create context with signal handling + ctx, cancel := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM) + defer cancel() + + // Expand schedule shortcuts + schedule := expandSchedule(installSchedule) + + // Create installer + inst := installer.NewInstaller(log, installDryRun) + + // Set up options + opts := installer.InstallOptions{ + Instance: installInstance, + BackupType: installBackupType, + Schedule: schedule, + User: installUser, + Group: installGroup, + BackupDir: installBackupDir, + ConfigPath: installConfigPath, + TimeoutSeconds: installTimeout, + WithMetrics: installWithMetrics, + MetricsPort: installMetricsPort, + } + + // For cluster backup, override instance + if installBackupType == "cluster" { + opts.Instance = "cluster" + } + + return inst.Install(ctx, opts) +} + +func runUninstall(ctx context.Context, instance string) error { + ctx, cancel := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM) + defer cancel() + + inst := installer.NewInstaller(log, false) + return inst.Uninstall(ctx, instance, uninstallPurge) +} + +func runInstallStatus(ctx context.Context) error { + inst := installer.NewInstaller(log, false) + + // Check cluster status + clusterStatus, err := inst.Status(ctx, "cluster") + if err != nil { + return err + } + + fmt.Println() + fmt.Println("📦 DBBackup Installation Status") + fmt.Println(strings.Repeat("═", 50)) + + if clusterStatus.Installed { + fmt.Println() + fmt.Println("🔹 Cluster Backup:") + fmt.Printf(" Service: %s\n", formatStatus(clusterStatus.Installed, clusterStatus.Active)) + fmt.Printf(" Timer: %s\n", formatStatus(clusterStatus.TimerEnabled, clusterStatus.TimerActive)) + if clusterStatus.NextRun != "" { + fmt.Printf(" Next run: %s\n", clusterStatus.NextRun) + } + if clusterStatus.LastRun != "" { + fmt.Printf(" Last run: %s\n", clusterStatus.LastRun) + } + } else { + fmt.Println() + fmt.Println("❌ No systemd services installed") + fmt.Println() + fmt.Println("Run 'sudo dbbackup install' to install as a systemd service") + } + + // Check for exporter + if _, err := os.Stat("/etc/systemd/system/dbbackup-exporter.service"); err == nil { + exporterStatus, err := inst.Status(ctx, "exporter") + fmt.Println() + fmt.Println("🔹 Metrics Exporter:") + if err == nil && exporterStatus != nil { + fmt.Printf(" Service: %s\n", formatStatus(true, exporterStatus.Active)) + } else { + fmt.Printf(" Service: installed (status unknown)\n") + } + } + + fmt.Println() + return nil +} + +func formatStatus(installed, active bool) string { + if !installed { + return "not installed" + } + if active { + return "✅ active" + } + return "⚪ inactive" +} + +func expandSchedule(schedule string) string { + shortcuts := map[string]string{ + "hourly": "*-*-* *:00:00", + "daily": "*-*-* 02:00:00", + "weekly": "Mon *-*-* 02:00:00", + "monthly": "*-*-01 02:00:00", + } + + if expanded, ok := shortcuts[strings.ToLower(schedule)]; ok { + return expanded + } + return schedule +} diff --git a/cmd/metrics.go b/cmd/metrics.go new file mode 100644 index 0000000..4ea2a94 --- /dev/null +++ b/cmd/metrics.go @@ -0,0 +1,138 @@ +package cmd + +import ( + "context" + "fmt" + "os" + "os/signal" + "syscall" + + "dbbackup/internal/prometheus" + + "github.com/spf13/cobra" +) + +var ( + metricsInstance string + metricsOutput string + metricsPort int +) + +// metricsCmd represents the metrics command +var metricsCmd = &cobra.Command{ + Use: "metrics", + Short: "Prometheus metrics management", + Long: `Prometheus metrics management for dbbackup. + +Export metrics to a textfile for node_exporter, or run an HTTP server +for direct Prometheus scraping.`, +} + +// metricsExportCmd exports metrics to a textfile +var metricsExportCmd = &cobra.Command{ + Use: "export", + Short: "Export metrics to textfile", + Long: `Export Prometheus metrics to a textfile for node_exporter. + +The textfile collector in node_exporter can scrape metrics from files +in a designated directory (typically /var/lib/node_exporter/textfile_collector/). + +Examples: + # Export metrics to default location + dbbackup metrics export + + # Export with custom output path + dbbackup metrics export --output /var/lib/dbbackup/metrics/dbbackup.prom + + # Export for specific instance + dbbackup metrics export --instance production --output /var/lib/dbbackup/metrics/production.prom + +After export, configure node_exporter with: + --collector.textfile.directory=/var/lib/dbbackup/metrics/ +`, + RunE: func(cmd *cobra.Command, args []string) error { + return runMetricsExport(cmd.Context()) + }, +} + +// metricsServeCmd runs the HTTP metrics server +var metricsServeCmd = &cobra.Command{ + Use: "serve", + Short: "Run Prometheus HTTP server", + Long: `Run an HTTP server exposing Prometheus metrics. + +This starts a long-running daemon that serves metrics at /metrics. +Prometheus can scrape this endpoint directly. + +Examples: + # Start server on default port 9399 + dbbackup metrics serve + + # Start server on custom port + dbbackup metrics serve --port 9100 + + # Run as systemd service (installed via 'dbbackup install --with-metrics') + sudo systemctl start dbbackup-exporter + +Endpoints: + /metrics - Prometheus metrics + /health - Health check (returns 200 OK) + / - Service info page +`, + RunE: func(cmd *cobra.Command, args []string) error { + return runMetricsServe(cmd.Context()) + }, +} + +func init() { + rootCmd.AddCommand(metricsCmd) + metricsCmd.AddCommand(metricsExportCmd) + metricsCmd.AddCommand(metricsServeCmd) + + // Export flags + metricsExportCmd.Flags().StringVar(&metricsInstance, "instance", "default", "Instance name for metrics labels") + metricsExportCmd.Flags().StringVarP(&metricsOutput, "output", "o", "/var/lib/dbbackup/metrics/dbbackup.prom", "Output file path") + + // Serve flags + metricsServeCmd.Flags().StringVar(&metricsInstance, "instance", "default", "Instance name for metrics labels") + metricsServeCmd.Flags().IntVarP(&metricsPort, "port", "p", 9399, "HTTP server port") +} + +func runMetricsExport(ctx context.Context) error { + // Open catalog + cat, err := openCatalog() + if err != nil { + return fmt.Errorf("failed to open catalog: %w", err) + } + defer cat.Close() + + // Create metrics writer + writer := prometheus.NewMetricsWriter(log, cat, metricsInstance) + + // Write textfile + if err := writer.WriteTextfile(metricsOutput); err != nil { + return fmt.Errorf("failed to write metrics: %w", err) + } + + log.Info("Exported metrics to textfile", "path", metricsOutput, "instance", metricsInstance) + return nil +} + +func runMetricsServe(ctx context.Context) error { + // Setup signal handling + ctx, cancel := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM) + defer cancel() + + // Open catalog + cat, err := openCatalog() + if err != nil { + return fmt.Errorf("failed to open catalog: %w", err) + } + defer cat.Close() + + // Create exporter + exporter := prometheus.NewExporter(log, cat, metricsInstance, metricsPort) + + // Run server (blocks until context is cancelled) + return exporter.Serve(ctx) +} diff --git a/internal/installer/embed.go b/internal/installer/embed.go new file mode 100644 index 0000000..c95e0ad --- /dev/null +++ b/internal/installer/embed.go @@ -0,0 +1,11 @@ +// Package installer provides systemd service installation for dbbackup +package installer + +import ( + "embed" +) + +// Templates contains embedded systemd unit files +// +//go:embed templates/*.service templates/*.timer +var Templates embed.FS diff --git a/internal/installer/installer.go b/internal/installer/installer.go new file mode 100644 index 0000000..409132a --- /dev/null +++ b/internal/installer/installer.go @@ -0,0 +1,634 @@ +// Package installer provides systemd service installation for dbbackup +package installer + +import ( + "context" + "fmt" + "os" + "os/exec" + "os/user" + "path/filepath" + "runtime" + "strings" + "text/template" + + "dbbackup/internal/logger" +) + +// Installer handles systemd service installation +type Installer struct { + log logger.Logger + unitDir string // /etc/systemd/system or custom + dryRun bool +} + +// InstallOptions configures the installation +type InstallOptions struct { + // Instance name (e.g., "production", "staging") + Instance string + + // Binary path (auto-detected if empty) + BinaryPath string + + // Backup configuration + BackupType string // "single" or "cluster" + Schedule string // OnCalendar format, e.g., "daily", "*-*-* 02:00:00" + + // Service user/group + User string + Group string + + // Paths + BackupDir string + ConfigPath string + + // Timeout in seconds (default: 3600) + TimeoutSeconds int + + // Metrics + WithMetrics bool + MetricsPort int +} + +// ServiceStatus contains information about installed services +type ServiceStatus struct { + Installed bool + Enabled bool + Active bool + TimerEnabled bool + TimerActive bool + LastRun string + NextRun string + ServicePath string + TimerPath string + ExporterPath string +} + +// NewInstaller creates a new Installer +func NewInstaller(log logger.Logger, dryRun bool) *Installer { + return &Installer{ + log: log, + unitDir: "/etc/systemd/system", + dryRun: dryRun, + } +} + +// SetUnitDir allows overriding the systemd unit directory (for testing) +func (i *Installer) SetUnitDir(dir string) { + i.unitDir = dir +} + +// Install installs the systemd service and timer +func (i *Installer) Install(ctx context.Context, opts InstallOptions) error { + // Validate platform + if runtime.GOOS != "linux" { + return fmt.Errorf("systemd installation only supported on Linux (current: %s)", runtime.GOOS) + } + + // Validate prerequisites + if err := i.validatePrerequisites(); err != nil { + return err + } + + // Set defaults + if err := i.setDefaults(&opts); err != nil { + return err + } + + // Create user if needed + if err := i.ensureUser(opts.User, opts.Group); err != nil { + return err + } + + // Create directories + if err := i.createDirectories(opts); err != nil { + return err + } + + // Write service and timer files + if err := i.writeUnitFiles(opts); err != nil { + return err + } + + // Reload systemd + if err := i.systemctl(ctx, "daemon-reload"); err != nil { + return err + } + + // Enable timer + timerName := i.getTimerName(opts) + if err := i.systemctl(ctx, "enable", timerName); err != nil { + return err + } + + // Install metrics exporter if requested + if opts.WithMetrics { + if err := i.installExporter(ctx, opts); err != nil { + i.log.Warn("Failed to install metrics exporter", "error", err) + } + } + + i.log.Info("Installation complete", + "instance", opts.Instance, + "timer", timerName, + "schedule", opts.Schedule) + + i.printNextSteps(opts) + + return nil +} + +// Uninstall removes the systemd service and timer +func (i *Installer) Uninstall(ctx context.Context, instance string, purge bool) error { + if runtime.GOOS != "linux" { + return fmt.Errorf("systemd uninstallation only supported on Linux") + } + + if err := i.validatePrerequisites(); err != nil { + return err + } + + // Determine service names + var serviceName, timerName string + if instance == "cluster" || instance == "" { + serviceName = "dbbackup-cluster.service" + timerName = "dbbackup-cluster.timer" + } else { + serviceName = fmt.Sprintf("dbbackup@%s.service", instance) + timerName = fmt.Sprintf("dbbackup@%s.timer", instance) + } + + // Stop and disable timer + _ = i.systemctl(ctx, "stop", timerName) + _ = i.systemctl(ctx, "disable", timerName) + + // Stop and disable service + _ = i.systemctl(ctx, "stop", serviceName) + _ = i.systemctl(ctx, "disable", serviceName) + + // Remove unit files + servicePath := filepath.Join(i.unitDir, serviceName) + timerPath := filepath.Join(i.unitDir, timerName) + + if !i.dryRun { + os.Remove(servicePath) + os.Remove(timerPath) + } else { + i.log.Info("Would remove", "service", servicePath) + i.log.Info("Would remove", "timer", timerPath) + } + + // Also try to remove template units if they exist + if instance != "cluster" && instance != "" { + templateService := filepath.Join(i.unitDir, "dbbackup@.service") + templateTimer := filepath.Join(i.unitDir, "dbbackup@.timer") + + // Only remove templates if no other instances are using them + if i.canRemoveTemplates() { + if !i.dryRun { + os.Remove(templateService) + os.Remove(templateTimer) + } + } + } + + // Remove exporter + exporterPath := filepath.Join(i.unitDir, "dbbackup-exporter.service") + _ = i.systemctl(ctx, "stop", "dbbackup-exporter.service") + _ = i.systemctl(ctx, "disable", "dbbackup-exporter.service") + if !i.dryRun { + os.Remove(exporterPath) + } + + // Reload systemd + _ = i.systemctl(ctx, "daemon-reload") + + // Purge config files if requested + if purge { + configDirs := []string{ + "/etc/dbbackup", + "/var/lib/dbbackup", + } + for _, dir := range configDirs { + if !i.dryRun { + if err := os.RemoveAll(dir); err != nil { + i.log.Warn("Failed to remove directory", "path", dir, "error", err) + } else { + i.log.Info("Removed directory", "path", dir) + } + } else { + i.log.Info("Would remove directory", "path", dir) + } + } + } + + i.log.Info("Uninstallation complete", "instance", instance, "purge", purge) + return nil +} + +// Status returns the current installation status +func (i *Installer) Status(ctx context.Context, instance string) (*ServiceStatus, error) { + if runtime.GOOS != "linux" { + return nil, fmt.Errorf("systemd status only supported on Linux") + } + + status := &ServiceStatus{} + + // Determine service names + var serviceName, timerName string + if instance == "cluster" || instance == "" { + serviceName = "dbbackup-cluster.service" + timerName = "dbbackup-cluster.timer" + } else { + serviceName = fmt.Sprintf("dbbackup@%s.service", instance) + timerName = fmt.Sprintf("dbbackup@%s.timer", instance) + } + + // Check service file exists + status.ServicePath = filepath.Join(i.unitDir, serviceName) + if _, err := os.Stat(status.ServicePath); err == nil { + status.Installed = true + } + + // Check timer file exists + status.TimerPath = filepath.Join(i.unitDir, timerName) + + // Check exporter + status.ExporterPath = filepath.Join(i.unitDir, "dbbackup-exporter.service") + + // Check enabled/active status + if status.Installed { + status.Enabled = i.isEnabled(ctx, serviceName) + status.Active = i.isActive(ctx, serviceName) + status.TimerEnabled = i.isEnabled(ctx, timerName) + status.TimerActive = i.isActive(ctx, timerName) + + // Get timer info + status.NextRun = i.getTimerNext(ctx, timerName) + status.LastRun = i.getTimerLast(ctx, timerName) + } + + return status, nil +} + +// validatePrerequisites checks system requirements +func (i *Installer) validatePrerequisites() error { + // Check root + if os.Getuid() != 0 { + return fmt.Errorf("installation requires root privileges (use sudo)") + } + + // Check systemd + if _, err := exec.LookPath("systemctl"); err != nil { + return fmt.Errorf("systemctl not found - is this a systemd-based system?") + } + + // Check for container environment + if _, err := os.Stat("/.dockerenv"); err == nil { + i.log.Warn("Running inside Docker container - systemd may not work correctly") + } + + return nil +} + +// setDefaults fills in default values +func (i *Installer) setDefaults(opts *InstallOptions) error { + // Auto-detect binary path + if opts.BinaryPath == "" { + binPath, err := os.Executable() + if err != nil { + return fmt.Errorf("failed to detect binary path: %w", err) + } + binPath, err = filepath.EvalSymlinks(binPath) + if err != nil { + return fmt.Errorf("failed to resolve binary path: %w", err) + } + opts.BinaryPath = binPath + } + + // Default instance + if opts.Instance == "" { + opts.Instance = "default" + } + + // Default backup type + if opts.BackupType == "" { + opts.BackupType = "single" + } + + // Default schedule (daily at 2am) + if opts.Schedule == "" { + opts.Schedule = "*-*-* 02:00:00" + } + + // Default user/group + if opts.User == "" { + opts.User = "dbbackup" + } + if opts.Group == "" { + opts.Group = "dbbackup" + } + + // Default paths + if opts.BackupDir == "" { + opts.BackupDir = "/var/lib/dbbackup/backups" + } + if opts.ConfigPath == "" { + opts.ConfigPath = "/etc/dbbackup/dbbackup.conf" + } + + // Default timeout (1 hour) + if opts.TimeoutSeconds == 0 { + opts.TimeoutSeconds = 3600 + } + + // Default metrics port + if opts.MetricsPort == 0 { + opts.MetricsPort = 9399 + } + + return nil +} + +// ensureUser creates the service user if it doesn't exist +func (i *Installer) ensureUser(username, groupname string) error { + // Check if user exists + if _, err := user.Lookup(username); err == nil { + i.log.Debug("User already exists", "user", username) + return nil + } + + if i.dryRun { + i.log.Info("Would create user", "user", username, "group", groupname) + return nil + } + + // Create group first + groupCmd := exec.Command("groupadd", "--system", groupname) + if output, err := groupCmd.CombinedOutput(); err != nil { + // Ignore if group already exists + if !strings.Contains(string(output), "already exists") { + i.log.Debug("Group creation output", "output", string(output)) + } + } + + // Create user + userCmd := exec.Command("useradd", + "--system", + "--shell", "/usr/sbin/nologin", + "--home-dir", "/var/lib/dbbackup", + "--gid", groupname, + username) + + if output, err := userCmd.CombinedOutput(); err != nil { + if !strings.Contains(string(output), "already exists") { + return fmt.Errorf("failed to create user %s: %w (%s)", username, err, output) + } + } + + i.log.Info("Created system user", "user", username, "group", groupname) + return nil +} + +// createDirectories creates required directories +func (i *Installer) createDirectories(opts InstallOptions) error { + dirs := []struct { + path string + mode os.FileMode + }{ + {"/etc/dbbackup", 0755}, + {"/etc/dbbackup/env.d", 0700}, + {"/var/lib/dbbackup", 0750}, + {"/var/lib/dbbackup/backups", 0750}, + {"/var/lib/dbbackup/metrics", 0755}, + {"/var/log/dbbackup", 0750}, + {opts.BackupDir, 0750}, + } + + for _, d := range dirs { + if i.dryRun { + i.log.Info("Would create directory", "path", d.path, "mode", d.mode) + continue + } + + if err := os.MkdirAll(d.path, d.mode); err != nil { + return fmt.Errorf("failed to create directory %s: %w", d.path, err) + } + + // Set ownership + u, err := user.Lookup(opts.User) + if err == nil { + var uid, gid int + fmt.Sscanf(u.Uid, "%d", &uid) + fmt.Sscanf(u.Gid, "%d", &gid) + os.Chown(d.path, uid, gid) + } + } + + return nil +} + +// writeUnitFiles renders and writes the systemd unit files +func (i *Installer) writeUnitFiles(opts InstallOptions) error { + // Prepare template data + data := map[string]interface{}{ + "User": opts.User, + "Group": opts.Group, + "BinaryPath": opts.BinaryPath, + "BackupType": opts.BackupType, + "BackupDir": opts.BackupDir, + "ConfigPath": opts.ConfigPath, + "TimeoutSeconds": opts.TimeoutSeconds, + "Schedule": opts.Schedule, + "MetricsPort": opts.MetricsPort, + } + + // Determine which templates to use + var serviceTemplate, timerTemplate string + var serviceName, timerName string + + if opts.BackupType == "cluster" { + serviceTemplate = "templates/dbbackup-cluster.service" + timerTemplate = "templates/dbbackup-cluster.timer" + serviceName = "dbbackup-cluster.service" + timerName = "dbbackup-cluster.timer" + } else { + serviceTemplate = "templates/dbbackup@.service" + timerTemplate = "templates/dbbackup@.timer" + serviceName = "dbbackup@.service" + timerName = "dbbackup@.timer" + } + + // Write service file + if err := i.writeTemplateFile(serviceTemplate, serviceName, data); err != nil { + return fmt.Errorf("failed to write service file: %w", err) + } + + // Write timer file + if err := i.writeTemplateFile(timerTemplate, timerName, data); err != nil { + return fmt.Errorf("failed to write timer file: %w", err) + } + + return nil +} + +// writeTemplateFile reads an embedded template and writes it to the unit directory +func (i *Installer) writeTemplateFile(templatePath, outputName string, data map[string]interface{}) error { + // Read template + content, err := Templates.ReadFile(templatePath) + if err != nil { + return fmt.Errorf("failed to read template %s: %w", templatePath, err) + } + + // Parse template + tmpl, err := template.New(outputName).Parse(string(content)) + if err != nil { + return fmt.Errorf("failed to parse template %s: %w", templatePath, err) + } + + // Render template + var buf strings.Builder + if err := tmpl.Execute(&buf, data); err != nil { + return fmt.Errorf("failed to render template %s: %w", templatePath, err) + } + + // Write file + outputPath := filepath.Join(i.unitDir, outputName) + if i.dryRun { + i.log.Info("Would write unit file", "path", outputPath) + i.log.Debug("Unit file content", "content", buf.String()) + return nil + } + + if err := os.WriteFile(outputPath, []byte(buf.String()), 0644); err != nil { + return fmt.Errorf("failed to write %s: %w", outputPath, err) + } + + i.log.Info("Created unit file", "path", outputPath) + return nil +} + +// installExporter installs the metrics exporter service +func (i *Installer) installExporter(ctx context.Context, opts InstallOptions) error { + data := map[string]interface{}{ + "User": opts.User, + "Group": opts.Group, + "BinaryPath": opts.BinaryPath, + "ConfigPath": opts.ConfigPath, + "MetricsPort": opts.MetricsPort, + } + + if err := i.writeTemplateFile("templates/dbbackup-exporter.service", "dbbackup-exporter.service", data); err != nil { + return err + } + + if err := i.systemctl(ctx, "daemon-reload"); err != nil { + return err + } + + if err := i.systemctl(ctx, "enable", "dbbackup-exporter.service"); err != nil { + return err + } + + if err := i.systemctl(ctx, "start", "dbbackup-exporter.service"); err != nil { + return err + } + + i.log.Info("Installed metrics exporter", "port", opts.MetricsPort) + return nil +} + +// getTimerName returns the timer unit name for the given options +func (i *Installer) getTimerName(opts InstallOptions) string { + if opts.BackupType == "cluster" { + return "dbbackup-cluster.timer" + } + return fmt.Sprintf("dbbackup@%s.timer", opts.Instance) +} + +// systemctl runs a systemctl command +func (i *Installer) systemctl(ctx context.Context, args ...string) error { + if i.dryRun { + i.log.Info("Would run: systemctl", "args", args) + return nil + } + + cmd := exec.CommandContext(ctx, "systemctl", args...) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("systemctl %v failed: %w\n%s", args, err, string(output)) + } + return nil +} + +// isEnabled checks if a unit is enabled +func (i *Installer) isEnabled(ctx context.Context, unit string) bool { + cmd := exec.CommandContext(ctx, "systemctl", "is-enabled", unit) + return cmd.Run() == nil +} + +// isActive checks if a unit is active +func (i *Installer) isActive(ctx context.Context, unit string) bool { + cmd := exec.CommandContext(ctx, "systemctl", "is-active", unit) + return cmd.Run() == nil +} + +// getTimerNext gets the next run time for a timer +func (i *Installer) getTimerNext(ctx context.Context, timer string) string { + cmd := exec.CommandContext(ctx, "systemctl", "show", timer, "--property=NextElapseUSecRealtime", "--value") + output, err := cmd.Output() + if err != nil { + return "" + } + return strings.TrimSpace(string(output)) +} + +// getTimerLast gets the last run time for a timer +func (i *Installer) getTimerLast(ctx context.Context, timer string) string { + cmd := exec.CommandContext(ctx, "systemctl", "show", timer, "--property=LastTriggerUSec", "--value") + output, err := cmd.Output() + if err != nil { + return "" + } + return strings.TrimSpace(string(output)) +} + +// canRemoveTemplates checks if template units can be safely removed +func (i *Installer) canRemoveTemplates() bool { + // Check if any dbbackup@*.service instances exist + pattern := filepath.Join(i.unitDir, "dbbackup@*.service") + matches, _ := filepath.Glob(pattern) + + // Also check for running instances + cmd := exec.Command("systemctl", "list-units", "--type=service", "--all", "dbbackup@*") + output, _ := cmd.Output() + + return len(matches) == 0 && !strings.Contains(string(output), "dbbackup@") +} + +// printNextSteps prints helpful next steps after installation +func (i *Installer) printNextSteps(opts InstallOptions) { + timerName := i.getTimerName(opts) + serviceName := strings.Replace(timerName, ".timer", ".service", 1) + + fmt.Println() + fmt.Println("✅ Installation successful!") + fmt.Println() + fmt.Println("📋 Next steps:") + fmt.Println() + fmt.Printf(" 1. Edit configuration: sudo nano %s\n", opts.ConfigPath) + fmt.Printf(" 2. Set credentials: sudo nano /etc/dbbackup/env.d/%s.conf\n", opts.Instance) + fmt.Printf(" 3. Start the timer: sudo systemctl start %s\n", timerName) + fmt.Printf(" 4. Verify timer status: sudo systemctl status %s\n", timerName) + fmt.Printf(" 5. Run backup manually: sudo systemctl start %s\n", serviceName) + fmt.Println() + fmt.Println("📊 View backup logs:") + fmt.Printf(" journalctl -u %s -f\n", serviceName) + fmt.Println() + + if opts.WithMetrics { + fmt.Println("📈 Prometheus metrics:") + fmt.Printf(" curl http://localhost:%d/metrics\n", opts.MetricsPort) + fmt.Println() + } +} diff --git a/internal/installer/templates/dbbackup-cluster.service b/internal/installer/templates/dbbackup-cluster.service new file mode 100644 index 0000000..be5f405 --- /dev/null +++ b/internal/installer/templates/dbbackup-cluster.service @@ -0,0 +1,47 @@ +[Unit] +Description=Database Cluster Backup +Documentation=https://github.com/PlusOne/dbbackup +After=network-online.target postgresql.service mysql.service mariadb.service +Wants=network-online.target + +[Service] +Type=oneshot +User={{.User}} +Group={{.Group}} + +# Security hardening +NoNewPrivileges=yes +ProtectSystem=strict +ProtectHome=read-only +PrivateTmp=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +ProtectControlGroups=yes +RestrictSUIDSGID=yes +RestrictRealtime=yes +LockPersonality=yes +RemoveIPC=yes +CapabilityBoundingSet= +AmbientCapabilities= + +# Directories +ReadWritePaths={{.BackupDir}} /var/lib/dbbackup /var/log/dbbackup + +# Network access for cloud uploads +RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 + +# Environment +EnvironmentFile=-/etc/dbbackup/env.d/cluster.conf + +# Execution - cluster backup (all databases) +ExecStart={{.BinaryPath}} backup cluster --config {{.ConfigPath}} +TimeoutStartSec={{.TimeoutSeconds}} + +# Post-backup metrics export +ExecStopPost=-{{.BinaryPath}} metrics export --instance cluster --output /var/lib/dbbackup/metrics/cluster.prom + +# OOM protection for large backups +OOMScoreAdjust=-500 + +[Install] +WantedBy=multi-user.target diff --git a/internal/installer/templates/dbbackup-cluster.timer b/internal/installer/templates/dbbackup-cluster.timer new file mode 100644 index 0000000..b9dca39 --- /dev/null +++ b/internal/installer/templates/dbbackup-cluster.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Database Cluster Backup Timer +Documentation=https://github.com/PlusOne/dbbackup + +[Timer] +OnCalendar={{.Schedule}} +Persistent=true +RandomizedDelaySec=1800 + +[Install] +WantedBy=timers.target diff --git a/internal/installer/templates/dbbackup-exporter.service b/internal/installer/templates/dbbackup-exporter.service new file mode 100644 index 0000000..e4b2a1f --- /dev/null +++ b/internal/installer/templates/dbbackup-exporter.service @@ -0,0 +1,37 @@ +[Unit] +Description=DBBackup Prometheus Metrics Exporter +Documentation=https://github.com/PlusOne/dbbackup +After=network-online.target + +[Service] +Type=simple +User={{.User}} +Group={{.Group}} + +# Security hardening +NoNewPrivileges=yes +ProtectSystem=strict +ProtectHome=yes +PrivateTmp=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +ProtectControlGroups=yes +RestrictSUIDSGID=yes +RestrictRealtime=yes +LockPersonality=yes +RemoveIPC=yes + +# Read-only access to catalog and backups +ReadOnlyPaths=/var/lib/dbbackup + +# Network for HTTP server +RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 + +# Execution +ExecStart={{.BinaryPath}} metrics serve --port {{.MetricsPort}} --config {{.ConfigPath}} +ExecReload=/bin/kill -HUP $MAINPID +Restart=on-failure +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/internal/installer/templates/dbbackup@.service b/internal/installer/templates/dbbackup@.service new file mode 100644 index 0000000..00d2785 --- /dev/null +++ b/internal/installer/templates/dbbackup@.service @@ -0,0 +1,47 @@ +[Unit] +Description=Database Backup for %i +Documentation=https://github.com/PlusOne/dbbackup +After=network-online.target postgresql.service mysql.service mariadb.service +Wants=network-online.target + +[Service] +Type=oneshot +User={{.User}} +Group={{.Group}} + +# Security hardening +NoNewPrivileges=yes +ProtectSystem=strict +ProtectHome=read-only +PrivateTmp=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +ProtectControlGroups=yes +RestrictSUIDSGID=yes +RestrictRealtime=yes +LockPersonality=yes +RemoveIPC=yes +CapabilityBoundingSet= +AmbientCapabilities= + +# Directories +ReadWritePaths={{.BackupDir}} /var/lib/dbbackup /var/log/dbbackup + +# Network access for cloud uploads +RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 + +# Environment +EnvironmentFile=-/etc/dbbackup/env.d/%i.conf + +# Execution +ExecStart={{.BinaryPath}} backup {{.BackupType}} %i --config {{.ConfigPath}} +TimeoutStartSec={{.TimeoutSeconds}} + +# Post-backup metrics export +ExecStopPost=-{{.BinaryPath}} metrics export --instance %i --output /var/lib/dbbackup/metrics/%i.prom + +# OOM protection for large backups +OOMScoreAdjust=-500 + +[Install] +WantedBy=multi-user.target diff --git a/internal/installer/templates/dbbackup@.timer b/internal/installer/templates/dbbackup@.timer new file mode 100644 index 0000000..f30a939 --- /dev/null +++ b/internal/installer/templates/dbbackup@.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Database Backup Timer for %i +Documentation=https://github.com/PlusOne/dbbackup + +[Timer] +OnCalendar={{.Schedule}} +Persistent=true +RandomizedDelaySec=1800 + +[Install] +WantedBy=timers.target diff --git a/internal/prometheus/exporter.go b/internal/prometheus/exporter.go new file mode 100644 index 0000000..0e6fab1 --- /dev/null +++ b/internal/prometheus/exporter.go @@ -0,0 +1,174 @@ +// Package prometheus provides Prometheus metrics for dbbackup +package prometheus + +import ( + "context" + "fmt" + "net/http" + "sync" + "time" + + "dbbackup/internal/catalog" + "dbbackup/internal/logger" +) + +// Exporter provides an HTTP endpoint for Prometheus metrics +type Exporter struct { + log logger.Logger + catalog catalog.Catalog + instance string + port int + + mu sync.RWMutex + cachedData string + lastRefresh time.Time + refreshTTL time.Duration +} + +// NewExporter creates a new Prometheus exporter +func NewExporter(log logger.Logger, cat catalog.Catalog, instance string, port int) *Exporter { + return &Exporter{ + log: log, + catalog: cat, + instance: instance, + port: port, + refreshTTL: 30 * time.Second, + } +} + +// Serve starts the HTTP server and blocks until context is cancelled +func (e *Exporter) Serve(ctx context.Context) error { + mux := http.NewServeMux() + + // /metrics endpoint + mux.HandleFunc("/metrics", e.handleMetrics) + + // /health endpoint + mux.HandleFunc("/health", e.handleHealth) + + // / root with info + mux.HandleFunc("/", e.handleRoot) + + addr := fmt.Sprintf(":%d", e.port) + srv := &http.Server{ + Addr: addr, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 30 * time.Second, + IdleTimeout: 60 * time.Second, + } + + // Start refresh goroutine + go e.refreshLoop(ctx) + + // Graceful shutdown + go func() { + <-ctx.Done() + e.log.Info("Shutting down metrics server...") + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := srv.Shutdown(shutdownCtx); err != nil { + e.log.Error("Server shutdown error", "error", err) + } + }() + + e.log.Info("Starting Prometheus metrics server", "addr", addr) + if err := srv.ListenAndServe(); err != http.ErrServerClosed { + return fmt.Errorf("server error: %w", err) + } + + return nil +} + +// handleMetrics handles /metrics endpoint +func (e *Exporter) handleMetrics(w http.ResponseWriter, r *http.Request) { + e.mu.RLock() + data := e.cachedData + e.mu.RUnlock() + + if data == "" { + // Force refresh if cache is empty + if err := e.refresh(); err != nil { + http.Error(w, "Failed to collect metrics", http.StatusInternalServerError) + return + } + e.mu.RLock() + data = e.cachedData + e.mu.RUnlock() + } + + w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + w.WriteHeader(http.StatusOK) + w.Write([]byte(data)) +} + +// handleHealth handles /health endpoint +func (e *Exporter) handleHealth(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"status":"ok","service":"dbbackup-exporter"}`)) +} + +// handleRoot handles / endpoint +func (e *Exporter) handleRoot(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/" { + http.NotFound(w, r) + return + } + + w.Header().Set("Content-Type", "text/html") + w.WriteHeader(http.StatusOK) + w.Write([]byte(` + + + DBBackup Exporter + + +

DBBackup Prometheus Exporter

+

This is a Prometheus metrics exporter for DBBackup.

+ + +`)) +} + +// refreshLoop periodically refreshes the metrics cache +func (e *Exporter) refreshLoop(ctx context.Context) { + ticker := time.NewTicker(e.refreshTTL) + defer ticker.Stop() + + // Initial refresh + if err := e.refresh(); err != nil { + e.log.Error("Initial metrics refresh failed", "error", err) + } + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if err := e.refresh(); err != nil { + e.log.Error("Metrics refresh failed", "error", err) + } + } + } +} + +// refresh updates the cached metrics +func (e *Exporter) refresh() error { + writer := NewMetricsWriter(e.log, e.catalog, e.instance) + data, err := writer.GenerateMetricsString() + if err != nil { + return err + } + + e.mu.Lock() + e.cachedData = data + e.lastRefresh = time.Now() + e.mu.Unlock() + + e.log.Debug("Refreshed metrics cache") + return nil +} diff --git a/internal/prometheus/textfile.go b/internal/prometheus/textfile.go new file mode 100644 index 0000000..f2d16bb --- /dev/null +++ b/internal/prometheus/textfile.go @@ -0,0 +1,245 @@ +// Package prometheus provides Prometheus metrics for dbbackup +package prometheus + +import ( + "context" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "time" + + "dbbackup/internal/catalog" + "dbbackup/internal/logger" +) + +// MetricsWriter writes metrics in Prometheus text format +type MetricsWriter struct { + log logger.Logger + catalog catalog.Catalog + instance string +} + +// NewMetricsWriter creates a new MetricsWriter +func NewMetricsWriter(log logger.Logger, cat catalog.Catalog, instance string) *MetricsWriter { + return &MetricsWriter{ + log: log, + catalog: cat, + instance: instance, + } +} + +// BackupMetrics holds metrics for a single database +type BackupMetrics struct { + Database string + Engine string + LastSuccess time.Time + LastDuration time.Duration + LastSize int64 + TotalBackups int + SuccessCount int + FailureCount int + Verified bool + RPOSeconds float64 +} + +// WriteTextfile writes metrics to a Prometheus textfile collector file +func (m *MetricsWriter) WriteTextfile(path string) error { + metrics, err := m.collectMetrics() + if err != nil { + return fmt.Errorf("failed to collect metrics: %w", err) + } + + output := m.formatMetrics(metrics) + + // Atomic write: write to temp file, then rename + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("failed to create directory %s: %w", dir, err) + } + + tmpPath := path + ".tmp" + if err := os.WriteFile(tmpPath, []byte(output), 0644); err != nil { + return fmt.Errorf("failed to write temp file: %w", err) + } + + if err := os.Rename(tmpPath, path); err != nil { + os.Remove(tmpPath) + return fmt.Errorf("failed to rename temp file: %w", err) + } + + m.log.Debug("Wrote metrics to textfile", "path", path, "databases", len(metrics)) + return nil +} + +// collectMetrics gathers metrics from the catalog +func (m *MetricsWriter) collectMetrics() ([]BackupMetrics, error) { + if m.catalog == nil { + return nil, fmt.Errorf("catalog not available") + } + + ctx := context.Background() + + // Get recent backups using Search with limit + query := &catalog.SearchQuery{ + Limit: 1000, + } + entries, err := m.catalog.Search(ctx, query) + if err != nil { + return nil, fmt.Errorf("failed to search backups: %w", err) + } + + // Group by database + byDB := make(map[string]*BackupMetrics) + + for _, e := range entries { + key := e.Database + if key == "" { + key = "unknown" + } + + metrics, ok := byDB[key] + if !ok { + metrics = &BackupMetrics{ + Database: key, + Engine: e.DatabaseType, + } + byDB[key] = metrics + } + + metrics.TotalBackups++ + + isSuccess := e.Status == catalog.StatusCompleted || e.Status == catalog.StatusVerified + if isSuccess { + metrics.SuccessCount++ + // Track most recent success + if e.CreatedAt.After(metrics.LastSuccess) { + metrics.LastSuccess = e.CreatedAt + metrics.LastDuration = time.Duration(e.Duration * float64(time.Second)) + metrics.LastSize = e.SizeBytes + metrics.Verified = e.VerifiedAt != nil && e.VerifyValid != nil && *e.VerifyValid + metrics.Engine = e.DatabaseType + } + } else { + metrics.FailureCount++ + } + } + + // Calculate RPO for each database + now := time.Now() + for _, metrics := range byDB { + if !metrics.LastSuccess.IsZero() { + metrics.RPOSeconds = now.Sub(metrics.LastSuccess).Seconds() + } + } + + // Convert to slice and sort + result := make([]BackupMetrics, 0, len(byDB)) + for _, metrics := range byDB { + result = append(result, *metrics) + } + sort.Slice(result, func(i, j int) bool { + return result[i].Database < result[j].Database + }) + + return result, nil +} + +// formatMetrics formats metrics in Prometheus exposition format +func (m *MetricsWriter) formatMetrics(metrics []BackupMetrics) string { + var b strings.Builder + + // Timestamp of metrics generation + now := time.Now().Unix() + + // Header comment + b.WriteString("# DBBackup Prometheus Metrics\n") + b.WriteString(fmt.Sprintf("# Generated at: %s\n", time.Now().Format(time.RFC3339))) + b.WriteString(fmt.Sprintf("# Instance: %s\n", m.instance)) + b.WriteString("\n") + + // dbbackup_last_success_timestamp + b.WriteString("# HELP dbbackup_last_success_timestamp Unix timestamp of last successful backup\n") + b.WriteString("# TYPE dbbackup_last_success_timestamp gauge\n") + for _, met := range metrics { + if !met.LastSuccess.IsZero() { + b.WriteString(fmt.Sprintf("dbbackup_last_success_timestamp{instance=%q,database=%q,engine=%q} %d\n", + m.instance, met.Database, met.Engine, met.LastSuccess.Unix())) + } + } + b.WriteString("\n") + + // dbbackup_last_backup_duration_seconds + b.WriteString("# HELP dbbackup_last_backup_duration_seconds Duration of last successful backup in seconds\n") + b.WriteString("# TYPE dbbackup_last_backup_duration_seconds gauge\n") + for _, met := range metrics { + if met.LastDuration > 0 { + b.WriteString(fmt.Sprintf("dbbackup_last_backup_duration_seconds{instance=%q,database=%q,engine=%q} %.2f\n", + m.instance, met.Database, met.Engine, met.LastDuration.Seconds())) + } + } + b.WriteString("\n") + + // dbbackup_last_backup_size_bytes + b.WriteString("# HELP dbbackup_last_backup_size_bytes Size of last successful backup in bytes\n") + b.WriteString("# TYPE dbbackup_last_backup_size_bytes gauge\n") + for _, met := range metrics { + if met.LastSize > 0 { + b.WriteString(fmt.Sprintf("dbbackup_last_backup_size_bytes{instance=%q,database=%q,engine=%q} %d\n", + m.instance, met.Database, met.Engine, met.LastSize)) + } + } + b.WriteString("\n") + + // dbbackup_backup_total (counter) + b.WriteString("# HELP dbbackup_backup_total Total number of backup attempts\n") + b.WriteString("# TYPE dbbackup_backup_total counter\n") + for _, met := range metrics { + b.WriteString(fmt.Sprintf("dbbackup_backup_total{instance=%q,database=%q,status=\"success\"} %d\n", + m.instance, met.Database, met.SuccessCount)) + b.WriteString(fmt.Sprintf("dbbackup_backup_total{instance=%q,database=%q,status=\"failure\"} %d\n", + m.instance, met.Database, met.FailureCount)) + } + b.WriteString("\n") + + // dbbackup_rpo_seconds + b.WriteString("# HELP dbbackup_rpo_seconds Recovery Point Objective - seconds since last successful backup\n") + b.WriteString("# TYPE dbbackup_rpo_seconds gauge\n") + for _, met := range metrics { + if met.RPOSeconds > 0 { + b.WriteString(fmt.Sprintf("dbbackup_rpo_seconds{instance=%q,database=%q} %.0f\n", + m.instance, met.Database, met.RPOSeconds)) + } + } + b.WriteString("\n") + + // dbbackup_backup_verified + b.WriteString("# HELP dbbackup_backup_verified Whether the last backup was verified (1=yes, 0=no)\n") + b.WriteString("# TYPE dbbackup_backup_verified gauge\n") + for _, met := range metrics { + verified := 0 + if met.Verified { + verified = 1 + } + b.WriteString(fmt.Sprintf("dbbackup_backup_verified{instance=%q,database=%q} %d\n", + m.instance, met.Database, verified)) + } + b.WriteString("\n") + + // dbbackup_scrape_timestamp + b.WriteString("# HELP dbbackup_scrape_timestamp Unix timestamp when metrics were collected\n") + b.WriteString("# TYPE dbbackup_scrape_timestamp gauge\n") + b.WriteString(fmt.Sprintf("dbbackup_scrape_timestamp{instance=%q} %d\n", m.instance, now)) + + return b.String() +} + +// GenerateMetricsString returns metrics as a string (for HTTP endpoint) +func (m *MetricsWriter) GenerateMetricsString() (string, error) { + metrics, err := m.collectMetrics() + if err != nil { + return "", err + } + return m.formatMetrics(metrics), nil +}