Compare commits
151 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 323ccb18bc | |||
| 73fe9ef7fa | |||
| 527435a3b8 | |||
| 6a7cf3c11e | |||
| fd3f8770b7 | |||
| 15f10c280c | |||
| 35a9a6e837 | |||
| 82378be971 | |||
| 9fec2c79f8 | |||
| ae34467b4a | |||
| 379ca06146 | |||
| c9bca42f28 | |||
| c90ec1156e | |||
| 23265a33a4 | |||
| 9b9abbfde7 | |||
| 6282d66693 | |||
| 4486a5d617 | |||
| 75dee1fff5 | |||
| 91d494537d | |||
| 8ffc1ba23c | |||
| 8e8045d8c0 | |||
| 0e94dcf384 | |||
| 33adfbdb38 | |||
| af34eaa073 | |||
| babce7cc83 | |||
| ae8c8fde3d | |||
| 346cb7fb61 | |||
| 18549584b1 | |||
| b1d1d57b61 | |||
| d0e1da1bea | |||
| 343a8b782d | |||
| bc5f7c07f4 | |||
| 821521470f | |||
| 147b9fc234 | |||
| 6f3e81a5a6 | |||
| bf1722c316 | |||
| a759f4d3db | |||
| 7cf1d6f85b | |||
| b305d1342e | |||
| 5456da7183 | |||
| f9ff45cf2a | |||
| 72c06ba5c2 | |||
| a0a401cab1 | |||
| 59a717abe7 | |||
| 490a12f858 | |||
| ea4337e298 | |||
| bbd4f0ceac | |||
| f6f8b04785 | |||
| 670c9af2e7 | |||
| e2cf9adc62 | |||
| 29e089fe3b | |||
| 9396c8e605 | |||
| e363e1937f | |||
| df1ab2f55b | |||
| 0e050b2def | |||
| 62d58c77af | |||
| c5be9bcd2b | |||
| b120f1507e | |||
| dd1db844ce | |||
| 4ea3ec2cf8 | |||
| 9200024e50 | |||
| 698b8a761c | |||
| dd7c4da0eb | |||
| b2a78cad2a | |||
| 5728b465e6 | |||
| bfe99e959c | |||
| 780beaadfb | |||
| 838c5b8c15 | |||
| 9d95a193db | |||
| 3201f0fb6a | |||
| 62ddc57fb7 | |||
| 510175ff04 | |||
| a85ad0c88c | |||
| 4938dc1918 | |||
| 09a917766f | |||
| eeacbfa007 | |||
| 7711a206ab | |||
| ba6e8a2b39 | |||
| ec5e89eab7 | |||
| e24d7ab49f | |||
| 721e53fe6a | |||
| 4e09066aa5 | |||
| 6a24ee39be | |||
| dc6dfd8b2c | |||
| 7b4ab76313 | |||
| c0d92b3a81 | |||
| 8c85d85249 | |||
| e0cdcb28be | |||
| 22a7b9e81e | |||
| c71889be47 | |||
| 222bdbef58 | |||
| f7e9fa64f0 | |||
| f153e61dbf | |||
| d19c065658 | |||
| 8dac5efc10 | |||
| fd5edce5ae | |||
| a7e2c86618 | |||
| b2e0c739e0 | |||
| ad23abdf4e | |||
| 390b830976 | |||
| 7e53950967 | |||
| 59d2094241 | |||
| b1f8c6d646 | |||
| b05c2be19d | |||
| ec33959e3e | |||
| 92402f0fdb | |||
| 682510d1bc | |||
| 83ad62b6b5 | |||
| 55d34be32e | |||
| 1831bd7c1f | |||
| 24377eab8f | |||
| 3e41d88445 | |||
| 5fb88b14ba | |||
| cccee4294f | |||
| 9688143176 | |||
| e821e131b4 | |||
| 15a60d2e71 | |||
| 9c65821250 | |||
| 627061cdbb | |||
| e1a7c57e0f | |||
| 22915102d4 | |||
| 3653ced6da | |||
| 9743d571ce | |||
| c519f08ef2 | |||
| b99b05fedb | |||
| c5f2c3322c | |||
| 56ad0824c7 | |||
| ec65df2976 | |||
| 23cc1e0e08 | |||
| 7770abab6f | |||
| f6a20f035b | |||
| 28e54d118f | |||
| ab0ff3f28d | |||
| b7dd325c51 | |||
| 2ed54141a3 | |||
| 495ee31247 | |||
| 78e10f5057 | |||
| f4a0e2d82c | |||
| f66d19acb0 | |||
| 16f377e9b5 | |||
| 7e32a0369d | |||
| 120ee33e3b | |||
| 9f375621d1 | |||
| 9ad925191e | |||
| 9d8a6e763e | |||
| 63b16eee8b | |||
| 91228552fb | |||
| 9ee55309bd | |||
| 0baf741c0b | |||
| faace7271c | |||
| c3ade7a693 |
@ -37,90 +37,6 @@ jobs:
|
||||
- name: Coverage summary
|
||||
run: go tool cover -func=coverage.out | tail -1
|
||||
|
||||
test-integration:
|
||||
name: Integration Tests
|
||||
runs-on: ubuntu-latest
|
||||
needs: [test]
|
||||
container:
|
||||
image: golang:1.24-bookworm
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:15
|
||||
env:
|
||||
POSTGRES_PASSWORD: postgres
|
||||
POSTGRES_DB: testdb
|
||||
ports: ['5432:5432']
|
||||
mysql:
|
||||
image: mysql:8
|
||||
env:
|
||||
MYSQL_ROOT_PASSWORD: mysql
|
||||
MYSQL_DATABASE: testdb
|
||||
ports: ['3306:3306']
|
||||
steps:
|
||||
- name: Checkout code
|
||||
env:
|
||||
TOKEN: ${{ github.token }}
|
||||
run: |
|
||||
apt-get update && apt-get install -y -qq git ca-certificates postgresql-client default-mysql-client
|
||||
git config --global --add safe.directory "$GITHUB_WORKSPACE"
|
||||
git init
|
||||
git remote add origin "https://${TOKEN}@git.uuxo.net/${GITHUB_REPOSITORY}.git"
|
||||
git fetch --depth=1 origin "${GITHUB_SHA}"
|
||||
git checkout FETCH_HEAD
|
||||
|
||||
- name: Wait for databases
|
||||
run: |
|
||||
echo "Waiting for PostgreSQL..."
|
||||
for i in $(seq 1 30); do
|
||||
pg_isready -h postgres -p 5432 && break || sleep 1
|
||||
done
|
||||
echo "Waiting for MySQL..."
|
||||
for i in $(seq 1 30); do
|
||||
mysqladmin ping -h mysql -u root -pmysql --silent && break || sleep 1
|
||||
done
|
||||
|
||||
- name: Build dbbackup
|
||||
run: go build -o dbbackup .
|
||||
|
||||
- name: Test PostgreSQL backup/restore
|
||||
env:
|
||||
PGHOST: postgres
|
||||
PGUSER: postgres
|
||||
PGPASSWORD: postgres
|
||||
run: |
|
||||
# Create test data
|
||||
psql -h postgres -c "CREATE TABLE test_table (id SERIAL PRIMARY KEY, name TEXT);"
|
||||
psql -h postgres -c "INSERT INTO test_table (name) VALUES ('test1'), ('test2'), ('test3');"
|
||||
# Run backup - database name is positional argument
|
||||
mkdir -p /tmp/backups
|
||||
./dbbackup backup single testdb --db-type postgres --host postgres --user postgres --password postgres --backup-dir /tmp/backups --no-config --allow-root
|
||||
# Verify backup file exists
|
||||
ls -la /tmp/backups/
|
||||
|
||||
- name: Test MySQL backup/restore
|
||||
env:
|
||||
MYSQL_HOST: mysql
|
||||
MYSQL_USER: root
|
||||
MYSQL_PASSWORD: mysql
|
||||
run: |
|
||||
# Create test data
|
||||
mysql -h mysql -u root -pmysql testdb -e "CREATE TABLE test_table (id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(255));"
|
||||
mysql -h mysql -u root -pmysql testdb -e "INSERT INTO test_table (name) VALUES ('test1'), ('test2'), ('test3');"
|
||||
# Run backup - positional arg is db to backup, --database is connection db
|
||||
mkdir -p /tmp/mysql_backups
|
||||
./dbbackup backup single testdb --db-type mysql --host mysql --port 3306 --user root --password mysql --database testdb --backup-dir /tmp/mysql_backups --no-config --allow-root
|
||||
# Verify backup file exists
|
||||
ls -la /tmp/mysql_backups/
|
||||
|
||||
- name: Test verify-locks command
|
||||
env:
|
||||
PGHOST: postgres
|
||||
PGUSER: postgres
|
||||
PGPASSWORD: postgres
|
||||
run: |
|
||||
./dbbackup verify-locks --host postgres --db-type postgres --no-config --allow-root | tee verify-locks.out
|
||||
grep -q 'max_locks_per_transaction' verify-locks.out
|
||||
|
||||
lint:
|
||||
name: Lint
|
||||
runs-on: ubuntu-latest
|
||||
@ -160,7 +76,6 @@ jobs:
|
||||
git init
|
||||
git remote add origin "https://${TOKEN}@git.uuxo.net/${GITHUB_REPOSITORY}.git"
|
||||
git fetch --depth=1 origin "${GITHUB_SHA}"
|
||||
git fetch --tags origin
|
||||
git checkout FETCH_HEAD
|
||||
|
||||
- name: Build all platforms
|
||||
|
||||
@ -1,75 +0,0 @@
|
||||
# Backup of .gitea/workflows/ci.yml — created before adding integration-verify-locks job
|
||||
# timestamp: 2026-01-23
|
||||
|
||||
# CI/CD Pipeline for dbbackup (backup copy)
|
||||
# Source: .gitea/workflows/ci.yml
|
||||
# Created: 2026-01-23
|
||||
|
||||
name: CI/CD
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, master, develop]
|
||||
tags: ['v*']
|
||||
pull_request:
|
||||
branches: [main, master]
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: Test
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: golang:1.24-bookworm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
env:
|
||||
TOKEN: ${{ github.token }}
|
||||
run: |
|
||||
apt-get update && apt-get install -y -qq git ca-certificates
|
||||
git config --global --add safe.directory "$GITHUB_WORKSPACE"
|
||||
git init
|
||||
git remote add origin "https://${TOKEN}@git.uuxo.net/${GITHUB_REPOSITORY}.git"
|
||||
git fetch --depth=1 origin "${GITHUB_SHA}"
|
||||
git checkout FETCH_HEAD
|
||||
|
||||
- name: Download dependencies
|
||||
run: go mod download
|
||||
|
||||
- name: Run tests
|
||||
run: go test -race -coverprofile=coverage.out ./...
|
||||
|
||||
- name: Coverage summary
|
||||
run: go tool cover -func=coverage.out | tail -1
|
||||
|
||||
lint:
|
||||
name: Lint
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: golang:1.24-bookworm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
env:
|
||||
TOKEN: ${{ github.token }}
|
||||
run: |
|
||||
apt-get update && apt-get install -y -qq git ca-certificates
|
||||
git config --global --add safe.directory "$GITHUB_WORKSPACE"
|
||||
git init
|
||||
git remote add origin "https://${TOKEN}@git.uuxo.net/${GITHUB_REPOSITORY}.git"
|
||||
git fetch --depth=1 origin "${GITHUB_SHA}"
|
||||
git checkout FETCH_HEAD
|
||||
|
||||
- name: Install and run golangci-lint
|
||||
run: |
|
||||
go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.8.0
|
||||
golangci-lint run --timeout=5m ./...
|
||||
|
||||
build-and-release:
|
||||
name: Build & Release
|
||||
runs-on: ubuntu-latest
|
||||
needs: [test, lint]
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
container:
|
||||
image: golang:1.24-bookworm
|
||||
steps: |
|
||||
<trimmed for backup>
|
||||
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@ -13,7 +13,8 @@ logs/
|
||||
/dbbackup
|
||||
/dbbackup_*
|
||||
!dbbackup.png
|
||||
bin/
|
||||
bin/dbbackup_*
|
||||
bin/*.exe
|
||||
|
||||
# Ignore development artifacts
|
||||
*.swp
|
||||
@ -37,6 +38,3 @@ CRITICAL_BUGS_FIXED.md
|
||||
LEGAL_DOCUMENTATION.md
|
||||
LEGAL_*.md
|
||||
legal/
|
||||
|
||||
# Release binaries (uploaded via gh release, not git)
|
||||
release/dbbackup_*
|
||||
|
||||
@ -236,8 +236,8 @@ dbbackup cloud download \
|
||||
# Manual delete
|
||||
dbbackup cloud delete "azure://prod-backups/postgres/old_backup.sql?account=myaccount&key=KEY"
|
||||
|
||||
# Automatic cleanup (keep last 7 days, min 5 backups)
|
||||
dbbackup cleanup "azure://prod-backups/postgres/?account=myaccount&key=KEY" --retention-days 7 --min-backups 5
|
||||
# Automatic cleanup (keep last 7 backups)
|
||||
dbbackup cleanup "azure://prod-backups/postgres/?account=myaccount&key=KEY" --keep 7
|
||||
```
|
||||
|
||||
### Scheduled Backups
|
||||
@ -253,7 +253,7 @@ dbbackup backup single production_db \
|
||||
--compression 9
|
||||
|
||||
# Cleanup old backups
|
||||
dbbackup cleanup "azure://prod-backups/postgres/?account=myaccount&key=${AZURE_STORAGE_KEY}" --retention-days 30 --min-backups 5
|
||||
dbbackup cleanup "azure://prod-backups/postgres/?account=myaccount&key=${AZURE_STORAGE_KEY}" --keep 30
|
||||
```
|
||||
|
||||
**Crontab:**
|
||||
@ -385,7 +385,7 @@ Tests include:
|
||||
### 4. Reliability
|
||||
|
||||
- Test **restore procedures** regularly
|
||||
- Use **retention policies**: `--retention-days 30`
|
||||
- Use **retention policies**: `--keep 30`
|
||||
- Enable **soft delete** in Azure (30-day recovery)
|
||||
- Monitor backup success with Azure Monitor
|
||||
|
||||
337
CHANGELOG.md
337
CHANGELOG.md
@ -5,330 +5,6 @@ All notable changes to dbbackup will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [4.2.0] - 2026-01-30
|
||||
|
||||
### Added - Quick Wins Release
|
||||
|
||||
- **`dbbackup health` command** - Comprehensive backup infrastructure health check
|
||||
- 10 automated health checks: config, DB connectivity, backup dir, catalog, freshness, gaps, verification, file integrity, orphans, disk space
|
||||
- Exit codes for automation: 0=healthy, 1=warning, 2=critical
|
||||
- JSON output for monitoring integration (Prometheus, Nagios, etc.)
|
||||
- Auto-generates actionable recommendations
|
||||
- Custom backup interval for gap detection: `--interval 12h`
|
||||
- Skip database check for offline mode: `--skip-db`
|
||||
- Example: `dbbackup health --format json`
|
||||
|
||||
- **TUI System Health Check** - Interactive health monitoring
|
||||
- Accessible via Tools → System Health Check
|
||||
- Runs all 10 checks asynchronously with progress spinner
|
||||
- Color-coded results: green=healthy, yellow=warning, red=critical
|
||||
- Displays recommendations for any issues found
|
||||
|
||||
- **`dbbackup restore preview` command** - Pre-restore analysis and validation
|
||||
- Shows backup format, compression type, database type
|
||||
- Estimates uncompressed size (3x compression ratio)
|
||||
- Calculates RTO (Recovery Time Objective) based on active profile
|
||||
- Validates backup integrity without actual restore
|
||||
- Displays resource requirements (RAM, CPU, disk space)
|
||||
- Example: `dbbackup restore preview backup.dump.gz`
|
||||
|
||||
- **`dbbackup diff` command** - Compare two backups and track changes
|
||||
- Flexible input: file paths, catalog IDs, or `database:latest/previous`
|
||||
- Shows size delta with percentage change
|
||||
- Calculates database growth rate (GB/day)
|
||||
- Projects time to reach 10GB threshold
|
||||
- Compares backup duration and compression efficiency
|
||||
- JSON output for automation and reporting
|
||||
- Example: `dbbackup diff mydb:latest mydb:previous`
|
||||
|
||||
- **`dbbackup cost analyze` command** - Cloud storage cost optimization
|
||||
- Analyzes 15 storage tiers across 5 cloud providers
|
||||
- AWS S3: Standard, IA, Glacier Instant/Flexible, Deep Archive
|
||||
- Google Cloud Storage: Standard, Nearline, Coldline, Archive
|
||||
- Azure Blob Storage: Hot, Cool, Archive
|
||||
- Backblaze B2 and Wasabi alternatives
|
||||
- Monthly/annual cost projections
|
||||
- Savings calculations vs S3 Standard baseline
|
||||
- Tiered lifecycle strategy recommendations
|
||||
- Shows potential savings of 90%+ with proper policies
|
||||
- Example: `dbbackup cost analyze --database mydb`
|
||||
|
||||
### Enhanced
|
||||
- **TUI restore preview** - Added RTO estimates and size calculations
|
||||
- Shows estimated uncompressed size during restore confirmation
|
||||
- Displays estimated restore time based on current profile
|
||||
- Helps users make informed restore decisions
|
||||
- Keeps TUI simple (essentials only), detailed analysis in CLI
|
||||
|
||||
### Documentation
|
||||
- Updated README.md with new commands and examples
|
||||
- Created QUICK_WINS.md documenting the rapid development sprint
|
||||
- Added backup diff and cost analysis sections
|
||||
|
||||
## [4.1.4] - 2026-01-29
|
||||
|
||||
### Added
|
||||
- **New `turbo` restore profile** - Maximum restore speed, matches native `pg_restore -j8`
|
||||
- `ClusterParallelism = 2` (restore 2 DBs concurrently)
|
||||
- `Jobs = 8` (8 parallel pg_restore jobs)
|
||||
- `BufferedIO = true` (32KB write buffers for faster extraction)
|
||||
- Works on 16GB+ RAM, 4+ cores
|
||||
- Usage: `dbbackup restore cluster backup.tar.gz --profile=turbo --confirm`
|
||||
|
||||
- **Restore startup performance logging** - Shows actual parallelism settings at restore start
|
||||
- Logs profile name, cluster_parallelism, pg_restore_jobs, buffered_io
|
||||
- Helps verify settings before long restore operations
|
||||
|
||||
- **Buffered I/O optimization** - 32KB write buffers during tar extraction (turbo profile)
|
||||
- Reduces system call overhead
|
||||
- Improves I/O throughput for large archives
|
||||
|
||||
### Fixed
|
||||
- **TUI now respects saved profile settings** - Previously TUI forced `conservative` profile on every launch, ignoring user's saved configuration. Now properly loads and respects saved settings.
|
||||
|
||||
### Changed
|
||||
- TUI default profile changed from forced `conservative` to `balanced` (only when no profile configured)
|
||||
- `LargeDBMode` no longer forced on TUI startup - user controls it via settings
|
||||
|
||||
## [4.1.3] - 2026-01-27
|
||||
|
||||
### Added
|
||||
- **`--config` / `-c` global flag** - Specify config file path from anywhere
|
||||
- Example: `dbbackup --config /opt/dbbackup/.dbbackup.conf backup single mydb`
|
||||
- No longer need to `cd` to config directory before running commands
|
||||
- Works with all subcommands (backup, restore, verify, etc.)
|
||||
|
||||
## [4.1.2] - 2026-01-27
|
||||
|
||||
### Added
|
||||
- **`--socket` flag for MySQL/MariaDB** - Connect via Unix socket instead of TCP/IP
|
||||
- Usage: `dbbackup backup single mydb --db-type mysql --socket /var/run/mysqld/mysqld.sock`
|
||||
- Works for both backup and restore operations
|
||||
- Supports socket auth (no password required with proper permissions)
|
||||
|
||||
### Fixed
|
||||
- **Socket path as --host now works** - If `--host` starts with `/`, it's auto-detected as a socket path
|
||||
- Example: `--host /var/run/mysqld/mysqld.sock` now works correctly instead of DNS lookup error
|
||||
- Auto-converts to `--socket` internally
|
||||
|
||||
## [4.1.1] - 2026-01-25
|
||||
|
||||
### Added
|
||||
- **`dbbackup_build_info` metric** - Exposes version and git commit as Prometheus labels
|
||||
- Useful for tracking deployed versions across a fleet
|
||||
- Labels: `server`, `version`, `commit`
|
||||
|
||||
### Fixed
|
||||
- **Documentation clarification**: The `pitr_base` value for `backup_type` label is auto-assigned
|
||||
by `dbbackup pitr base` command. CLI `--backup-type` flag only accepts `full` or `incremental`.
|
||||
This was causing confusion in deployments.
|
||||
|
||||
## [4.1.0] - 2026-01-25
|
||||
|
||||
### Added
|
||||
- **Backup Type Tracking**: All backup metrics now include a `backup_type` label
|
||||
(`full`, `incremental`, or `pitr_base` for PITR base backups)
|
||||
- **PITR Metrics**: Complete Point-in-Time Recovery monitoring
|
||||
- `dbbackup_pitr_enabled` - Whether PITR is enabled (1/0)
|
||||
- `dbbackup_pitr_archive_lag_seconds` - Seconds since last WAL/binlog archived
|
||||
- `dbbackup_pitr_chain_valid` - WAL/binlog chain integrity (1=valid)
|
||||
- `dbbackup_pitr_gap_count` - Number of gaps in archive chain
|
||||
- `dbbackup_pitr_archive_count` - Total archived segments
|
||||
- `dbbackup_pitr_archive_size_bytes` - Total archive storage
|
||||
- `dbbackup_pitr_recovery_window_minutes` - Estimated PITR coverage
|
||||
- **PITR Alerting Rules**: 6 new alerts for PITR monitoring
|
||||
- PITRArchiveLag, PITRChainBroken, PITRGapsDetected, PITRArchiveStalled,
|
||||
PITRStorageGrowing, PITRDisabledUnexpectedly
|
||||
- **`dbbackup_backup_by_type` metric** - Count backups by type
|
||||
|
||||
### Changed
|
||||
- `dbbackup_backup_total` type changed from counter to gauge for snapshot-based collection
|
||||
|
||||
## [3.42.110] - 2026-01-24
|
||||
|
||||
### Improved - Code Quality & Testing
|
||||
- **Cleaned up 40+ unused code items** found by staticcheck:
|
||||
- Removed unused functions, variables, struct fields, and type aliases
|
||||
- Fixed SA4006 warning (unused value assignment in restore engine)
|
||||
- All packages now pass staticcheck with zero warnings
|
||||
|
||||
- **Added golangci-lint integration** to Makefile:
|
||||
- New `make golangci-lint` target with auto-install
|
||||
- Updated `lint` target to include golangci-lint
|
||||
- Updated `install-tools` to install golangci-lint
|
||||
|
||||
- **New unit tests** for improved coverage:
|
||||
- `internal/config/config_test.go` - Tests for config initialization, database types, env helpers
|
||||
- `internal/security/security_test.go` - Tests for checksums, path validation, rate limiting, audit logging
|
||||
|
||||
## [3.42.109] - 2026-01-24
|
||||
|
||||
### Added - Grafana Dashboard & Monitoring Improvements
|
||||
- **Enhanced Grafana dashboard** with comprehensive improvements:
|
||||
- Added dashboard description for better discoverability
|
||||
- New collapsible "Backup Overview" row for organization
|
||||
- New **Verification Status** panel showing last backup verification state
|
||||
- Added descriptions to all 17 panels for better understanding
|
||||
- Enabled shared crosshair (graphTooltip=1) for correlated analysis
|
||||
- Added "monitoring" tag for dashboard discovery
|
||||
|
||||
- **New Prometheus alerting rules** (`grafana/alerting-rules.yaml`):
|
||||
- `DBBackupRPOCritical` - No backup in 24+ hours (critical)
|
||||
- `DBBackupRPOWarning` - No backup in 12+ hours (warning)
|
||||
- `DBBackupFailure` - Backup failures detected
|
||||
- `DBBackupNotVerified` - Backup not verified in 24h
|
||||
- `DBBackupDedupRatioLow` - Dedup ratio below 10%
|
||||
- `DBBackupDedupDiskGrowth` - Rapid storage growth prediction
|
||||
- `DBBackupExporterDown` - Metrics exporter not responding
|
||||
- `DBBackupMetricsStale` - Metrics not updated in 10+ minutes
|
||||
- `DBBackupNeverSucceeded` - Database never backed up successfully
|
||||
|
||||
### Changed
|
||||
- **Grafana dashboard layout fixes**:
|
||||
- Fixed overlapping dedup panels (y: 31/36 → 22/27/32)
|
||||
- Adjusted top row panel widths for better balance (5+5+5+4+5=24)
|
||||
|
||||
- **Added Makefile** for streamlined development workflow:
|
||||
- `make build` - optimized binary with ldflags
|
||||
- `make test`, `make race`, `make cover` - testing targets
|
||||
- `make lint` - runs vet + staticcheck
|
||||
- `make all-platforms` - cross-platform builds
|
||||
|
||||
### Fixed
|
||||
- Removed deprecated `netErr.Temporary()` call in cloud retry logic (Go 1.18+)
|
||||
- Fixed staticcheck warnings for redundant fmt.Sprintf calls
|
||||
- Logger optimizations: buffer pooling, early level check, pre-allocated maps
|
||||
- Clone engine now validates disk space before operations
|
||||
|
||||
## [3.42.108] - 2026-01-24
|
||||
|
||||
### Added - TUI Tools Expansion
|
||||
- **Table Sizes** - view top 100 tables sorted by size with row counts, data/index breakdown
|
||||
- Supports PostgreSQL (`pg_stat_user_tables`) and MySQL (`information_schema.TABLES`)
|
||||
- Shows total/data/index sizes, row counts, schema prefix for non-public schemas
|
||||
|
||||
- **Kill Connections** - manage active database connections
|
||||
- List all active connections with PID, user, database, state, query preview, duration
|
||||
- Kill single connection or all connections to a specific database
|
||||
- Useful before restore operations to clear blocking sessions
|
||||
- Supports PostgreSQL (`pg_terminate_backend`) and MySQL (`KILL`)
|
||||
|
||||
- **Drop Database** - safely drop databases with double confirmation
|
||||
- Lists user databases (system DBs hidden: postgres, template0/1, mysql, sys, etc.)
|
||||
- Requires two confirmations: y/n then type full database name
|
||||
- Auto-terminates connections before drop
|
||||
- Supports PostgreSQL and MySQL
|
||||
|
||||
## [3.42.107] - 2026-01-24
|
||||
|
||||
### Added - Tools Menu & Blob Statistics
|
||||
- **New "Tools" submenu in TUI** - centralized access to utility functions
|
||||
- Blob Statistics - scan database for bytea/blob columns with size analysis
|
||||
- Blob Extract - externalize large objects (coming soon)
|
||||
- Dedup Store Analyze - storage savings analysis (coming soon)
|
||||
- Verify Backup Integrity - backup verification
|
||||
- Catalog Sync - synchronize local catalog (coming soon)
|
||||
|
||||
- **New `dbbackup blob stats` CLI command** - analyze blob/bytea columns
|
||||
- Scans `information_schema` for binary column types
|
||||
- Shows row counts, total size, average size, max size per column
|
||||
- Identifies tables storing large binary data for optimization
|
||||
- Supports both PostgreSQL (bytea, oid) and MySQL (blob, mediumblob, longblob)
|
||||
- Provides recommendations for databases with >100MB blob data
|
||||
|
||||
## [3.42.106] - 2026-01-24
|
||||
|
||||
### Fixed - Cluster Restore Resilience & Performance
|
||||
- **Fixed cluster restore failing on missing roles** - harmless "role does not exist" errors no longer abort restore
|
||||
- Added role-related errors to `isIgnorableError()` with warning log
|
||||
- Removed `ON_ERROR_STOP=1` from psql commands (pre-validation catches real corruption)
|
||||
- Restore now continues gracefully when referenced roles don't exist in target cluster
|
||||
- Previously caused 12h+ restores to fail at 94% completion
|
||||
|
||||
- **Fixed TUI output scrambling in screen/tmux sessions** - added terminal detection
|
||||
- Uses `go-isatty` to detect non-interactive terminals (backgrounded screen sessions, pipes)
|
||||
- Added `viewSimple()` methods for clean line-by-line output without ANSI escape codes
|
||||
- TUI menu now shows warning when running in non-interactive terminal
|
||||
|
||||
### Changed - Consistent Parallel Compression (pgzip)
|
||||
- **Migrated all gzip operations to parallel pgzip** - 2-4x faster compression/decompression on multi-core systems
|
||||
- Systematic audit found 17 files using standard `compress/gzip`
|
||||
- All converted to `github.com/klauspost/pgzip` for consistent performance
|
||||
- **Files updated**:
|
||||
- `internal/backup/`: incremental_tar.go, incremental_extract.go, incremental_mysql.go
|
||||
- `internal/wal/`: compression.go (CompressWALFile, DecompressWALFile, VerifyCompressedFile)
|
||||
- `internal/engine/`: clone.go, snapshot_engine.go, mysqldump.go, binlog/file_target.go
|
||||
- `internal/restore/`: engine.go, safety.go, formats.go, error_report.go
|
||||
- `internal/pitr/`: mysql.go, binlog.go
|
||||
- `internal/dedup/`: store.go
|
||||
- `cmd/`: dedup.go, placeholder.go
|
||||
- **Benefit**: Large backup/restore operations now fully utilize available CPU cores
|
||||
|
||||
## [3.42.105] - 2026-01-23
|
||||
|
||||
### Changed - TUI Visual Cleanup
|
||||
- **Removed ASCII box characters** from backup/restore success/failure banners
|
||||
- Replaced `╔═╗║╚╝` boxes with clean `═══` horizontal line separators
|
||||
- Cleaner, more modern appearance in terminal output
|
||||
- **Consolidated duplicate styles** in TUI components
|
||||
- Unified check status styles (passed/failed/warning/pending) into global definitions
|
||||
- Reduces code duplication across restore preview and diagnose views
|
||||
|
||||
## [3.42.98] - 2025-01-23
|
||||
|
||||
### Fixed - Critical Bug Fixes for v3.42.97
|
||||
- **Fixed CGO/SQLite build issue** - binaries now work when compiled with `CGO_ENABLED=0`
|
||||
- Switched from `github.com/mattn/go-sqlite3` (requires CGO) to `modernc.org/sqlite` (pure Go)
|
||||
- All cross-compiled binaries now work correctly on all platforms
|
||||
- No more "Binary was compiled with 'CGO_ENABLED=0', go-sqlite3 requires cgo to work" errors
|
||||
|
||||
- **Fixed MySQL positional database argument being ignored**
|
||||
- `dbbackup backup single <dbname> --db-type mysql` now correctly uses `<dbname>`
|
||||
- Previously defaulted to 'postgres' regardless of positional argument
|
||||
- Also fixed in `backup sample` command
|
||||
|
||||
## [3.42.97] - 2025-01-23
|
||||
|
||||
### Added - Bandwidth Throttling for Cloud Uploads
|
||||
- **New `--bandwidth-limit` flag for cloud operations** - prevent network saturation during business hours
|
||||
- Works with S3, GCS, Azure Blob Storage, MinIO, Backblaze B2
|
||||
- Supports human-readable formats:
|
||||
- `10MB/s`, `50MiB/s` - megabytes per second
|
||||
- `100KB/s`, `500KiB/s` - kilobytes per second
|
||||
- `1GB/s` - gigabytes per second
|
||||
- `100Mbps` - megabits per second (for network-minded users)
|
||||
- `unlimited` or `0` - no limit (default)
|
||||
- Environment variable: `DBBACKUP_BANDWIDTH_LIMIT`
|
||||
- **Example usage**:
|
||||
```bash
|
||||
# Limit upload to 10 MB/s during business hours
|
||||
dbbackup cloud upload backup.dump --bandwidth-limit 10MB/s
|
||||
|
||||
# Environment variable for all operations
|
||||
export DBBACKUP_BANDWIDTH_LIMIT=50MiB/s
|
||||
```
|
||||
- **Implementation**: Token-bucket style throttling with 100ms windows for smooth rate limiting
|
||||
- **DBA requested feature**: Avoid saturating production network during scheduled backups
|
||||
|
||||
## [3.42.96] - 2025-02-01
|
||||
|
||||
### Changed - Complete Elimination of Shell tar/gzip Dependencies
|
||||
- **All tar/gzip operations now 100% in-process** - ZERO shell dependencies for backup/restore
|
||||
- Removed ALL remaining `exec.Command("tar", ...)` calls
|
||||
- Removed ALL remaining `exec.Command("gzip", ...)` calls
|
||||
- Systematic code audit found and eliminated:
|
||||
- `diagnose.go`: Replaced `tar -tzf` test with direct file open check
|
||||
- `large_restore_check.go`: Replaced `gzip -t` and `gzip -l` with in-process pgzip verification
|
||||
- `pitr/restore.go`: Replaced `tar -xf` with in-process tar extraction
|
||||
- **Benefits**:
|
||||
- No external tool dependencies (works in minimal containers)
|
||||
- 2-4x faster on multi-core systems using parallel pgzip
|
||||
- More reliable error handling with Go-native errors
|
||||
- Consistent behavior across all platforms
|
||||
- Reduced attack surface (no shell spawning)
|
||||
- **Verification**: `strace` and `ps aux` show no tar/gzip/gunzip processes during backup/restore
|
||||
- **Note**: Docker drill container commands still use gunzip for in-container operations (intentional)
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added - Single Database Extraction from Cluster Backups (CLI + TUI)
|
||||
@ -382,13 +58,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Reduces preflight validation time from minutes to seconds on large archives
|
||||
- Falls back to full extraction only when necessary (with `--diagnose`)
|
||||
|
||||
### Added - PostgreSQL lock verification (CLI + preflight)
|
||||
- **`dbbackup verify-locks`** — new CLI command that probes PostgreSQL GUCs (`max_locks_per_transaction`, `max_connections`, `max_prepared_transactions`) and prints total lock capacity plus actionable restore guidance.
|
||||
- **Integrated into preflight checks** — preflight now warns/fails when lock settings are insufficient and provides exact remediation commands and recommended restore flags (e.g. `--jobs 1 --parallel-dbs 1`).
|
||||
- **Implemented in Go (replaces `verify_postgres_locks.sh`)** with robust parsing, sudo/`psql` fallback and unit-tested decision logic.
|
||||
- **Files:** `cmd/verify_locks.go`, `internal/checks/locks.go`, `internal/checks/locks_test.go`, `internal/checks/preflight.go`.
|
||||
- **Why:** Prevents repeated parallel-restore failures by surfacing lock-capacity issues early and providing bulletproof guidance.
|
||||
|
||||
## [3.42.74] - 2026-01-20 "Resource Profile System + Critical Ctrl+C Fix"
|
||||
|
||||
### Critical Bug Fix
|
||||
@ -408,7 +77,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Good default for most scenarios
|
||||
- **Aggressive** (`--profile=aggressive`): Maximum parallelism, all available resources
|
||||
- Best for dedicated database servers with ample resources
|
||||
- **Potato** (`--profile=potato`): Easter egg, same as conservative
|
||||
- **Potato** (`--profile=potato`): Easter egg 🥔, same as conservative
|
||||
- **Profile system applies to both CLI and TUI**:
|
||||
- CLI: `dbbackup restore cluster backup.tar.gz --profile=conservative --confirm`
|
||||
- TUI: Automatically uses conservative profile for safer interactive operation
|
||||
@ -915,7 +584,7 @@ dbbackup metrics serve --port 9399
|
||||
|
||||
## [3.41.0] - 2026-01-07 "The Pre-Flight Check"
|
||||
|
||||
### Added - Pre-Restore Validation
|
||||
### Added - 🛡️ Pre-Restore Validation
|
||||
|
||||
**Automatic Dump Validation Before Restore:**
|
||||
- SQL dump files are now validated BEFORE attempting restore
|
||||
@ -1002,7 +671,7 @@ dbbackup metrics serve --port 9399
|
||||
|
||||
## [3.2.0] - 2025-12-13 "The Margin Eraser"
|
||||
|
||||
### Added - Physical Backup Revolution
|
||||
### Added - 🚀 Physical Backup Revolution
|
||||
|
||||
**MySQL Clone Plugin Integration:**
|
||||
- Native physical backup using MySQL 8.0.17+ Clone Plugin
|
||||
|
||||
@ -573,7 +573,7 @@ dbbackup cleanup minio://test-backups/test/ --retention-days 7 --dry-run
|
||||
|
||||
3. **Use compression:**
|
||||
```bash
|
||||
--compression 6 # Reduces upload size
|
||||
--compression gzip # Reduces upload size
|
||||
```
|
||||
|
||||
### Reliability
|
||||
@ -693,7 +693,7 @@ Error: checksum mismatch: expected abc123, got def456
|
||||
for db in db1 db2 db3; do
|
||||
dbbackup backup single $db \
|
||||
--cloud s3://production-backups/daily/$db/ \
|
||||
--compression 6
|
||||
--compression gzip
|
||||
done
|
||||
|
||||
# Cleanup old backups (keep 30 days, min 10 backups)
|
||||
@ -16,17 +16,17 @@ DBBackup now includes a modular backup engine system with multiple strategies:
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# List available engines for your MySQL/MariaDB environment
|
||||
# List available engines
|
||||
dbbackup engine list
|
||||
|
||||
# Get detailed information on a specific engine
|
||||
dbbackup engine info clone
|
||||
# Auto-select best engine for your environment
|
||||
dbbackup engine select
|
||||
|
||||
# Get engine info for current environment
|
||||
dbbackup engine info
|
||||
# Perform physical backup with auto-selection
|
||||
dbbackup physical-backup --output /backups/db.tar.gz
|
||||
|
||||
# Use engines with backup commands (auto-detection)
|
||||
dbbackup backup single mydb --db-type mysql
|
||||
# Stream directly to S3 (no local storage needed)
|
||||
dbbackup stream-backup --target s3://bucket/backups/db.tar.gz --workers 8
|
||||
```
|
||||
|
||||
## Engine Descriptions
|
||||
@ -36,7 +36,7 @@ dbbackup backup single mydb --db-type mysql
|
||||
Traditional logical backup using mysqldump. Works with all MySQL/MariaDB versions.
|
||||
|
||||
```bash
|
||||
dbbackup backup single mydb --db-type mysql
|
||||
dbbackup physical-backup --engine mysqldump --output backup.sql.gz
|
||||
```
|
||||
|
||||
Features:
|
||||
@ -293,8 +293,8 @@ dbbackup cloud download \
|
||||
# Manual delete
|
||||
dbbackup cloud delete "gs://prod-backups/postgres/old_backup.sql"
|
||||
|
||||
# Automatic cleanup (keep last 7 days, min 5 backups)
|
||||
dbbackup cleanup "gs://prod-backups/postgres/" --retention-days 7 --min-backups 5
|
||||
# Automatic cleanup (keep last 7 backups)
|
||||
dbbackup cleanup "gs://prod-backups/postgres/" --keep 7
|
||||
```
|
||||
|
||||
### Scheduled Backups
|
||||
@ -310,7 +310,7 @@ dbbackup backup single production_db \
|
||||
--compression 9
|
||||
|
||||
# Cleanup old backups
|
||||
dbbackup cleanup "gs://prod-backups/postgres/" --retention-days 30 --min-backups 5
|
||||
dbbackup cleanup "gs://prod-backups/postgres/" --keep 30
|
||||
```
|
||||
|
||||
**Crontab:**
|
||||
@ -482,7 +482,7 @@ Tests include:
|
||||
### 4. Reliability
|
||||
|
||||
- Test **restore procedures** regularly
|
||||
- Use **retention policies**: `--retention-days 30`
|
||||
- Use **retention policies**: `--keep 30`
|
||||
- Enable **object versioning** (30-day recovery)
|
||||
- Use **multi-region** buckets for disaster recovery
|
||||
- Monitor backup success with Cloud Monitoring
|
||||
126
Makefile
126
Makefile
@ -1,126 +0,0 @@
|
||||
# Makefile for dbbackup
|
||||
# Provides common development workflows
|
||||
|
||||
.PHONY: build test lint vet clean install-tools help race cover golangci-lint
|
||||
|
||||
# Build variables
|
||||
VERSION := $(shell grep 'version.*=' main.go | head -1 | sed 's/.*"\(.*\)".*/\1/')
|
||||
BUILD_TIME := $(shell date -u '+%Y-%m-%d_%H:%M:%S_UTC')
|
||||
GIT_COMMIT := $(shell git rev-parse --short HEAD 2>/dev/null || echo "unknown")
|
||||
LDFLAGS := -w -s -X main.version=$(VERSION) -X main.buildTime=$(BUILD_TIME) -X main.gitCommit=$(GIT_COMMIT)
|
||||
|
||||
# Default target
|
||||
all: lint test build
|
||||
|
||||
## build: Build the binary with optimizations
|
||||
build:
|
||||
@echo "🔨 Building dbbackup $(VERSION)..."
|
||||
CGO_ENABLED=0 go build -ldflags="$(LDFLAGS)" -o bin/dbbackup .
|
||||
@echo "✅ Built bin/dbbackup"
|
||||
|
||||
## build-debug: Build with debug symbols (for debugging)
|
||||
build-debug:
|
||||
@echo "🔨 Building dbbackup $(VERSION) with debug symbols..."
|
||||
go build -ldflags="-X main.version=$(VERSION) -X main.buildTime=$(BUILD_TIME) -X main.gitCommit=$(GIT_COMMIT)" -o bin/dbbackup-debug .
|
||||
@echo "✅ Built bin/dbbackup-debug"
|
||||
|
||||
## test: Run tests
|
||||
test:
|
||||
@echo "🧪 Running tests..."
|
||||
go test ./...
|
||||
|
||||
## race: Run tests with race detector
|
||||
race:
|
||||
@echo "🏃 Running tests with race detector..."
|
||||
go test -race ./...
|
||||
|
||||
## cover: Run tests with coverage report
|
||||
cover:
|
||||
@echo "📊 Running tests with coverage..."
|
||||
go test -cover ./... | tee coverage.txt
|
||||
@echo "📄 Coverage saved to coverage.txt"
|
||||
|
||||
## cover-html: Generate HTML coverage report
|
||||
cover-html:
|
||||
@echo "📊 Generating HTML coverage report..."
|
||||
go test -coverprofile=coverage.out ./...
|
||||
go tool cover -html=coverage.out -o coverage.html
|
||||
@echo "📄 Coverage report: coverage.html"
|
||||
|
||||
## lint: Run all linters
|
||||
lint: vet staticcheck golangci-lint
|
||||
|
||||
## vet: Run go vet
|
||||
vet:
|
||||
@echo "🔍 Running go vet..."
|
||||
go vet ./...
|
||||
|
||||
## staticcheck: Run staticcheck (install if missing)
|
||||
staticcheck:
|
||||
@echo "🔍 Running staticcheck..."
|
||||
@if ! command -v staticcheck >/dev/null 2>&1; then \
|
||||
echo "Installing staticcheck..."; \
|
||||
go install honnef.co/go/tools/cmd/staticcheck@latest; \
|
||||
fi
|
||||
$$(go env GOPATH)/bin/staticcheck ./...
|
||||
|
||||
## golangci-lint: Run golangci-lint (comprehensive linting)
|
||||
golangci-lint:
|
||||
@echo "🔍 Running golangci-lint..."
|
||||
@if ! command -v golangci-lint >/dev/null 2>&1; then \
|
||||
echo "Installing golangci-lint..."; \
|
||||
go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest; \
|
||||
fi
|
||||
$$(go env GOPATH)/bin/golangci-lint run --timeout 5m
|
||||
|
||||
## install-tools: Install development tools
|
||||
install-tools:
|
||||
@echo "📦 Installing development tools..."
|
||||
go install honnef.co/go/tools/cmd/staticcheck@latest
|
||||
go install golang.org/x/tools/cmd/goimports@latest
|
||||
go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest
|
||||
@echo "✅ Tools installed"
|
||||
|
||||
## fmt: Format code
|
||||
fmt:
|
||||
@echo "🎨 Formatting code..."
|
||||
gofmt -w -s .
|
||||
@which goimports > /dev/null && goimports -w . || true
|
||||
|
||||
## tidy: Tidy and verify go.mod
|
||||
tidy:
|
||||
@echo "🧹 Tidying go.mod..."
|
||||
go mod tidy
|
||||
go mod verify
|
||||
|
||||
## update: Update dependencies
|
||||
update:
|
||||
@echo "⬆️ Updating dependencies..."
|
||||
go get -u ./...
|
||||
go mod tidy
|
||||
|
||||
## clean: Clean build artifacts
|
||||
clean:
|
||||
@echo "🧹 Cleaning..."
|
||||
rm -rf bin/dbbackup bin/dbbackup-debug
|
||||
rm -f coverage.out coverage.txt coverage.html
|
||||
go clean -cache -testcache
|
||||
|
||||
## docker: Build Docker image
|
||||
docker:
|
||||
@echo "🐳 Building Docker image..."
|
||||
docker build -t dbbackup:$(VERSION) .
|
||||
|
||||
## all-platforms: Build for all platforms (uses build_all.sh)
|
||||
all-platforms:
|
||||
@echo "🌍 Building for all platforms..."
|
||||
./build_all.sh
|
||||
|
||||
## help: Show this help
|
||||
help:
|
||||
@echo "dbbackup Makefile"
|
||||
@echo ""
|
||||
@echo "Usage: make [target]"
|
||||
@echo ""
|
||||
@echo "Targets:"
|
||||
@grep -E '^## ' Makefile | sed 's/## / /'
|
||||
206
OPENSOURCE_ALTERNATIVE.md
Normal file
206
OPENSOURCE_ALTERNATIVE.md
Normal file
@ -0,0 +1,206 @@
|
||||
# dbbackup: The Real Open Source Alternative
|
||||
|
||||
## Killing Two Borgs with One Binary
|
||||
|
||||
You have two choices for database backups today:
|
||||
|
||||
1. **Pay $2,000-10,000/year per server** for Veeam, Commvault, or Veritas
|
||||
2. **Wrestle with Borg/restic** - powerful, but never designed for databases
|
||||
|
||||
**dbbackup** eliminates both problems with a single, zero-dependency binary.
|
||||
|
||||
## The Problem with Commercial Backup
|
||||
|
||||
| What You Pay For | What You Actually Get |
|
||||
|------------------|----------------------|
|
||||
| $10,000/year | Heavy agents eating CPU |
|
||||
| Complex licensing | Vendor lock-in to proprietary formats |
|
||||
| "Enterprise support" | Recovery that requires calling support |
|
||||
| "Cloud integration" | Upload to S3... eventually |
|
||||
|
||||
## The Problem with Borg/Restic
|
||||
|
||||
Great tools. Wrong use case.
|
||||
|
||||
| Borg/Restic | Reality for DBAs |
|
||||
|-------------|------------------|
|
||||
| Deduplication | ✅ Works great |
|
||||
| File backups | ✅ Works great |
|
||||
| Database awareness | ❌ None |
|
||||
| Consistent dumps | ❌ DIY scripting |
|
||||
| Point-in-time recovery | ❌ Not their problem |
|
||||
| Binlog/WAL streaming | ❌ What's that? |
|
||||
|
||||
You end up writing wrapper scripts. Then more scripts. Then a monitoring layer. Then you've built half a product anyway.
|
||||
|
||||
## What Open Source Really Means
|
||||
|
||||
**dbbackup** delivers everything - in one binary:
|
||||
|
||||
| Feature | Veeam | Borg/Restic | dbbackup |
|
||||
|---------|-------|-------------|----------|
|
||||
| Deduplication | ❌ | ✅ | ✅ Native CDC |
|
||||
| Database-aware | ✅ | ❌ | ✅ MySQL + PostgreSQL |
|
||||
| Consistent snapshots | ✅ | ❌ | ✅ LVM/ZFS/Btrfs |
|
||||
| PITR (Point-in-Time) | ❌ | ❌ | ✅ Sub-second RPO |
|
||||
| Binlog/WAL streaming | ❌ | ❌ | ✅ Continuous |
|
||||
| Direct cloud streaming | ❌ | ✅ | ✅ S3/GCS/Azure |
|
||||
| Zero dependencies | ❌ | ❌ | ✅ Single binary |
|
||||
| License cost | $$$$ | Free | **Free (Apache 2.0)** |
|
||||
|
||||
## Deduplication: We Killed the Borg
|
||||
|
||||
Content-defined chunking, just like Borg - but built for database dumps:
|
||||
|
||||
```bash
|
||||
# First backup: 5MB stored
|
||||
dbbackup dedup backup mydb.dump
|
||||
|
||||
# Second backup (modified): only 1.6KB new data!
|
||||
# 100% deduplication ratio
|
||||
dbbackup dedup backup mydb_modified.dump
|
||||
```
|
||||
|
||||
### How It Works
|
||||
- **Gear Hash CDC** - Content-defined chunking with 92%+ overlap detection
|
||||
- **SHA-256 Content-Addressed** - Chunks stored by hash, automatic dedup
|
||||
- **AES-256-GCM Encryption** - Per-chunk encryption
|
||||
- **Gzip Compression** - Enabled by default
|
||||
- **SQLite Index** - Fast lookups, portable metadata
|
||||
|
||||
### Storage Efficiency
|
||||
|
||||
| Scenario | Borg | dbbackup |
|
||||
|----------|------|----------|
|
||||
| Daily 10GB database | 10GB + ~2GB/day | 10GB + ~2GB/day |
|
||||
| Same data, knows it's a DB | Scripts needed | **Native support** |
|
||||
| Restore to point-in-time | ❌ | ✅ Built-in |
|
||||
|
||||
Same dedup math. Zero wrapper scripts.
|
||||
|
||||
## Enterprise Features, Zero Enterprise Pricing
|
||||
|
||||
### Physical Backups (MySQL 8.0.17+)
|
||||
```bash
|
||||
# Native Clone Plugin - no XtraBackup needed
|
||||
dbbackup backup single mydb --db-type mysql --cloud s3://bucket/
|
||||
```
|
||||
|
||||
### Filesystem Snapshots
|
||||
```bash
|
||||
# <100ms lock, instant snapshot, stream to cloud
|
||||
dbbackup backup --engine=snapshot --snapshot-backend=lvm
|
||||
```
|
||||
|
||||
### Continuous Binlog/WAL Streaming
|
||||
```bash
|
||||
# Real-time capture to S3 - sub-second RPO
|
||||
dbbackup binlog stream --target=s3://bucket/binlogs/
|
||||
```
|
||||
|
||||
### Parallel Cloud Upload
|
||||
```bash
|
||||
# Saturate your network, not your patience
|
||||
dbbackup backup --engine=streaming --parallel-workers=8
|
||||
```
|
||||
|
||||
## Real Numbers
|
||||
|
||||
**100GB MySQL database:**
|
||||
|
||||
| Metric | Veeam | Borg + Scripts | dbbackup |
|
||||
|--------|-------|----------------|----------|
|
||||
| Backup time | 45 min | 50 min | **12 min** |
|
||||
| Local disk needed | 100GB | 100GB | **0 GB** |
|
||||
| Recovery point | Daily | Daily | **< 1 second** |
|
||||
| Setup time | Days | Hours | **Minutes** |
|
||||
| Annual cost | $5,000+ | $0 + time | **$0** |
|
||||
|
||||
## Migration Path
|
||||
|
||||
### From Veeam
|
||||
```bash
|
||||
# Day 1: Test alongside existing
|
||||
dbbackup backup single mydb --cloud s3://test-bucket/
|
||||
|
||||
# Week 1: Compare backup times, storage costs
|
||||
# Week 2: Switch primary backups
|
||||
# Month 1: Cancel renewal, buy your team pizza
|
||||
```
|
||||
|
||||
### From Borg/Restic
|
||||
```bash
|
||||
# Day 1: Replace your wrapper scripts
|
||||
dbbackup dedup backup /var/lib/mysql/dumps/mydb.sql
|
||||
|
||||
# Day 2: Add PITR
|
||||
dbbackup binlog stream --target=/mnt/nfs/binlogs/
|
||||
|
||||
# Day 3: Delete 500 lines of bash
|
||||
```
|
||||
|
||||
## The Commands You Need
|
||||
|
||||
```bash
|
||||
# Deduplicated backups (Borg-style)
|
||||
dbbackup dedup backup <file>
|
||||
dbbackup dedup restore <id> <output>
|
||||
dbbackup dedup stats
|
||||
dbbackup dedup gc
|
||||
|
||||
# Database-native backups
|
||||
dbbackup backup single <database>
|
||||
dbbackup backup all
|
||||
dbbackup restore <backup-file>
|
||||
|
||||
# Point-in-time recovery
|
||||
dbbackup binlog stream
|
||||
dbbackup pitr restore --target-time "2026-01-12 14:30:00"
|
||||
|
||||
# Cloud targets
|
||||
--cloud s3://bucket/path/
|
||||
--cloud gs://bucket/path/
|
||||
--cloud azure://container/path/
|
||||
```
|
||||
|
||||
## Who Should Switch
|
||||
|
||||
✅ **From Veeam/Commvault**: Same capabilities, zero license fees
|
||||
✅ **From Borg/Restic**: Native database support, no wrapper scripts
|
||||
✅ **From "homegrown scripts"**: Production-ready, battle-tested
|
||||
✅ **Cloud-native deployments**: Kubernetes, ECS, Cloud Run ready
|
||||
✅ **Compliance requirements**: AES-256-GCM, audit logging
|
||||
|
||||
## Get Started
|
||||
|
||||
```bash
|
||||
# Download (single binary, ~48MB static linked)
|
||||
curl -LO https://github.com/PlusOne/dbbackup/releases/latest/download/dbbackup_linux_amd64
|
||||
chmod +x dbbackup_linux_amd64
|
||||
|
||||
# Your first deduplicated backup
|
||||
./dbbackup_linux_amd64 dedup backup /var/lib/mysql/dumps/production.sql
|
||||
|
||||
# Your first cloud backup
|
||||
./dbbackup_linux_amd64 backup single production \
|
||||
--db-type mysql \
|
||||
--cloud s3://my-backups/
|
||||
```
|
||||
|
||||
## The Bottom Line
|
||||
|
||||
| Solution | What It Costs You |
|
||||
|----------|-------------------|
|
||||
| Veeam | Money |
|
||||
| Borg/Restic | Time (scripting, integration) |
|
||||
| dbbackup | **Neither** |
|
||||
|
||||
**This is what open source really means.**
|
||||
|
||||
Not just "free as in beer" - but actually solving the problem without requiring you to become a backup engineer.
|
||||
|
||||
---
|
||||
|
||||
*Apache 2.0 Licensed. Free forever. No sales calls. No wrapper scripts.*
|
||||
|
||||
[GitHub](https://github.com/PlusOne/dbbackup) | [Releases](https://github.com/PlusOne/dbbackup/releases) | [Changelog](CHANGELOG.md)
|
||||
326
QUICK.md
326
QUICK.md
@ -1,326 +0,0 @@
|
||||
# dbbackup Quick Reference
|
||||
|
||||
Real examples, no fluff.
|
||||
|
||||
## Basic Backups
|
||||
|
||||
```bash
|
||||
# PostgreSQL cluster (all databases + globals)
|
||||
dbbackup backup cluster
|
||||
|
||||
# Single database
|
||||
dbbackup backup single myapp
|
||||
|
||||
# MySQL
|
||||
dbbackup backup single gitea --db-type mysql --host 127.0.0.1 --port 3306
|
||||
|
||||
# MySQL/MariaDB with Unix socket
|
||||
dbbackup backup single myapp --db-type mysql --socket /var/run/mysqld/mysqld.sock
|
||||
|
||||
# With compression level (0-9, default 6)
|
||||
dbbackup backup cluster --compression 9
|
||||
|
||||
# As root (requires flag)
|
||||
sudo dbbackup backup cluster --allow-root
|
||||
```
|
||||
|
||||
## PITR (Point-in-Time Recovery)
|
||||
|
||||
```bash
|
||||
# Enable WAL archiving for a database
|
||||
dbbackup pitr enable myapp /mnt/backups/wal
|
||||
|
||||
# Take base backup (required before PITR works)
|
||||
dbbackup pitr base myapp /mnt/backups/wal
|
||||
|
||||
# Check PITR status
|
||||
dbbackup pitr status myapp /mnt/backups/wal
|
||||
|
||||
# Restore to specific point in time
|
||||
dbbackup pitr restore myapp /mnt/backups/wal --target-time "2026-01-23 14:30:00"
|
||||
|
||||
# Restore to latest available
|
||||
dbbackup pitr restore myapp /mnt/backups/wal --target-time latest
|
||||
|
||||
# Disable PITR
|
||||
dbbackup pitr disable myapp
|
||||
```
|
||||
|
||||
## Deduplication
|
||||
|
||||
```bash
|
||||
# Backup with dedup (saves ~60-80% space on similar databases)
|
||||
dbbackup backup all /mnt/backups/databases --dedup
|
||||
|
||||
# Check dedup stats
|
||||
dbbackup dedup stats /mnt/backups/databases
|
||||
|
||||
# Prune orphaned chunks (after deleting old backups)
|
||||
dbbackup dedup prune /mnt/backups/databases
|
||||
|
||||
# Verify chunk integrity
|
||||
dbbackup dedup verify /mnt/backups/databases
|
||||
```
|
||||
|
||||
## Blob Statistics
|
||||
|
||||
```bash
|
||||
# Analyze blob/binary columns in a database (plan extraction strategies)
|
||||
dbbackup blob stats --database myapp
|
||||
|
||||
# Output shows tables with blob columns, row counts, and estimated sizes
|
||||
# Helps identify large binary data for separate extraction
|
||||
|
||||
# With explicit connection
|
||||
dbbackup blob stats --database myapp --host dbserver --user admin
|
||||
|
||||
# MySQL blob analysis
|
||||
dbbackup blob stats --database shopdb --db-type mysql
|
||||
```
|
||||
|
||||
## Blob Statistics
|
||||
|
||||
```bash
|
||||
# Analyze blob/binary columns in a database (plan extraction strategies)
|
||||
dbbackup blob stats --database myapp
|
||||
|
||||
# Output shows tables with blob columns, row counts, and estimated sizes
|
||||
# Helps identify large binary data for separate extraction
|
||||
|
||||
# With explicit connection
|
||||
dbbackup blob stats --database myapp --host dbserver --user admin
|
||||
|
||||
# MySQL blob analysis
|
||||
dbbackup blob stats --database shopdb --db-type mysql
|
||||
```
|
||||
|
||||
## Engine Management
|
||||
|
||||
```bash
|
||||
# List available backup engines for MySQL/MariaDB
|
||||
dbbackup engine list
|
||||
|
||||
# Get detailed info on a specific engine
|
||||
dbbackup engine info clone
|
||||
|
||||
# Get current environment info
|
||||
dbbackup engine info
|
||||
```
|
||||
|
||||
## Cloud Storage
|
||||
|
||||
```bash
|
||||
# Upload to S3
|
||||
dbbackup cloud upload /mnt/backups/databases/myapp_2026-01-23.sql.gz \
|
||||
--cloud-provider s3 \
|
||||
--cloud-bucket my-backups
|
||||
|
||||
# Upload to MinIO (self-hosted)
|
||||
dbbackup cloud upload backup.sql.gz \
|
||||
--cloud-provider minio \
|
||||
--cloud-bucket backups \
|
||||
--cloud-endpoint https://minio.internal:9000
|
||||
|
||||
# Upload to Backblaze B2
|
||||
dbbackup cloud upload backup.sql.gz \
|
||||
--cloud-provider b2 \
|
||||
--cloud-bucket my-b2-bucket
|
||||
|
||||
# With bandwidth limit (don't saturate the network)
|
||||
dbbackup cloud upload backup.sql.gz --cloud-provider s3 --cloud-bucket backups --bandwidth-limit 10MB/s
|
||||
|
||||
# List remote backups
|
||||
dbbackup cloud list --cloud-provider s3 --cloud-bucket my-backups
|
||||
|
||||
# Download
|
||||
dbbackup cloud download myapp_2026-01-23.sql.gz /tmp/ --cloud-provider s3 --cloud-bucket my-backups
|
||||
|
||||
# Delete old backup from cloud
|
||||
dbbackup cloud delete myapp_2026-01-01.sql.gz --cloud-provider s3 --cloud-bucket my-backups
|
||||
```
|
||||
|
||||
### Cloud Environment Variables
|
||||
|
||||
```bash
|
||||
# S3/MinIO
|
||||
export AWS_ACCESS_KEY_ID=AKIAXXXXXXXX
|
||||
export AWS_SECRET_ACCESS_KEY=xxxxxxxx
|
||||
export AWS_REGION=eu-central-1
|
||||
|
||||
# GCS
|
||||
export GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json
|
||||
|
||||
# Azure
|
||||
export AZURE_STORAGE_ACCOUNT=mystorageaccount
|
||||
export AZURE_STORAGE_KEY=xxxxxxxx
|
||||
```
|
||||
|
||||
## Encryption
|
||||
|
||||
```bash
|
||||
# Backup with encryption (AES-256-GCM)
|
||||
dbbackup backup single myapp --encrypt
|
||||
|
||||
# Use environment variable for key (recommended)
|
||||
export DBBACKUP_ENCRYPTION_KEY="my-secret-passphrase"
|
||||
dbbackup backup cluster --encrypt
|
||||
|
||||
# Or use key file
|
||||
dbbackup backup single myapp --encrypt --encryption-key-file /path/to/keyfile
|
||||
|
||||
# Restore encrypted backup (key from environment)
|
||||
dbbackup restore single myapp_2026-01-23.dump.gz.enc --confirm
|
||||
```
|
||||
|
||||
## Catalog (Backup Inventory)
|
||||
|
||||
```bash
|
||||
# Sync local backups to catalog
|
||||
dbbackup catalog sync /mnt/backups/databases
|
||||
|
||||
# List all backups
|
||||
dbbackup catalog list
|
||||
|
||||
# Show catalog statistics
|
||||
dbbackup catalog stats
|
||||
|
||||
# Show gaps (missing daily backups)
|
||||
dbbackup catalog gaps mydb --interval 24h
|
||||
|
||||
# Search backups
|
||||
dbbackup catalog search --database myapp --after 2026-01-01
|
||||
|
||||
# Show detailed info for a backup
|
||||
dbbackup catalog info myapp_2026-01-23.dump.gz
|
||||
```
|
||||
|
||||
## Restore
|
||||
|
||||
```bash
|
||||
# Preview restore (dry-run by default)
|
||||
dbbackup restore single myapp_2026-01-23.dump.gz
|
||||
|
||||
# Restore to new database
|
||||
dbbackup restore single myapp_2026-01-23.dump.gz --target myapp_restored --confirm
|
||||
|
||||
# Restore to existing database (clean first)
|
||||
dbbackup restore single myapp_2026-01-23.dump.gz --clean --confirm
|
||||
|
||||
# Restore MySQL
|
||||
dbbackup restore single gitea_2026-01-23.sql.gz --target gitea_restored \
|
||||
--db-type mysql --host 127.0.0.1 --confirm
|
||||
|
||||
# Verify restore (restores to temp db, runs checks, drops it)
|
||||
dbbackup verify-restore myapp_2026-01-23.dump.gz
|
||||
```
|
||||
|
||||
## Retention & Cleanup
|
||||
|
||||
```bash
|
||||
# Delete backups older than 30 days (keep at least 5)
|
||||
dbbackup cleanup /mnt/backups/databases --retention-days 30 --min-backups 5
|
||||
|
||||
# GFS retention: 7 daily, 4 weekly, 12 monthly
|
||||
dbbackup cleanup /mnt/backups/databases --gfs --gfs-daily 7 --gfs-weekly 4 --gfs-monthly 12
|
||||
|
||||
# Dry run (show what would be deleted)
|
||||
dbbackup cleanup /mnt/backups/databases --retention-days 7 --dry-run
|
||||
```
|
||||
|
||||
## Disaster Recovery Drill
|
||||
|
||||
```bash
|
||||
# Full DR test (restores random backup, verifies, cleans up)
|
||||
dbbackup drill /mnt/backups/databases
|
||||
|
||||
# Test specific database
|
||||
dbbackup drill /mnt/backups/databases --database myapp
|
||||
|
||||
# With email notification (configure via environment variables)
|
||||
export NOTIFY_SMTP_HOST="smtp.example.com"
|
||||
export NOTIFY_SMTP_TO="admin@example.com"
|
||||
dbbackup drill /mnt/backups/databases --database myapp
|
||||
```
|
||||
|
||||
## Monitoring & Metrics
|
||||
|
||||
```bash
|
||||
# Prometheus metrics endpoint
|
||||
dbbackup metrics serve --port 9101
|
||||
|
||||
# One-shot status check (for scripts)
|
||||
dbbackup status /mnt/backups/databases
|
||||
echo $? # 0 = OK, 1 = warnings, 2 = critical
|
||||
|
||||
# Generate HTML report
|
||||
dbbackup report /mnt/backups/databases --output backup-report.html
|
||||
```
|
||||
|
||||
## Systemd Timer (Recommended)
|
||||
|
||||
```bash
|
||||
# Install systemd units
|
||||
sudo dbbackup install systemd --backup-path /mnt/backups/databases --schedule "02:00"
|
||||
|
||||
# Creates:
|
||||
# /etc/systemd/system/dbbackup.service
|
||||
# /etc/systemd/system/dbbackup.timer
|
||||
|
||||
# Check timer
|
||||
systemctl status dbbackup.timer
|
||||
systemctl list-timers dbbackup.timer
|
||||
```
|
||||
|
||||
## Common Combinations
|
||||
|
||||
```bash
|
||||
# Full production setup: encrypted, with cloud auto-upload
|
||||
dbbackup backup cluster \
|
||||
--encrypt \
|
||||
--compression 9 \
|
||||
--cloud-auto-upload \
|
||||
--cloud-provider s3 \
|
||||
--cloud-bucket prod-backups
|
||||
|
||||
# Quick MySQL backup to S3
|
||||
dbbackup backup single shopdb --db-type mysql && \
|
||||
dbbackup cloud upload shopdb_*.sql.gz --cloud-provider s3 --cloud-bucket backups
|
||||
|
||||
# PITR-enabled PostgreSQL with cloud upload
|
||||
dbbackup pitr enable proddb /mnt/wal
|
||||
dbbackup pitr base proddb /mnt/wal
|
||||
dbbackup cloud upload /mnt/wal/*.gz --cloud-provider s3 --cloud-bucket wal-archive
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
| Variable | Description |
|
||||
|----------|-------------|
|
||||
| `DBBACKUP_ENCRYPTION_KEY` | Encryption passphrase |
|
||||
| `DBBACKUP_BANDWIDTH_LIMIT` | Cloud upload limit (e.g., `10MB/s`) |
|
||||
| `DBBACKUP_CLOUD_PROVIDER` | Cloud provider (s3, minio, b2) |
|
||||
| `DBBACKUP_CLOUD_BUCKET` | Cloud bucket name |
|
||||
| `DBBACKUP_CLOUD_ENDPOINT` | Custom endpoint (for MinIO) |
|
||||
| `AWS_ACCESS_KEY_ID` | S3/MinIO credentials |
|
||||
| `AWS_SECRET_ACCESS_KEY` | S3/MinIO secret key |
|
||||
| `PGHOST`, `PGPORT`, `PGUSER` | PostgreSQL connection |
|
||||
| `MYSQL_HOST`, `MYSQL_TCP_PORT` | MySQL connection |
|
||||
|
||||
## Quick Checks
|
||||
|
||||
```bash
|
||||
# What version?
|
||||
dbbackup --version
|
||||
|
||||
# Connection status
|
||||
dbbackup status
|
||||
|
||||
# Test database connection (dry-run)
|
||||
dbbackup backup single testdb --dry-run
|
||||
|
||||
# Verify a backup file
|
||||
dbbackup verify /mnt/backups/databases/myapp_2026-01-23.dump.gz
|
||||
|
||||
# Run preflight checks
|
||||
dbbackup preflight
|
||||
```
|
||||
133
QUICK_WINS.md
133
QUICK_WINS.md
@ -1,133 +0,0 @@
|
||||
# Quick Wins Shipped - January 30, 2026
|
||||
|
||||
## Summary
|
||||
|
||||
Shipped 3 high-value features in rapid succession, transforming dbbackup's analysis capabilities.
|
||||
|
||||
## Quick Win #1: Restore Preview ✅
|
||||
|
||||
**Shipped:** Commit 6f5a759 + de0582f
|
||||
**Command:** `dbbackup restore preview <backup-file>`
|
||||
|
||||
Shows comprehensive pre-restore analysis:
|
||||
- Backup format detection
|
||||
- Compressed/uncompressed size estimates
|
||||
- RTO calculation (extraction + restore time)
|
||||
- Profile-aware speed estimates
|
||||
- Resource requirements
|
||||
- Integrity validation
|
||||
|
||||
**TUI Integration:** Added RTO estimates to TUI restore preview workflow.
|
||||
|
||||
## Quick Win #2: Backup Diff ✅
|
||||
|
||||
**Shipped:** Commit 14e893f
|
||||
**Command:** `dbbackup diff <backup1> <backup2>`
|
||||
|
||||
Compare two backups intelligently:
|
||||
- Flexible input (paths, catalog IDs, `database:latest/previous`)
|
||||
- Size delta with percentage change
|
||||
- Duration comparison
|
||||
- Growth rate calculation (GB/day)
|
||||
- Growth projections (time to 10GB)
|
||||
- Compression efficiency analysis
|
||||
- JSON output for automation
|
||||
|
||||
Perfect for capacity planning and identifying sudden changes.
|
||||
|
||||
## Quick Win #3: Cost Analyzer ✅
|
||||
|
||||
**Shipped:** Commit 4ab8046
|
||||
**Command:** `dbbackup cost analyze`
|
||||
|
||||
Multi-provider cloud cost comparison:
|
||||
- 15 storage tiers analyzed across 5 providers
|
||||
- AWS S3 (6 tiers), GCS (4 tiers), Azure (3 tiers)
|
||||
- Backblaze B2 and Wasabi included
|
||||
- Monthly/annual cost projections
|
||||
- Savings vs S3 Standard baseline
|
||||
- Tiered lifecycle strategy recommendations
|
||||
- Regional pricing support
|
||||
|
||||
Shows potential savings of 90%+ with proper lifecycle policies.
|
||||
|
||||
## Impact
|
||||
|
||||
**Time to Ship:** ~3 hours total
|
||||
- Restore Preview: 1.5 hours (CLI + TUI)
|
||||
- Backup Diff: 1 hour
|
||||
- Cost Analyzer: 0.5 hours
|
||||
|
||||
**Lines of Code:**
|
||||
- Restore Preview: 328 lines (cmd/restore_preview.go)
|
||||
- Backup Diff: 419 lines (cmd/backup_diff.go)
|
||||
- Cost Analyzer: 423 lines (cmd/cost.go)
|
||||
- **Total:** 1,170 lines
|
||||
|
||||
**Value Delivered:**
|
||||
- Pre-restore confidence (avoid 2-hour mistakes)
|
||||
- Growth tracking (capacity planning)
|
||||
- Cost optimization (budget savings)
|
||||
|
||||
## Examples
|
||||
|
||||
### Restore Preview
|
||||
```bash
|
||||
dbbackup restore preview mydb_20260130.dump.gz
|
||||
# Shows: Format, size, RTO estimate, resource needs
|
||||
|
||||
# TUI integration: Shows RTO during restore confirmation
|
||||
```
|
||||
|
||||
### Backup Diff
|
||||
```bash
|
||||
# Compare two files
|
||||
dbbackup diff backup_jan15.dump.gz backup_jan30.dump.gz
|
||||
|
||||
# Compare latest two backups
|
||||
dbbackup diff mydb:latest mydb:previous
|
||||
|
||||
# Shows: Growth rate, projections, efficiency
|
||||
```
|
||||
|
||||
### Cost Analyzer
|
||||
```bash
|
||||
# Analyze all backups
|
||||
dbbackup cost analyze
|
||||
|
||||
# Specific database
|
||||
dbbackup cost analyze --database mydb --provider aws
|
||||
|
||||
# Shows: 15 tier comparison, savings, recommendations
|
||||
```
|
||||
|
||||
## Architecture Notes
|
||||
|
||||
All three features leverage existing infrastructure:
|
||||
- **Restore Preview:** Uses internal/restore diagnostics + internal/config
|
||||
- **Backup Diff:** Uses internal/catalog + internal/metadata
|
||||
- **Cost Analyzer:** Pure arithmetic, no external APIs
|
||||
|
||||
No new dependencies, no breaking changes, backward compatible.
|
||||
|
||||
## Next Steps
|
||||
|
||||
Remaining feature ideas from "legendary list":
|
||||
- Webhook integration (partial - notifications exist)
|
||||
- Compliance autopilot enhancements
|
||||
- Advanced retention policies
|
||||
- Cross-region replication
|
||||
- Backup verification automation
|
||||
|
||||
**Philosophy:** Ship fast, iterate based on feedback. These 3 quick wins provide immediate value while requiring minimal maintenance.
|
||||
|
||||
---
|
||||
|
||||
**Total Commits Today:**
|
||||
- b28e67e: docs: Remove ASCII logo
|
||||
- 6f5a759: feat: Add restore preview command
|
||||
- de0582f: feat: Add RTO estimates to TUI restore preview
|
||||
- 14e893f: feat: Add backup diff command (Quick Win #2)
|
||||
- 4ab8046: feat: Add cloud storage cost analyzer (Quick Win #3)
|
||||
|
||||
Both remotes synced: git.uuxo.net + GitHub
|
||||
160
README.md
160
README.md
@ -4,7 +4,6 @@ Database backup and restore utility for PostgreSQL, MySQL, and MariaDB.
|
||||
|
||||
[](https://opensource.org/licenses/Apache-2.0)
|
||||
[](https://golang.org/)
|
||||
[](https://github.com/PlusOne/dbbackup/releases/latest)
|
||||
|
||||
**Repository:** https://git.uuxo.net/UUXO/dbbackup
|
||||
**Mirror:** https://github.com/PlusOne/dbbackup
|
||||
@ -100,7 +99,6 @@ Database: postgres@localhost:5432 (PostgreSQL)
|
||||
Diagnose Backup File
|
||||
List & Manage Backups
|
||||
────────────────────────────────
|
||||
Tools
|
||||
View Active Operations
|
||||
Show Operation History
|
||||
Database Status & Health Check
|
||||
@ -109,22 +107,6 @@ Database: postgres@localhost:5432 (PostgreSQL)
|
||||
Quit
|
||||
```
|
||||
|
||||
**Tools Menu:**
|
||||
```
|
||||
Tools
|
||||
|
||||
Advanced utilities for database backup management
|
||||
|
||||
> Blob Statistics
|
||||
Blob Extract (externalize LOBs)
|
||||
────────────────────────────────
|
||||
Dedup Store Analyze
|
||||
Verify Backup Integrity
|
||||
Catalog Sync
|
||||
────────────────────────────────
|
||||
Back to Main Menu
|
||||
```
|
||||
|
||||
**Database Selection:**
|
||||
```
|
||||
Single Database Backup
|
||||
@ -313,12 +295,6 @@ dbbackup restore cluster backup.tar.gz --save-debug-log /tmp/restore-debug.json
|
||||
# Diagnose backup before restore
|
||||
dbbackup restore diagnose backup.dump.gz --deep
|
||||
|
||||
# Check PostgreSQL lock configuration (preflight for large restores)
|
||||
# - warns/fails when `max_locks_per_transaction` is insufficient and prints exact remediation
|
||||
# - safe to run before a restore to determine whether single-threaded restore is required
|
||||
# Example:
|
||||
# dbbackup verify-locks
|
||||
|
||||
# Cloud backup
|
||||
dbbackup backup single mydb --cloud s3://my-bucket/backups/
|
||||
|
||||
@ -338,7 +314,6 @@ dbbackup backup single mydb --dry-run
|
||||
| `restore pitr` | Point-in-Time Recovery |
|
||||
| `restore diagnose` | Diagnose backup file integrity |
|
||||
| `verify-backup` | Verify backup integrity |
|
||||
| `verify-locks` | Check PostgreSQL lock settings and get restore guidance |
|
||||
| `cleanup` | Remove old backups |
|
||||
| `status` | Check connection status |
|
||||
| `preflight` | Run pre-backup checks |
|
||||
@ -352,7 +327,6 @@ dbbackup backup single mydb --dry-run
|
||||
| `drill` | DR drill testing |
|
||||
| `report` | Compliance report generation |
|
||||
| `rto` | RTO/RPO analysis |
|
||||
| `blob stats` | Analyze blob/bytea columns in database |
|
||||
| `install` | Install as systemd service |
|
||||
| `uninstall` | Remove systemd service |
|
||||
| `metrics export` | Export Prometheus metrics to textfile |
|
||||
@ -655,87 +629,13 @@ dbbackup catalog stats
|
||||
# Detect backup gaps (missing scheduled backups)
|
||||
dbbackup catalog gaps --interval 24h --database mydb
|
||||
|
||||
# Search backups by date range
|
||||
dbbackup catalog search --database mydb --after 2024-01-01 --before 2024-12-31
|
||||
# Search backups
|
||||
dbbackup catalog search --database mydb --start 2024-01-01 --end 2024-12-31
|
||||
|
||||
# Get backup info by path
|
||||
dbbackup catalog info /backups/mydb_20240115.dump.gz
|
||||
|
||||
# Compare two backups to see what changed
|
||||
dbbackup diff /backups/mydb_20240115.dump.gz /backups/mydb_20240120.dump.gz
|
||||
|
||||
# Compare using catalog IDs
|
||||
dbbackup diff 123 456
|
||||
|
||||
# Compare latest two backups for a database
|
||||
dbbackup diff mydb:latest mydb:previous
|
||||
# Get backup info
|
||||
dbbackup catalog info 42
|
||||
```
|
||||
|
||||
## Cost Analysis
|
||||
|
||||
Analyze and optimize cloud storage costs:
|
||||
|
||||
```bash
|
||||
# Analyze current backup costs
|
||||
dbbackup cost analyze
|
||||
|
||||
# Specific database
|
||||
dbbackup cost analyze --database mydb
|
||||
|
||||
# Compare providers and tiers
|
||||
dbbackup cost analyze --provider aws --format table
|
||||
|
||||
# Get JSON for automation/reporting
|
||||
dbbackup cost analyze --format json
|
||||
```
|
||||
|
||||
**Providers analyzed:**
|
||||
- AWS S3 (Standard, IA, Glacier, Deep Archive)
|
||||
- Google Cloud Storage (Standard, Nearline, Coldline, Archive)
|
||||
- Azure Blob (Hot, Cool, Archive)
|
||||
- Backblaze B2
|
||||
- Wasabi
|
||||
|
||||
Shows tiered storage strategy recommendations with potential annual savings.
|
||||
|
||||
## Health Check
|
||||
|
||||
Comprehensive backup infrastructure health monitoring:
|
||||
|
||||
```bash
|
||||
# Quick health check
|
||||
dbbackup health
|
||||
|
||||
# Detailed output
|
||||
dbbackup health --verbose
|
||||
|
||||
# JSON for monitoring integration (Prometheus, Nagios, etc.)
|
||||
dbbackup health --format json
|
||||
|
||||
# Custom backup interval for gap detection
|
||||
dbbackup health --interval 12h
|
||||
|
||||
# Skip database connectivity (offline check)
|
||||
dbbackup health --skip-db
|
||||
```
|
||||
|
||||
**Checks performed:**
|
||||
- Configuration validity
|
||||
- Database connectivity
|
||||
- Backup directory accessibility
|
||||
- Catalog integrity
|
||||
- Backup freshness (is last backup recent?)
|
||||
- Gap detection (missed scheduled backups)
|
||||
- Verification status (% of backups verified)
|
||||
- File integrity (do files exist and match metadata?)
|
||||
- Orphaned entries (catalog entries for missing files)
|
||||
- Disk space
|
||||
|
||||
**Exit codes for automation:**
|
||||
- `0` = healthy (all checks passed)
|
||||
- `1` = warning (some checks need attention)
|
||||
- `2` = critical (immediate action required)
|
||||
|
||||
## DR Drill Testing
|
||||
|
||||
Automated disaster recovery testing restores backups to Docker containers:
|
||||
@ -744,8 +644,8 @@ Automated disaster recovery testing restores backups to Docker containers:
|
||||
# Run full DR drill
|
||||
dbbackup drill run /backups/mydb_latest.dump.gz \
|
||||
--database mydb \
|
||||
--type postgresql \
|
||||
--timeout 1800
|
||||
--db-type postgres \
|
||||
--timeout 30m
|
||||
|
||||
# Quick drill (restore + basic validation)
|
||||
dbbackup drill quick /backups/mydb_latest.dump.gz --database mydb
|
||||
@ -753,11 +653,11 @@ dbbackup drill quick /backups/mydb_latest.dump.gz --database mydb
|
||||
# List running drill containers
|
||||
dbbackup drill list
|
||||
|
||||
# Cleanup all drill containers
|
||||
dbbackup drill cleanup
|
||||
# Cleanup old drill containers
|
||||
dbbackup drill cleanup --age 24h
|
||||
|
||||
# Display a saved drill report
|
||||
dbbackup drill report drill_20240115_120000_report.json --format json
|
||||
# Generate drill report
|
||||
dbbackup drill report --format html --output drill-report.html
|
||||
```
|
||||
|
||||
**Drill phases:**
|
||||
@ -802,13 +702,16 @@ Calculate and monitor Recovery Time/Point Objectives:
|
||||
|
||||
```bash
|
||||
# Analyze RTO/RPO for a database
|
||||
dbbackup rto analyze --database mydb
|
||||
dbbackup rto analyze mydb
|
||||
|
||||
# Show status for all databases
|
||||
dbbackup rto status
|
||||
|
||||
# Check against targets
|
||||
dbbackup rto check --target-rto 4h --target-rpo 1h
|
||||
dbbackup rto check --rto 4h --rpo 1h
|
||||
|
||||
# Set target objectives
|
||||
dbbackup rto analyze mydb --target-rto 4h --target-rpo 1h
|
||||
```
|
||||
|
||||
**Analysis includes:**
|
||||
@ -856,8 +759,6 @@ sudo dbbackup uninstall cluster --purge
|
||||
|
||||
Export backup metrics for monitoring with Prometheus:
|
||||
|
||||
> **Migration Note (v1.x → v2.x):** The `--instance` flag was renamed to `--server` to avoid collision with Prometheus's reserved `instance` label. Update your cronjobs and scripts accordingly.
|
||||
|
||||
### Textfile Collector
|
||||
|
||||
For integration with node_exporter:
|
||||
@ -866,8 +767,8 @@ For integration with node_exporter:
|
||||
# Export metrics to textfile
|
||||
dbbackup metrics export --output /var/lib/node_exporter/textfile_collector/dbbackup.prom
|
||||
|
||||
# Export for specific server
|
||||
dbbackup metrics export --server production --output /var/lib/dbbackup/metrics/production.prom
|
||||
# Export for specific instance
|
||||
dbbackup metrics export --instance production --output /var/lib/dbbackup/metrics/production.prom
|
||||
```
|
||||
|
||||
Configure node_exporter:
|
||||
@ -999,29 +900,16 @@ Workload types:
|
||||
|
||||
## Documentation
|
||||
|
||||
**Quick Start:**
|
||||
- [QUICK.md](QUICK.md) - Real-world examples cheat sheet
|
||||
|
||||
**Guides:**
|
||||
- [docs/PITR.md](docs/PITR.md) - Point-in-Time Recovery (PostgreSQL)
|
||||
- [docs/MYSQL_PITR.md](docs/MYSQL_PITR.md) - Point-in-Time Recovery (MySQL)
|
||||
- [docs/ENGINES.md](docs/ENGINES.md) - Database engine configuration
|
||||
- [docs/RESTORE_PROFILES.md](docs/RESTORE_PROFILES.md) - Restore resource profiles
|
||||
|
||||
**Cloud Storage:**
|
||||
- [docs/CLOUD.md](docs/CLOUD.md) - Cloud storage overview
|
||||
- [docs/AZURE.md](docs/AZURE.md) - Azure Blob Storage
|
||||
- [docs/GCS.md](docs/GCS.md) - Google Cloud Storage
|
||||
|
||||
**Deployment:**
|
||||
- [docs/DOCKER.md](docs/DOCKER.md) - Docker deployment
|
||||
- [docs/SYSTEMD.md](docs/SYSTEMD.md) - Systemd installation & scheduling
|
||||
|
||||
**Reference:**
|
||||
- [RESTORE_PROFILES.md](RESTORE_PROFILES.md) - Restore resource profiles & troubleshooting
|
||||
- [SYSTEMD.md](SYSTEMD.md) - Systemd installation & scheduling
|
||||
- [DOCKER.md](DOCKER.md) - Docker deployment
|
||||
- [CLOUD.md](CLOUD.md) - Cloud storage configuration
|
||||
- [PITR.md](PITR.md) - Point-in-Time Recovery
|
||||
- [AZURE.md](AZURE.md) - Azure Blob Storage
|
||||
- [GCS.md](GCS.md) - Google Cloud Storage
|
||||
- [SECURITY.md](SECURITY.md) - Security considerations
|
||||
- [CONTRIBUTING.md](CONTRIBUTING.md) - Contribution guidelines
|
||||
- [CHANGELOG.md](CHANGELOG.md) - Version history
|
||||
- [docs/LOCK_DEBUGGING.md](docs/LOCK_DEBUGGING.md) - Lock troubleshooting
|
||||
|
||||
## License
|
||||
|
||||
|
||||
108
RELEASE_NOTES.md
Normal file
108
RELEASE_NOTES.md
Normal file
@ -0,0 +1,108 @@
|
||||
# v3.42.1 Release Notes
|
||||
|
||||
## What's New in v3.42.1
|
||||
|
||||
### Deduplication - Resistance is Futile
|
||||
|
||||
Content-defined chunking deduplication for space-efficient backups. Like restic/borgbackup but with **native database dump support**.
|
||||
|
||||
```bash
|
||||
# First backup: 5MB stored
|
||||
dbbackup dedup backup mydb.dump
|
||||
|
||||
# Second backup (modified): only 1.6KB new data stored!
|
||||
# 100% deduplication ratio
|
||||
dbbackup dedup backup mydb_modified.dump
|
||||
```
|
||||
|
||||
#### Features
|
||||
- **Gear Hash CDC** - Content-defined chunking with 92%+ overlap on shifted data
|
||||
- **SHA-256 Content-Addressed** - Chunks stored by hash, automatic deduplication
|
||||
- **AES-256-GCM Encryption** - Optional per-chunk encryption
|
||||
- **Gzip Compression** - Optional compression (enabled by default)
|
||||
- **SQLite Index** - Fast chunk lookups and statistics
|
||||
|
||||
#### Commands
|
||||
```bash
|
||||
dbbackup dedup backup <file> # Create deduplicated backup
|
||||
dbbackup dedup backup <file> --encrypt # With AES-256-GCM encryption
|
||||
dbbackup dedup restore <id> <output> # Restore from manifest
|
||||
dbbackup dedup list # List all backups
|
||||
dbbackup dedup stats # Show deduplication statistics
|
||||
dbbackup dedup delete <id> # Delete a backup
|
||||
dbbackup dedup gc # Garbage collect unreferenced chunks
|
||||
```
|
||||
|
||||
#### Storage Structure
|
||||
```
|
||||
<backup-dir>/dedup/
|
||||
chunks/ # Content-addressed chunk files
|
||||
ab/cdef1234... # Sharded by first 2 chars of hash
|
||||
manifests/ # JSON manifest per backup
|
||||
chunks.db # SQLite index
|
||||
```
|
||||
|
||||
### Also Included (from v3.41.x)
|
||||
- **Systemd Integration** - One-command install with `dbbackup install`
|
||||
- **Prometheus Metrics** - HTTP exporter on port 9399
|
||||
- **Backup Catalog** - SQLite-based tracking of all backup operations
|
||||
- **Prometheus Alerting Rules** - Added to SYSTEMD.md documentation
|
||||
|
||||
### Installation
|
||||
|
||||
#### Quick Install (Recommended)
|
||||
```bash
|
||||
# Download for your platform
|
||||
curl -LO https://git.uuxo.net/UUXO/dbbackup/releases/download/v3.42.1/dbbackup-linux-amd64
|
||||
|
||||
# Install with systemd service
|
||||
chmod +x dbbackup-linux-amd64
|
||||
sudo ./dbbackup-linux-amd64 install --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
#### Available Binaries
|
||||
| Platform | Architecture | Binary |
|
||||
|----------|--------------|--------|
|
||||
| Linux | amd64 | `dbbackup-linux-amd64` |
|
||||
| Linux | arm64 | `dbbackup-linux-arm64` |
|
||||
| macOS | Intel | `dbbackup-darwin-amd64` |
|
||||
| macOS | Apple Silicon | `dbbackup-darwin-arm64` |
|
||||
| FreeBSD | amd64 | `dbbackup-freebsd-amd64` |
|
||||
|
||||
### Systemd Commands
|
||||
```bash
|
||||
dbbackup install --config config.yaml # Install service + timer
|
||||
dbbackup install --status # Check service status
|
||||
dbbackup install --uninstall # Remove services
|
||||
```
|
||||
|
||||
### Prometheus Metrics
|
||||
Available at `http://localhost:9399/metrics`:
|
||||
|
||||
| Metric | Description |
|
||||
|--------|-------------|
|
||||
| `dbbackup_last_backup_timestamp` | Unix timestamp of last backup |
|
||||
| `dbbackup_last_backup_success` | 1 if successful, 0 if failed |
|
||||
| `dbbackup_last_backup_duration_seconds` | Duration of last backup |
|
||||
| `dbbackup_last_backup_size_bytes` | Size of last backup |
|
||||
| `dbbackup_backup_total` | Total number of backups |
|
||||
| `dbbackup_backup_errors_total` | Total number of failed backups |
|
||||
|
||||
### Security Features
|
||||
- Hardened systemd service with `ProtectSystem=strict`
|
||||
- `NoNewPrivileges=true` prevents privilege escalation
|
||||
- Dedicated `dbbackup` system user (optional)
|
||||
- Credential files with restricted permissions
|
||||
|
||||
### Documentation
|
||||
- [SYSTEMD.md](SYSTEMD.md) - Complete systemd installation guide
|
||||
- [README.md](README.md) - Full documentation
|
||||
- [CHANGELOG.md](CHANGELOG.md) - Version history
|
||||
|
||||
### Bug Fixes
|
||||
- Fixed SQLite time parsing in dedup stats
|
||||
- Fixed function name collision in cmd package
|
||||
|
||||
---
|
||||
|
||||
**Full Changelog**: https://git.uuxo.net/UUXO/dbbackup/compare/v3.41.1...v3.42.1
|
||||
@ -67,46 +67,18 @@ dbbackup restore cluster backup.tar.gz --profile=balanced --confirm
|
||||
dbbackup restore cluster backup.tar.gz --profile=aggressive --confirm
|
||||
```
|
||||
|
||||
### Potato Profile (`--profile=potato`)
|
||||
### Potato Profile (`--profile=potato`) 🥔
|
||||
**Easter egg:** Same as conservative, for servers running on a potato.
|
||||
|
||||
### Turbo Profile (`--profile=turbo`)
|
||||
**NEW! Best for:** Maximum restore speed - matches native pg_restore -j8 performance.
|
||||
|
||||
**Settings:**
|
||||
- Parallel databases: 2 (balanced I/O)
|
||||
- pg_restore jobs: 8 (like `pg_restore -j8`)
|
||||
- Buffered I/O: 32KB write buffers for faster extraction
|
||||
- Optimized for large databases
|
||||
|
||||
**When to use:**
|
||||
- Dedicated database server
|
||||
- Need fastest possible restore (DR scenarios)
|
||||
- Server has 16GB+ RAM, 4+ cores
|
||||
- Large databases (100GB+)
|
||||
- You want dbbackup to match pg_restore speed
|
||||
|
||||
**Example:**
|
||||
```bash
|
||||
dbbackup restore cluster backup.tar.gz --profile=turbo --confirm
|
||||
```
|
||||
|
||||
**TUI Usage:**
|
||||
1. Go to Settings → Resource Profile
|
||||
2. Press Enter to cycle until you see "turbo"
|
||||
3. Save settings and run restore
|
||||
|
||||
## Profile Comparison
|
||||
|
||||
| Setting | Conservative | Balanced | Performance | Turbo |
|
||||
|---------|-------------|----------|-------------|----------|
|
||||
| Parallel DBs | 1 | 2 | 4 | 2 |
|
||||
| pg_restore Jobs | 1 | 2 | 4 | 8 |
|
||||
| Buffered I/O | No | No | No | Yes (32KB) |
|
||||
| Memory Usage | Minimal | Moderate | High | Moderate |
|
||||
| Speed | Slowest | Medium | Fast | **Fastest** |
|
||||
| Stability | Most stable | Stable | Good | Good |
|
||||
| Best For | Small VMs | General use | Powerful servers | DR/Large DBs |
|
||||
| Setting | Conservative | Balanced | Aggressive |
|
||||
|---------|-------------|----------|-----------|
|
||||
| Parallel DBs | 1 (sequential) | Auto (2-4) | Auto (all CPUs) |
|
||||
| Jobs (decompression) | 1 | Auto (2-4) | Auto (all CPUs) |
|
||||
| Memory Usage | Minimal | Moderate | Maximum |
|
||||
| Speed | Slowest | Medium | Fastest |
|
||||
| Stability | Most stable | Stable | Requires resources |
|
||||
|
||||
## Overriding Profile Settings
|
||||
|
||||
171
RESTORE_PROGRESS_PROPOSAL.md
Normal file
171
RESTORE_PROGRESS_PROPOSAL.md
Normal file
@ -0,0 +1,171 @@
|
||||
# Restore Progress Bar Enhancement Proposal
|
||||
|
||||
## Problem
|
||||
During Phase 2 cluster restore, the progress bar is not real-time because:
|
||||
- `pg_restore` subprocess blocks until completion
|
||||
- Progress updates only happen **before** each database restore starts
|
||||
- No feedback during actual restore execution (which can take hours)
|
||||
- Users see frozen progress bar during large database restores
|
||||
|
||||
## Root Cause
|
||||
In `internal/restore/engine.go`:
|
||||
- `executeRestoreCommand()` blocks on `cmd.Wait()`
|
||||
- Progress is only reported at goroutine entry (line ~1315)
|
||||
- No streaming progress during pg_restore execution
|
||||
|
||||
## Proposed Solutions
|
||||
|
||||
### Option 1: Parse pg_restore stderr for progress (RECOMMENDED)
|
||||
**Pros:**
|
||||
- Real-time feedback during restore
|
||||
- Works with existing pg_restore
|
||||
- No external tools needed
|
||||
|
||||
**Implementation:**
|
||||
```go
|
||||
// In executeRestoreCommand, modify stderr reader:
|
||||
go func() {
|
||||
scanner := bufio.NewScanner(stderr)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
// Parse pg_restore progress lines
|
||||
// Format: "pg_restore: processing item 1234 TABLE public users"
|
||||
if strings.Contains(line, "processing item") {
|
||||
e.reportItemProgress(line) // Update progress bar
|
||||
}
|
||||
|
||||
// Capture errors
|
||||
if strings.Contains(line, "ERROR:") {
|
||||
lastError = line
|
||||
errorCount++
|
||||
}
|
||||
}
|
||||
}()
|
||||
```
|
||||
|
||||
**Add to RestoreCluster goroutine:**
|
||||
```go
|
||||
// Track sub-items within each database
|
||||
var currentDBItems, totalDBItems int
|
||||
e.setItemProgressCallback(func(current, total int) {
|
||||
currentDBItems = current
|
||||
totalDBItems = total
|
||||
// Update TUI with sub-progress
|
||||
e.reportDatabaseSubProgress(idx, totalDBs, dbName, current, total)
|
||||
})
|
||||
```
|
||||
|
||||
### Option 2: Verbose mode with line counting
|
||||
**Pros:**
|
||||
- More granular progress (row-level)
|
||||
- Shows exact operation being performed
|
||||
|
||||
**Cons:**
|
||||
- `--verbose` causes massive stderr output (OOM risk on huge DBs)
|
||||
- Currently disabled for memory safety
|
||||
- Requires careful memory management
|
||||
|
||||
### Option 3: Hybrid approach (BEST)
|
||||
**Combine both:**
|
||||
1. **Default**: Parse non-verbose pg_restore output for item counts
|
||||
2. **Small DBs** (<500MB): Enable verbose for detailed progress
|
||||
3. **Periodic updates**: Report progress every 5 seconds even without stderr changes
|
||||
|
||||
**Implementation:**
|
||||
```go
|
||||
// Add periodic progress ticker
|
||||
progressTicker := time.NewTicker(5 * time.Second)
|
||||
defer progressTicker.Stop()
|
||||
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
case <-progressTicker.C:
|
||||
// Report heartbeat even if no stderr
|
||||
e.reportHeartbeat(dbName, time.Since(dbRestoreStart))
|
||||
case <-stderrDone:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
```
|
||||
|
||||
## Recommended Implementation Plan
|
||||
|
||||
### Phase 1: Quick Win (1-2 hours)
|
||||
1. Add heartbeat ticker in cluster restore goroutines
|
||||
2. Update TUI to show "Restoring database X... (elapsed: 3m 45s)"
|
||||
3. No code changes to pg_restore wrapper
|
||||
|
||||
### Phase 2: Parse pg_restore Output (4-6 hours)
|
||||
1. Parse stderr for "processing item" lines
|
||||
2. Extract current/total item counts
|
||||
3. Report sub-progress to TUI
|
||||
4. Update progress bar calculation:
|
||||
```
|
||||
dbProgress = baseProgress + (itemsDone/totalItems) * dbWeightedPercent
|
||||
```
|
||||
|
||||
### Phase 3: Smart Verbose Mode (optional)
|
||||
1. Detect database size before restore
|
||||
2. Enable verbose for DBs < 500MB
|
||||
3. Parse verbose output for detailed progress
|
||||
4. Automatic fallback to item-based for large DBs
|
||||
|
||||
## Files to Modify
|
||||
|
||||
1. **internal/restore/engine.go**:
|
||||
- `executeRestoreCommand()` - add progress parsing
|
||||
- `RestoreCluster()` - add heartbeat ticker
|
||||
- New: `reportItemProgress()`, `reportHeartbeat()`
|
||||
|
||||
2. **internal/tui/restore_exec.go**:
|
||||
- Update `RestoreExecModel` to handle sub-progress
|
||||
- Add "elapsed time" display during restore
|
||||
- Show item counts: "Restoring tables... (234/567)"
|
||||
|
||||
3. **internal/progress/indicator.go**:
|
||||
- Add `UpdateSubProgress(current, total int)` method
|
||||
- Add `ReportHeartbeat(elapsed time.Duration)` method
|
||||
|
||||
## Example Output
|
||||
|
||||
**Before (current):**
|
||||
```
|
||||
[====================] Phase 2/3: Restoring Databases (1/5)
|
||||
Restoring database myapp...
|
||||
[frozen for 30 minutes]
|
||||
```
|
||||
|
||||
**After (with heartbeat):**
|
||||
```
|
||||
[====================] Phase 2/3: Restoring Databases (1/5)
|
||||
Restoring database myapp... (elapsed: 4m 32s)
|
||||
[updates every 5 seconds]
|
||||
```
|
||||
|
||||
**After (with item parsing):**
|
||||
```
|
||||
[=========>-----------] Phase 2/3: Restoring Databases (1/5)
|
||||
Restoring database myapp... (processing item 1,234/5,678) (elapsed: 4m 32s)
|
||||
[smooth progress bar movement]
|
||||
```
|
||||
|
||||
## Testing Strategy
|
||||
1. Test with small DB (< 100MB) - verify heartbeat works
|
||||
2. Test with large DB (> 10GB) - verify no OOM, heartbeat works
|
||||
3. Test with BLOB-heavy DB - verify phased restore shows progress
|
||||
4. Test parallel cluster restore - verify multiple heartbeats don't conflict
|
||||
|
||||
## Risk Assessment
|
||||
- **Low risk**: Heartbeat ticker (Phase 1)
|
||||
- **Medium risk**: stderr parsing (Phase 2) - test thoroughly
|
||||
- **High risk**: Verbose mode (Phase 3) - can cause OOM
|
||||
|
||||
## Estimated Implementation Time
|
||||
- Phase 1 (heartbeat): 1-2 hours
|
||||
- Phase 2 (item parsing): 4-6 hours
|
||||
- Phase 3 (smart verbose): 8-10 hours (optional)
|
||||
|
||||
**Total for Phases 1+2: 5-8 hours**
|
||||
87
bin/README.md
Normal file
87
bin/README.md
Normal file
@ -0,0 +1,87 @@
|
||||
# DB Backup Tool - Pre-compiled Binaries
|
||||
|
||||
This directory contains pre-compiled binaries for the DB Backup Tool across multiple platforms and architectures.
|
||||
|
||||
## Build Information
|
||||
- **Version**: 3.42.81
|
||||
- **Build Time**: 2026-01-22_17:13:41_UTC
|
||||
- **Git Commit**: 6a7cf3c
|
||||
|
||||
## Recent Updates (v1.1.0)
|
||||
- ✅ Fixed TUI progress display with line-by-line output
|
||||
- ✅ Added interactive configuration settings menu
|
||||
- ✅ Improved menu navigation and responsiveness
|
||||
- ✅ Enhanced completion status handling
|
||||
- ✅ Better CPU detection and optimization
|
||||
- ✅ Silent mode support for TUI operations
|
||||
|
||||
## Available Binaries
|
||||
|
||||
### Linux
|
||||
- `dbbackup_linux_amd64` - Linux 64-bit (Intel/AMD)
|
||||
- `dbbackup_linux_arm64` - Linux 64-bit (ARM)
|
||||
- `dbbackup_linux_arm_armv7` - Linux 32-bit (ARMv7)
|
||||
|
||||
### macOS
|
||||
- `dbbackup_darwin_amd64` - macOS 64-bit (Intel)
|
||||
- `dbbackup_darwin_arm64` - macOS 64-bit (Apple Silicon)
|
||||
|
||||
### Windows
|
||||
- `dbbackup_windows_amd64.exe` - Windows 64-bit (Intel/AMD)
|
||||
- `dbbackup_windows_arm64.exe` - Windows 64-bit (ARM)
|
||||
|
||||
### BSD Systems
|
||||
- `dbbackup_freebsd_amd64` - FreeBSD 64-bit
|
||||
- `dbbackup_openbsd_amd64` - OpenBSD 64-bit
|
||||
- `dbbackup_netbsd_amd64` - NetBSD 64-bit
|
||||
|
||||
## Usage
|
||||
|
||||
1. Download the appropriate binary for your platform
|
||||
2. Make it executable (Unix-like systems): `chmod +x dbbackup_*`
|
||||
3. Run: `./dbbackup_* --help`
|
||||
|
||||
## Interactive Mode
|
||||
|
||||
Launch the interactive TUI menu for easy configuration and operation:
|
||||
|
||||
```bash
|
||||
# Interactive mode with TUI menu
|
||||
./dbbackup_linux_amd64
|
||||
|
||||
# Features:
|
||||
# - Interactive configuration settings
|
||||
# - Real-time progress display
|
||||
# - Operation history and status
|
||||
# - CPU detection and optimization
|
||||
```
|
||||
|
||||
## Command Line Mode
|
||||
|
||||
Direct command line usage with line-by-line progress:
|
||||
|
||||
```bash
|
||||
# Show CPU information and optimization settings
|
||||
./dbbackup_linux_amd64 cpu
|
||||
|
||||
# Auto-optimize for your hardware
|
||||
./dbbackup_linux_amd64 backup cluster --auto-detect-cores
|
||||
|
||||
# Manual CPU configuration
|
||||
./dbbackup_linux_amd64 backup single mydb --jobs 8 --dump-jobs 4
|
||||
|
||||
# Line-by-line progress output
|
||||
./dbbackup_linux_amd64 backup cluster --progress-type line
|
||||
```
|
||||
|
||||
## CPU Detection
|
||||
|
||||
All binaries include advanced CPU detection capabilities:
|
||||
- Automatic core detection for optimal parallelism
|
||||
- Support for different workload types (CPU-intensive, I/O-intensive, balanced)
|
||||
- Platform-specific optimizations for Linux, macOS, and Windows
|
||||
- Interactive CPU configuration in TUI mode
|
||||
|
||||
## Support
|
||||
|
||||
For issues or questions, please refer to the main project documentation.
|
||||
@ -33,7 +33,7 @@ CYAN='\033[0;36m'
|
||||
BOLD='\033[1m'
|
||||
NC='\033[0m'
|
||||
|
||||
# Platform configurations - Linux & macOS only
|
||||
# Platform configurations
|
||||
# Format: "GOOS/GOARCH:binary_suffix:description"
|
||||
PLATFORMS=(
|
||||
"linux/amd64::Linux 64-bit (Intel/AMD)"
|
||||
@ -41,6 +41,11 @@ PLATFORMS=(
|
||||
"linux/arm:_armv7:Linux 32-bit (ARMv7)"
|
||||
"darwin/amd64::macOS 64-bit (Intel)"
|
||||
"darwin/arm64::macOS 64-bit (Apple Silicon)"
|
||||
"windows/amd64:.exe:Windows 64-bit (Intel/AMD)"
|
||||
"windows/arm64:.exe:Windows 64-bit (ARM)"
|
||||
"freebsd/amd64::FreeBSD 64-bit (Intel/AMD)"
|
||||
"openbsd/amd64::OpenBSD 64-bit (Intel/AMD)"
|
||||
"netbsd/amd64::NetBSD 64-bit (Intel/AMD)"
|
||||
)
|
||||
|
||||
echo -e "${BOLD}${BLUE}🔨 Cross-Platform Build Script for ${APP_NAME}${NC}"
|
||||
|
||||
@ -58,13 +58,13 @@ var singleCmd = &cobra.Command{
|
||||
|
||||
Backup Types:
|
||||
--backup-type full - Complete full backup (default)
|
||||
--backup-type incremental - Incremental backup (only changed files since base)
|
||||
--backup-type incremental - Incremental backup (only changed files since base) [NOT IMPLEMENTED]
|
||||
|
||||
Examples:
|
||||
# Full backup (default)
|
||||
dbbackup backup single mydb
|
||||
|
||||
# Incremental backup (requires previous full backup)
|
||||
# Incremental backup (requires previous full backup) [COMING IN v2.2.1]
|
||||
dbbackup backup single mydb --backup-type incremental --base-backup mydb_20250126.tar.gz`,
|
||||
Args: cobra.MaximumNArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
@ -114,7 +114,7 @@ func init() {
|
||||
backupCmd.AddCommand(sampleCmd)
|
||||
|
||||
// Incremental backup flags (single backup only) - using global vars to avoid initialization cycle
|
||||
singleCmd.Flags().StringVar(&backupTypeFlag, "backup-type", "full", "Backup type: full or incremental")
|
||||
singleCmd.Flags().StringVar(&backupTypeFlag, "backup-type", "full", "Backup type: full or incremental [incremental NOT IMPLEMENTED]")
|
||||
singleCmd.Flags().StringVar(&baseBackupFlag, "base-backup", "", "Path to base backup (required for incremental)")
|
||||
|
||||
// Encryption flags for all backup commands
|
||||
|
||||
@ -1,417 +0,0 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"dbbackup/internal/catalog"
|
||||
"dbbackup/internal/metadata"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
var (
|
||||
diffFormat string
|
||||
diffVerbose bool
|
||||
diffShowOnly string // changed, added, removed, all
|
||||
)
|
||||
|
||||
// diffCmd compares two backups
|
||||
var diffCmd = &cobra.Command{
|
||||
Use: "diff <backup1> <backup2>",
|
||||
Short: "Compare two backups and show differences",
|
||||
Long: `Compare two backups from the catalog and show what changed.
|
||||
|
||||
Shows:
|
||||
- New tables/databases added
|
||||
- Removed tables/databases
|
||||
- Size changes for existing tables
|
||||
- Total size delta
|
||||
- Compression ratio changes
|
||||
|
||||
Arguments can be:
|
||||
- Backup file paths (absolute or relative)
|
||||
- Backup IDs from catalog (e.g., "123", "456")
|
||||
- Database name with latest backup (e.g., "mydb:latest")
|
||||
|
||||
Examples:
|
||||
# Compare two backup files
|
||||
dbbackup diff backup1.dump.gz backup2.dump.gz
|
||||
|
||||
# Compare catalog entries by ID
|
||||
dbbackup diff 123 456
|
||||
|
||||
# Compare latest two backups for a database
|
||||
dbbackup diff mydb:latest mydb:previous
|
||||
|
||||
# Show only changes (ignore unchanged)
|
||||
dbbackup diff backup1.dump.gz backup2.dump.gz --show changed
|
||||
|
||||
# JSON output for automation
|
||||
dbbackup diff 123 456 --format json`,
|
||||
Args: cobra.ExactArgs(2),
|
||||
RunE: runDiff,
|
||||
}
|
||||
|
||||
func init() {
|
||||
rootCmd.AddCommand(diffCmd)
|
||||
|
||||
diffCmd.Flags().StringVar(&diffFormat, "format", "table", "Output format (table, json)")
|
||||
diffCmd.Flags().BoolVar(&diffVerbose, "verbose", false, "Show verbose output")
|
||||
diffCmd.Flags().StringVar(&diffShowOnly, "show", "all", "Show only: changed, added, removed, all")
|
||||
}
|
||||
|
||||
func runDiff(cmd *cobra.Command, args []string) error {
|
||||
backup1Path, err := resolveBackupArg(args[0])
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to resolve backup1: %w", err)
|
||||
}
|
||||
|
||||
backup2Path, err := resolveBackupArg(args[1])
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to resolve backup2: %w", err)
|
||||
}
|
||||
|
||||
// Load metadata for both backups
|
||||
meta1, err := metadata.Load(backup1Path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load metadata for backup1: %w", err)
|
||||
}
|
||||
|
||||
meta2, err := metadata.Load(backup2Path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load metadata for backup2: %w", err)
|
||||
}
|
||||
|
||||
// Validate same database
|
||||
if meta1.Database != meta2.Database {
|
||||
return fmt.Errorf("backups are from different databases: %s vs %s", meta1.Database, meta2.Database)
|
||||
}
|
||||
|
||||
// Calculate diff
|
||||
diff := calculateBackupDiff(meta1, meta2)
|
||||
|
||||
// Output
|
||||
if diffFormat == "json" {
|
||||
return outputDiffJSON(diff, meta1, meta2)
|
||||
}
|
||||
|
||||
return outputDiffTable(diff, meta1, meta2)
|
||||
}
|
||||
|
||||
// resolveBackupArg resolves various backup reference formats
|
||||
func resolveBackupArg(arg string) (string, error) {
|
||||
// If it looks like a file path, use it directly
|
||||
if strings.Contains(arg, "/") || strings.HasSuffix(arg, ".gz") || strings.HasSuffix(arg, ".dump") {
|
||||
if _, err := os.Stat(arg); err == nil {
|
||||
return arg, nil
|
||||
}
|
||||
return "", fmt.Errorf("backup file not found: %s", arg)
|
||||
}
|
||||
|
||||
// Try as catalog ID
|
||||
cat, err := openCatalog()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to open catalog: %w", err)
|
||||
}
|
||||
defer cat.Close()
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Special syntax: "database:latest" or "database:previous"
|
||||
if strings.Contains(arg, ":") {
|
||||
parts := strings.Split(arg, ":")
|
||||
database := parts[0]
|
||||
position := parts[1]
|
||||
|
||||
query := &catalog.SearchQuery{
|
||||
Database: database,
|
||||
OrderBy: "created_at",
|
||||
OrderDesc: true,
|
||||
}
|
||||
|
||||
if position == "latest" {
|
||||
query.Limit = 1
|
||||
} else if position == "previous" {
|
||||
query.Limit = 2
|
||||
} else {
|
||||
return "", fmt.Errorf("invalid position: %s (use 'latest' or 'previous')", position)
|
||||
}
|
||||
|
||||
entries, err := cat.Search(ctx, query)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if len(entries) == 0 {
|
||||
return "", fmt.Errorf("no backups found for database: %s", database)
|
||||
}
|
||||
|
||||
if position == "previous" {
|
||||
if len(entries) < 2 {
|
||||
return "", fmt.Errorf("not enough backups for database: %s (need at least 2)", database)
|
||||
}
|
||||
return entries[1].BackupPath, nil
|
||||
}
|
||||
|
||||
return entries[0].BackupPath, nil
|
||||
}
|
||||
|
||||
// Try as numeric ID
|
||||
var id int64
|
||||
_, err = fmt.Sscanf(arg, "%d", &id)
|
||||
if err == nil {
|
||||
entry, err := cat.Get(ctx, id)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if entry == nil {
|
||||
return "", fmt.Errorf("backup not found with ID: %d", id)
|
||||
}
|
||||
return entry.BackupPath, nil
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("invalid backup reference: %s", arg)
|
||||
}
|
||||
|
||||
// BackupDiff represents the difference between two backups
|
||||
type BackupDiff struct {
|
||||
Database string
|
||||
Backup1Time time.Time
|
||||
Backup2Time time.Time
|
||||
TimeDelta time.Duration
|
||||
SizeDelta int64
|
||||
SizeDeltaPct float64
|
||||
DurationDelta float64
|
||||
|
||||
// Detailed changes (when metadata contains table info)
|
||||
AddedItems []DiffItem
|
||||
RemovedItems []DiffItem
|
||||
ChangedItems []DiffItem
|
||||
UnchangedItems []DiffItem
|
||||
}
|
||||
|
||||
type DiffItem struct {
|
||||
Name string
|
||||
Size1 int64
|
||||
Size2 int64
|
||||
SizeDelta int64
|
||||
DeltaPct float64
|
||||
}
|
||||
|
||||
func calculateBackupDiff(meta1, meta2 *metadata.BackupMetadata) *BackupDiff {
|
||||
diff := &BackupDiff{
|
||||
Database: meta1.Database,
|
||||
Backup1Time: meta1.Timestamp,
|
||||
Backup2Time: meta2.Timestamp,
|
||||
TimeDelta: meta2.Timestamp.Sub(meta1.Timestamp),
|
||||
SizeDelta: meta2.SizeBytes - meta1.SizeBytes,
|
||||
DurationDelta: meta2.Duration - meta1.Duration,
|
||||
}
|
||||
|
||||
if meta1.SizeBytes > 0 {
|
||||
diff.SizeDeltaPct = (float64(diff.SizeDelta) / float64(meta1.SizeBytes)) * 100.0
|
||||
}
|
||||
|
||||
// If metadata contains table-level info, compare tables
|
||||
// For now, we only have file-level comparison
|
||||
// Future enhancement: parse backup files for table sizes
|
||||
|
||||
return diff
|
||||
}
|
||||
|
||||
func outputDiffTable(diff *BackupDiff, meta1, meta2 *metadata.BackupMetadata) error {
|
||||
fmt.Println()
|
||||
fmt.Println("═══════════════════════════════════════════════════════════")
|
||||
fmt.Printf(" Backup Comparison: %s\n", diff.Database)
|
||||
fmt.Println("═══════════════════════════════════════════════════════════")
|
||||
fmt.Println()
|
||||
|
||||
// Backup info
|
||||
fmt.Printf("[BACKUP 1]\n")
|
||||
fmt.Printf(" Time: %s\n", meta1.Timestamp.Format("2006-01-02 15:04:05"))
|
||||
fmt.Printf(" Size: %s (%d bytes)\n", formatBytesForDiff(meta1.SizeBytes), meta1.SizeBytes)
|
||||
fmt.Printf(" Duration: %.2fs\n", meta1.Duration)
|
||||
fmt.Printf(" Compression: %s\n", meta1.Compression)
|
||||
fmt.Printf(" Type: %s\n", meta1.BackupType)
|
||||
fmt.Println()
|
||||
|
||||
fmt.Printf("[BACKUP 2]\n")
|
||||
fmt.Printf(" Time: %s\n", meta2.Timestamp.Format("2006-01-02 15:04:05"))
|
||||
fmt.Printf(" Size: %s (%d bytes)\n", formatBytesForDiff(meta2.SizeBytes), meta2.SizeBytes)
|
||||
fmt.Printf(" Duration: %.2fs\n", meta2.Duration)
|
||||
fmt.Printf(" Compression: %s\n", meta2.Compression)
|
||||
fmt.Printf(" Type: %s\n", meta2.BackupType)
|
||||
fmt.Println()
|
||||
|
||||
// Deltas
|
||||
fmt.Println("───────────────────────────────────────────────────────────")
|
||||
fmt.Println("[CHANGES]")
|
||||
fmt.Println("───────────────────────────────────────────────────────────")
|
||||
|
||||
// Time delta
|
||||
timeDelta := diff.TimeDelta
|
||||
fmt.Printf(" Time Between: %s\n", formatDurationForDiff(timeDelta))
|
||||
|
||||
// Size delta
|
||||
sizeIcon := "="
|
||||
if diff.SizeDelta > 0 {
|
||||
sizeIcon = "↑"
|
||||
fmt.Printf(" Size Change: %s %s (+%.1f%%)\n",
|
||||
sizeIcon, formatBytesForDiff(diff.SizeDelta), diff.SizeDeltaPct)
|
||||
} else if diff.SizeDelta < 0 {
|
||||
sizeIcon = "↓"
|
||||
fmt.Printf(" Size Change: %s %s (%.1f%%)\n",
|
||||
sizeIcon, formatBytesForDiff(-diff.SizeDelta), diff.SizeDeltaPct)
|
||||
} else {
|
||||
fmt.Printf(" Size Change: %s No change\n", sizeIcon)
|
||||
}
|
||||
|
||||
// Duration delta
|
||||
durDelta := diff.DurationDelta
|
||||
durIcon := "="
|
||||
if durDelta > 0 {
|
||||
durIcon = "↑"
|
||||
durPct := (durDelta / meta1.Duration) * 100.0
|
||||
fmt.Printf(" Duration: %s +%.2fs (+%.1f%%)\n", durIcon, durDelta, durPct)
|
||||
} else if durDelta < 0 {
|
||||
durIcon = "↓"
|
||||
durPct := (-durDelta / meta1.Duration) * 100.0
|
||||
fmt.Printf(" Duration: %s -%.2fs (-%.1f%%)\n", durIcon, -durDelta, durPct)
|
||||
} else {
|
||||
fmt.Printf(" Duration: %s No change\n", durIcon)
|
||||
}
|
||||
|
||||
// Compression efficiency
|
||||
if meta1.Compression != "none" && meta2.Compression != "none" {
|
||||
fmt.Println()
|
||||
fmt.Println("[COMPRESSION ANALYSIS]")
|
||||
// Note: We'd need uncompressed sizes to calculate actual compression ratio
|
||||
fmt.Printf(" Backup 1: %s\n", meta1.Compression)
|
||||
fmt.Printf(" Backup 2: %s\n", meta2.Compression)
|
||||
if meta1.Compression != meta2.Compression {
|
||||
fmt.Printf(" ⚠ Compression method changed\n")
|
||||
}
|
||||
}
|
||||
|
||||
// Database growth rate
|
||||
if diff.TimeDelta.Hours() > 0 {
|
||||
growthPerDay := float64(diff.SizeDelta) / diff.TimeDelta.Hours() * 24.0
|
||||
fmt.Println()
|
||||
fmt.Println("[GROWTH RATE]")
|
||||
if growthPerDay > 0 {
|
||||
fmt.Printf(" Database growing at ~%s/day\n", formatBytesForDiff(int64(growthPerDay)))
|
||||
|
||||
// Project forward
|
||||
daysTo10GB := (10*1024*1024*1024 - float64(meta2.SizeBytes)) / growthPerDay
|
||||
if daysTo10GB > 0 && daysTo10GB < 365 {
|
||||
fmt.Printf(" Will reach 10GB in ~%.0f days\n", daysTo10GB)
|
||||
}
|
||||
} else if growthPerDay < 0 {
|
||||
fmt.Printf(" Database shrinking at ~%s/day\n", formatBytesForDiff(int64(-growthPerDay)))
|
||||
} else {
|
||||
fmt.Printf(" Database size stable\n")
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println("═══════════════════════════════════════════════════════════")
|
||||
|
||||
if diffVerbose {
|
||||
fmt.Println()
|
||||
fmt.Println("[METADATA DIFF]")
|
||||
fmt.Printf(" Host: %s → %s\n", meta1.Host, meta2.Host)
|
||||
fmt.Printf(" Port: %d → %d\n", meta1.Port, meta2.Port)
|
||||
fmt.Printf(" DB Version: %s → %s\n", meta1.DatabaseVersion, meta2.DatabaseVersion)
|
||||
fmt.Printf(" Encrypted: %v → %v\n", meta1.Encrypted, meta2.Encrypted)
|
||||
fmt.Printf(" Checksum 1: %s\n", meta1.SHA256[:16]+"...")
|
||||
fmt.Printf(" Checksum 2: %s\n", meta2.SHA256[:16]+"...")
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
return nil
|
||||
}
|
||||
|
||||
func outputDiffJSON(diff *BackupDiff, meta1, meta2 *metadata.BackupMetadata) error {
|
||||
output := map[string]interface{}{
|
||||
"database": diff.Database,
|
||||
"backup1": map[string]interface{}{
|
||||
"timestamp": meta1.Timestamp,
|
||||
"size_bytes": meta1.SizeBytes,
|
||||
"duration": meta1.Duration,
|
||||
"compression": meta1.Compression,
|
||||
"type": meta1.BackupType,
|
||||
"version": meta1.DatabaseVersion,
|
||||
},
|
||||
"backup2": map[string]interface{}{
|
||||
"timestamp": meta2.Timestamp,
|
||||
"size_bytes": meta2.SizeBytes,
|
||||
"duration": meta2.Duration,
|
||||
"compression": meta2.Compression,
|
||||
"type": meta2.BackupType,
|
||||
"version": meta2.DatabaseVersion,
|
||||
},
|
||||
"diff": map[string]interface{}{
|
||||
"time_delta_hours": diff.TimeDelta.Hours(),
|
||||
"size_delta_bytes": diff.SizeDelta,
|
||||
"size_delta_pct": diff.SizeDeltaPct,
|
||||
"duration_delta": diff.DurationDelta,
|
||||
},
|
||||
}
|
||||
|
||||
// Calculate growth rate
|
||||
if diff.TimeDelta.Hours() > 0 {
|
||||
growthPerDay := float64(diff.SizeDelta) / diff.TimeDelta.Hours() * 24.0
|
||||
output["growth_rate_bytes_per_day"] = growthPerDay
|
||||
}
|
||||
|
||||
data, err := json.MarshalIndent(output, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Println(string(data))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Utility wrappers
|
||||
func formatBytesForDiff(bytes int64) string {
|
||||
if bytes < 0 {
|
||||
return "-" + formatBytesForDiff(-bytes)
|
||||
}
|
||||
|
||||
const unit = 1024
|
||||
if bytes < unit {
|
||||
return fmt.Sprintf("%d B", bytes)
|
||||
}
|
||||
|
||||
div, exp := int64(unit), 0
|
||||
for n := bytes / unit; n >= unit; n /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%.2f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
func formatDurationForDiff(d time.Duration) string {
|
||||
if d < 0 {
|
||||
return "-" + formatDurationForDiff(-d)
|
||||
}
|
||||
|
||||
days := int(d.Hours() / 24)
|
||||
hours := int(d.Hours()) % 24
|
||||
minutes := int(d.Minutes()) % 60
|
||||
|
||||
if days > 0 {
|
||||
return fmt.Sprintf("%dd %dh %dm", days, hours, minutes)
|
||||
}
|
||||
if hours > 0 {
|
||||
return fmt.Sprintf("%dh %dm", hours, minutes)
|
||||
}
|
||||
return fmt.Sprintf("%dm", minutes)
|
||||
}
|
||||
@ -12,7 +12,6 @@ import (
|
||||
"dbbackup/internal/checks"
|
||||
"dbbackup/internal/config"
|
||||
"dbbackup/internal/database"
|
||||
"dbbackup/internal/notify"
|
||||
"dbbackup/internal/security"
|
||||
)
|
||||
|
||||
@ -58,17 +57,6 @@ func runClusterBackup(ctx context.Context) error {
|
||||
user := security.GetCurrentUser()
|
||||
auditLogger.LogBackupStart(user, "all_databases", "cluster")
|
||||
|
||||
// Track start time for notifications
|
||||
backupStartTime := time.Now()
|
||||
|
||||
// Notify: backup started
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventBackupStarted, notify.SeverityInfo, "Cluster backup started").
|
||||
WithDatabase("all_databases").
|
||||
WithDetail("host", cfg.Host).
|
||||
WithDetail("backup_dir", cfg.BackupDir))
|
||||
}
|
||||
|
||||
// Rate limit connection attempts
|
||||
host := fmt.Sprintf("%s:%d", cfg.Host, cfg.Port)
|
||||
if err := rateLimiter.CheckAndWait(host); err != nil {
|
||||
@ -98,13 +86,6 @@ func runClusterBackup(ctx context.Context) error {
|
||||
// Perform cluster backup
|
||||
if err := engine.BackupCluster(ctx); err != nil {
|
||||
auditLogger.LogBackupFailed(user, "all_databases", err)
|
||||
// Notify: backup failed
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventBackupFailed, notify.SeverityError, "Cluster backup failed").
|
||||
WithDatabase("all_databases").
|
||||
WithError(err).
|
||||
WithDuration(time.Since(backupStartTime)))
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
@ -112,13 +93,6 @@ func runClusterBackup(ctx context.Context) error {
|
||||
if isEncryptionEnabled() {
|
||||
if err := encryptLatestClusterBackup(); err != nil {
|
||||
log.Error("Failed to encrypt backup", "error", err)
|
||||
// Notify: encryption failed
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventBackupFailed, notify.SeverityError, "Backup encryption failed").
|
||||
WithDatabase("all_databases").
|
||||
WithError(err).
|
||||
WithDuration(time.Since(backupStartTime)))
|
||||
}
|
||||
return fmt.Errorf("backup completed successfully but encryption failed. Unencrypted backup remains in %s: %w", cfg.BackupDir, err)
|
||||
}
|
||||
log.Info("Cluster backup encrypted successfully")
|
||||
@ -127,14 +101,6 @@ func runClusterBackup(ctx context.Context) error {
|
||||
// Audit log: backup success
|
||||
auditLogger.LogBackupComplete(user, "all_databases", cfg.BackupDir, 0)
|
||||
|
||||
// Notify: backup completed
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventBackupCompleted, notify.SeveritySuccess, "Cluster backup completed successfully").
|
||||
WithDatabase("all_databases").
|
||||
WithDuration(time.Since(backupStartTime)).
|
||||
WithDetail("backup_dir", cfg.BackupDir))
|
||||
}
|
||||
|
||||
// Cleanup old backups if retention policy is enabled
|
||||
if cfg.RetentionDays > 0 {
|
||||
retentionPolicy := security.NewRetentionPolicy(cfg.RetentionDays, cfg.MinBackups, log)
|
||||
@ -164,10 +130,6 @@ func runSingleBackup(ctx context.Context, databaseName string) error {
|
||||
// Update config from environment
|
||||
cfg.UpdateFromEnvironment()
|
||||
|
||||
// IMPORTANT: Set the database name from positional argument
|
||||
// This overrides the default 'postgres' when using MySQL
|
||||
cfg.Database = databaseName
|
||||
|
||||
// Validate configuration
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return fmt.Errorf("configuration error: %w", err)
|
||||
@ -178,9 +140,10 @@ func runSingleBackup(ctx context.Context, databaseName string) error {
|
||||
return runBackupPreflight(ctx, databaseName)
|
||||
}
|
||||
|
||||
// Get backup type and base backup from command line flags
|
||||
backupType := backupTypeFlag
|
||||
baseBackup := baseBackupFlag
|
||||
// Get backup type and base backup from command line flags (set via global vars in PreRunE)
|
||||
// These are populated by cobra flag binding in cmd/backup.go
|
||||
backupType := "full" // Default to full backup if not specified
|
||||
baseBackup := "" // Base backup path for incremental backups
|
||||
|
||||
// Validate backup type
|
||||
if backupType != "full" && backupType != "incremental" {
|
||||
@ -223,17 +186,6 @@ func runSingleBackup(ctx context.Context, databaseName string) error {
|
||||
user := security.GetCurrentUser()
|
||||
auditLogger.LogBackupStart(user, databaseName, "single")
|
||||
|
||||
// Track start time for notifications
|
||||
backupStartTime := time.Now()
|
||||
|
||||
// Notify: backup started
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventBackupStarted, notify.SeverityInfo, "Database backup started").
|
||||
WithDatabase(databaseName).
|
||||
WithDetail("host", cfg.Host).
|
||||
WithDetail("backup_type", backupType))
|
||||
}
|
||||
|
||||
// Rate limit connection attempts
|
||||
host := fmt.Sprintf("%s:%d", cfg.Host, cfg.Port)
|
||||
if err := rateLimiter.CheckAndWait(host); err != nil {
|
||||
@ -316,13 +268,6 @@ func runSingleBackup(ctx context.Context, databaseName string) error {
|
||||
|
||||
if backupErr != nil {
|
||||
auditLogger.LogBackupFailed(user, databaseName, backupErr)
|
||||
// Notify: backup failed
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventBackupFailed, notify.SeverityError, "Database backup failed").
|
||||
WithDatabase(databaseName).
|
||||
WithError(backupErr).
|
||||
WithDuration(time.Since(backupStartTime)))
|
||||
}
|
||||
return backupErr
|
||||
}
|
||||
|
||||
@ -330,13 +275,6 @@ func runSingleBackup(ctx context.Context, databaseName string) error {
|
||||
if isEncryptionEnabled() {
|
||||
if err := encryptLatestBackup(databaseName); err != nil {
|
||||
log.Error("Failed to encrypt backup", "error", err)
|
||||
// Notify: encryption failed
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventBackupFailed, notify.SeverityError, "Backup encryption failed").
|
||||
WithDatabase(databaseName).
|
||||
WithError(err).
|
||||
WithDuration(time.Since(backupStartTime)))
|
||||
}
|
||||
return fmt.Errorf("backup succeeded but encryption failed: %w", err)
|
||||
}
|
||||
log.Info("Backup encrypted successfully")
|
||||
@ -345,15 +283,6 @@ func runSingleBackup(ctx context.Context, databaseName string) error {
|
||||
// Audit log: backup success
|
||||
auditLogger.LogBackupComplete(user, databaseName, cfg.BackupDir, 0)
|
||||
|
||||
// Notify: backup completed
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventBackupCompleted, notify.SeveritySuccess, "Database backup completed successfully").
|
||||
WithDatabase(databaseName).
|
||||
WithDuration(time.Since(backupStartTime)).
|
||||
WithDetail("backup_dir", cfg.BackupDir).
|
||||
WithDetail("backup_type", backupType))
|
||||
}
|
||||
|
||||
// Cleanup old backups if retention policy is enabled
|
||||
if cfg.RetentionDays > 0 {
|
||||
retentionPolicy := security.NewRetentionPolicy(cfg.RetentionDays, cfg.MinBackups, log)
|
||||
@ -383,9 +312,6 @@ func runSampleBackup(ctx context.Context, databaseName string) error {
|
||||
// Update config from environment
|
||||
cfg.UpdateFromEnvironment()
|
||||
|
||||
// IMPORTANT: Set the database name from positional argument
|
||||
cfg.Database = databaseName
|
||||
|
||||
// Validate configuration
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return fmt.Errorf("configuration error: %w", err)
|
||||
|
||||
318
cmd/blob.go
318
cmd/blob.go
@ -1,318 +0,0 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"text/tabwriter"
|
||||
"time"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
|
||||
_ "github.com/go-sql-driver/mysql"
|
||||
_ "github.com/jackc/pgx/v5/stdlib" // PostgreSQL driver
|
||||
)
|
||||
|
||||
var blobCmd = &cobra.Command{
|
||||
Use: "blob",
|
||||
Short: "Large object (BLOB/BYTEA) operations",
|
||||
Long: `Analyze and manage large binary objects stored in databases.
|
||||
|
||||
Many applications store large binary data (images, PDFs, attachments) directly
|
||||
in the database. This can cause:
|
||||
- Slow backups and restores
|
||||
- Poor deduplication ratios
|
||||
- Excessive storage usage
|
||||
|
||||
The blob commands help you identify and manage this data.
|
||||
|
||||
Available Commands:
|
||||
stats Scan database for blob columns and show size statistics
|
||||
extract Extract blobs to external storage (coming soon)
|
||||
rehydrate Restore blobs from external storage (coming soon)`,
|
||||
}
|
||||
|
||||
var blobStatsCmd = &cobra.Command{
|
||||
Use: "stats",
|
||||
Short: "Show blob column statistics",
|
||||
Long: `Scan the database for BLOB/BYTEA columns and display size statistics.
|
||||
|
||||
This helps identify tables storing large binary data that might benefit
|
||||
from blob extraction for faster backups.
|
||||
|
||||
PostgreSQL column types detected:
|
||||
- bytea
|
||||
- oid (large objects)
|
||||
|
||||
MySQL/MariaDB column types detected:
|
||||
- blob, mediumblob, longblob, tinyblob
|
||||
- binary, varbinary
|
||||
|
||||
Example:
|
||||
dbbackup blob stats
|
||||
dbbackup blob stats -d myapp_production`,
|
||||
RunE: runBlobStats,
|
||||
}
|
||||
|
||||
func init() {
|
||||
rootCmd.AddCommand(blobCmd)
|
||||
blobCmd.AddCommand(blobStatsCmd)
|
||||
}
|
||||
|
||||
func runBlobStats(cmd *cobra.Command, args []string) error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
// Connect to database
|
||||
var db *sql.DB
|
||||
var err error
|
||||
|
||||
if cfg.IsPostgreSQL() {
|
||||
// PostgreSQL connection string
|
||||
connStr := fmt.Sprintf("host=%s port=%d user=%s dbname=%s sslmode=disable",
|
||||
cfg.Host, cfg.Port, cfg.User, cfg.Database)
|
||||
if cfg.Password != "" {
|
||||
connStr += fmt.Sprintf(" password=%s", cfg.Password)
|
||||
}
|
||||
db, err = sql.Open("pgx", connStr)
|
||||
} else {
|
||||
// MySQL DSN
|
||||
connStr := fmt.Sprintf("%s:%s@tcp(%s:%d)/%s",
|
||||
cfg.User, cfg.Password, cfg.Host, cfg.Port, cfg.Database)
|
||||
db, err = sql.Open("mysql", connStr)
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to connect: %w", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
fmt.Printf("Scanning %s for blob columns...\n\n", cfg.DisplayDatabaseType())
|
||||
|
||||
// Discover blob columns
|
||||
type BlobColumn struct {
|
||||
Schema string
|
||||
Table string
|
||||
Column string
|
||||
DataType string
|
||||
RowCount int64
|
||||
TotalSize int64
|
||||
AvgSize int64
|
||||
MaxSize int64
|
||||
NullCount int64
|
||||
}
|
||||
|
||||
var columns []BlobColumn
|
||||
|
||||
if cfg.IsPostgreSQL() {
|
||||
query := `
|
||||
SELECT
|
||||
table_schema,
|
||||
table_name,
|
||||
column_name,
|
||||
data_type
|
||||
FROM information_schema.columns
|
||||
WHERE data_type IN ('bytea', 'oid')
|
||||
AND table_schema NOT IN ('pg_catalog', 'information_schema')
|
||||
ORDER BY table_schema, table_name, column_name
|
||||
`
|
||||
rows, err := db.QueryContext(ctx, query)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to query columns: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
for rows.Next() {
|
||||
var col BlobColumn
|
||||
if err := rows.Scan(&col.Schema, &col.Table, &col.Column, &col.DataType); err != nil {
|
||||
continue
|
||||
}
|
||||
columns = append(columns, col)
|
||||
}
|
||||
} else {
|
||||
query := `
|
||||
SELECT
|
||||
TABLE_SCHEMA,
|
||||
TABLE_NAME,
|
||||
COLUMN_NAME,
|
||||
DATA_TYPE
|
||||
FROM information_schema.COLUMNS
|
||||
WHERE DATA_TYPE IN ('blob', 'mediumblob', 'longblob', 'tinyblob', 'binary', 'varbinary')
|
||||
AND TABLE_SCHEMA NOT IN ('mysql', 'information_schema', 'performance_schema', 'sys')
|
||||
ORDER BY TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME
|
||||
`
|
||||
rows, err := db.QueryContext(ctx, query)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to query columns: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
for rows.Next() {
|
||||
var col BlobColumn
|
||||
if err := rows.Scan(&col.Schema, &col.Table, &col.Column, &col.DataType); err != nil {
|
||||
continue
|
||||
}
|
||||
columns = append(columns, col)
|
||||
}
|
||||
}
|
||||
|
||||
if len(columns) == 0 {
|
||||
fmt.Println("✓ No blob columns found in this database")
|
||||
return nil
|
||||
}
|
||||
|
||||
fmt.Printf("Found %d blob column(s), scanning sizes...\n\n", len(columns))
|
||||
|
||||
// Scan each column for size stats
|
||||
var totalBlobs, totalSize int64
|
||||
for i := range columns {
|
||||
col := &columns[i]
|
||||
|
||||
var query string
|
||||
var fullName, colName string
|
||||
|
||||
if cfg.IsPostgreSQL() {
|
||||
fullName = fmt.Sprintf(`"%s"."%s"`, col.Schema, col.Table)
|
||||
colName = fmt.Sprintf(`"%s"`, col.Column)
|
||||
query = fmt.Sprintf(`
|
||||
SELECT
|
||||
COUNT(*),
|
||||
COALESCE(SUM(COALESCE(octet_length(%s), 0)), 0),
|
||||
COALESCE(AVG(COALESCE(octet_length(%s), 0)), 0),
|
||||
COALESCE(MAX(COALESCE(octet_length(%s), 0)), 0),
|
||||
COUNT(*) - COUNT(%s)
|
||||
FROM %s
|
||||
`, colName, colName, colName, colName, fullName)
|
||||
} else {
|
||||
fullName = fmt.Sprintf("`%s`.`%s`", col.Schema, col.Table)
|
||||
colName = fmt.Sprintf("`%s`", col.Column)
|
||||
query = fmt.Sprintf(`
|
||||
SELECT
|
||||
COUNT(*),
|
||||
COALESCE(SUM(COALESCE(LENGTH(%s), 0)), 0),
|
||||
COALESCE(AVG(COALESCE(LENGTH(%s), 0)), 0),
|
||||
COALESCE(MAX(COALESCE(LENGTH(%s), 0)), 0),
|
||||
COUNT(*) - COUNT(%s)
|
||||
FROM %s
|
||||
`, colName, colName, colName, colName, fullName)
|
||||
}
|
||||
|
||||
scanCtx, scanCancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
row := db.QueryRowContext(scanCtx, query)
|
||||
var avgSize float64
|
||||
err := row.Scan(&col.RowCount, &col.TotalSize, &avgSize, &col.MaxSize, &col.NullCount)
|
||||
col.AvgSize = int64(avgSize)
|
||||
scanCancel()
|
||||
|
||||
if err != nil {
|
||||
log.Warn("Failed to scan column", "table", fullName, "column", col.Column, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
totalBlobs += col.RowCount - col.NullCount
|
||||
totalSize += col.TotalSize
|
||||
}
|
||||
|
||||
// Print summary
|
||||
fmt.Printf("═══════════════════════════════════════════════════════════════════\n")
|
||||
fmt.Printf("BLOB STATISTICS SUMMARY\n")
|
||||
fmt.Printf("═══════════════════════════════════════════════════════════════════\n")
|
||||
fmt.Printf("Total blob columns: %d\n", len(columns))
|
||||
fmt.Printf("Total blob values: %s\n", formatNumberWithCommas(totalBlobs))
|
||||
fmt.Printf("Total blob size: %s\n", formatBytesHuman(totalSize))
|
||||
fmt.Printf("═══════════════════════════════════════════════════════════════════\n\n")
|
||||
|
||||
// Print detailed table
|
||||
w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
|
||||
fmt.Fprintf(w, "SCHEMA\tTABLE\tCOLUMN\tTYPE\tROWS\tNON-NULL\tTOTAL SIZE\tAVG SIZE\tMAX SIZE\n")
|
||||
fmt.Fprintf(w, "──────\t─────\t──────\t────\t────\t────────\t──────────\t────────\t────────\n")
|
||||
|
||||
for _, col := range columns {
|
||||
nonNull := col.RowCount - col.NullCount
|
||||
fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
|
||||
truncateBlobStr(col.Schema, 15),
|
||||
truncateBlobStr(col.Table, 20),
|
||||
truncateBlobStr(col.Column, 15),
|
||||
col.DataType,
|
||||
formatNumberWithCommas(col.RowCount),
|
||||
formatNumberWithCommas(nonNull),
|
||||
formatBytesHuman(col.TotalSize),
|
||||
formatBytesHuman(col.AvgSize),
|
||||
formatBytesHuman(col.MaxSize),
|
||||
)
|
||||
}
|
||||
w.Flush()
|
||||
|
||||
// Show top tables by size
|
||||
if len(columns) > 1 {
|
||||
fmt.Println("\n───────────────────────────────────────────────────────────────────")
|
||||
fmt.Println("TOP TABLES BY BLOB SIZE:")
|
||||
|
||||
// Simple sort (bubble sort is fine for small lists)
|
||||
for i := 0; i < len(columns)-1; i++ {
|
||||
for j := i + 1; j < len(columns); j++ {
|
||||
if columns[j].TotalSize > columns[i].TotalSize {
|
||||
columns[i], columns[j] = columns[j], columns[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for i, col := range columns {
|
||||
if i >= 5 || col.TotalSize == 0 {
|
||||
break
|
||||
}
|
||||
pct := float64(col.TotalSize) / float64(totalSize) * 100
|
||||
fmt.Printf(" %d. %s.%s.%s: %s (%.1f%%)\n",
|
||||
i+1, col.Schema, col.Table, col.Column,
|
||||
formatBytesHuman(col.TotalSize), pct)
|
||||
}
|
||||
}
|
||||
|
||||
// Recommendations
|
||||
if totalSize > 100*1024*1024 { // > 100MB
|
||||
fmt.Println("\n───────────────────────────────────────────────────────────────────")
|
||||
fmt.Println("RECOMMENDATIONS:")
|
||||
fmt.Printf(" • You have %s of blob data which could benefit from extraction\n", formatBytesHuman(totalSize))
|
||||
fmt.Println(" • Consider using 'dbbackup blob extract' to externalize large objects")
|
||||
fmt.Println(" • This can improve backup speed and deduplication ratios")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func formatBytesHuman(bytes int64) string {
|
||||
const unit = 1024
|
||||
if bytes < unit {
|
||||
return fmt.Sprintf("%d B", bytes)
|
||||
}
|
||||
div, exp := int64(unit), 0
|
||||
for n := bytes / unit; n >= unit; n /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
return fmt.Sprintf("%.1f %cB", float64(bytes)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
func formatNumberWithCommas(n int64) string {
|
||||
str := fmt.Sprintf("%d", n)
|
||||
if len(str) <= 3 {
|
||||
return str
|
||||
}
|
||||
|
||||
var result strings.Builder
|
||||
for i, c := range str {
|
||||
if i > 0 && (len(str)-i)%3 == 0 {
|
||||
result.WriteRune(',')
|
||||
}
|
||||
result.WriteRune(c)
|
||||
}
|
||||
return result.String()
|
||||
}
|
||||
|
||||
func truncateBlobStr(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max-1] + "…"
|
||||
}
|
||||
@ -271,20 +271,12 @@ func runCatalogSync(cmd *cobra.Command, args []string) error {
|
||||
fmt.Printf(" [OK] Added: %d\n", result.Added)
|
||||
fmt.Printf(" [SYNC] Updated: %d\n", result.Updated)
|
||||
fmt.Printf(" [DEL] Removed: %d\n", result.Removed)
|
||||
if result.Skipped > 0 {
|
||||
fmt.Printf(" [SKIP] Skipped: %d (legacy files without metadata)\n", result.Skipped)
|
||||
}
|
||||
if result.Errors > 0 {
|
||||
fmt.Printf(" [FAIL] Errors: %d\n", result.Errors)
|
||||
}
|
||||
fmt.Printf(" [TIME] Duration: %.2fs\n", result.Duration)
|
||||
fmt.Printf("=====================================================\n")
|
||||
|
||||
// Show legacy backup warning
|
||||
if result.LegacyWarning != "" {
|
||||
fmt.Printf("\n[WARN] %s\n", result.LegacyWarning)
|
||||
}
|
||||
|
||||
// Show details if verbose
|
||||
if catalogVerbose && len(result.Details) > 0 {
|
||||
fmt.Printf("\nDetails:\n")
|
||||
|
||||
65
cmd/cloud.go
65
cmd/cloud.go
@ -30,12 +30,7 @@ Configuration via flags or environment variables:
|
||||
--cloud-region DBBACKUP_CLOUD_REGION
|
||||
--cloud-endpoint DBBACKUP_CLOUD_ENDPOINT
|
||||
--cloud-access-key DBBACKUP_CLOUD_ACCESS_KEY (or AWS_ACCESS_KEY_ID)
|
||||
--cloud-secret-key DBBACKUP_CLOUD_SECRET_KEY (or AWS_SECRET_ACCESS_KEY)
|
||||
--bandwidth-limit DBBACKUP_BANDWIDTH_LIMIT
|
||||
|
||||
Bandwidth Limiting:
|
||||
Limit upload/download speed to avoid saturating network during business hours.
|
||||
Examples: 10MB/s, 50MiB/s, 100Mbps, unlimited`,
|
||||
--cloud-secret-key DBBACKUP_CLOUD_SECRET_KEY (or AWS_SECRET_ACCESS_KEY)`,
|
||||
}
|
||||
|
||||
var cloudUploadCmd = &cobra.Command{
|
||||
@ -108,16 +103,15 @@ Examples:
|
||||
}
|
||||
|
||||
var (
|
||||
cloudProvider string
|
||||
cloudBucket string
|
||||
cloudRegion string
|
||||
cloudEndpoint string
|
||||
cloudAccessKey string
|
||||
cloudSecretKey string
|
||||
cloudPrefix string
|
||||
cloudVerbose bool
|
||||
cloudConfirm bool
|
||||
cloudBandwidthLimit string
|
||||
cloudProvider string
|
||||
cloudBucket string
|
||||
cloudRegion string
|
||||
cloudEndpoint string
|
||||
cloudAccessKey string
|
||||
cloudSecretKey string
|
||||
cloudPrefix string
|
||||
cloudVerbose bool
|
||||
cloudConfirm bool
|
||||
)
|
||||
|
||||
func init() {
|
||||
@ -133,7 +127,6 @@ func init() {
|
||||
cmd.Flags().StringVar(&cloudAccessKey, "cloud-access-key", getEnv("DBBACKUP_CLOUD_ACCESS_KEY", getEnv("AWS_ACCESS_KEY_ID", "")), "Access key")
|
||||
cmd.Flags().StringVar(&cloudSecretKey, "cloud-secret-key", getEnv("DBBACKUP_CLOUD_SECRET_KEY", getEnv("AWS_SECRET_ACCESS_KEY", "")), "Secret key")
|
||||
cmd.Flags().StringVar(&cloudPrefix, "cloud-prefix", getEnv("DBBACKUP_CLOUD_PREFIX", ""), "Key prefix")
|
||||
cmd.Flags().StringVar(&cloudBandwidthLimit, "bandwidth-limit", getEnv("DBBACKUP_BANDWIDTH_LIMIT", ""), "Bandwidth limit (e.g., 10MB/s, 100Mbps, 50MiB/s)")
|
||||
cmd.Flags().BoolVarP(&cloudVerbose, "verbose", "v", false, "Verbose output")
|
||||
}
|
||||
|
||||
@ -148,40 +141,24 @@ func getEnv(key, defaultValue string) string {
|
||||
}
|
||||
|
||||
func getCloudBackend() (cloud.Backend, error) {
|
||||
// Parse bandwidth limit
|
||||
var bandwidthLimit int64
|
||||
if cloudBandwidthLimit != "" {
|
||||
var err error
|
||||
bandwidthLimit, err = cloud.ParseBandwidth(cloudBandwidthLimit)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid bandwidth limit: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
cfg := &cloud.Config{
|
||||
Provider: cloudProvider,
|
||||
Bucket: cloudBucket,
|
||||
Region: cloudRegion,
|
||||
Endpoint: cloudEndpoint,
|
||||
AccessKey: cloudAccessKey,
|
||||
SecretKey: cloudSecretKey,
|
||||
Prefix: cloudPrefix,
|
||||
UseSSL: true,
|
||||
PathStyle: cloudProvider == "minio",
|
||||
Timeout: 300,
|
||||
MaxRetries: 3,
|
||||
BandwidthLimit: bandwidthLimit,
|
||||
Provider: cloudProvider,
|
||||
Bucket: cloudBucket,
|
||||
Region: cloudRegion,
|
||||
Endpoint: cloudEndpoint,
|
||||
AccessKey: cloudAccessKey,
|
||||
SecretKey: cloudSecretKey,
|
||||
Prefix: cloudPrefix,
|
||||
UseSSL: true,
|
||||
PathStyle: cloudProvider == "minio",
|
||||
Timeout: 300,
|
||||
MaxRetries: 3,
|
||||
}
|
||||
|
||||
if cfg.Bucket == "" {
|
||||
return nil, fmt.Errorf("bucket name is required (use --cloud-bucket or DBBACKUP_CLOUD_BUCKET)")
|
||||
}
|
||||
|
||||
// Log bandwidth limit if set
|
||||
if bandwidthLimit > 0 {
|
||||
fmt.Printf("📊 Bandwidth limit: %s\n", cloud.FormatBandwidth(bandwidthLimit))
|
||||
}
|
||||
|
||||
backend, err := cloud.NewBackend(cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create cloud backend: %w", err)
|
||||
|
||||
396
cmd/cost.go
396
cmd/cost.go
@ -1,396 +0,0 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"dbbackup/internal/catalog"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
var (
|
||||
costDatabase string
|
||||
costFormat string
|
||||
costRegion string
|
||||
costProvider string
|
||||
costDays int
|
||||
)
|
||||
|
||||
// costCmd analyzes backup storage costs
|
||||
var costCmd = &cobra.Command{
|
||||
Use: "cost",
|
||||
Short: "Analyze cloud storage costs for backups",
|
||||
Long: `Calculate and compare cloud storage costs for your backups.
|
||||
|
||||
Analyzes storage costs across providers:
|
||||
- AWS S3 (Standard, IA, Glacier, Deep Archive)
|
||||
- Google Cloud Storage (Standard, Nearline, Coldline, Archive)
|
||||
- Azure Blob Storage (Hot, Cool, Archive)
|
||||
- Backblaze B2
|
||||
- Wasabi
|
||||
|
||||
Pricing is based on standard rates and may vary by region.
|
||||
|
||||
Examples:
|
||||
# Analyze all backups
|
||||
dbbackup cost analyze
|
||||
|
||||
# Specific database
|
||||
dbbackup cost analyze --database mydb
|
||||
|
||||
# Compare providers for 90 days
|
||||
dbbackup cost analyze --days 90 --format table
|
||||
|
||||
# Estimate for specific region
|
||||
dbbackup cost analyze --region us-east-1
|
||||
|
||||
# JSON output for automation
|
||||
dbbackup cost analyze --format json`,
|
||||
}
|
||||
|
||||
var costAnalyzeCmd = &cobra.Command{
|
||||
Use: "analyze",
|
||||
Short: "Analyze backup storage costs",
|
||||
Args: cobra.NoArgs,
|
||||
RunE: runCostAnalyze,
|
||||
}
|
||||
|
||||
func init() {
|
||||
rootCmd.AddCommand(costCmd)
|
||||
costCmd.AddCommand(costAnalyzeCmd)
|
||||
|
||||
costAnalyzeCmd.Flags().StringVar(&costDatabase, "database", "", "Filter by database")
|
||||
costAnalyzeCmd.Flags().StringVar(&costFormat, "format", "table", "Output format (table, json)")
|
||||
costAnalyzeCmd.Flags().StringVar(&costRegion, "region", "us-east-1", "Cloud region for pricing")
|
||||
costAnalyzeCmd.Flags().StringVar(&costProvider, "provider", "all", "Show specific provider (all, aws, gcs, azure, b2, wasabi)")
|
||||
costAnalyzeCmd.Flags().IntVar(&costDays, "days", 30, "Number of days to calculate")
|
||||
}
|
||||
|
||||
func runCostAnalyze(cmd *cobra.Command, args []string) error {
|
||||
cat, err := openCatalog()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer cat.Close()
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Get backup statistics
|
||||
var stats *catalog.Stats
|
||||
if costDatabase != "" {
|
||||
stats, err = cat.StatsByDatabase(ctx, costDatabase)
|
||||
} else {
|
||||
stats, err = cat.Stats(ctx)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if stats.TotalBackups == 0 {
|
||||
fmt.Println("No backups found in catalog. Run 'dbbackup catalog sync' first.")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Calculate costs
|
||||
analysis := calculateCosts(stats.TotalSize, costDays, costRegion)
|
||||
|
||||
if costFormat == "json" {
|
||||
return outputCostJSON(analysis, stats)
|
||||
}
|
||||
|
||||
return outputCostTable(analysis, stats)
|
||||
}
|
||||
|
||||
// StorageTier represents a storage class/tier
|
||||
type StorageTier struct {
|
||||
Provider string
|
||||
Tier string
|
||||
Description string
|
||||
StorageGB float64 // $ per GB/month
|
||||
RetrievalGB float64 // $ per GB retrieved
|
||||
Requests float64 // $ per 1000 requests
|
||||
MinDays int // Minimum storage duration
|
||||
}
|
||||
|
||||
// CostAnalysis represents the cost breakdown
|
||||
type CostAnalysis struct {
|
||||
TotalSizeGB float64
|
||||
Days int
|
||||
Region string
|
||||
Recommendations []TierRecommendation
|
||||
}
|
||||
|
||||
type TierRecommendation struct {
|
||||
Provider string
|
||||
Tier string
|
||||
Description string
|
||||
MonthlyStorage float64
|
||||
AnnualStorage float64
|
||||
RetrievalCost float64
|
||||
TotalMonthly float64
|
||||
TotalAnnual float64
|
||||
SavingsVsS3 float64
|
||||
SavingsPct float64
|
||||
BestFor string
|
||||
}
|
||||
|
||||
func calculateCosts(totalBytes int64, days int, region string) *CostAnalysis {
|
||||
sizeGB := float64(totalBytes) / (1024 * 1024 * 1024)
|
||||
|
||||
analysis := &CostAnalysis{
|
||||
TotalSizeGB: sizeGB,
|
||||
Days: days,
|
||||
Region: region,
|
||||
}
|
||||
|
||||
// Define storage tiers (pricing as of 2026, approximate)
|
||||
tiers := []StorageTier{
|
||||
// AWS S3
|
||||
{Provider: "AWS S3", Tier: "Standard", Description: "Frequent access",
|
||||
StorageGB: 0.023, RetrievalGB: 0.0, Requests: 0.0004, MinDays: 0},
|
||||
{Provider: "AWS S3", Tier: "Intelligent-Tiering", Description: "Auto-optimization",
|
||||
StorageGB: 0.023, RetrievalGB: 0.0, Requests: 0.0004, MinDays: 0},
|
||||
{Provider: "AWS S3", Tier: "Standard-IA", Description: "Infrequent access",
|
||||
StorageGB: 0.0125, RetrievalGB: 0.01, Requests: 0.001, MinDays: 30},
|
||||
{Provider: "AWS S3", Tier: "Glacier Instant", Description: "Archive instant",
|
||||
StorageGB: 0.004, RetrievalGB: 0.03, Requests: 0.01, MinDays: 90},
|
||||
{Provider: "AWS S3", Tier: "Glacier Flexible", Description: "Archive flexible",
|
||||
StorageGB: 0.0036, RetrievalGB: 0.02, Requests: 0.05, MinDays: 90},
|
||||
{Provider: "AWS S3", Tier: "Deep Archive", Description: "Long-term archive",
|
||||
StorageGB: 0.00099, RetrievalGB: 0.02, Requests: 0.05, MinDays: 180},
|
||||
|
||||
// Google Cloud Storage
|
||||
{Provider: "GCS", Tier: "Standard", Description: "Frequent access",
|
||||
StorageGB: 0.020, RetrievalGB: 0.0, Requests: 0.0004, MinDays: 0},
|
||||
{Provider: "GCS", Tier: "Nearline", Description: "Monthly access",
|
||||
StorageGB: 0.010, RetrievalGB: 0.01, Requests: 0.001, MinDays: 30},
|
||||
{Provider: "GCS", Tier: "Coldline", Description: "Quarterly access",
|
||||
StorageGB: 0.004, RetrievalGB: 0.02, Requests: 0.005, MinDays: 90},
|
||||
{Provider: "GCS", Tier: "Archive", Description: "Annual access",
|
||||
StorageGB: 0.0012, RetrievalGB: 0.05, Requests: 0.05, MinDays: 365},
|
||||
|
||||
// Azure Blob Storage
|
||||
{Provider: "Azure", Tier: "Hot", Description: "Frequent access",
|
||||
StorageGB: 0.0184, RetrievalGB: 0.0, Requests: 0.0004, MinDays: 0},
|
||||
{Provider: "Azure", Tier: "Cool", Description: "Infrequent access",
|
||||
StorageGB: 0.010, RetrievalGB: 0.01, Requests: 0.001, MinDays: 30},
|
||||
{Provider: "Azure", Tier: "Archive", Description: "Long-term archive",
|
||||
StorageGB: 0.00099, RetrievalGB: 0.02, Requests: 0.05, MinDays: 180},
|
||||
|
||||
// Backblaze B2
|
||||
{Provider: "Backblaze B2", Tier: "Standard", Description: "Affordable cloud",
|
||||
StorageGB: 0.005, RetrievalGB: 0.01, Requests: 0.0004, MinDays: 0},
|
||||
|
||||
// Wasabi
|
||||
{Provider: "Wasabi", Tier: "Hot Cloud", Description: "No egress fees",
|
||||
StorageGB: 0.0059, RetrievalGB: 0.0, Requests: 0.0, MinDays: 90},
|
||||
}
|
||||
|
||||
// Calculate costs for each tier
|
||||
s3StandardCost := 0.0
|
||||
for _, tier := range tiers {
|
||||
if costProvider != "all" {
|
||||
providerLower := strings.ToLower(tier.Provider)
|
||||
filterLower := strings.ToLower(costProvider)
|
||||
if !strings.Contains(providerLower, filterLower) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
rec := TierRecommendation{
|
||||
Provider: tier.Provider,
|
||||
Tier: tier.Tier,
|
||||
Description: tier.Description,
|
||||
}
|
||||
|
||||
// Monthly storage cost
|
||||
rec.MonthlyStorage = sizeGB * tier.StorageGB
|
||||
|
||||
// Annual storage cost
|
||||
rec.AnnualStorage = rec.MonthlyStorage * 12
|
||||
|
||||
// Estimate retrieval cost (assume 1 retrieval per month for DR testing)
|
||||
rec.RetrievalCost = sizeGB * tier.RetrievalGB
|
||||
|
||||
// Total costs
|
||||
rec.TotalMonthly = rec.MonthlyStorage + rec.RetrievalCost
|
||||
rec.TotalAnnual = rec.AnnualStorage + (rec.RetrievalCost * 12)
|
||||
|
||||
// Track S3 Standard for comparison
|
||||
if tier.Provider == "AWS S3" && tier.Tier == "Standard" {
|
||||
s3StandardCost = rec.TotalMonthly
|
||||
}
|
||||
|
||||
// Recommendations
|
||||
switch {
|
||||
case tier.MinDays >= 180:
|
||||
rec.BestFor = "Long-term archives (6+ months)"
|
||||
case tier.MinDays >= 90:
|
||||
rec.BestFor = "Compliance archives (3+ months)"
|
||||
case tier.MinDays >= 30:
|
||||
rec.BestFor = "Recent backups (monthly rotation)"
|
||||
default:
|
||||
rec.BestFor = "Active/hot backups (daily access)"
|
||||
}
|
||||
|
||||
analysis.Recommendations = append(analysis.Recommendations, rec)
|
||||
}
|
||||
|
||||
// Calculate savings vs S3 Standard
|
||||
if s3StandardCost > 0 {
|
||||
for i := range analysis.Recommendations {
|
||||
rec := &analysis.Recommendations[i]
|
||||
rec.SavingsVsS3 = s3StandardCost - rec.TotalMonthly
|
||||
if s3StandardCost > 0 {
|
||||
rec.SavingsPct = (rec.SavingsVsS3 / s3StandardCost) * 100.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return analysis
|
||||
}
|
||||
|
||||
func outputCostTable(analysis *CostAnalysis, stats *catalog.Stats) error {
|
||||
fmt.Println()
|
||||
fmt.Println("═══════════════════════════════════════════════════════════════════════════")
|
||||
fmt.Printf(" Cloud Storage Cost Analysis\n")
|
||||
fmt.Println("═══════════════════════════════════════════════════════════════════════════")
|
||||
fmt.Println()
|
||||
|
||||
fmt.Printf("[CURRENT BACKUP INVENTORY]\n")
|
||||
fmt.Printf(" Total Backups: %d\n", stats.TotalBackups)
|
||||
fmt.Printf(" Total Size: %.2f GB (%s)\n", analysis.TotalSizeGB, stats.TotalSizeHuman)
|
||||
if costDatabase != "" {
|
||||
fmt.Printf(" Database: %s\n", costDatabase)
|
||||
} else {
|
||||
fmt.Printf(" Databases: %d\n", len(stats.ByDatabase))
|
||||
}
|
||||
fmt.Printf(" Region: %s\n", analysis.Region)
|
||||
fmt.Printf(" Analysis Period: %d days\n", analysis.Days)
|
||||
fmt.Println()
|
||||
|
||||
fmt.Println("───────────────────────────────────────────────────────────────────────────")
|
||||
fmt.Printf("%-20s %-20s %12s %12s %12s\n",
|
||||
"PROVIDER", "TIER", "MONTHLY", "ANNUAL", "SAVINGS")
|
||||
fmt.Println("───────────────────────────────────────────────────────────────────────────")
|
||||
|
||||
for _, rec := range analysis.Recommendations {
|
||||
savings := ""
|
||||
if rec.SavingsVsS3 > 0 {
|
||||
savings = fmt.Sprintf("↓ $%.2f (%.0f%%)", rec.SavingsVsS3, rec.SavingsPct)
|
||||
} else if rec.SavingsVsS3 < 0 {
|
||||
savings = fmt.Sprintf("↑ $%.2f", -rec.SavingsVsS3)
|
||||
} else {
|
||||
savings = "baseline"
|
||||
}
|
||||
|
||||
fmt.Printf("%-20s %-20s $%10.2f $%10.2f %s\n",
|
||||
rec.Provider,
|
||||
rec.Tier,
|
||||
rec.TotalMonthly,
|
||||
rec.TotalAnnual,
|
||||
savings,
|
||||
)
|
||||
}
|
||||
|
||||
fmt.Println("───────────────────────────────────────────────────────────────────────────")
|
||||
fmt.Println()
|
||||
|
||||
// Top recommendations
|
||||
fmt.Println("[COST OPTIMIZATION RECOMMENDATIONS]")
|
||||
fmt.Println()
|
||||
|
||||
// Find cheapest option
|
||||
cheapest := analysis.Recommendations[0]
|
||||
for _, rec := range analysis.Recommendations {
|
||||
if rec.TotalAnnual < cheapest.TotalAnnual {
|
||||
cheapest = rec
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("💰 CHEAPEST OPTION: %s %s\n", cheapest.Provider, cheapest.Tier)
|
||||
fmt.Printf(" Annual Cost: $%.2f (save $%.2f/year vs S3 Standard)\n",
|
||||
cheapest.TotalAnnual, cheapest.SavingsVsS3*12)
|
||||
fmt.Printf(" Best For: %s\n", cheapest.BestFor)
|
||||
fmt.Println()
|
||||
|
||||
// Find best balance
|
||||
fmt.Printf("⚖️ BALANCED OPTION: AWS S3 Standard-IA or GCS Nearline\n")
|
||||
fmt.Printf(" Good balance of cost and accessibility\n")
|
||||
fmt.Printf(" Suitable for 30-day retention backups\n")
|
||||
fmt.Println()
|
||||
|
||||
// Find hot storage
|
||||
fmt.Printf("🔥 HOT STORAGE: Wasabi or Backblaze B2\n")
|
||||
fmt.Printf(" No egress fees (Wasabi) or low retrieval costs\n")
|
||||
fmt.Printf(" Perfect for frequent restore testing\n")
|
||||
fmt.Println()
|
||||
|
||||
// Strategy recommendation
|
||||
fmt.Println("[TIERED STORAGE STRATEGY]")
|
||||
fmt.Println()
|
||||
fmt.Printf(" Day 0-7: S3 Standard or Wasabi (frequent access)\n")
|
||||
fmt.Printf(" Day 8-30: S3 Standard-IA or GCS Nearline (weekly access)\n")
|
||||
fmt.Printf(" Day 31-90: S3 Glacier or GCS Coldline (monthly access)\n")
|
||||
fmt.Printf(" Day 90+: S3 Deep Archive or GCS Archive (compliance)\n")
|
||||
fmt.Println()
|
||||
|
||||
potentialSaving := 0.0
|
||||
for _, rec := range analysis.Recommendations {
|
||||
if rec.Provider == "AWS S3" && rec.Tier == "Deep Archive" {
|
||||
potentialSaving = rec.SavingsVsS3 * 12
|
||||
}
|
||||
}
|
||||
|
||||
if potentialSaving > 0 {
|
||||
fmt.Printf("💡 With tiered lifecycle policies, you could save ~$%.2f/year\n", potentialSaving)
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println("═══════════════════════════════════════════════════════════════════════════")
|
||||
fmt.Println()
|
||||
fmt.Println("Note: Costs are estimates based on standard pricing.")
|
||||
fmt.Println("Actual costs may vary by region, usage patterns, and current pricing.")
|
||||
fmt.Println()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func outputCostJSON(analysis *CostAnalysis, stats *catalog.Stats) error {
|
||||
output := map[string]interface{}{
|
||||
"inventory": map[string]interface{}{
|
||||
"total_backups": stats.TotalBackups,
|
||||
"total_size_gb": analysis.TotalSizeGB,
|
||||
"total_size_human": stats.TotalSizeHuman,
|
||||
"region": analysis.Region,
|
||||
"analysis_days": analysis.Days,
|
||||
},
|
||||
"recommendations": analysis.Recommendations,
|
||||
}
|
||||
|
||||
// Find cheapest
|
||||
cheapest := analysis.Recommendations[0]
|
||||
for _, rec := range analysis.Recommendations {
|
||||
if rec.TotalAnnual < cheapest.TotalAnnual {
|
||||
cheapest = rec
|
||||
}
|
||||
}
|
||||
|
||||
output["cheapest"] = map[string]interface{}{
|
||||
"provider": cheapest.Provider,
|
||||
"tier": cheapest.Tier,
|
||||
"annual_cost": cheapest.TotalAnnual,
|
||||
"monthly_cost": cheapest.TotalMonthly,
|
||||
}
|
||||
|
||||
data, err := json.MarshalIndent(output, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Println(string(data))
|
||||
return nil
|
||||
}
|
||||
26
cmd/dedup.go
26
cmd/dedup.go
@ -1,6 +1,7 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"compress/gzip"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
@ -13,7 +14,6 @@ import (
|
||||
|
||||
"dbbackup/internal/dedup"
|
||||
|
||||
"github.com/klauspost/pgzip"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
@ -164,8 +164,8 @@ var (
|
||||
|
||||
// metrics flags
|
||||
var (
|
||||
dedupMetricsOutput string
|
||||
dedupMetricsServer string
|
||||
dedupMetricsOutput string
|
||||
dedupMetricsInstance string
|
||||
)
|
||||
|
||||
var dedupMetricsCmd = &cobra.Command{
|
||||
@ -241,7 +241,7 @@ func init() {
|
||||
|
||||
// Metrics flags
|
||||
dedupMetricsCmd.Flags().StringVarP(&dedupMetricsOutput, "output", "o", "", "Output file path (default: stdout)")
|
||||
dedupMetricsCmd.Flags().StringVar(&dedupMetricsServer, "server", "", "Server label for metrics (default: hostname)")
|
||||
dedupMetricsCmd.Flags().StringVar(&dedupMetricsInstance, "instance", "", "Instance label for metrics (default: hostname)")
|
||||
}
|
||||
|
||||
func getDedupDir() string {
|
||||
@ -295,7 +295,7 @@ func runDedupBackup(cmd *cobra.Command, args []string) error {
|
||||
|
||||
if isGzipped && dedupDecompress {
|
||||
fmt.Printf("Auto-decompressing gzip input for better dedup ratio...\n")
|
||||
gzReader, err := pgzip.NewReader(file)
|
||||
gzReader, err := gzip.NewReader(file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to decompress gzip input: %w", err)
|
||||
}
|
||||
@ -596,7 +596,7 @@ func runDedupList(cmd *cobra.Command, args []string) error {
|
||||
func runDedupStats(cmd *cobra.Command, args []string) error {
|
||||
basePath := getDedupDir()
|
||||
|
||||
index, err := dedup.NewChunkIndexAt(getIndexDBPath())
|
||||
index, err := dedup.NewChunkIndex(basePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open chunk index: %w", err)
|
||||
}
|
||||
@ -642,7 +642,7 @@ func runDedupStats(cmd *cobra.Command, args []string) error {
|
||||
func runDedupGC(cmd *cobra.Command, args []string) error {
|
||||
basePath := getDedupDir()
|
||||
|
||||
index, err := dedup.NewChunkIndexAt(getIndexDBPath())
|
||||
index, err := dedup.NewChunkIndex(basePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open chunk index: %w", err)
|
||||
}
|
||||
@ -702,7 +702,7 @@ func runDedupDelete(cmd *cobra.Command, args []string) error {
|
||||
return fmt.Errorf("failed to open manifest store: %w", err)
|
||||
}
|
||||
|
||||
index, err := dedup.NewChunkIndexAt(getIndexDBPath())
|
||||
index, err := dedup.NewChunkIndex(basePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open chunk index: %w", err)
|
||||
}
|
||||
@ -1258,10 +1258,10 @@ func runDedupMetrics(cmd *cobra.Command, args []string) error {
|
||||
basePath := getDedupDir()
|
||||
indexPath := getIndexDBPath()
|
||||
|
||||
server := dedupMetricsServer
|
||||
if server == "" {
|
||||
instance := dedupMetricsInstance
|
||||
if instance == "" {
|
||||
hostname, _ := os.Hostname()
|
||||
server = hostname
|
||||
instance = hostname
|
||||
}
|
||||
|
||||
metrics, err := dedup.CollectMetrics(basePath, indexPath)
|
||||
@ -1269,10 +1269,10 @@ func runDedupMetrics(cmd *cobra.Command, args []string) error {
|
||||
return fmt.Errorf("failed to collect metrics: %w", err)
|
||||
}
|
||||
|
||||
output := dedup.FormatPrometheusMetrics(metrics, server)
|
||||
output := dedup.FormatPrometheusMetrics(metrics, instance)
|
||||
|
||||
if dedupMetricsOutput != "" {
|
||||
if err := dedup.WritePrometheusTextfile(dedupMetricsOutput, server, basePath, indexPath); err != nil {
|
||||
if err := dedup.WritePrometheusTextfile(dedupMetricsOutput, instance, basePath, indexPath); err != nil {
|
||||
return fmt.Errorf("failed to write metrics: %w", err)
|
||||
}
|
||||
fmt.Printf("Wrote metrics to %s\n", dedupMetricsOutput)
|
||||
|
||||
@ -16,7 +16,8 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
drillDatabaseName string
|
||||
drillBackupPath string
|
||||
drillDatabaseName string
|
||||
drillDatabaseType string
|
||||
drillImage string
|
||||
drillPort int
|
||||
|
||||
@ -65,3 +65,13 @@ func loadEncryptionKey(keyFile, keyEnvVar string) ([]byte, error) {
|
||||
func isEncryptionEnabled() bool {
|
||||
return encryptBackupFlag
|
||||
}
|
||||
|
||||
// generateEncryptionKey generates a new random encryption key
|
||||
func generateEncryptionKey() ([]byte, error) {
|
||||
salt, err := crypto.GenerateSalt()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// For key generation, use salt as both password and salt (random)
|
||||
return crypto.DeriveKey(salt, salt), nil
|
||||
}
|
||||
|
||||
699
cmd/health.go
699
cmd/health.go
@ -1,699 +0,0 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"dbbackup/internal/catalog"
|
||||
"dbbackup/internal/database"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
var (
|
||||
healthFormat string
|
||||
healthVerbose bool
|
||||
healthInterval string
|
||||
healthSkipDB bool
|
||||
)
|
||||
|
||||
// HealthStatus represents overall health
|
||||
type HealthStatus string
|
||||
|
||||
const (
|
||||
StatusHealthy HealthStatus = "healthy"
|
||||
StatusWarning HealthStatus = "warning"
|
||||
StatusCritical HealthStatus = "critical"
|
||||
)
|
||||
|
||||
// HealthReport contains the complete health check results
|
||||
type HealthReport struct {
|
||||
Status HealthStatus `json:"status"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Summary string `json:"summary"`
|
||||
Checks []HealthCheck `json:"checks"`
|
||||
Recommendations []string `json:"recommendations,omitempty"`
|
||||
}
|
||||
|
||||
// HealthCheck represents a single health check
|
||||
type HealthCheck struct {
|
||||
Name string `json:"name"`
|
||||
Status HealthStatus `json:"status"`
|
||||
Message string `json:"message"`
|
||||
Details string `json:"details,omitempty"`
|
||||
}
|
||||
|
||||
// healthCmd is the health check command
|
||||
var healthCmd = &cobra.Command{
|
||||
Use: "health",
|
||||
Short: "Check backup system health",
|
||||
Long: `Comprehensive health check for your backup infrastructure.
|
||||
|
||||
Checks:
|
||||
- Database connectivity (can we reach the database?)
|
||||
- Catalog integrity (is the backup database healthy?)
|
||||
- Backup freshness (are backups up to date?)
|
||||
- Gap detection (any missed scheduled backups?)
|
||||
- Verification status (are backups verified?)
|
||||
- File integrity (do backup files exist and match metadata?)
|
||||
- Disk space (sufficient space for operations?)
|
||||
- Configuration (valid settings?)
|
||||
|
||||
Exit codes for automation:
|
||||
0 = healthy (all checks passed)
|
||||
1 = warning (some checks need attention)
|
||||
2 = critical (immediate action required)
|
||||
|
||||
Examples:
|
||||
# Quick health check
|
||||
dbbackup health
|
||||
|
||||
# Detailed output
|
||||
dbbackup health --verbose
|
||||
|
||||
# JSON for monitoring integration
|
||||
dbbackup health --format json
|
||||
|
||||
# Custom backup interval for gap detection
|
||||
dbbackup health --interval 12h
|
||||
|
||||
# Skip database connectivity (offline check)
|
||||
dbbackup health --skip-db`,
|
||||
RunE: runHealthCheck,
|
||||
}
|
||||
|
||||
func init() {
|
||||
rootCmd.AddCommand(healthCmd)
|
||||
|
||||
healthCmd.Flags().StringVar(&healthFormat, "format", "table", "Output format (table, json)")
|
||||
healthCmd.Flags().BoolVarP(&healthVerbose, "verbose", "v", false, "Show detailed output")
|
||||
healthCmd.Flags().StringVar(&healthInterval, "interval", "24h", "Expected backup interval for gap detection")
|
||||
healthCmd.Flags().BoolVar(&healthSkipDB, "skip-db", false, "Skip database connectivity check")
|
||||
}
|
||||
|
||||
func runHealthCheck(cmd *cobra.Command, args []string) error {
|
||||
report := &HealthReport{
|
||||
Status: StatusHealthy,
|
||||
Timestamp: time.Now(),
|
||||
Checks: []HealthCheck{},
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Parse interval for gap detection
|
||||
interval, err := time.ParseDuration(healthInterval)
|
||||
if err != nil {
|
||||
interval = 24 * time.Hour
|
||||
}
|
||||
|
||||
// 1. Configuration check
|
||||
report.addCheck(checkConfiguration())
|
||||
|
||||
// 2. Database connectivity (unless skipped)
|
||||
if !healthSkipDB {
|
||||
report.addCheck(checkDatabaseConnectivity(ctx))
|
||||
}
|
||||
|
||||
// 3. Backup directory check
|
||||
report.addCheck(checkBackupDir())
|
||||
|
||||
// 4. Catalog integrity check
|
||||
catalogCheck, cat := checkCatalogIntegrity(ctx)
|
||||
report.addCheck(catalogCheck)
|
||||
|
||||
if cat != nil {
|
||||
defer cat.Close()
|
||||
|
||||
// 5. Backup freshness check
|
||||
report.addCheck(checkBackupFreshness(ctx, cat, interval))
|
||||
|
||||
// 6. Gap detection
|
||||
report.addCheck(checkBackupGaps(ctx, cat, interval))
|
||||
|
||||
// 7. Verification status
|
||||
report.addCheck(checkVerificationStatus(ctx, cat))
|
||||
|
||||
// 8. File integrity (sampling)
|
||||
report.addCheck(checkFileIntegrity(ctx, cat))
|
||||
|
||||
// 9. Orphaned entries
|
||||
report.addCheck(checkOrphanedEntries(ctx, cat))
|
||||
}
|
||||
|
||||
// 10. Disk space
|
||||
report.addCheck(checkDiskSpace())
|
||||
|
||||
// Calculate overall status
|
||||
report.calculateOverallStatus()
|
||||
|
||||
// Generate recommendations
|
||||
report.generateRecommendations()
|
||||
|
||||
// Output
|
||||
if healthFormat == "json" {
|
||||
return outputHealthJSON(report)
|
||||
}
|
||||
|
||||
outputHealthTable(report)
|
||||
|
||||
// Exit code based on status
|
||||
switch report.Status {
|
||||
case StatusWarning:
|
||||
os.Exit(1)
|
||||
case StatusCritical:
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *HealthReport) addCheck(check HealthCheck) {
|
||||
r.Checks = append(r.Checks, check)
|
||||
}
|
||||
|
||||
func (r *HealthReport) calculateOverallStatus() {
|
||||
criticalCount := 0
|
||||
warningCount := 0
|
||||
healthyCount := 0
|
||||
|
||||
for _, check := range r.Checks {
|
||||
switch check.Status {
|
||||
case StatusCritical:
|
||||
criticalCount++
|
||||
case StatusWarning:
|
||||
warningCount++
|
||||
case StatusHealthy:
|
||||
healthyCount++
|
||||
}
|
||||
}
|
||||
|
||||
if criticalCount > 0 {
|
||||
r.Status = StatusCritical
|
||||
r.Summary = fmt.Sprintf("%d critical, %d warning, %d healthy", criticalCount, warningCount, healthyCount)
|
||||
} else if warningCount > 0 {
|
||||
r.Status = StatusWarning
|
||||
r.Summary = fmt.Sprintf("%d warning, %d healthy", warningCount, healthyCount)
|
||||
} else {
|
||||
r.Status = StatusHealthy
|
||||
r.Summary = fmt.Sprintf("All %d checks passed", healthyCount)
|
||||
}
|
||||
}
|
||||
|
||||
func (r *HealthReport) generateRecommendations() {
|
||||
for _, check := range r.Checks {
|
||||
switch {
|
||||
case check.Name == "Backup Freshness" && check.Status != StatusHealthy:
|
||||
r.Recommendations = append(r.Recommendations, "Run a backup immediately: dbbackup backup cluster")
|
||||
case check.Name == "Verification Status" && check.Status != StatusHealthy:
|
||||
r.Recommendations = append(r.Recommendations, "Verify recent backups: dbbackup verify-backup /path/to/backup")
|
||||
case check.Name == "Disk Space" && check.Status != StatusHealthy:
|
||||
r.Recommendations = append(r.Recommendations, "Free up disk space or run cleanup: dbbackup cleanup")
|
||||
case check.Name == "Backup Gaps" && check.Status == StatusCritical:
|
||||
r.Recommendations = append(r.Recommendations, "Review backup schedule and cron configuration")
|
||||
case check.Name == "Orphaned Entries" && check.Status != StatusHealthy:
|
||||
r.Recommendations = append(r.Recommendations, "Clean orphaned entries: dbbackup catalog cleanup --orphaned")
|
||||
case check.Name == "Database Connectivity" && check.Status != StatusHealthy:
|
||||
r.Recommendations = append(r.Recommendations, "Check database connection settings in .dbbackup.conf")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Individual health checks
|
||||
|
||||
func checkConfiguration() HealthCheck {
|
||||
check := HealthCheck{
|
||||
Name: "Configuration",
|
||||
Status: StatusHealthy,
|
||||
}
|
||||
|
||||
if err := cfg.Validate(); err != nil {
|
||||
check.Status = StatusCritical
|
||||
check.Message = "Configuration invalid"
|
||||
check.Details = err.Error()
|
||||
return check
|
||||
}
|
||||
|
||||
check.Message = "Configuration valid"
|
||||
return check
|
||||
}
|
||||
|
||||
func checkDatabaseConnectivity(ctx context.Context) HealthCheck {
|
||||
check := HealthCheck{
|
||||
Name: "Database Connectivity",
|
||||
Status: StatusHealthy,
|
||||
}
|
||||
|
||||
db, err := database.New(cfg, log)
|
||||
if err != nil {
|
||||
check.Status = StatusCritical
|
||||
check.Message = "Failed to create database instance"
|
||||
check.Details = err.Error()
|
||||
return check
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
if err := db.Connect(ctx); err != nil {
|
||||
check.Status = StatusCritical
|
||||
check.Message = "Cannot connect to database"
|
||||
check.Details = err.Error()
|
||||
return check
|
||||
}
|
||||
|
||||
version, _ := db.GetVersion(ctx)
|
||||
check.Message = "Connected successfully"
|
||||
check.Details = version
|
||||
|
||||
return check
|
||||
}
|
||||
|
||||
func checkBackupDir() HealthCheck {
|
||||
check := HealthCheck{
|
||||
Name: "Backup Directory",
|
||||
Status: StatusHealthy,
|
||||
}
|
||||
|
||||
info, err := os.Stat(cfg.BackupDir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
check.Status = StatusWarning
|
||||
check.Message = "Backup directory does not exist"
|
||||
check.Details = cfg.BackupDir
|
||||
} else {
|
||||
check.Status = StatusCritical
|
||||
check.Message = "Cannot access backup directory"
|
||||
check.Details = err.Error()
|
||||
}
|
||||
return check
|
||||
}
|
||||
|
||||
if !info.IsDir() {
|
||||
check.Status = StatusCritical
|
||||
check.Message = "Backup path is not a directory"
|
||||
check.Details = cfg.BackupDir
|
||||
return check
|
||||
}
|
||||
|
||||
// Check writability
|
||||
testFile := filepath.Join(cfg.BackupDir, ".health_check_test")
|
||||
if err := os.WriteFile(testFile, []byte("test"), 0644); err != nil {
|
||||
check.Status = StatusCritical
|
||||
check.Message = "Backup directory is not writable"
|
||||
check.Details = err.Error()
|
||||
return check
|
||||
}
|
||||
os.Remove(testFile)
|
||||
|
||||
check.Message = "Backup directory accessible"
|
||||
check.Details = cfg.BackupDir
|
||||
|
||||
return check
|
||||
}
|
||||
|
||||
func checkCatalogIntegrity(ctx context.Context) (HealthCheck, *catalog.SQLiteCatalog) {
|
||||
check := HealthCheck{
|
||||
Name: "Catalog Integrity",
|
||||
Status: StatusHealthy,
|
||||
}
|
||||
|
||||
cat, err := openCatalog()
|
||||
if err != nil {
|
||||
check.Status = StatusWarning
|
||||
check.Message = "Catalog not available"
|
||||
check.Details = err.Error()
|
||||
return check, nil
|
||||
}
|
||||
|
||||
// Try a simple query to verify integrity
|
||||
stats, err := cat.Stats(ctx)
|
||||
if err != nil {
|
||||
check.Status = StatusCritical
|
||||
check.Message = "Catalog corrupted or inaccessible"
|
||||
check.Details = err.Error()
|
||||
cat.Close()
|
||||
return check, nil
|
||||
}
|
||||
|
||||
check.Message = fmt.Sprintf("Catalog healthy (%d backups tracked)", stats.TotalBackups)
|
||||
check.Details = fmt.Sprintf("Size: %s", stats.TotalSizeHuman)
|
||||
|
||||
return check, cat
|
||||
}
|
||||
|
||||
func checkBackupFreshness(ctx context.Context, cat *catalog.SQLiteCatalog, interval time.Duration) HealthCheck {
|
||||
check := HealthCheck{
|
||||
Name: "Backup Freshness",
|
||||
Status: StatusHealthy,
|
||||
}
|
||||
|
||||
stats, err := cat.Stats(ctx)
|
||||
if err != nil {
|
||||
check.Status = StatusWarning
|
||||
check.Message = "Cannot determine backup freshness"
|
||||
check.Details = err.Error()
|
||||
return check
|
||||
}
|
||||
|
||||
if stats.NewestBackup == nil {
|
||||
check.Status = StatusCritical
|
||||
check.Message = "No backups found in catalog"
|
||||
return check
|
||||
}
|
||||
|
||||
age := time.Since(*stats.NewestBackup)
|
||||
|
||||
if age > interval*3 {
|
||||
check.Status = StatusCritical
|
||||
check.Message = fmt.Sprintf("Last backup is %s old (critical)", formatDurationHealth(age))
|
||||
check.Details = stats.NewestBackup.Format("2006-01-02 15:04:05")
|
||||
} else if age > interval {
|
||||
check.Status = StatusWarning
|
||||
check.Message = fmt.Sprintf("Last backup is %s old", formatDurationHealth(age))
|
||||
check.Details = stats.NewestBackup.Format("2006-01-02 15:04:05")
|
||||
} else {
|
||||
check.Message = fmt.Sprintf("Last backup %s ago", formatDurationHealth(age))
|
||||
check.Details = stats.NewestBackup.Format("2006-01-02 15:04:05")
|
||||
}
|
||||
|
||||
return check
|
||||
}
|
||||
|
||||
func checkBackupGaps(ctx context.Context, cat *catalog.SQLiteCatalog, interval time.Duration) HealthCheck {
|
||||
check := HealthCheck{
|
||||
Name: "Backup Gaps",
|
||||
Status: StatusHealthy,
|
||||
}
|
||||
|
||||
config := &catalog.GapDetectionConfig{
|
||||
ExpectedInterval: interval,
|
||||
Tolerance: interval / 4,
|
||||
RPOThreshold: interval * 2,
|
||||
}
|
||||
|
||||
allGaps, err := cat.DetectAllGaps(ctx, config)
|
||||
if err != nil {
|
||||
check.Status = StatusWarning
|
||||
check.Message = "Gap detection failed"
|
||||
check.Details = err.Error()
|
||||
return check
|
||||
}
|
||||
|
||||
totalGaps := 0
|
||||
criticalGaps := 0
|
||||
for _, gaps := range allGaps {
|
||||
totalGaps += len(gaps)
|
||||
for _, gap := range gaps {
|
||||
if gap.Severity == catalog.SeverityCritical {
|
||||
criticalGaps++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if criticalGaps > 0 {
|
||||
check.Status = StatusCritical
|
||||
check.Message = fmt.Sprintf("%d critical gaps detected", criticalGaps)
|
||||
check.Details = fmt.Sprintf("%d total gaps across %d databases", totalGaps, len(allGaps))
|
||||
} else if totalGaps > 0 {
|
||||
check.Status = StatusWarning
|
||||
check.Message = fmt.Sprintf("%d gaps detected", totalGaps)
|
||||
check.Details = fmt.Sprintf("Across %d databases", len(allGaps))
|
||||
} else {
|
||||
check.Message = "No backup gaps detected"
|
||||
}
|
||||
|
||||
return check
|
||||
}
|
||||
|
||||
func checkVerificationStatus(ctx context.Context, cat *catalog.SQLiteCatalog) HealthCheck {
|
||||
check := HealthCheck{
|
||||
Name: "Verification Status",
|
||||
Status: StatusHealthy,
|
||||
}
|
||||
|
||||
stats, err := cat.Stats(ctx)
|
||||
if err != nil {
|
||||
check.Status = StatusWarning
|
||||
check.Message = "Cannot check verification status"
|
||||
return check
|
||||
}
|
||||
|
||||
if stats.TotalBackups == 0 {
|
||||
check.Message = "No backups to verify"
|
||||
return check
|
||||
}
|
||||
|
||||
verifiedPct := float64(stats.VerifiedCount) / float64(stats.TotalBackups) * 100
|
||||
|
||||
if verifiedPct < 25 {
|
||||
check.Status = StatusWarning
|
||||
check.Message = fmt.Sprintf("Only %.0f%% of backups verified", verifiedPct)
|
||||
check.Details = fmt.Sprintf("%d/%d verified", stats.VerifiedCount, stats.TotalBackups)
|
||||
} else {
|
||||
check.Message = fmt.Sprintf("%.0f%% of backups verified", verifiedPct)
|
||||
check.Details = fmt.Sprintf("%d/%d verified", stats.VerifiedCount, stats.TotalBackups)
|
||||
}
|
||||
|
||||
// Check drill testing status too
|
||||
if stats.DrillTestedCount > 0 {
|
||||
check.Details += fmt.Sprintf(", %d drill tested", stats.DrillTestedCount)
|
||||
}
|
||||
|
||||
return check
|
||||
}
|
||||
|
||||
func checkFileIntegrity(ctx context.Context, cat *catalog.SQLiteCatalog) HealthCheck {
|
||||
check := HealthCheck{
|
||||
Name: "File Integrity",
|
||||
Status: StatusHealthy,
|
||||
}
|
||||
|
||||
// Sample recent backups for file existence
|
||||
entries, err := cat.Search(ctx, &catalog.SearchQuery{
|
||||
Limit: 10,
|
||||
OrderBy: "created_at",
|
||||
OrderDesc: true,
|
||||
})
|
||||
if err != nil || len(entries) == 0 {
|
||||
check.Message = "No backups to check"
|
||||
return check
|
||||
}
|
||||
|
||||
missingCount := 0
|
||||
checksumMismatch := 0
|
||||
|
||||
for _, entry := range entries {
|
||||
// Skip cloud backups
|
||||
if entry.CloudLocation != "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check file exists
|
||||
info, err := os.Stat(entry.BackupPath)
|
||||
if err != nil {
|
||||
missingCount++
|
||||
continue
|
||||
}
|
||||
|
||||
// Quick size check
|
||||
if info.Size() != entry.SizeBytes {
|
||||
checksumMismatch++
|
||||
}
|
||||
}
|
||||
|
||||
totalChecked := len(entries)
|
||||
|
||||
if missingCount > 0 {
|
||||
check.Status = StatusCritical
|
||||
check.Message = fmt.Sprintf("%d/%d backup files missing", missingCount, totalChecked)
|
||||
} else if checksumMismatch > 0 {
|
||||
check.Status = StatusWarning
|
||||
check.Message = fmt.Sprintf("%d/%d backups have size mismatch", checksumMismatch, totalChecked)
|
||||
} else {
|
||||
check.Message = fmt.Sprintf("Sampled %d recent backups - all present", totalChecked)
|
||||
}
|
||||
|
||||
return check
|
||||
}
|
||||
|
||||
func checkOrphanedEntries(ctx context.Context, cat *catalog.SQLiteCatalog) HealthCheck {
|
||||
check := HealthCheck{
|
||||
Name: "Orphaned Entries",
|
||||
Status: StatusHealthy,
|
||||
}
|
||||
|
||||
// Check for catalog entries pointing to missing files
|
||||
entries, err := cat.Search(ctx, &catalog.SearchQuery{
|
||||
Limit: 50,
|
||||
OrderBy: "created_at",
|
||||
OrderDesc: true,
|
||||
})
|
||||
if err != nil {
|
||||
check.Message = "Cannot check for orphaned entries"
|
||||
return check
|
||||
}
|
||||
|
||||
orphanCount := 0
|
||||
for _, entry := range entries {
|
||||
if entry.CloudLocation != "" {
|
||||
continue // Skip cloud backups
|
||||
}
|
||||
if _, err := os.Stat(entry.BackupPath); os.IsNotExist(err) {
|
||||
orphanCount++
|
||||
}
|
||||
}
|
||||
|
||||
if orphanCount > 0 {
|
||||
check.Status = StatusWarning
|
||||
check.Message = fmt.Sprintf("%d orphaned catalog entries", orphanCount)
|
||||
check.Details = "Files deleted but entries remain in catalog"
|
||||
} else {
|
||||
check.Message = "No orphaned entries detected"
|
||||
}
|
||||
|
||||
return check
|
||||
}
|
||||
|
||||
func checkDiskSpace() HealthCheck {
|
||||
check := HealthCheck{
|
||||
Name: "Disk Space",
|
||||
Status: StatusHealthy,
|
||||
}
|
||||
|
||||
// Simple approach: check if we can write a test file
|
||||
testPath := filepath.Join(cfg.BackupDir, ".space_check")
|
||||
|
||||
// Create a 1MB test to ensure we have space
|
||||
testData := make([]byte, 1024*1024)
|
||||
if err := os.WriteFile(testPath, testData, 0644); err != nil {
|
||||
check.Status = StatusCritical
|
||||
check.Message = "Insufficient disk space or write error"
|
||||
check.Details = err.Error()
|
||||
return check
|
||||
}
|
||||
os.Remove(testPath)
|
||||
|
||||
// Try to get actual free space (Linux-specific)
|
||||
info, err := os.Stat(cfg.BackupDir)
|
||||
if err == nil && info.IsDir() {
|
||||
// Walk the backup directory to get size
|
||||
var totalSize int64
|
||||
filepath.Walk(cfg.BackupDir, func(path string, info os.FileInfo, err error) error {
|
||||
if err == nil && !info.IsDir() {
|
||||
totalSize += info.Size()
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
check.Message = "Disk space available"
|
||||
check.Details = fmt.Sprintf("Backup directory using %s", formatBytesHealth(totalSize))
|
||||
} else {
|
||||
check.Message = "Disk space available"
|
||||
}
|
||||
|
||||
return check
|
||||
}
|
||||
|
||||
// Output functions
|
||||
|
||||
func outputHealthTable(report *HealthReport) {
|
||||
fmt.Println()
|
||||
|
||||
statusIcon := "✅"
|
||||
statusColor := "\033[32m" // green
|
||||
if report.Status == StatusWarning {
|
||||
statusIcon = "⚠️"
|
||||
statusColor = "\033[33m" // yellow
|
||||
} else if report.Status == StatusCritical {
|
||||
statusIcon = "🚨"
|
||||
statusColor = "\033[31m" // red
|
||||
}
|
||||
|
||||
fmt.Println("═══════════════════════════════════════════════════════════════")
|
||||
fmt.Printf(" %s Backup Health Check\n", statusIcon)
|
||||
fmt.Println("═══════════════════════════════════════════════════════════════")
|
||||
fmt.Println()
|
||||
|
||||
fmt.Printf("Status: %s%s\033[0m\n", statusColor, strings.ToUpper(string(report.Status)))
|
||||
fmt.Printf("Time: %s\n", report.Timestamp.Format("2006-01-02 15:04:05"))
|
||||
fmt.Println()
|
||||
|
||||
fmt.Println("───────────────────────────────────────────────────────────────")
|
||||
fmt.Println("CHECKS")
|
||||
fmt.Println("───────────────────────────────────────────────────────────────")
|
||||
|
||||
for _, check := range report.Checks {
|
||||
icon := "✓"
|
||||
color := "\033[32m"
|
||||
if check.Status == StatusWarning {
|
||||
icon = "!"
|
||||
color = "\033[33m"
|
||||
} else if check.Status == StatusCritical {
|
||||
icon = "✗"
|
||||
color = "\033[31m"
|
||||
}
|
||||
|
||||
fmt.Printf("%s[%s]\033[0m %-22s %s\n", color, icon, check.Name, check.Message)
|
||||
|
||||
if healthVerbose && check.Details != "" {
|
||||
fmt.Printf(" └─ %s\n", check.Details)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println("───────────────────────────────────────────────────────────────")
|
||||
fmt.Printf("Summary: %s\n", report.Summary)
|
||||
fmt.Println("───────────────────────────────────────────────────────────────")
|
||||
|
||||
if len(report.Recommendations) > 0 {
|
||||
fmt.Println()
|
||||
fmt.Println("RECOMMENDATIONS")
|
||||
for _, rec := range report.Recommendations {
|
||||
fmt.Printf(" → %s\n", rec)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
func outputHealthJSON(report *HealthReport) error {
|
||||
data, err := json.MarshalIndent(report, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Println(string(data))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Helpers
|
||||
|
||||
func formatDurationHealth(d time.Duration) string {
|
||||
if d < time.Minute {
|
||||
return fmt.Sprintf("%.0fs", d.Seconds())
|
||||
}
|
||||
if d < time.Hour {
|
||||
return fmt.Sprintf("%.0fm", d.Minutes())
|
||||
}
|
||||
hours := int(d.Hours())
|
||||
if hours < 24 {
|
||||
return fmt.Sprintf("%dh", hours)
|
||||
}
|
||||
days := hours / 24
|
||||
return fmt.Sprintf("%dd %dh", days, hours%24)
|
||||
}
|
||||
|
||||
func formatBytesHealth(bytes int64) string {
|
||||
const unit = 1024
|
||||
if bytes < unit {
|
||||
return fmt.Sprintf("%d B", bytes)
|
||||
}
|
||||
div, exp := int64(unit), 0
|
||||
for n := bytes / unit; n >= unit; n /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
return fmt.Sprintf("%.1f %cB", float64(bytes)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
@ -5,19 +5,17 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
|
||||
"dbbackup/internal/catalog"
|
||||
"dbbackup/internal/prometheus"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
var (
|
||||
metricsServer string
|
||||
metricsOutput string
|
||||
metricsPort int
|
||||
metricsInstance string
|
||||
metricsOutput string
|
||||
metricsPort int
|
||||
)
|
||||
|
||||
// metricsCmd represents the metrics command
|
||||
@ -47,7 +45,7 @@ Examples:
|
||||
dbbackup metrics export --output /var/lib/dbbackup/metrics/dbbackup.prom
|
||||
|
||||
# Export for specific instance
|
||||
dbbackup metrics export --server production --output /var/lib/dbbackup/metrics/production.prom
|
||||
dbbackup metrics export --instance production --output /var/lib/dbbackup/metrics/production.prom
|
||||
|
||||
After export, configure node_exporter with:
|
||||
--collector.textfile.directory=/var/lib/dbbackup/metrics/
|
||||
@ -86,56 +84,37 @@ Endpoints:
|
||||
},
|
||||
}
|
||||
|
||||
var metricsCatalogDB string
|
||||
|
||||
func init() {
|
||||
rootCmd.AddCommand(metricsCmd)
|
||||
metricsCmd.AddCommand(metricsExportCmd)
|
||||
metricsCmd.AddCommand(metricsServeCmd)
|
||||
|
||||
// Default catalog path (same as catalog command)
|
||||
home, _ := os.UserHomeDir()
|
||||
defaultCatalogPath := filepath.Join(home, ".dbbackup", "catalog.db")
|
||||
|
||||
// Export flags
|
||||
metricsExportCmd.Flags().StringVar(&metricsServer, "server", "", "Server name for metrics labels (default: hostname)")
|
||||
metricsExportCmd.Flags().StringVar(&metricsInstance, "instance", "default", "Instance name for metrics labels")
|
||||
metricsExportCmd.Flags().StringVarP(&metricsOutput, "output", "o", "/var/lib/dbbackup/metrics/dbbackup.prom", "Output file path")
|
||||
metricsExportCmd.Flags().StringVar(&metricsCatalogDB, "catalog-db", defaultCatalogPath, "Path to catalog SQLite database")
|
||||
|
||||
// Serve flags
|
||||
metricsServeCmd.Flags().StringVar(&metricsServer, "server", "", "Server name for metrics labels (default: hostname)")
|
||||
metricsServeCmd.Flags().StringVar(&metricsInstance, "instance", "default", "Instance name for metrics labels")
|
||||
metricsServeCmd.Flags().IntVarP(&metricsPort, "port", "p", 9399, "HTTP server port")
|
||||
metricsServeCmd.Flags().StringVar(&metricsCatalogDB, "catalog-db", defaultCatalogPath, "Path to catalog SQLite database")
|
||||
}
|
||||
|
||||
func runMetricsExport(ctx context.Context) error {
|
||||
// Auto-detect hostname if server not specified
|
||||
server := metricsServer
|
||||
if server == "" {
|
||||
hostname, err := os.Hostname()
|
||||
if err != nil {
|
||||
server = "unknown"
|
||||
} else {
|
||||
server = hostname
|
||||
}
|
||||
}
|
||||
|
||||
// Open catalog using specified path
|
||||
cat, err := catalog.NewSQLiteCatalog(metricsCatalogDB)
|
||||
// Open catalog
|
||||
cat, err := openCatalog()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open catalog: %w", err)
|
||||
}
|
||||
defer cat.Close()
|
||||
|
||||
// Create metrics writer with version info
|
||||
writer := prometheus.NewMetricsWriterWithVersion(log, cat, server, cfg.Version, cfg.GitCommit)
|
||||
// Create metrics writer
|
||||
writer := prometheus.NewMetricsWriter(log, cat, metricsInstance)
|
||||
|
||||
// Write textfile
|
||||
if err := writer.WriteTextfile(metricsOutput); err != nil {
|
||||
return fmt.Errorf("failed to write metrics: %w", err)
|
||||
}
|
||||
|
||||
log.Info("Exported metrics to textfile", "path", metricsOutput, "server", server)
|
||||
log.Info("Exported metrics to textfile", "path", metricsOutput, "instance", metricsInstance)
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -144,26 +123,15 @@ func runMetricsServe(ctx context.Context) error {
|
||||
ctx, cancel := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM)
|
||||
defer cancel()
|
||||
|
||||
// Auto-detect hostname if server not specified
|
||||
server := metricsServer
|
||||
if server == "" {
|
||||
hostname, err := os.Hostname()
|
||||
if err != nil {
|
||||
server = "unknown"
|
||||
} else {
|
||||
server = hostname
|
||||
}
|
||||
}
|
||||
|
||||
// Open catalog using specified path
|
||||
cat, err := catalog.NewSQLiteCatalog(metricsCatalogDB)
|
||||
// Open catalog
|
||||
cat, err := openCatalog()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open catalog: %w", err)
|
||||
}
|
||||
defer cat.Close()
|
||||
|
||||
// Create exporter with version info
|
||||
exporter := prometheus.NewExporterWithVersion(log, cat, server, metricsPort, cfg.Version, cfg.GitCommit)
|
||||
// Create exporter
|
||||
exporter := prometheus.NewExporter(log, cat, metricsInstance, metricsPort)
|
||||
|
||||
// Run server (blocks until context is cancelled)
|
||||
return exporter.Serve(ctx)
|
||||
|
||||
13
cmd/pitr.go
13
cmd/pitr.go
@ -5,6 +5,7 @@ import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
@ -43,6 +44,7 @@ var (
|
||||
mysqlArchiveInterval string
|
||||
mysqlRequireRowFormat bool
|
||||
mysqlRequireGTID bool
|
||||
mysqlWatchMode bool
|
||||
)
|
||||
|
||||
// pitrCmd represents the pitr command group
|
||||
@ -1309,3 +1311,14 @@ func runMySQLPITREnable(cmd *cobra.Command, args []string) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// getMySQLBinlogDir attempts to determine the binlog directory from MySQL
|
||||
func getMySQLBinlogDir(ctx context.Context, db *sql.DB) (string, error) {
|
||||
var logBinBasename string
|
||||
err := db.QueryRowContext(ctx, "SELECT @@log_bin_basename").Scan(&logBinBasename)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return filepath.Dir(logBinBasename), nil
|
||||
}
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
@ -14,7 +15,6 @@ import (
|
||||
"dbbackup/internal/logger"
|
||||
"dbbackup/internal/tui"
|
||||
|
||||
"github.com/klauspost/pgzip"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
@ -66,21 +66,14 @@ TUI Automation Flags (for testing and CI/CD):
|
||||
cfg.TUIVerbose, _ = cmd.Flags().GetBool("verbose-tui")
|
||||
cfg.TUILogFile, _ = cmd.Flags().GetString("tui-log-file")
|
||||
|
||||
// FIXED: Only set default profile if user hasn't configured one
|
||||
// Previously this forced conservative mode, ignoring user's saved settings
|
||||
if cfg.ResourceProfile == "" {
|
||||
// No profile configured at all - use balanced as sensible default
|
||||
cfg.ResourceProfile = "balanced"
|
||||
// Set conservative profile as default for TUI mode (safer for interactive users)
|
||||
if cfg.ResourceProfile == "" || cfg.ResourceProfile == "balanced" {
|
||||
cfg.ResourceProfile = "conservative"
|
||||
cfg.LargeDBMode = true
|
||||
if cfg.Debug {
|
||||
log.Info("TUI mode: no profile configured, using 'balanced' default")
|
||||
}
|
||||
} else {
|
||||
// User has a configured profile - RESPECT IT!
|
||||
if cfg.Debug {
|
||||
log.Info("TUI mode: respecting user-configured profile", "profile", cfg.ResourceProfile)
|
||||
log.Info("TUI mode: using conservative profile by default")
|
||||
}
|
||||
}
|
||||
// Note: LargeDBMode is no longer forced - user controls it via settings
|
||||
|
||||
// Check authentication before starting TUI
|
||||
if cfg.IsPostgreSQL() {
|
||||
@ -281,7 +274,7 @@ func runPreflight(ctx context.Context) error {
|
||||
|
||||
// 4. Disk space check
|
||||
fmt.Print("[4] Available disk space... ")
|
||||
if err := checkPreflightDiskSpace(); err != nil {
|
||||
if err := checkDiskSpace(); err != nil {
|
||||
fmt.Printf("[FAIL] FAILED: %v\n", err)
|
||||
} else {
|
||||
fmt.Println("[OK] PASSED")
|
||||
@ -361,7 +354,7 @@ func checkBackupDirectory() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func checkPreflightDiskSpace() error {
|
||||
func checkDiskSpace() error {
|
||||
// Basic disk space check - this is a simplified version
|
||||
// In a real implementation, you'd use syscall.Statfs or similar
|
||||
if _, err := os.Stat(cfg.BackupDir); os.IsNotExist(err) {
|
||||
@ -398,6 +391,92 @@ func checkSystemResources() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// runRestore restores database from backup archive
|
||||
func runRestore(ctx context.Context, archiveName string) error {
|
||||
fmt.Println("==============================================================")
|
||||
fmt.Println(" Database Restore")
|
||||
fmt.Println("==============================================================")
|
||||
|
||||
// Construct full path to archive
|
||||
archivePath := filepath.Join(cfg.BackupDir, archiveName)
|
||||
|
||||
// Check if archive exists
|
||||
if _, err := os.Stat(archivePath); os.IsNotExist(err) {
|
||||
return fmt.Errorf("backup archive not found: %s", archivePath)
|
||||
}
|
||||
|
||||
// Detect archive type
|
||||
archiveType := detectArchiveType(archiveName)
|
||||
fmt.Printf("Archive: %s\n", archiveName)
|
||||
fmt.Printf("Type: %s\n", archiveType)
|
||||
fmt.Printf("Location: %s\n", archivePath)
|
||||
fmt.Println()
|
||||
|
||||
// Get archive info
|
||||
stat, err := os.Stat(archivePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot access archive: %w", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Size: %s\n", formatFileSize(stat.Size()))
|
||||
fmt.Printf("Created: %s\n", stat.ModTime().Format("2006-01-02 15:04:05"))
|
||||
fmt.Println()
|
||||
|
||||
// Show warning
|
||||
fmt.Println("[WARN] WARNING: This will restore data to the target database.")
|
||||
fmt.Println(" Existing data may be overwritten or merged depending on the restore method.")
|
||||
fmt.Println()
|
||||
|
||||
// For safety, show what would be done without actually doing it
|
||||
switch archiveType {
|
||||
case "Single Database (.dump)":
|
||||
fmt.Println("[EXEC] Would execute: pg_restore to restore single database")
|
||||
fmt.Printf(" Command: pg_restore -h %s -p %d -U %s -d %s --verbose %s\n",
|
||||
cfg.Host, cfg.Port, cfg.User, cfg.Database, archivePath)
|
||||
case "Single Database (.dump.gz)":
|
||||
fmt.Println("[EXEC] Would execute: gunzip and pg_restore to restore single database")
|
||||
fmt.Printf(" Command: gunzip -c %s | pg_restore -h %s -p %d -U %s -d %s --verbose\n",
|
||||
archivePath, cfg.Host, cfg.Port, cfg.User, cfg.Database)
|
||||
case "SQL Script (.sql)":
|
||||
if cfg.IsPostgreSQL() {
|
||||
fmt.Println("[EXEC] Would execute: psql to run SQL script")
|
||||
fmt.Printf(" Command: psql -h %s -p %d -U %s -d %s -f %s\n",
|
||||
cfg.Host, cfg.Port, cfg.User, cfg.Database, archivePath)
|
||||
} else if cfg.IsMySQL() {
|
||||
fmt.Println("[EXEC] Would execute: mysql to run SQL script")
|
||||
fmt.Printf(" Command: %s\n", mysqlRestoreCommand(archivePath, false))
|
||||
} else {
|
||||
fmt.Println("[EXEC] Would execute: SQL client to run script (database type unknown)")
|
||||
}
|
||||
case "SQL Script (.sql.gz)":
|
||||
if cfg.IsPostgreSQL() {
|
||||
fmt.Println("[EXEC] Would execute: gunzip and psql to run SQL script")
|
||||
fmt.Printf(" Command: gunzip -c %s | psql -h %s -p %d -U %s -d %s\n",
|
||||
archivePath, cfg.Host, cfg.Port, cfg.User, cfg.Database)
|
||||
} else if cfg.IsMySQL() {
|
||||
fmt.Println("[EXEC] Would execute: gunzip and mysql to run SQL script")
|
||||
fmt.Printf(" Command: %s\n", mysqlRestoreCommand(archivePath, true))
|
||||
} else {
|
||||
fmt.Println("[EXEC] Would execute: gunzip and SQL client to run script (database type unknown)")
|
||||
}
|
||||
case "Cluster Backup (.tar.gz)":
|
||||
fmt.Println("[EXEC] Would execute: Extract and restore cluster backup")
|
||||
fmt.Println(" Steps:")
|
||||
fmt.Println(" 1. Extract tar.gz archive")
|
||||
fmt.Println(" 2. Restore global objects (roles, tablespaces)")
|
||||
fmt.Println(" 3. Restore individual databases")
|
||||
default:
|
||||
return fmt.Errorf("unsupported archive type: %s", archiveType)
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println("[SAFETY] SAFETY MODE: Restore command is in preview mode.")
|
||||
fmt.Println(" This shows what would be executed without making changes.")
|
||||
fmt.Println(" To enable actual restore, add --confirm flag (not yet implemented).")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func detectArchiveType(filename string) string {
|
||||
switch {
|
||||
case strings.HasSuffix(filename, ".dump.gz"):
|
||||
@ -579,7 +658,7 @@ func verifyPgDumpGzip(path string) error {
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
gz, err := pgzip.NewReader(file)
|
||||
gz, err := gzip.NewReader(file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open gzip stream: %w", err)
|
||||
}
|
||||
@ -628,7 +707,7 @@ func verifyGzipSqlScript(path string) error {
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
gz, err := pgzip.NewReader(file)
|
||||
gz, err := gzip.NewReader(file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open gzip stream: %w", err)
|
||||
}
|
||||
@ -696,3 +775,33 @@ func containsSQLKeywords(content string) bool {
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func mysqlRestoreCommand(archivePath string, compressed bool) string {
|
||||
parts := []string{"mysql"}
|
||||
|
||||
// Only add -h flag if host is not localhost (to use Unix socket)
|
||||
if cfg.Host != "localhost" && cfg.Host != "127.0.0.1" && cfg.Host != "" {
|
||||
parts = append(parts, "-h", cfg.Host)
|
||||
}
|
||||
|
||||
parts = append(parts,
|
||||
"-P", fmt.Sprintf("%d", cfg.Port),
|
||||
"-u", cfg.User,
|
||||
)
|
||||
|
||||
if cfg.Password != "" {
|
||||
parts = append(parts, fmt.Sprintf("-p'%s'", cfg.Password))
|
||||
}
|
||||
|
||||
if cfg.Database != "" {
|
||||
parts = append(parts, cfg.Database)
|
||||
}
|
||||
|
||||
command := strings.Join(parts, " ")
|
||||
|
||||
if compressed {
|
||||
return fmt.Sprintf("gunzip -c %s | %s", archivePath, command)
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s < %s", command, archivePath)
|
||||
}
|
||||
|
||||
@ -15,7 +15,6 @@ import (
|
||||
"dbbackup/internal/cloud"
|
||||
"dbbackup/internal/config"
|
||||
"dbbackup/internal/database"
|
||||
"dbbackup/internal/notify"
|
||||
"dbbackup/internal/pitr"
|
||||
"dbbackup/internal/progress"
|
||||
"dbbackup/internal/restore"
|
||||
@ -25,24 +24,22 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
restoreConfirm bool
|
||||
restoreDryRun bool
|
||||
restoreForce bool
|
||||
restoreClean bool
|
||||
restoreCreate bool
|
||||
restoreJobs int
|
||||
restoreParallelDBs int // Number of parallel database restores
|
||||
restoreProfile string // Resource profile: conservative, balanced, aggressive
|
||||
restoreTarget string
|
||||
restoreVerbose bool
|
||||
restoreNoProgress bool
|
||||
restoreWorkdir string
|
||||
restoreCleanCluster bool
|
||||
restoreDiagnose bool // Run diagnosis before restore
|
||||
restoreSaveDebugLog string // Path to save debug log on failure
|
||||
restoreDebugLocks bool // Enable detailed lock debugging
|
||||
restoreOOMProtection bool // Enable OOM protection for large restores
|
||||
restoreLowMemory bool // Force low-memory mode for constrained systems
|
||||
restoreConfirm bool
|
||||
restoreDryRun bool
|
||||
restoreForce bool
|
||||
restoreClean bool
|
||||
restoreCreate bool
|
||||
restoreJobs int
|
||||
restoreParallelDBs int // Number of parallel database restores
|
||||
restoreProfile string // Resource profile: conservative, balanced, aggressive
|
||||
restoreTarget string
|
||||
restoreVerbose bool
|
||||
restoreNoProgress bool
|
||||
restoreWorkdir string
|
||||
restoreCleanCluster bool
|
||||
restoreDiagnose bool // Run diagnosis before restore
|
||||
restoreSaveDebugLog string // Path to save debug log on failure
|
||||
restoreDebugLocks bool // Enable detailed lock debugging
|
||||
|
||||
// Single database extraction from cluster flags
|
||||
restoreDatabase string // Single database to extract/restore from cluster
|
||||
@ -282,7 +279,7 @@ Use this when:
|
||||
Checks performed:
|
||||
- File format detection (custom dump vs SQL)
|
||||
- PGDMP signature verification
|
||||
- Compression integrity validation (pgzip)
|
||||
- Gzip integrity validation
|
||||
- COPY block termination check
|
||||
- pg_restore --list verification
|
||||
- Cluster archive structure validation
|
||||
@ -350,8 +347,6 @@ func init() {
|
||||
restoreClusterCmd.Flags().BoolVar(&restoreDebugLocks, "debug-locks", false, "Enable detailed lock debugging (captures PostgreSQL config, Guard decisions, boost attempts)")
|
||||
restoreClusterCmd.Flags().BoolVar(&restoreClean, "clean", false, "Drop and recreate target database (for single DB restore)")
|
||||
restoreClusterCmd.Flags().BoolVar(&restoreCreate, "create", false, "Create target database if it doesn't exist (for single DB restore)")
|
||||
restoreClusterCmd.Flags().BoolVar(&restoreOOMProtection, "oom-protection", false, "Enable OOM protection: disable swap, tune PostgreSQL memory, protect from OOM killer")
|
||||
restoreClusterCmd.Flags().BoolVar(&restoreLowMemory, "low-memory", false, "Force low-memory mode: single-threaded restore with minimal memory (use for <8GB RAM or very large backups)")
|
||||
|
||||
// PITR restore flags
|
||||
restorePITRCmd.Flags().StringVar(&pitrBaseBackup, "base-backup", "", "Path to base backup file (.tar.gz) (required)")
|
||||
@ -697,36 +692,14 @@ func runRestoreSingle(cmd *cobra.Command, args []string) error {
|
||||
startTime := time.Now()
|
||||
auditLogger.LogRestoreStart(user, targetDB, archivePath)
|
||||
|
||||
// Notify: restore started
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventRestoreStarted, notify.SeverityInfo, "Database restore started").
|
||||
WithDatabase(targetDB).
|
||||
WithDetail("archive", filepath.Base(archivePath)))
|
||||
}
|
||||
|
||||
if err := engine.RestoreSingle(ctx, archivePath, targetDB, restoreClean, restoreCreate); err != nil {
|
||||
auditLogger.LogRestoreFailed(user, targetDB, err)
|
||||
// Notify: restore failed
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventRestoreFailed, notify.SeverityError, "Database restore failed").
|
||||
WithDatabase(targetDB).
|
||||
WithError(err).
|
||||
WithDuration(time.Since(startTime)))
|
||||
}
|
||||
return fmt.Errorf("restore failed: %w", err)
|
||||
}
|
||||
|
||||
// Audit log: restore success
|
||||
auditLogger.LogRestoreComplete(user, targetDB, time.Since(startTime))
|
||||
|
||||
// Notify: restore completed
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventRestoreCompleted, notify.SeveritySuccess, "Database restore completed successfully").
|
||||
WithDatabase(targetDB).
|
||||
WithDuration(time.Since(startTime)).
|
||||
WithDetail("archive", filepath.Base(archivePath)))
|
||||
}
|
||||
|
||||
log.Info("[OK] Restore completed successfully", "database", targetDB)
|
||||
return nil
|
||||
}
|
||||
@ -1218,36 +1191,15 @@ func runFullClusterRestore(archivePath string) error {
|
||||
startTime := time.Now()
|
||||
auditLogger.LogRestoreStart(user, "all_databases", archivePath)
|
||||
|
||||
// Notify: restore started
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventRestoreStarted, notify.SeverityInfo, "Cluster restore started").
|
||||
WithDatabase("all_databases").
|
||||
WithDetail("archive", filepath.Base(archivePath)))
|
||||
}
|
||||
|
||||
// Pass pre-extracted directory to avoid double extraction
|
||||
if err := engine.RestoreCluster(ctx, archivePath, extractedDir); err != nil {
|
||||
auditLogger.LogRestoreFailed(user, "all_databases", err)
|
||||
// Notify: restore failed
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventRestoreFailed, notify.SeverityError, "Cluster restore failed").
|
||||
WithDatabase("all_databases").
|
||||
WithError(err).
|
||||
WithDuration(time.Since(startTime)))
|
||||
}
|
||||
return fmt.Errorf("cluster restore failed: %w", err)
|
||||
}
|
||||
|
||||
// Audit log: restore success
|
||||
auditLogger.LogRestoreComplete(user, "all_databases", time.Since(startTime))
|
||||
|
||||
// Notify: restore completed
|
||||
if notifyManager != nil {
|
||||
notifyManager.Notify(notify.NewEvent(notify.EventRestoreCompleted, notify.SeveritySuccess, "Cluster restore completed successfully").
|
||||
WithDatabase("all_databases").
|
||||
WithDuration(time.Since(startTime)))
|
||||
}
|
||||
|
||||
log.Info("[OK] Cluster restore completed successfully")
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -1,328 +0,0 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/dustin/go-humanize"
|
||||
"github.com/spf13/cobra"
|
||||
|
||||
"dbbackup/internal/restore"
|
||||
)
|
||||
|
||||
var (
|
||||
previewCompareSchema bool
|
||||
previewEstimate bool
|
||||
)
|
||||
|
||||
var restorePreviewCmd = &cobra.Command{
|
||||
Use: "preview [archive-file]",
|
||||
Short: "Preview backup contents before restoring",
|
||||
Long: `Show detailed information about what a backup contains before actually restoring it.
|
||||
|
||||
This command analyzes backup archives and provides:
|
||||
- Database name, version, and size information
|
||||
- Table count and largest tables
|
||||
- Estimated restore time based on system resources
|
||||
- Required disk space
|
||||
- Schema comparison with current database (optional)
|
||||
- Resource recommendations
|
||||
|
||||
Use this to:
|
||||
- See what you'll get before committing to a long restore
|
||||
- Estimate restore time and resource requirements
|
||||
- Identify schema changes since backup was created
|
||||
- Verify backup contains expected data
|
||||
|
||||
Examples:
|
||||
# Preview a backup
|
||||
dbbackup restore preview mydb.dump.gz
|
||||
|
||||
# Preview with restore time estimation
|
||||
dbbackup restore preview mydb.dump.gz --estimate
|
||||
|
||||
# Preview with schema comparison to current database
|
||||
dbbackup restore preview mydb.dump.gz --compare-schema
|
||||
|
||||
# Preview cluster backup
|
||||
dbbackup restore preview cluster_backup.tar.gz
|
||||
`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: runRestorePreview,
|
||||
}
|
||||
|
||||
func init() {
|
||||
restoreCmd.AddCommand(restorePreviewCmd)
|
||||
|
||||
restorePreviewCmd.Flags().BoolVar(&previewCompareSchema, "compare-schema", false, "Compare backup schema with current database")
|
||||
restorePreviewCmd.Flags().BoolVar(&previewEstimate, "estimate", true, "Estimate restore time and resource requirements")
|
||||
restorePreviewCmd.Flags().BoolVar(&restoreVerbose, "verbose", false, "Show detailed analysis")
|
||||
}
|
||||
|
||||
func runRestorePreview(cmd *cobra.Command, args []string) error {
|
||||
archivePath := args[0]
|
||||
|
||||
// Convert to absolute path
|
||||
if !filepath.IsAbs(archivePath) {
|
||||
absPath, err := filepath.Abs(archivePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid archive path: %w", err)
|
||||
}
|
||||
archivePath = absPath
|
||||
}
|
||||
|
||||
// Check if file exists
|
||||
stat, err := os.Stat(archivePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("archive not found: %s", archivePath)
|
||||
}
|
||||
|
||||
fmt.Printf("\n%s\n", strings.Repeat("=", 70))
|
||||
fmt.Printf("BACKUP PREVIEW: %s\n", filepath.Base(archivePath))
|
||||
fmt.Printf("%s\n\n", strings.Repeat("=", 70))
|
||||
|
||||
// Get file info
|
||||
fileSize := stat.Size()
|
||||
fmt.Printf("File Information:\n")
|
||||
fmt.Printf(" Path: %s\n", archivePath)
|
||||
fmt.Printf(" Size: %s (%d bytes)\n", humanize.Bytes(uint64(fileSize)), fileSize)
|
||||
fmt.Printf(" Modified: %s\n", stat.ModTime().Format("2006-01-02 15:04:05"))
|
||||
fmt.Printf(" Age: %s\n", humanize.Time(stat.ModTime()))
|
||||
fmt.Println()
|
||||
|
||||
// Detect format
|
||||
format := restore.DetectArchiveFormat(archivePath)
|
||||
fmt.Printf("Format Detection:\n")
|
||||
fmt.Printf(" Type: %s\n", format.String())
|
||||
|
||||
if format.IsCompressed() {
|
||||
fmt.Printf(" Compressed: Yes\n")
|
||||
} else {
|
||||
fmt.Printf(" Compressed: No\n")
|
||||
}
|
||||
fmt.Println()
|
||||
|
||||
// Run diagnosis
|
||||
diagnoser := restore.NewDiagnoser(log, restoreVerbose)
|
||||
result, err := diagnoser.DiagnoseFile(archivePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to analyze backup: %w", err)
|
||||
}
|
||||
|
||||
// Database information
|
||||
fmt.Printf("Database Information:\n")
|
||||
|
||||
if format.IsClusterBackup() {
|
||||
// For cluster backups, extract database list
|
||||
fmt.Printf(" Type: Cluster Backup (multiple databases)\n")
|
||||
|
||||
// Try to list databases
|
||||
if dbList, err := listDatabasesInCluster(archivePath); err == nil && len(dbList) > 0 {
|
||||
fmt.Printf(" Databases: %d\n", len(dbList))
|
||||
fmt.Printf("\n Database List:\n")
|
||||
for _, db := range dbList {
|
||||
fmt.Printf(" - %s\n", db)
|
||||
}
|
||||
} else {
|
||||
fmt.Printf(" Databases: Multiple (use --list-databases to see all)\n")
|
||||
}
|
||||
} else {
|
||||
// Single database backup
|
||||
dbName := extractDatabaseName(archivePath, result)
|
||||
fmt.Printf(" Database: %s\n", dbName)
|
||||
|
||||
if result.Details != nil && result.Details.TableCount > 0 {
|
||||
fmt.Printf(" Tables: %d\n", result.Details.TableCount)
|
||||
|
||||
if len(result.Details.TableList) > 0 {
|
||||
fmt.Printf("\n Largest Tables (top 5):\n")
|
||||
displayCount := 5
|
||||
if len(result.Details.TableList) < displayCount {
|
||||
displayCount = len(result.Details.TableList)
|
||||
}
|
||||
for i := 0; i < displayCount; i++ {
|
||||
fmt.Printf(" - %s\n", result.Details.TableList[i])
|
||||
}
|
||||
if len(result.Details.TableList) > 5 {
|
||||
fmt.Printf(" ... and %d more\n", len(result.Details.TableList)-5)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fmt.Println()
|
||||
|
||||
// Size estimation
|
||||
if result.Details != nil && result.Details.ExpandedSize > 0 {
|
||||
fmt.Printf("Size Estimates:\n")
|
||||
fmt.Printf(" Compressed: %s\n", humanize.Bytes(uint64(fileSize)))
|
||||
fmt.Printf(" Uncompressed: %s\n", humanize.Bytes(uint64(result.Details.ExpandedSize)))
|
||||
|
||||
if result.Details.CompressionRatio > 0 {
|
||||
fmt.Printf(" Ratio: %.1f%% (%.2fx compression)\n",
|
||||
result.Details.CompressionRatio*100,
|
||||
float64(result.Details.ExpandedSize)/float64(fileSize))
|
||||
}
|
||||
|
||||
// Estimate disk space needed (uncompressed + indexes + temp space)
|
||||
estimatedDisk := int64(float64(result.Details.ExpandedSize) * 1.5) // 1.5x for indexes and temp
|
||||
fmt.Printf(" Disk needed: %s (including indexes and temporary space)\n",
|
||||
humanize.Bytes(uint64(estimatedDisk)))
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
// Restore time estimation
|
||||
if previewEstimate {
|
||||
fmt.Printf("Restore Estimates:\n")
|
||||
|
||||
// Apply current profile
|
||||
profile := cfg.GetCurrentProfile()
|
||||
if profile != nil {
|
||||
fmt.Printf(" Profile: %s (P:%d J:%d)\n",
|
||||
profile.Name, profile.ClusterParallelism, profile.Jobs)
|
||||
}
|
||||
|
||||
// Estimate extraction time
|
||||
extractionSpeed := int64(500 * 1024 * 1024) // 500 MB/s typical
|
||||
extractionTime := time.Duration(fileSize/extractionSpeed) * time.Second
|
||||
|
||||
fmt.Printf(" Extract time: ~%s\n", formatDuration(extractionTime))
|
||||
|
||||
// Estimate restore time (depends on data size and parallelism)
|
||||
if result.Details != nil && result.Details.ExpandedSize > 0 {
|
||||
// Rough estimate: 50MB/s per job for PostgreSQL restore
|
||||
restoreSpeed := int64(50 * 1024 * 1024)
|
||||
if profile != nil {
|
||||
restoreSpeed *= int64(profile.Jobs)
|
||||
}
|
||||
restoreTime := time.Duration(result.Details.ExpandedSize/restoreSpeed) * time.Second
|
||||
|
||||
fmt.Printf(" Restore time: ~%s\n", formatDuration(restoreTime))
|
||||
|
||||
// Validation time (10% of restore)
|
||||
validationTime := restoreTime / 10
|
||||
fmt.Printf(" Validation: ~%s\n", formatDuration(validationTime))
|
||||
|
||||
// Total
|
||||
totalTime := extractionTime + restoreTime + validationTime
|
||||
fmt.Printf(" Total (RTO): ~%s\n", formatDuration(totalTime))
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
// Validation status
|
||||
fmt.Printf("Validation Status:\n")
|
||||
if result.IsValid {
|
||||
fmt.Printf(" Status: ✓ VALID - Backup appears intact\n")
|
||||
} else {
|
||||
fmt.Printf(" Status: ✗ INVALID - Backup has issues\n")
|
||||
}
|
||||
|
||||
if result.IsTruncated {
|
||||
fmt.Printf(" Truncation: ✗ File appears truncated\n")
|
||||
}
|
||||
if result.IsCorrupted {
|
||||
fmt.Printf(" Corruption: ✗ Corruption detected\n")
|
||||
}
|
||||
|
||||
if len(result.Errors) > 0 {
|
||||
fmt.Printf("\n Errors:\n")
|
||||
for _, err := range result.Errors {
|
||||
fmt.Printf(" - %s\n", err)
|
||||
}
|
||||
}
|
||||
|
||||
if len(result.Warnings) > 0 {
|
||||
fmt.Printf("\n Warnings:\n")
|
||||
for _, warn := range result.Warnings {
|
||||
fmt.Printf(" - %s\n", warn)
|
||||
}
|
||||
}
|
||||
fmt.Println()
|
||||
|
||||
// Schema comparison
|
||||
if previewCompareSchema {
|
||||
fmt.Printf("Schema Comparison:\n")
|
||||
fmt.Printf(" Status: Not yet implemented\n")
|
||||
fmt.Printf(" (Compare with current database schema)\n")
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
// Recommendations
|
||||
fmt.Printf("Recommendations:\n")
|
||||
|
||||
if !result.IsValid {
|
||||
fmt.Printf(" - ✗ DO NOT restore this backup - validation failed\n")
|
||||
fmt.Printf(" - Run 'dbbackup restore diagnose %s' for detailed analysis\n", filepath.Base(archivePath))
|
||||
} else {
|
||||
fmt.Printf(" - ✓ Backup is valid and ready to restore\n")
|
||||
|
||||
// Resource recommendations
|
||||
if result.Details != nil && result.Details.ExpandedSize > 0 {
|
||||
estimatedRAM := result.Details.ExpandedSize / (1024 * 1024 * 1024) / 10 // Rough: 10% of data size
|
||||
if estimatedRAM < 4 {
|
||||
estimatedRAM = 4
|
||||
}
|
||||
fmt.Printf(" - Recommended RAM: %dGB or more\n", estimatedRAM)
|
||||
|
||||
// Disk space
|
||||
estimatedDisk := int64(float64(result.Details.ExpandedSize) * 1.5)
|
||||
fmt.Printf(" - Ensure %s free disk space\n", humanize.Bytes(uint64(estimatedDisk)))
|
||||
}
|
||||
|
||||
// Profile recommendation
|
||||
if result.Details != nil && result.Details.TableCount > 100 {
|
||||
fmt.Printf(" - Use 'conservative' profile for databases with many tables\n")
|
||||
} else {
|
||||
fmt.Printf(" - Use 'turbo' profile for fastest restore\n")
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("\n%s\n", strings.Repeat("=", 70))
|
||||
|
||||
if result.IsValid {
|
||||
fmt.Printf("Ready to restore? Run:\n")
|
||||
if format.IsClusterBackup() {
|
||||
fmt.Printf(" dbbackup restore cluster %s --confirm\n", filepath.Base(archivePath))
|
||||
} else {
|
||||
fmt.Printf(" dbbackup restore single %s --confirm\n", filepath.Base(archivePath))
|
||||
}
|
||||
} else {
|
||||
fmt.Printf("Fix validation errors before attempting restore.\n")
|
||||
}
|
||||
fmt.Printf("%s\n\n", strings.Repeat("=", 70))
|
||||
|
||||
if !result.IsValid {
|
||||
return fmt.Errorf("backup validation failed")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
|
||||
func extractDatabaseName(archivePath string, result *restore.DiagnoseResult) string {
|
||||
// Try to extract from filename
|
||||
baseName := filepath.Base(archivePath)
|
||||
baseName = strings.TrimSuffix(baseName, ".gz")
|
||||
baseName = strings.TrimSuffix(baseName, ".dump")
|
||||
baseName = strings.TrimSuffix(baseName, ".sql")
|
||||
baseName = strings.TrimSuffix(baseName, ".tar")
|
||||
|
||||
// Remove timestamp patterns
|
||||
parts := strings.Split(baseName, "_")
|
||||
if len(parts) > 0 {
|
||||
return parts[0]
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func listDatabasesInCluster(archivePath string) ([]string, error) {
|
||||
// This would extract and list databases from tar.gz
|
||||
// For now, return empty to indicate it needs implementation
|
||||
return nil, fmt.Errorf("not implemented")
|
||||
}
|
||||
50
cmd/root.go
50
cmd/root.go
@ -3,11 +3,9 @@ package cmd
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"dbbackup/internal/config"
|
||||
"dbbackup/internal/logger"
|
||||
"dbbackup/internal/notify"
|
||||
"dbbackup/internal/security"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
@ -15,11 +13,10 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
cfg *config.Config
|
||||
log logger.Logger
|
||||
auditLogger *security.AuditLogger
|
||||
rateLimiter *security.RateLimiter
|
||||
notifyManager *notify.Manager
|
||||
cfg *config.Config
|
||||
log logger.Logger
|
||||
auditLogger *security.AuditLogger
|
||||
rateLimiter *security.RateLimiter
|
||||
)
|
||||
|
||||
// rootCmd represents the base command when called without any subcommands
|
||||
@ -55,26 +52,9 @@ For help with specific commands, use: dbbackup [command] --help`,
|
||||
|
||||
// Load local config if not disabled
|
||||
if !cfg.NoLoadConfig {
|
||||
// Use custom config path if specified, otherwise default to current directory
|
||||
var localCfg *config.LocalConfig
|
||||
var err error
|
||||
if cfg.ConfigPath != "" {
|
||||
localCfg, err = config.LoadLocalConfigFromPath(cfg.ConfigPath)
|
||||
if err != nil {
|
||||
log.Warn("Failed to load config from specified path", "path", cfg.ConfigPath, "error", err)
|
||||
} else if localCfg != nil {
|
||||
log.Info("Loaded configuration", "path", cfg.ConfigPath)
|
||||
}
|
||||
} else {
|
||||
localCfg, err = config.LoadLocalConfig()
|
||||
if err != nil {
|
||||
log.Warn("Failed to load local config", "error", err)
|
||||
} else if localCfg != nil {
|
||||
log.Info("Loaded configuration from .dbbackup.conf")
|
||||
}
|
||||
}
|
||||
|
||||
if localCfg != nil {
|
||||
if localCfg, err := config.LoadLocalConfig(); err != nil {
|
||||
log.Warn("Failed to load local config", "error", err)
|
||||
} else if localCfg != nil {
|
||||
// Save current flag values that were explicitly set
|
||||
savedBackupDir := cfg.BackupDir
|
||||
savedHost := cfg.Host
|
||||
@ -89,6 +69,7 @@ For help with specific commands, use: dbbackup [command] --help`,
|
||||
|
||||
// Apply config from file
|
||||
config.ApplyLocalConfig(cfg, localCfg)
|
||||
log.Info("Loaded configuration from .dbbackup.conf")
|
||||
|
||||
// Restore explicitly set flag values (flags have priority)
|
||||
if flagsSet["backup-dir"] {
|
||||
@ -124,12 +105,6 @@ For help with specific commands, use: dbbackup [command] --help`,
|
||||
}
|
||||
}
|
||||
|
||||
// Auto-detect socket from --host path (if host starts with /)
|
||||
if strings.HasPrefix(cfg.Host, "/") && cfg.Socket == "" {
|
||||
cfg.Socket = cfg.Host
|
||||
cfg.Host = "localhost" // Reset host for socket connections
|
||||
}
|
||||
|
||||
return cfg.SetDatabaseType(cfg.DatabaseType)
|
||||
},
|
||||
}
|
||||
@ -145,22 +120,13 @@ func Execute(ctx context.Context, config *config.Config, logger logger.Logger) e
|
||||
// Initialize rate limiter
|
||||
rateLimiter = security.NewRateLimiter(config.MaxRetries, logger)
|
||||
|
||||
// Initialize notification manager from environment variables
|
||||
notifyCfg := notify.ConfigFromEnv()
|
||||
notifyManager = notify.NewManager(notifyCfg)
|
||||
if notifyManager.HasEnabledNotifiers() {
|
||||
logger.Info("Notifications enabled", "smtp", notifyCfg.SMTPEnabled, "webhook", notifyCfg.WebhookEnabled)
|
||||
}
|
||||
|
||||
// Set version info
|
||||
rootCmd.Version = fmt.Sprintf("%s (built: %s, commit: %s)",
|
||||
cfg.Version, cfg.BuildTime, cfg.GitCommit)
|
||||
|
||||
// Add persistent flags
|
||||
rootCmd.PersistentFlags().StringVarP(&cfg.ConfigPath, "config", "c", "", "Path to config file (default: .dbbackup.conf in current directory)")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.Host, "host", cfg.Host, "Database host")
|
||||
rootCmd.PersistentFlags().IntVar(&cfg.Port, "port", cfg.Port, "Database port")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.Socket, "socket", cfg.Socket, "Unix socket path for MySQL/MariaDB (e.g., /var/run/mysqld/mysqld.sock)")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.User, "user", cfg.User, "Database user")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.Database, "database", cfg.Database, "Database name")
|
||||
rootCmd.PersistentFlags().StringVar(&cfg.Password, "password", cfg.Password, "Database password")
|
||||
|
||||
@ -1,64 +0,0 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"dbbackup/internal/checks"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
var verifyLocksCmd = &cobra.Command{
|
||||
Use: "verify-locks",
|
||||
Short: "Check PostgreSQL lock settings and print restore guidance",
|
||||
Long: `Probe PostgreSQL for lock-related GUCs (max_locks_per_transaction, max_connections, max_prepared_transactions) and print capacity + recommended restore options.`,
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
return runVerifyLocks(cmd.Context())
|
||||
},
|
||||
}
|
||||
|
||||
func runVerifyLocks(ctx context.Context) error {
|
||||
p := checks.NewPreflightChecker(cfg, log)
|
||||
res, err := p.RunAllChecks(ctx, cfg.Database)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Find the Postgres lock check in the preflight results
|
||||
var chk checks.PreflightCheck
|
||||
found := false
|
||||
for _, c := range res.Checks {
|
||||
if c.Name == "PostgreSQL lock configuration" {
|
||||
chk = c
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
fmt.Println("No PostgreSQL lock check available (skipped)")
|
||||
return nil
|
||||
}
|
||||
|
||||
fmt.Printf("%s\n", chk.Name)
|
||||
fmt.Printf("Status: %s\n", chk.Status.String())
|
||||
fmt.Printf("%s\n\n", chk.Message)
|
||||
if chk.Details != "" {
|
||||
fmt.Println(chk.Details)
|
||||
}
|
||||
|
||||
// exit non-zero for failures so scripts can react
|
||||
if chk.Status == checks.StatusFailed {
|
||||
os.Exit(2)
|
||||
}
|
||||
if chk.Status == checks.StatusWarning {
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
rootCmd.AddCommand(verifyLocksCmd)
|
||||
}
|
||||
@ -1,371 +0,0 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"dbbackup/internal/logger"
|
||||
"dbbackup/internal/verification"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
var verifyRestoreCmd = &cobra.Command{
|
||||
Use: "verify-restore",
|
||||
Short: "Systematic verification for large database restores",
|
||||
Long: `Comprehensive verification tool for large database restores with BLOB support.
|
||||
|
||||
This tool performs systematic checks to ensure 100% data integrity after restore:
|
||||
- Table counts and row counts verification
|
||||
- BLOB/Large Object integrity (PostgreSQL large objects, bytea columns)
|
||||
- Table checksums (for non-BLOB tables)
|
||||
- Database-specific integrity checks
|
||||
- Orphaned object detection
|
||||
- Index validity checks
|
||||
|
||||
Designed to work with VERY LARGE databases and BLOBs with 100% reliability.
|
||||
|
||||
Examples:
|
||||
# Verify a restored PostgreSQL database
|
||||
dbbackup verify-restore --engine postgres --database mydb
|
||||
|
||||
# Verify with connection details
|
||||
dbbackup verify-restore --engine postgres --host localhost --port 5432 \
|
||||
--user postgres --password secret --database mydb
|
||||
|
||||
# Verify a MySQL database
|
||||
dbbackup verify-restore --engine mysql --database mydb
|
||||
|
||||
# Verify and output JSON report
|
||||
dbbackup verify-restore --engine postgres --database mydb --json
|
||||
|
||||
# Compare source and restored database
|
||||
dbbackup verify-restore --engine postgres --database source_db --compare restored_db
|
||||
|
||||
# Verify a backup file before restore
|
||||
dbbackup verify-restore --backup-file /backups/mydb.dump
|
||||
|
||||
# Verify multiple databases in parallel
|
||||
dbbackup verify-restore --engine postgres --databases "db1,db2,db3" --parallel 4`,
|
||||
RunE: runVerifyRestore,
|
||||
}
|
||||
|
||||
var (
|
||||
verifyEngine string
|
||||
verifyHost string
|
||||
verifyPort int
|
||||
verifyUser string
|
||||
verifyPassword string
|
||||
verifyDatabase string
|
||||
verifyDatabases string
|
||||
verifyCompareDB string
|
||||
verifyBackupFile string
|
||||
verifyJSON bool
|
||||
verifyParallel int
|
||||
)
|
||||
|
||||
func init() {
|
||||
rootCmd.AddCommand(verifyRestoreCmd)
|
||||
|
||||
verifyRestoreCmd.Flags().StringVar(&verifyEngine, "engine", "postgres", "Database engine (postgres, mysql)")
|
||||
verifyRestoreCmd.Flags().StringVar(&verifyHost, "host", "localhost", "Database host")
|
||||
verifyRestoreCmd.Flags().IntVar(&verifyPort, "port", 5432, "Database port")
|
||||
verifyRestoreCmd.Flags().StringVar(&verifyUser, "user", "", "Database user")
|
||||
verifyRestoreCmd.Flags().StringVar(&verifyPassword, "password", "", "Database password")
|
||||
verifyRestoreCmd.Flags().StringVar(&verifyDatabase, "database", "", "Database to verify")
|
||||
verifyRestoreCmd.Flags().StringVar(&verifyDatabases, "databases", "", "Comma-separated list of databases to verify")
|
||||
verifyRestoreCmd.Flags().StringVar(&verifyCompareDB, "compare", "", "Compare with another database (source vs restored)")
|
||||
verifyRestoreCmd.Flags().StringVar(&verifyBackupFile, "backup-file", "", "Verify backup file integrity before restore")
|
||||
verifyRestoreCmd.Flags().BoolVar(&verifyJSON, "json", false, "Output results as JSON")
|
||||
verifyRestoreCmd.Flags().IntVar(&verifyParallel, "parallel", 1, "Number of parallel verification workers")
|
||||
}
|
||||
|
||||
func runVerifyRestore(cmd *cobra.Command, args []string) error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 24*time.Hour) // Long timeout for large DBs
|
||||
defer cancel()
|
||||
|
||||
log := logger.New("INFO", "text")
|
||||
|
||||
// Get credentials from environment if not provided
|
||||
if verifyUser == "" {
|
||||
verifyUser = os.Getenv("PGUSER")
|
||||
if verifyUser == "" {
|
||||
verifyUser = os.Getenv("MYSQL_USER")
|
||||
}
|
||||
if verifyUser == "" {
|
||||
verifyUser = "postgres"
|
||||
}
|
||||
}
|
||||
|
||||
if verifyPassword == "" {
|
||||
verifyPassword = os.Getenv("PGPASSWORD")
|
||||
if verifyPassword == "" {
|
||||
verifyPassword = os.Getenv("MYSQL_PASSWORD")
|
||||
}
|
||||
}
|
||||
|
||||
// Set default port based on engine
|
||||
if verifyPort == 5432 && (verifyEngine == "mysql" || verifyEngine == "mariadb") {
|
||||
verifyPort = 3306
|
||||
}
|
||||
|
||||
checker := verification.NewLargeRestoreChecker(log, verifyEngine, verifyHost, verifyPort, verifyUser, verifyPassword)
|
||||
|
||||
// Mode 1: Verify backup file
|
||||
if verifyBackupFile != "" {
|
||||
return verifyBackupFileMode(ctx, checker)
|
||||
}
|
||||
|
||||
// Mode 2: Compare two databases
|
||||
if verifyCompareDB != "" {
|
||||
return verifyCompareMode(ctx, checker)
|
||||
}
|
||||
|
||||
// Mode 3: Verify multiple databases in parallel
|
||||
if verifyDatabases != "" {
|
||||
return verifyMultipleDatabases(ctx, log)
|
||||
}
|
||||
|
||||
// Mode 4: Verify single database
|
||||
if verifyDatabase == "" {
|
||||
return fmt.Errorf("--database is required")
|
||||
}
|
||||
|
||||
return verifySingleDatabase(ctx, checker)
|
||||
}
|
||||
|
||||
func verifyBackupFileMode(ctx context.Context, checker *verification.LargeRestoreChecker) error {
|
||||
fmt.Println()
|
||||
fmt.Println("╔══════════════════════════════════════════════════════════════╗")
|
||||
fmt.Println("║ 🔍 BACKUP FILE VERIFICATION ║")
|
||||
fmt.Println("╚══════════════════════════════════════════════════════════════╝")
|
||||
fmt.Println()
|
||||
|
||||
result, err := checker.VerifyBackupFile(ctx, verifyBackupFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("verification failed: %w", err)
|
||||
}
|
||||
|
||||
if verifyJSON {
|
||||
return outputJSON(result, "")
|
||||
}
|
||||
|
||||
fmt.Printf(" File: %s\n", result.Path)
|
||||
fmt.Printf(" Size: %s\n", formatBytes(result.SizeBytes))
|
||||
fmt.Printf(" Format: %s\n", result.Format)
|
||||
fmt.Printf(" Checksum: %s\n", result.Checksum)
|
||||
|
||||
if result.TableCount > 0 {
|
||||
fmt.Printf(" Tables: %d\n", result.TableCount)
|
||||
}
|
||||
if result.LargeObjectCount > 0 {
|
||||
fmt.Printf(" Large Objects: %d\n", result.LargeObjectCount)
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
|
||||
if result.Valid {
|
||||
fmt.Println(" ✅ Backup file verification PASSED")
|
||||
} else {
|
||||
fmt.Printf(" ❌ Backup file verification FAILED: %s\n", result.Error)
|
||||
return fmt.Errorf("verification failed")
|
||||
}
|
||||
|
||||
if len(result.Warnings) > 0 {
|
||||
fmt.Println()
|
||||
fmt.Println(" Warnings:")
|
||||
for _, w := range result.Warnings {
|
||||
fmt.Printf(" ⚠️ %s\n", w)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyCompareMode(ctx context.Context, checker *verification.LargeRestoreChecker) error {
|
||||
if verifyDatabase == "" {
|
||||
return fmt.Errorf("--database (source) is required for comparison")
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println("╔══════════════════════════════════════════════════════════════╗")
|
||||
fmt.Println("║ 🔍 DATABASE COMPARISON ║")
|
||||
fmt.Println("╚══════════════════════════════════════════════════════════════╝")
|
||||
fmt.Println()
|
||||
fmt.Printf(" Source: %s\n", verifyDatabase)
|
||||
fmt.Printf(" Target: %s\n", verifyCompareDB)
|
||||
fmt.Println()
|
||||
|
||||
result, err := checker.CompareSourceTarget(ctx, verifyDatabase, verifyCompareDB)
|
||||
if err != nil {
|
||||
return fmt.Errorf("comparison failed: %w", err)
|
||||
}
|
||||
|
||||
if verifyJSON {
|
||||
return outputJSON(result, "")
|
||||
}
|
||||
|
||||
if result.Match {
|
||||
fmt.Println(" ✅ Databases MATCH - restore verified successfully")
|
||||
} else {
|
||||
fmt.Println(" ❌ Databases DO NOT MATCH")
|
||||
fmt.Println()
|
||||
fmt.Println(" Differences:")
|
||||
for _, d := range result.Differences {
|
||||
fmt.Printf(" • %s\n", d)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyMultipleDatabases(ctx context.Context, log logger.Logger) error {
|
||||
databases := splitDatabases(verifyDatabases)
|
||||
if len(databases) == 0 {
|
||||
return fmt.Errorf("no databases specified")
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
fmt.Println("╔══════════════════════════════════════════════════════════════╗")
|
||||
fmt.Println("║ 🔍 PARALLEL DATABASE VERIFICATION ║")
|
||||
fmt.Println("╚══════════════════════════════════════════════════════════════╝")
|
||||
fmt.Println()
|
||||
fmt.Printf(" Databases: %d\n", len(databases))
|
||||
fmt.Printf(" Workers: %d\n", verifyParallel)
|
||||
fmt.Println()
|
||||
|
||||
results, err := verification.ParallelVerify(ctx, log, verifyEngine, verifyHost, verifyPort, verifyUser, verifyPassword, databases, verifyParallel)
|
||||
if err != nil {
|
||||
return fmt.Errorf("parallel verification failed: %w", err)
|
||||
}
|
||||
|
||||
if verifyJSON {
|
||||
return outputJSON(results, "")
|
||||
}
|
||||
|
||||
allValid := true
|
||||
for _, r := range results {
|
||||
if r == nil {
|
||||
continue
|
||||
}
|
||||
status := "✅"
|
||||
if !r.Valid {
|
||||
status = "❌"
|
||||
allValid = false
|
||||
}
|
||||
fmt.Printf(" %s %s: %d tables, %d rows, %d BLOBs (%s)\n",
|
||||
status, r.Database, r.TotalTables, r.TotalRows, r.TotalBlobCount, r.Duration.Round(time.Millisecond))
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
if allValid {
|
||||
fmt.Println(" ✅ All databases verified successfully")
|
||||
} else {
|
||||
fmt.Println(" ❌ Some databases failed verification")
|
||||
return fmt.Errorf("verification failed")
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifySingleDatabase(ctx context.Context, checker *verification.LargeRestoreChecker) error {
|
||||
fmt.Println()
|
||||
fmt.Println("╔══════════════════════════════════════════════════════════════╗")
|
||||
fmt.Println("║ 🔍 SYSTEMATIC RESTORE VERIFICATION ║")
|
||||
fmt.Println("║ For Large Databases & BLOBs ║")
|
||||
fmt.Println("╚══════════════════════════════════════════════════════════════╝")
|
||||
fmt.Println()
|
||||
fmt.Printf(" Database: %s\n", verifyDatabase)
|
||||
fmt.Printf(" Engine: %s\n", verifyEngine)
|
||||
fmt.Printf(" Host: %s:%d\n", verifyHost, verifyPort)
|
||||
fmt.Println()
|
||||
|
||||
result, err := checker.CheckDatabase(ctx, verifyDatabase)
|
||||
if err != nil {
|
||||
return fmt.Errorf("verification failed: %w", err)
|
||||
}
|
||||
|
||||
if verifyJSON {
|
||||
return outputJSON(result, "")
|
||||
}
|
||||
|
||||
// Summary
|
||||
fmt.Println(" ═══════════════════════════════════════════════════════════")
|
||||
fmt.Println(" VERIFICATION SUMMARY")
|
||||
fmt.Println(" ═══════════════════════════════════════════════════════════")
|
||||
fmt.Println()
|
||||
fmt.Printf(" Tables: %d\n", result.TotalTables)
|
||||
fmt.Printf(" Total Rows: %d\n", result.TotalRows)
|
||||
fmt.Printf(" Large Objects: %d\n", result.TotalBlobCount)
|
||||
fmt.Printf(" BLOB Size: %s\n", formatBytes(result.TotalBlobBytes))
|
||||
fmt.Printf(" Duration: %s\n", result.Duration.Round(time.Millisecond))
|
||||
fmt.Println()
|
||||
|
||||
// Table details
|
||||
if len(result.TableChecks) > 0 && len(result.TableChecks) <= 50 {
|
||||
fmt.Println(" Tables:")
|
||||
for _, t := range result.TableChecks {
|
||||
blobIndicator := ""
|
||||
if t.HasBlobColumn {
|
||||
blobIndicator = " [BLOB]"
|
||||
}
|
||||
status := "✓"
|
||||
if !t.Valid {
|
||||
status = "✗"
|
||||
}
|
||||
fmt.Printf(" %s %s.%s: %d rows%s\n", status, t.Schema, t.TableName, t.RowCount, blobIndicator)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
// Integrity errors
|
||||
if len(result.IntegrityErrors) > 0 {
|
||||
fmt.Println(" ❌ INTEGRITY ERRORS:")
|
||||
for _, e := range result.IntegrityErrors {
|
||||
fmt.Printf(" • %s\n", e)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
// Warnings
|
||||
if len(result.Warnings) > 0 {
|
||||
fmt.Println(" ⚠️ WARNINGS:")
|
||||
for _, w := range result.Warnings {
|
||||
fmt.Printf(" • %s\n", w)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
|
||||
// Final verdict
|
||||
fmt.Println(" ═══════════════════════════════════════════════════════════")
|
||||
if result.Valid {
|
||||
fmt.Println(" ✅ RESTORE VERIFICATION PASSED - Data integrity confirmed")
|
||||
} else {
|
||||
fmt.Println(" ❌ RESTORE VERIFICATION FAILED - See errors above")
|
||||
return fmt.Errorf("verification failed")
|
||||
}
|
||||
fmt.Println(" ═══════════════════════════════════════════════════════════")
|
||||
fmt.Println()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func splitDatabases(s string) []string {
|
||||
if s == "" {
|
||||
return nil
|
||||
}
|
||||
var dbs []string
|
||||
for _, db := range strings.Split(s, ",") {
|
||||
db = strings.TrimSpace(db)
|
||||
if db != "" {
|
||||
dbs = append(dbs, db)
|
||||
}
|
||||
}
|
||||
return dbs
|
||||
}
|
||||
@ -1,62 +0,0 @@
|
||||
# Deployment Examples for dbbackup
|
||||
|
||||
Enterprise deployment configurations for various platforms and orchestration tools.
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
deploy/
|
||||
├── README.md
|
||||
├── ansible/ # Ansible roles and playbooks
|
||||
│ ├── basic.yml # Simple installation
|
||||
│ ├── with-exporter.yml # With Prometheus metrics
|
||||
│ ├── with-notifications.yml # With email/Slack alerts
|
||||
│ └── enterprise.yml # Full enterprise setup
|
||||
├── kubernetes/ # Kubernetes manifests
|
||||
│ ├── cronjob.yaml # Scheduled backup CronJob
|
||||
│ ├── configmap.yaml # Configuration
|
||||
│ └── helm/ # Helm chart
|
||||
├── terraform/ # Infrastructure as Code
|
||||
│ ├── aws/ # AWS deployment
|
||||
│ └── gcp/ # GCP deployment
|
||||
└── scripts/ # Helper scripts
|
||||
├── backup-rotation.sh
|
||||
└── health-check.sh
|
||||
```
|
||||
|
||||
## Quick Start by Platform
|
||||
|
||||
### Ansible
|
||||
```bash
|
||||
cd ansible
|
||||
cp inventory.example inventory
|
||||
ansible-playbook -i inventory enterprise.yml
|
||||
```
|
||||
|
||||
### Kubernetes
|
||||
```bash
|
||||
kubectl apply -f kubernetes/
|
||||
# or with Helm
|
||||
helm install dbbackup kubernetes/helm/dbbackup
|
||||
```
|
||||
|
||||
### Terraform (AWS)
|
||||
```bash
|
||||
cd terraform/aws
|
||||
terraform init
|
||||
terraform apply
|
||||
```
|
||||
|
||||
## Feature Matrix
|
||||
|
||||
| Feature | basic | with-exporter | with-notifications | enterprise |
|
||||
|---------|:-----:|:-------------:|:------------------:|:----------:|
|
||||
| Scheduled Backups | ✓ | ✓ | ✓ | ✓ |
|
||||
| Retention Policy | ✓ | ✓ | ✓ | ✓ |
|
||||
| GFS Rotation | | | | ✓ |
|
||||
| Prometheus Metrics | | ✓ | | ✓ |
|
||||
| Email Notifications | | | ✓ | ✓ |
|
||||
| Slack/Webhook | | | ✓ | ✓ |
|
||||
| Encryption | | | | ✓ |
|
||||
| Cloud Upload | | | | ✓ |
|
||||
| Catalog Sync | | | | ✓ |
|
||||
@ -1,75 +0,0 @@
|
||||
# Ansible Deployment for dbbackup
|
||||
|
||||
Ansible roles and playbooks for deploying dbbackup in enterprise environments.
|
||||
|
||||
## Playbooks
|
||||
|
||||
| Playbook | Description |
|
||||
|----------|-------------|
|
||||
| `basic.yml` | Simple installation without monitoring |
|
||||
| `with-exporter.yml` | Installation with Prometheus metrics exporter |
|
||||
| `with-notifications.yml` | Installation with SMTP/webhook notifications |
|
||||
| `enterprise.yml` | Full enterprise setup (exporter + notifications + GFS retention) |
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Edit inventory
|
||||
cp inventory.example inventory
|
||||
vim inventory
|
||||
|
||||
# Edit variables
|
||||
vim group_vars/all.yml
|
||||
|
||||
# Deploy basic setup
|
||||
ansible-playbook -i inventory basic.yml
|
||||
|
||||
# Deploy enterprise setup
|
||||
ansible-playbook -i inventory enterprise.yml
|
||||
```
|
||||
|
||||
## Variables
|
||||
|
||||
See `group_vars/all.yml` for all configurable options.
|
||||
|
||||
### Required Variables
|
||||
|
||||
| Variable | Description | Example |
|
||||
|----------|-------------|---------|
|
||||
| `dbbackup_version` | Version to install | `3.42.74` |
|
||||
| `dbbackup_db_type` | Database type | `postgres` or `mysql` |
|
||||
| `dbbackup_backup_dir` | Backup storage path | `/var/backups/databases` |
|
||||
|
||||
### Optional Variables
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `dbbackup_schedule` | Backup schedule | `daily` |
|
||||
| `dbbackup_compression` | Compression level | `6` |
|
||||
| `dbbackup_retention_days` | Retention period | `30` |
|
||||
| `dbbackup_min_backups` | Minimum backups to keep | `5` |
|
||||
| `dbbackup_exporter_port` | Prometheus exporter port | `9399` |
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
ansible/
|
||||
├── README.md
|
||||
├── inventory.example
|
||||
├── group_vars/
|
||||
│ └── all.yml
|
||||
├── roles/
|
||||
│ └── dbbackup/
|
||||
│ ├── tasks/
|
||||
│ │ └── main.yml
|
||||
│ ├── templates/
|
||||
│ │ ├── dbbackup.conf.j2
|
||||
│ │ ├── env.j2
|
||||
│ │ └── systemd-override.conf.j2
|
||||
│ └── handlers/
|
||||
│ └── main.yml
|
||||
├── basic.yml
|
||||
├── with-exporter.yml
|
||||
├── with-notifications.yml
|
||||
└── enterprise.yml
|
||||
```
|
||||
@ -1,42 +0,0 @@
|
||||
---
|
||||
# dbbackup Basic Deployment
|
||||
# Simple installation without monitoring or notifications
|
||||
#
|
||||
# Usage:
|
||||
# ansible-playbook -i inventory basic.yml
|
||||
#
|
||||
# Features:
|
||||
# ✓ Automated daily backups
|
||||
# ✓ Retention policy (30 days default)
|
||||
# ✗ No Prometheus exporter
|
||||
# ✗ No notifications
|
||||
|
||||
- name: Deploy dbbackup (basic)
|
||||
hosts: db_servers
|
||||
become: yes
|
||||
|
||||
vars:
|
||||
dbbackup_exporter_enabled: false
|
||||
dbbackup_notify_enabled: false
|
||||
|
||||
roles:
|
||||
- dbbackup
|
||||
|
||||
post_tasks:
|
||||
- name: Verify installation
|
||||
command: "{{ dbbackup_install_dir }}/dbbackup --version"
|
||||
register: version_check
|
||||
changed_when: false
|
||||
|
||||
- name: Display version
|
||||
debug:
|
||||
msg: "Installed: {{ version_check.stdout }}"
|
||||
|
||||
- name: Show timer status
|
||||
command: systemctl status dbbackup-{{ dbbackup_backup_type }}.timer --no-pager
|
||||
register: timer_status
|
||||
changed_when: false
|
||||
|
||||
- name: Display next backup time
|
||||
debug:
|
||||
msg: "{{ timer_status.stdout_lines | select('search', 'Trigger') | list }}"
|
||||
@ -1,153 +0,0 @@
|
||||
---
|
||||
# dbbackup Enterprise Deployment
|
||||
# Full-featured installation with all enterprise capabilities
|
||||
#
|
||||
# Usage:
|
||||
# ansible-playbook -i inventory enterprise.yml
|
||||
#
|
||||
# Features:
|
||||
# ✓ Automated scheduled backups
|
||||
# ✓ GFS retention policy (Grandfather-Father-Son)
|
||||
# ✓ Prometheus metrics exporter
|
||||
# ✓ SMTP email notifications
|
||||
# ✓ Webhook/Slack notifications
|
||||
# ✓ Encrypted backups (optional)
|
||||
# ✓ Cloud storage upload (optional)
|
||||
# ✓ Catalog for backup tracking
|
||||
#
|
||||
# Required Vault Variables:
|
||||
# dbbackup_db_password
|
||||
# dbbackup_encryption_key (if encryption enabled)
|
||||
# dbbackup_notify_smtp_password (if SMTP enabled)
|
||||
# dbbackup_cloud_access_key (if cloud enabled)
|
||||
# dbbackup_cloud_secret_key (if cloud enabled)
|
||||
|
||||
- name: Deploy dbbackup (Enterprise)
|
||||
hosts: db_servers
|
||||
become: yes
|
||||
|
||||
vars:
|
||||
# Full feature set
|
||||
dbbackup_exporter_enabled: true
|
||||
dbbackup_exporter_port: 9399
|
||||
dbbackup_notify_enabled: true
|
||||
|
||||
# GFS Retention
|
||||
dbbackup_gfs_enabled: true
|
||||
dbbackup_gfs_daily: 7
|
||||
dbbackup_gfs_weekly: 4
|
||||
dbbackup_gfs_monthly: 12
|
||||
dbbackup_gfs_yearly: 3
|
||||
|
||||
pre_tasks:
|
||||
- name: Check for required secrets
|
||||
assert:
|
||||
that:
|
||||
- dbbackup_db_password is defined
|
||||
fail_msg: "Required secrets not provided. Use ansible-vault for dbbackup_db_password"
|
||||
|
||||
- name: Validate encryption key if enabled
|
||||
assert:
|
||||
that:
|
||||
- dbbackup_encryption_key is defined
|
||||
- dbbackup_encryption_key | length >= 16
|
||||
fail_msg: "Encryption enabled but key not provided or too short"
|
||||
when: dbbackup_encryption_enabled | default(false)
|
||||
|
||||
roles:
|
||||
- dbbackup
|
||||
|
||||
post_tasks:
|
||||
# Verify exporter
|
||||
- name: Wait for exporter to start
|
||||
wait_for:
|
||||
port: "{{ dbbackup_exporter_port }}"
|
||||
timeout: 30
|
||||
when: dbbackup_exporter_enabled
|
||||
|
||||
- name: Test metrics endpoint
|
||||
uri:
|
||||
url: "http://localhost:{{ dbbackup_exporter_port }}/metrics"
|
||||
return_content: yes
|
||||
register: metrics_response
|
||||
when: dbbackup_exporter_enabled
|
||||
|
||||
# Initialize catalog
|
||||
- name: Sync existing backups to catalog
|
||||
command: "{{ dbbackup_install_dir }}/dbbackup catalog sync {{ dbbackup_backup_dir }}"
|
||||
become_user: dbbackup
|
||||
changed_when: false
|
||||
|
||||
# Run preflight check
|
||||
- name: Run preflight checks
|
||||
command: "{{ dbbackup_install_dir }}/dbbackup preflight"
|
||||
become_user: dbbackup
|
||||
register: preflight_result
|
||||
changed_when: false
|
||||
failed_when: preflight_result.rc > 1 # rc=1 is warnings, rc=2 is failure
|
||||
|
||||
- name: Display preflight result
|
||||
debug:
|
||||
msg: "{{ preflight_result.stdout_lines }}"
|
||||
|
||||
# Summary
|
||||
- name: Display deployment summary
|
||||
debug:
|
||||
msg: |
|
||||
╔══════════════════════════════════════════════════════════════╗
|
||||
║ dbbackup Enterprise Deployment Complete ║
|
||||
╚══════════════════════════════════════════════════════════════╝
|
||||
|
||||
Host: {{ inventory_hostname }}
|
||||
Version: {{ dbbackup_version }}
|
||||
|
||||
┌─ Backup Configuration ─────────────────────────────────────────
|
||||
│ Type: {{ dbbackup_backup_type }}
|
||||
│ Schedule: {{ dbbackup_schedule }}
|
||||
│ Directory: {{ dbbackup_backup_dir }}
|
||||
│ Encryption: {{ 'Enabled' if dbbackup_encryption_enabled else 'Disabled' }}
|
||||
└────────────────────────────────────────────────────────────────
|
||||
|
||||
┌─ Retention Policy (GFS) ───────────────────────────────────────
|
||||
│ Daily: {{ dbbackup_gfs_daily }} backups
|
||||
│ Weekly: {{ dbbackup_gfs_weekly }} backups
|
||||
│ Monthly: {{ dbbackup_gfs_monthly }} backups
|
||||
│ Yearly: {{ dbbackup_gfs_yearly }} backups
|
||||
└────────────────────────────────────────────────────────────────
|
||||
|
||||
┌─ Monitoring ───────────────────────────────────────────────────
|
||||
│ Prometheus: http://{{ inventory_hostname }}:{{ dbbackup_exporter_port }}/metrics
|
||||
└────────────────────────────────────────────────────────────────
|
||||
|
||||
┌─ Notifications ────────────────────────────────────────────────
|
||||
{% if dbbackup_notify_smtp_enabled | default(false) %}
|
||||
│ SMTP: {{ dbbackup_notify_smtp_to | join(', ') }}
|
||||
{% endif %}
|
||||
{% if dbbackup_notify_slack_enabled | default(false) %}
|
||||
│ Slack: Enabled
|
||||
{% endif %}
|
||||
└────────────────────────────────────────────────────────────────
|
||||
|
||||
- name: Configure Prometheus scrape targets
|
||||
hosts: monitoring
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Add dbbackup targets to prometheus
|
||||
blockinfile:
|
||||
path: /etc/prometheus/targets/dbbackup.yml
|
||||
create: yes
|
||||
block: |
|
||||
- targets:
|
||||
{% for host in groups['db_servers'] %}
|
||||
- {{ host }}:{{ hostvars[host]['dbbackup_exporter_port'] | default(9399) }}
|
||||
{% endfor %}
|
||||
labels:
|
||||
job: dbbackup
|
||||
notify: reload prometheus
|
||||
when: "'monitoring' in group_names"
|
||||
|
||||
handlers:
|
||||
- name: reload prometheus
|
||||
systemd:
|
||||
name: prometheus
|
||||
state: reloaded
|
||||
@ -1,71 +0,0 @@
|
||||
# dbbackup Ansible Variables
|
||||
# =========================
|
||||
|
||||
# Version and Installation
|
||||
dbbackup_version: "3.42.74"
|
||||
dbbackup_download_url: "https://git.uuxo.net/UUXO/dbbackup/releases/download/v{{ dbbackup_version }}"
|
||||
dbbackup_install_dir: "/usr/local/bin"
|
||||
dbbackup_config_dir: "/etc/dbbackup"
|
||||
dbbackup_data_dir: "/var/lib/dbbackup"
|
||||
dbbackup_log_dir: "/var/log/dbbackup"
|
||||
|
||||
# Database Configuration
|
||||
dbbackup_db_type: "postgres" # postgres, mysql, mariadb
|
||||
dbbackup_db_host: "localhost"
|
||||
dbbackup_db_port: 5432 # 5432 for postgres, 3306 for mysql
|
||||
dbbackup_db_user: "postgres"
|
||||
# dbbackup_db_password: "" # Use vault for passwords!
|
||||
|
||||
# Backup Configuration
|
||||
dbbackup_backup_dir: "/var/backups/databases"
|
||||
dbbackup_backup_type: "cluster" # cluster, single
|
||||
dbbackup_compression: 6
|
||||
dbbackup_encryption_enabled: false
|
||||
# dbbackup_encryption_key: "" # Use vault!
|
||||
|
||||
# Schedule (systemd OnCalendar format)
|
||||
dbbackup_schedule: "daily" # daily, weekly, *-*-* 02:00:00
|
||||
|
||||
# Retention Policy
|
||||
dbbackup_retention_days: 30
|
||||
dbbackup_min_backups: 5
|
||||
|
||||
# GFS Retention (enterprise.yml)
|
||||
dbbackup_gfs_enabled: false
|
||||
dbbackup_gfs_daily: 7
|
||||
dbbackup_gfs_weekly: 4
|
||||
dbbackup_gfs_monthly: 12
|
||||
dbbackup_gfs_yearly: 3
|
||||
|
||||
# Prometheus Exporter (with-exporter.yml, enterprise.yml)
|
||||
dbbackup_exporter_enabled: false
|
||||
dbbackup_exporter_port: 9399
|
||||
|
||||
# Cloud Storage (optional)
|
||||
dbbackup_cloud_enabled: false
|
||||
dbbackup_cloud_provider: "s3" # s3, minio, b2, azure, gcs
|
||||
dbbackup_cloud_bucket: ""
|
||||
dbbackup_cloud_endpoint: "" # For MinIO/B2
|
||||
# dbbackup_cloud_access_key: "" # Use vault!
|
||||
# dbbackup_cloud_secret_key: "" # Use vault!
|
||||
|
||||
# Notifications (with-notifications.yml, enterprise.yml)
|
||||
dbbackup_notify_enabled: false
|
||||
|
||||
# SMTP Notifications
|
||||
dbbackup_notify_smtp_enabled: false
|
||||
dbbackup_notify_smtp_host: ""
|
||||
dbbackup_notify_smtp_port: 587
|
||||
dbbackup_notify_smtp_user: ""
|
||||
# dbbackup_notify_smtp_password: "" # Use vault!
|
||||
dbbackup_notify_smtp_from: ""
|
||||
dbbackup_notify_smtp_to: [] # List of recipients
|
||||
|
||||
# Webhook Notifications
|
||||
dbbackup_notify_webhook_enabled: false
|
||||
dbbackup_notify_webhook_url: ""
|
||||
# dbbackup_notify_webhook_secret: "" # Use vault for HMAC secret!
|
||||
|
||||
# Slack Integration (uses webhook)
|
||||
dbbackup_notify_slack_enabled: false
|
||||
dbbackup_notify_slack_webhook: ""
|
||||
@ -1,25 +0,0 @@
|
||||
# dbbackup Ansible Inventory Example
|
||||
# Copy to 'inventory' and customize
|
||||
|
||||
[db_servers]
|
||||
# PostgreSQL servers
|
||||
pg-primary.example.com dbbackup_db_type=postgres
|
||||
pg-replica.example.com dbbackup_db_type=postgres dbbackup_backup_from_replica=true
|
||||
|
||||
# MySQL servers
|
||||
mysql-01.example.com dbbackup_db_type=mysql
|
||||
|
||||
[db_servers:vars]
|
||||
ansible_user=deploy
|
||||
ansible_become=yes
|
||||
|
||||
# Group-level defaults
|
||||
dbbackup_backup_dir=/var/backups/databases
|
||||
dbbackup_schedule=daily
|
||||
|
||||
[monitoring]
|
||||
prometheus.example.com
|
||||
|
||||
[monitoring:vars]
|
||||
# Servers where metrics are scraped
|
||||
dbbackup_exporter_enabled=true
|
||||
@ -1,12 +0,0 @@
|
||||
---
|
||||
# dbbackup Ansible Role - Handlers
|
||||
|
||||
- name: reload systemd
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: restart dbbackup
|
||||
systemd:
|
||||
name: "dbbackup-{{ dbbackup_backup_type }}.service"
|
||||
state: restarted
|
||||
when: ansible_service_mgr == 'systemd'
|
||||
@ -1,116 +0,0 @@
|
||||
---
|
||||
# dbbackup Ansible Role - Main Tasks
|
||||
|
||||
- name: Create dbbackup group
|
||||
group:
|
||||
name: dbbackup
|
||||
system: yes
|
||||
|
||||
- name: Create dbbackup user
|
||||
user:
|
||||
name: dbbackup
|
||||
group: dbbackup
|
||||
system: yes
|
||||
home: "{{ dbbackup_data_dir }}"
|
||||
shell: /usr/sbin/nologin
|
||||
create_home: no
|
||||
|
||||
- name: Create directories
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: dbbackup
|
||||
group: dbbackup
|
||||
mode: "0755"
|
||||
loop:
|
||||
- "{{ dbbackup_config_dir }}"
|
||||
- "{{ dbbackup_data_dir }}"
|
||||
- "{{ dbbackup_data_dir }}/catalog"
|
||||
- "{{ dbbackup_log_dir }}"
|
||||
- "{{ dbbackup_backup_dir }}"
|
||||
|
||||
- name: Create env.d directory
|
||||
file:
|
||||
path: "{{ dbbackup_config_dir }}/env.d"
|
||||
state: directory
|
||||
owner: root
|
||||
group: dbbackup
|
||||
mode: "0750"
|
||||
|
||||
- name: Detect architecture
|
||||
set_fact:
|
||||
dbbackup_arch: "{{ 'arm64' if ansible_architecture == 'aarch64' else 'amd64' }}"
|
||||
|
||||
- name: Download dbbackup binary
|
||||
get_url:
|
||||
url: "{{ dbbackup_download_url }}/dbbackup-linux-{{ dbbackup_arch }}"
|
||||
dest: "{{ dbbackup_install_dir }}/dbbackup"
|
||||
mode: "0755"
|
||||
owner: root
|
||||
group: root
|
||||
notify: restart dbbackup
|
||||
|
||||
- name: Deploy configuration file
|
||||
template:
|
||||
src: dbbackup.conf.j2
|
||||
dest: "{{ dbbackup_config_dir }}/dbbackup.conf"
|
||||
owner: root
|
||||
group: dbbackup
|
||||
mode: "0640"
|
||||
notify: restart dbbackup
|
||||
|
||||
- name: Deploy environment file
|
||||
template:
|
||||
src: env.j2
|
||||
dest: "{{ dbbackup_config_dir }}/env.d/{{ dbbackup_backup_type }}.conf"
|
||||
owner: root
|
||||
group: dbbackup
|
||||
mode: "0600"
|
||||
notify: restart dbbackup
|
||||
|
||||
- name: Install systemd service
|
||||
command: >
|
||||
{{ dbbackup_install_dir }}/dbbackup install
|
||||
--backup-type {{ dbbackup_backup_type }}
|
||||
--schedule "{{ dbbackup_schedule }}"
|
||||
{% if dbbackup_exporter_enabled %}--with-metrics --metrics-port {{ dbbackup_exporter_port }}{% endif %}
|
||||
args:
|
||||
creates: "/etc/systemd/system/dbbackup-{{ dbbackup_backup_type }}.service"
|
||||
notify:
|
||||
- reload systemd
|
||||
- restart dbbackup
|
||||
|
||||
- name: Deploy systemd override (if customizations needed)
|
||||
template:
|
||||
src: systemd-override.conf.j2
|
||||
dest: "/etc/systemd/system/dbbackup-{{ dbbackup_backup_type }}.service.d/override.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: dbbackup_notify_enabled or dbbackup_cloud_enabled
|
||||
notify:
|
||||
- reload systemd
|
||||
- restart dbbackup
|
||||
|
||||
- name: Create systemd override directory
|
||||
file:
|
||||
path: "/etc/systemd/system/dbbackup-{{ dbbackup_backup_type }}.service.d"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0755"
|
||||
when: dbbackup_notify_enabled or dbbackup_cloud_enabled
|
||||
|
||||
- name: Enable and start dbbackup timer
|
||||
systemd:
|
||||
name: "dbbackup-{{ dbbackup_backup_type }}.timer"
|
||||
enabled: yes
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Enable dbbackup exporter service
|
||||
systemd:
|
||||
name: dbbackup-exporter
|
||||
enabled: yes
|
||||
state: started
|
||||
when: dbbackup_exporter_enabled
|
||||
@ -1,39 +0,0 @@
|
||||
# dbbackup Configuration
|
||||
# Managed by Ansible - do not edit manually
|
||||
|
||||
# Database
|
||||
db-type = {{ dbbackup_db_type }}
|
||||
host = {{ dbbackup_db_host }}
|
||||
port = {{ dbbackup_db_port }}
|
||||
user = {{ dbbackup_db_user }}
|
||||
|
||||
# Backup
|
||||
backup-dir = {{ dbbackup_backup_dir }}
|
||||
compression = {{ dbbackup_compression }}
|
||||
|
||||
# Retention
|
||||
retention-days = {{ dbbackup_retention_days }}
|
||||
min-backups = {{ dbbackup_min_backups }}
|
||||
|
||||
{% if dbbackup_gfs_enabled %}
|
||||
# GFS Retention Policy
|
||||
gfs = true
|
||||
gfs-daily = {{ dbbackup_gfs_daily }}
|
||||
gfs-weekly = {{ dbbackup_gfs_weekly }}
|
||||
gfs-monthly = {{ dbbackup_gfs_monthly }}
|
||||
gfs-yearly = {{ dbbackup_gfs_yearly }}
|
||||
{% endif %}
|
||||
|
||||
{% if dbbackup_encryption_enabled %}
|
||||
# Encryption
|
||||
encrypt = true
|
||||
{% endif %}
|
||||
|
||||
{% if dbbackup_cloud_enabled %}
|
||||
# Cloud Storage
|
||||
cloud-provider = {{ dbbackup_cloud_provider }}
|
||||
cloud-bucket = {{ dbbackup_cloud_bucket }}
|
||||
{% if dbbackup_cloud_endpoint %}
|
||||
cloud-endpoint = {{ dbbackup_cloud_endpoint }}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
@ -1,57 +0,0 @@
|
||||
# dbbackup Environment Variables
|
||||
# Managed by Ansible - do not edit manually
|
||||
# Permissions: 0600 (secrets inside)
|
||||
|
||||
{% if dbbackup_db_password is defined %}
|
||||
# Database Password
|
||||
{% if dbbackup_db_type == 'postgres' %}
|
||||
PGPASSWORD={{ dbbackup_db_password }}
|
||||
{% else %}
|
||||
MYSQL_PWD={{ dbbackup_db_password }}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{% if dbbackup_encryption_enabled and dbbackup_encryption_key is defined %}
|
||||
# Encryption Key
|
||||
DBBACKUP_ENCRYPTION_KEY={{ dbbackup_encryption_key }}
|
||||
{% endif %}
|
||||
|
||||
{% if dbbackup_cloud_enabled %}
|
||||
# Cloud Storage Credentials
|
||||
{% if dbbackup_cloud_provider in ['s3', 'minio', 'b2'] %}
|
||||
AWS_ACCESS_KEY_ID={{ dbbackup_cloud_access_key | default('') }}
|
||||
AWS_SECRET_ACCESS_KEY={{ dbbackup_cloud_secret_key | default('') }}
|
||||
{% endif %}
|
||||
{% if dbbackup_cloud_provider == 'azure' %}
|
||||
AZURE_STORAGE_ACCOUNT={{ dbbackup_cloud_access_key | default('') }}
|
||||
AZURE_STORAGE_KEY={{ dbbackup_cloud_secret_key | default('') }}
|
||||
{% endif %}
|
||||
{% if dbbackup_cloud_provider == 'gcs' %}
|
||||
GOOGLE_APPLICATION_CREDENTIALS={{ dbbackup_cloud_credentials_file | default('/etc/dbbackup/gcs-credentials.json') }}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{% if dbbackup_notify_smtp_enabled %}
|
||||
# SMTP Notifications
|
||||
NOTIFY_SMTP_HOST={{ dbbackup_notify_smtp_host }}
|
||||
NOTIFY_SMTP_PORT={{ dbbackup_notify_smtp_port }}
|
||||
NOTIFY_SMTP_USER={{ dbbackup_notify_smtp_user }}
|
||||
{% if dbbackup_notify_smtp_password is defined %}
|
||||
NOTIFY_SMTP_PASSWORD={{ dbbackup_notify_smtp_password }}
|
||||
{% endif %}
|
||||
NOTIFY_SMTP_FROM={{ dbbackup_notify_smtp_from }}
|
||||
NOTIFY_SMTP_TO={{ dbbackup_notify_smtp_to | join(',') }}
|
||||
{% endif %}
|
||||
|
||||
{% if dbbackup_notify_webhook_enabled %}
|
||||
# Webhook Notifications
|
||||
NOTIFY_WEBHOOK_URL={{ dbbackup_notify_webhook_url }}
|
||||
{% if dbbackup_notify_webhook_secret is defined %}
|
||||
NOTIFY_WEBHOOK_SECRET={{ dbbackup_notify_webhook_secret }}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{% if dbbackup_notify_slack_enabled %}
|
||||
# Slack Notifications
|
||||
NOTIFY_WEBHOOK_URL={{ dbbackup_notify_slack_webhook }}
|
||||
{% endif %}
|
||||
@ -1,6 +0,0 @@
|
||||
# dbbackup Systemd Override
|
||||
# Managed by Ansible
|
||||
|
||||
[Service]
|
||||
# Load environment from secure file
|
||||
EnvironmentFile=-{{ dbbackup_config_dir }}/env.d/{{ dbbackup_backup_type }}.conf
|
||||
@ -1,52 +0,0 @@
|
||||
---
|
||||
# dbbackup with Prometheus Exporter
|
||||
# Installation with metrics endpoint for monitoring
|
||||
#
|
||||
# Usage:
|
||||
# ansible-playbook -i inventory with-exporter.yml
|
||||
#
|
||||
# Features:
|
||||
# ✓ Automated daily backups
|
||||
# ✓ Retention policy
|
||||
# ✓ Prometheus exporter on port 9399
|
||||
# ✗ No notifications
|
||||
|
||||
- name: Deploy dbbackup with Prometheus exporter
|
||||
hosts: db_servers
|
||||
become: yes
|
||||
|
||||
vars:
|
||||
dbbackup_exporter_enabled: true
|
||||
dbbackup_exporter_port: 9399
|
||||
dbbackup_notify_enabled: false
|
||||
|
||||
roles:
|
||||
- dbbackup
|
||||
|
||||
post_tasks:
|
||||
- name: Wait for exporter to start
|
||||
wait_for:
|
||||
port: "{{ dbbackup_exporter_port }}"
|
||||
timeout: 30
|
||||
|
||||
- name: Test metrics endpoint
|
||||
uri:
|
||||
url: "http://localhost:{{ dbbackup_exporter_port }}/metrics"
|
||||
return_content: yes
|
||||
register: metrics_response
|
||||
|
||||
- name: Verify metrics available
|
||||
assert:
|
||||
that:
|
||||
- "'dbbackup_' in metrics_response.content"
|
||||
fail_msg: "Metrics endpoint not returning dbbackup metrics"
|
||||
success_msg: "Prometheus exporter running on port {{ dbbackup_exporter_port }}"
|
||||
|
||||
- name: Display Prometheus scrape config
|
||||
debug:
|
||||
msg: |
|
||||
Add to prometheus.yml:
|
||||
|
||||
- job_name: 'dbbackup'
|
||||
static_configs:
|
||||
- targets: ['{{ inventory_hostname }}:{{ dbbackup_exporter_port }}']
|
||||
@ -1,84 +0,0 @@
|
||||
---
|
||||
# dbbackup with Notifications
|
||||
# Installation with SMTP email and/or webhook notifications
|
||||
#
|
||||
# Usage:
|
||||
# # With SMTP notifications
|
||||
# ansible-playbook -i inventory with-notifications.yml \
|
||||
# -e dbbackup_notify_smtp_enabled=true \
|
||||
# -e dbbackup_notify_smtp_host=smtp.example.com \
|
||||
# -e dbbackup_notify_smtp_from=backups@example.com \
|
||||
# -e '{"dbbackup_notify_smtp_to": ["admin@example.com", "dba@example.com"]}'
|
||||
#
|
||||
# # With Slack notifications
|
||||
# ansible-playbook -i inventory with-notifications.yml \
|
||||
# -e dbbackup_notify_slack_enabled=true \
|
||||
# -e dbbackup_notify_slack_webhook=https://hooks.slack.com/services/XXX
|
||||
#
|
||||
# Features:
|
||||
# ✓ Automated daily backups
|
||||
# ✓ Retention policy
|
||||
# ✗ No Prometheus exporter
|
||||
# ✓ Email notifications (optional)
|
||||
# ✓ Webhook/Slack notifications (optional)
|
||||
|
||||
- name: Deploy dbbackup with notifications
|
||||
hosts: db_servers
|
||||
become: yes
|
||||
|
||||
vars:
|
||||
dbbackup_exporter_enabled: false
|
||||
dbbackup_notify_enabled: true
|
||||
# Enable one or more notification methods:
|
||||
# dbbackup_notify_smtp_enabled: true
|
||||
# dbbackup_notify_webhook_enabled: true
|
||||
# dbbackup_notify_slack_enabled: true
|
||||
|
||||
pre_tasks:
|
||||
- name: Validate notification configuration
|
||||
assert:
|
||||
that:
|
||||
- dbbackup_notify_smtp_enabled or dbbackup_notify_webhook_enabled or dbbackup_notify_slack_enabled
|
||||
fail_msg: "At least one notification method must be enabled"
|
||||
success_msg: "Notification configuration valid"
|
||||
|
||||
- name: Validate SMTP configuration
|
||||
assert:
|
||||
that:
|
||||
- dbbackup_notify_smtp_host != ''
|
||||
- dbbackup_notify_smtp_from != ''
|
||||
- dbbackup_notify_smtp_to | length > 0
|
||||
fail_msg: "SMTP configuration incomplete"
|
||||
when: dbbackup_notify_smtp_enabled | default(false)
|
||||
|
||||
- name: Validate webhook configuration
|
||||
assert:
|
||||
that:
|
||||
- dbbackup_notify_webhook_url != ''
|
||||
fail_msg: "Webhook URL required"
|
||||
when: dbbackup_notify_webhook_enabled | default(false)
|
||||
|
||||
- name: Validate Slack configuration
|
||||
assert:
|
||||
that:
|
||||
- dbbackup_notify_slack_webhook != ''
|
||||
fail_msg: "Slack webhook URL required"
|
||||
when: dbbackup_notify_slack_enabled | default(false)
|
||||
|
||||
roles:
|
||||
- dbbackup
|
||||
|
||||
post_tasks:
|
||||
- name: Display notification configuration
|
||||
debug:
|
||||
msg: |
|
||||
Notifications configured:
|
||||
{% if dbbackup_notify_smtp_enabled | default(false) %}
|
||||
- SMTP: {{ dbbackup_notify_smtp_to | join(', ') }}
|
||||
{% endif %}
|
||||
{% if dbbackup_notify_webhook_enabled | default(false) %}
|
||||
- Webhook: {{ dbbackup_notify_webhook_url }}
|
||||
{% endif %}
|
||||
{% if dbbackup_notify_slack_enabled | default(false) %}
|
||||
- Slack: Enabled
|
||||
{% endif %}
|
||||
@ -1,48 +0,0 @@
|
||||
# dbbackup Kubernetes Deployment
|
||||
|
||||
Kubernetes manifests for running dbbackup as scheduled CronJobs.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Create namespace
|
||||
kubectl create namespace dbbackup
|
||||
|
||||
# Create secrets
|
||||
kubectl create secret generic dbbackup-db-credentials \
|
||||
--namespace dbbackup \
|
||||
--from-literal=password=your-db-password
|
||||
|
||||
# Apply manifests
|
||||
kubectl apply -f . --namespace dbbackup
|
||||
|
||||
# Check CronJob
|
||||
kubectl get cronjobs -n dbbackup
|
||||
```
|
||||
|
||||
## Components
|
||||
|
||||
- `configmap.yaml` - Configuration settings
|
||||
- `secret.yaml` - Credentials template (use kubectl create secret instead)
|
||||
- `cronjob.yaml` - Scheduled backup job
|
||||
- `pvc.yaml` - Persistent volume for backup storage
|
||||
- `servicemonitor.yaml` - Prometheus ServiceMonitor (optional)
|
||||
|
||||
## Customization
|
||||
|
||||
Edit `configmap.yaml` to configure:
|
||||
- Database connection
|
||||
- Backup schedule
|
||||
- Retention policy
|
||||
- Cloud storage
|
||||
|
||||
## Helm Chart
|
||||
|
||||
For more complex deployments, use the Helm chart:
|
||||
|
||||
```bash
|
||||
helm install dbbackup ./helm/dbbackup \
|
||||
--set database.host=postgres.default.svc \
|
||||
--set database.password=secret \
|
||||
--set schedule="0 2 * * *"
|
||||
```
|
||||
@ -1,27 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: dbbackup-config
|
||||
labels:
|
||||
app: dbbackup
|
||||
data:
|
||||
# Database Configuration
|
||||
DB_TYPE: "postgres"
|
||||
DB_HOST: "postgres.default.svc.cluster.local"
|
||||
DB_PORT: "5432"
|
||||
DB_USER: "postgres"
|
||||
|
||||
# Backup Configuration
|
||||
BACKUP_DIR: "/backups"
|
||||
COMPRESSION: "6"
|
||||
|
||||
# Retention
|
||||
RETENTION_DAYS: "30"
|
||||
MIN_BACKUPS: "5"
|
||||
|
||||
# GFS Retention (enterprise)
|
||||
GFS_ENABLED: "false"
|
||||
GFS_DAILY: "7"
|
||||
GFS_WEEKLY: "4"
|
||||
GFS_MONTHLY: "12"
|
||||
GFS_YEARLY: "3"
|
||||
@ -1,140 +0,0 @@
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: dbbackup-cluster
|
||||
labels:
|
||||
app: dbbackup
|
||||
component: backup
|
||||
spec:
|
||||
# Daily at 2:00 AM UTC
|
||||
schedule: "0 2 * * *"
|
||||
|
||||
# Keep last 3 successful and 1 failed job
|
||||
successfulJobsHistoryLimit: 3
|
||||
failedJobsHistoryLimit: 1
|
||||
|
||||
# Don't run if previous job is still running
|
||||
concurrencyPolicy: Forbid
|
||||
|
||||
# Start job within 5 minutes of scheduled time or skip
|
||||
startingDeadlineSeconds: 300
|
||||
|
||||
jobTemplate:
|
||||
spec:
|
||||
# Retry up to 2 times on failure
|
||||
backoffLimit: 2
|
||||
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: dbbackup
|
||||
component: backup
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
|
||||
# Security context
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
fsGroup: 1000
|
||||
|
||||
containers:
|
||||
- name: dbbackup
|
||||
image: git.uuxo.net/uuxo/dbbackup:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
|
||||
args:
|
||||
- backup
|
||||
- cluster
|
||||
- --compression
|
||||
- "$(COMPRESSION)"
|
||||
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: dbbackup-config
|
||||
- secretRef:
|
||||
name: dbbackup-secrets
|
||||
|
||||
env:
|
||||
- name: BACKUP_DIR
|
||||
value: /backups
|
||||
|
||||
volumeMounts:
|
||||
- name: backup-storage
|
||||
mountPath: /backups
|
||||
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "2Gi"
|
||||
cpu: "2000m"
|
||||
|
||||
volumes:
|
||||
- name: backup-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: dbbackup-storage
|
||||
|
||||
---
|
||||
# Cleanup CronJob - runs weekly
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: dbbackup-cleanup
|
||||
labels:
|
||||
app: dbbackup
|
||||
component: cleanup
|
||||
spec:
|
||||
# Weekly on Sunday at 3:00 AM UTC
|
||||
schedule: "0 3 * * 0"
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 1
|
||||
concurrencyPolicy: Forbid
|
||||
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: dbbackup
|
||||
component: cleanup
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
fsGroup: 1000
|
||||
|
||||
containers:
|
||||
- name: dbbackup
|
||||
image: git.uuxo.net/uuxo/dbbackup:latest
|
||||
|
||||
args:
|
||||
- cleanup
|
||||
- /backups
|
||||
- --retention-days
|
||||
- "$(RETENTION_DAYS)"
|
||||
- --min-backups
|
||||
- "$(MIN_BACKUPS)"
|
||||
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: dbbackup-config
|
||||
|
||||
volumeMounts:
|
||||
- name: backup-storage
|
||||
mountPath: /backups
|
||||
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "50m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
|
||||
volumes:
|
||||
- name: backup-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: dbbackup-storage
|
||||
@ -1,13 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: dbbackup-storage
|
||||
labels:
|
||||
app: dbbackup
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 100Gi # Adjust based on database size
|
||||
# storageClassName: fast-ssd # Uncomment for specific storage class
|
||||
@ -1,27 +0,0 @@
|
||||
# dbbackup Secrets Template
|
||||
# DO NOT commit this file with real credentials!
|
||||
# Use: kubectl create secret generic dbbackup-secrets --from-literal=...
|
||||
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: dbbackup-secrets
|
||||
labels:
|
||||
app: dbbackup
|
||||
type: Opaque
|
||||
stringData:
|
||||
# Database Password (required)
|
||||
PGPASSWORD: "CHANGE_ME"
|
||||
|
||||
# Encryption Key (optional - 32+ characters recommended)
|
||||
# DBBACKUP_ENCRYPTION_KEY: "your-encryption-key-here"
|
||||
|
||||
# Cloud Storage Credentials (optional)
|
||||
# AWS_ACCESS_KEY_ID: "AKIAXXXXXXXX"
|
||||
# AWS_SECRET_ACCESS_KEY: "your-secret-key"
|
||||
|
||||
# SMTP Notifications (optional)
|
||||
# NOTIFY_SMTP_PASSWORD: "smtp-password"
|
||||
|
||||
# Webhook Secret (optional)
|
||||
# NOTIFY_WEBHOOK_SECRET: "hmac-signing-secret"
|
||||
@ -1,114 +0,0 @@
|
||||
# Prometheus ServiceMonitor for dbbackup
|
||||
# Requires prometheus-operator
|
||||
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: dbbackup
|
||||
labels:
|
||||
app: dbbackup
|
||||
release: prometheus # Match your Prometheus operator release
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: dbbackup
|
||||
component: exporter
|
||||
endpoints:
|
||||
- port: metrics
|
||||
interval: 60s
|
||||
path: /metrics
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- dbbackup
|
||||
|
||||
---
|
||||
# Metrics exporter deployment (optional - for continuous metrics)
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: dbbackup-exporter
|
||||
labels:
|
||||
app: dbbackup
|
||||
component: exporter
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: dbbackup
|
||||
component: exporter
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: dbbackup
|
||||
component: exporter
|
||||
spec:
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
|
||||
containers:
|
||||
- name: exporter
|
||||
image: git.uuxo.net/uuxo/dbbackup:latest
|
||||
args:
|
||||
- metrics
|
||||
- serve
|
||||
- --port
|
||||
- "9399"
|
||||
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 9399
|
||||
protocol: TCP
|
||||
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: dbbackup-config
|
||||
|
||||
volumeMounts:
|
||||
- name: backup-storage
|
||||
mountPath: /backups
|
||||
readOnly: true
|
||||
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: metrics
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: metrics
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
|
||||
resources:
|
||||
requests:
|
||||
memory: "64Mi"
|
||||
cpu: "10m"
|
||||
limits:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
|
||||
volumes:
|
||||
- name: backup-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: dbbackup-storage
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: dbbackup-exporter
|
||||
labels:
|
||||
app: dbbackup
|
||||
component: exporter
|
||||
spec:
|
||||
ports:
|
||||
- name: metrics
|
||||
port: 9399
|
||||
targetPort: metrics
|
||||
selector:
|
||||
app: dbbackup
|
||||
component: exporter
|
||||
@ -1,168 +0,0 @@
|
||||
# Prometheus Alerting Rules for dbbackup
|
||||
# Import into your Prometheus/Alertmanager configuration
|
||||
|
||||
groups:
|
||||
- name: dbbackup
|
||||
rules:
|
||||
# RPO Alerts - Recovery Point Objective violations
|
||||
- alert: DBBackupRPOWarning
|
||||
expr: dbbackup_rpo_seconds > 43200 # 12 hours
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Database backup RPO warning on {{ $labels.server }}"
|
||||
description: "No successful backup for {{ $labels.database }} in {{ $value | humanizeDuration }}. RPO threshold: 12 hours."
|
||||
|
||||
- alert: DBBackupRPOCritical
|
||||
expr: dbbackup_rpo_seconds > 86400 # 24 hours
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Database backup RPO critical on {{ $labels.server }}"
|
||||
description: "No successful backup for {{ $labels.database }} in {{ $value | humanizeDuration }}. Immediate attention required!"
|
||||
runbook_url: "https://wiki.example.com/runbooks/dbbackup-rpo-violation"
|
||||
|
||||
# Backup Failure Alerts
|
||||
- alert: DBBackupFailed
|
||||
expr: increase(dbbackup_backup_total{status="failure"}[1h]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Database backup failed on {{ $labels.server }}"
|
||||
description: "Backup for {{ $labels.database }} failed. Check logs for details."
|
||||
|
||||
- alert: DBBackupFailureRateHigh
|
||||
expr: |
|
||||
rate(dbbackup_backup_total{status="failure"}[24h])
|
||||
/
|
||||
rate(dbbackup_backup_total[24h]) > 0.1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High backup failure rate on {{ $labels.server }}"
|
||||
description: "More than 10% of backups are failing over the last 24 hours."
|
||||
|
||||
# Backup Size Anomalies
|
||||
- alert: DBBackupSizeAnomaly
|
||||
expr: |
|
||||
abs(
|
||||
dbbackup_last_backup_size_bytes
|
||||
- avg_over_time(dbbackup_last_backup_size_bytes[7d])
|
||||
)
|
||||
/ avg_over_time(dbbackup_last_backup_size_bytes[7d]) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Backup size anomaly for {{ $labels.database }}"
|
||||
description: "Backup size changed by more than 50% compared to 7-day average. Current: {{ $value | humanize1024 }}B"
|
||||
|
||||
- alert: DBBackupSizeZero
|
||||
expr: dbbackup_last_backup_size_bytes == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Zero-size backup detected for {{ $labels.database }}"
|
||||
description: "Last backup file is empty. Backup likely failed silently."
|
||||
|
||||
# Duration Alerts
|
||||
- alert: DBBackupDurationHigh
|
||||
expr: dbbackup_last_backup_duration_seconds > 3600 # 1 hour
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Backup taking too long for {{ $labels.database }}"
|
||||
description: "Last backup took {{ $value | humanizeDuration }}. Consider optimizing backup strategy."
|
||||
|
||||
# Verification Alerts
|
||||
- alert: DBBackupNotVerified
|
||||
expr: dbbackup_backup_verified == 0
|
||||
for: 24h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Backup not verified for {{ $labels.database }}"
|
||||
description: "Last backup was not verified. Run dbbackup verify to check integrity."
|
||||
|
||||
# PITR Alerts
|
||||
- alert: DBBackupPITRArchiveLag
|
||||
expr: dbbackup_pitr_archive_lag_seconds > 600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PITR archive lag on {{ $labels.server }}"
|
||||
description: "WAL/binlog archiving for {{ $labels.database }} is {{ $value | humanizeDuration }} behind."
|
||||
|
||||
- alert: DBBackupPITRArchiveCritical
|
||||
expr: dbbackup_pitr_archive_lag_seconds > 1800
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR archive critically behind on {{ $labels.server }}"
|
||||
description: "WAL/binlog archiving for {{ $labels.database }} is {{ $value | humanizeDuration }} behind. PITR capability at risk!"
|
||||
|
||||
- alert: DBBackupPITRChainBroken
|
||||
expr: dbbackup_pitr_chain_valid == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR chain broken for {{ $labels.database }}"
|
||||
description: "WAL/binlog chain has gaps. Point-in-time recovery NOT possible. New base backup required."
|
||||
|
||||
- alert: DBBackupPITRGaps
|
||||
expr: dbbackup_pitr_gap_count > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PITR chain gaps for {{ $labels.database }}"
|
||||
description: "{{ $value }} gaps in WAL/binlog chain. Recovery to points within gaps will fail."
|
||||
|
||||
# Backup Type Alerts
|
||||
- alert: DBBackupNoRecentFull
|
||||
expr: time() - dbbackup_last_success_timestamp{backup_type="full"} > 604800
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "No full backup in 7+ days for {{ $labels.database }}"
|
||||
description: "Consider taking a full backup. Incremental chains depend on valid base."
|
||||
|
||||
# Exporter Health
|
||||
- alert: DBBackupExporterDown
|
||||
expr: up{job="dbbackup"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "dbbackup exporter is down on {{ $labels.instance }}"
|
||||
description: "Cannot scrape metrics from dbbackup exporter. Monitoring is impaired."
|
||||
|
||||
# Deduplication Alerts
|
||||
- alert: DBBackupDedupRatioLow
|
||||
expr: dbbackup_dedup_ratio < 0.2
|
||||
for: 24h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "Low deduplication ratio on {{ $labels.server }}"
|
||||
description: "Dedup ratio is {{ $value | printf \"%.1f%%\" }}. Consider if dedup is beneficial."
|
||||
|
||||
# Storage Alerts
|
||||
- alert: DBBackupStorageHigh
|
||||
expr: dbbackup_dedup_disk_usage_bytes > 1099511627776 # 1 TB
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High backup storage usage on {{ $labels.server }}"
|
||||
description: "Backup storage using {{ $value | humanize1024 }}B. Review retention policy."
|
||||
@ -1,48 +0,0 @@
|
||||
# Prometheus scrape configuration for dbbackup
|
||||
# Add to your prometheus.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'dbbackup'
|
||||
# Scrape interval - backup metrics don't change frequently
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 10s
|
||||
|
||||
# Static targets - list your database servers
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'db-server-01:9399'
|
||||
- 'db-server-02:9399'
|
||||
- 'db-server-03:9399'
|
||||
labels:
|
||||
environment: 'production'
|
||||
|
||||
- targets:
|
||||
- 'db-staging:9399'
|
||||
labels:
|
||||
environment: 'staging'
|
||||
|
||||
# Relabeling (optional)
|
||||
relabel_configs:
|
||||
# Extract hostname from target
|
||||
- source_labels: [__address__]
|
||||
target_label: instance
|
||||
regex: '([^:]+):\d+'
|
||||
replacement: '$1'
|
||||
|
||||
# Alternative: File-based service discovery
|
||||
# Useful when targets are managed by Ansible/Terraform
|
||||
|
||||
- job_name: 'dbbackup-sd'
|
||||
scrape_interval: 60s
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets/dbbackup/*.yml'
|
||||
refresh_interval: 5m
|
||||
|
||||
# Example target file (/etc/prometheus/targets/dbbackup/production.yml):
|
||||
# - targets:
|
||||
# - db-server-01:9399
|
||||
# - db-server-02:9399
|
||||
# labels:
|
||||
# environment: production
|
||||
# datacenter: us-east-1
|
||||
@ -1,65 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Backup Rotation Script for dbbackup
|
||||
# Implements GFS (Grandfather-Father-Son) retention policy
|
||||
#
|
||||
# Usage: backup-rotation.sh /path/to/backups [--dry-run]
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="${1:-/var/backups/databases}"
|
||||
DRY_RUN="${2:-}"
|
||||
|
||||
# GFS Configuration
|
||||
DAILY_KEEP=7
|
||||
WEEKLY_KEEP=4
|
||||
MONTHLY_KEEP=12
|
||||
YEARLY_KEEP=3
|
||||
|
||||
# Minimum backups to always keep
|
||||
MIN_BACKUPS=5
|
||||
|
||||
echo "═══════════════════════════════════════════════════════════════"
|
||||
echo " dbbackup GFS Rotation"
|
||||
echo "═══════════════════════════════════════════════════════════════"
|
||||
echo ""
|
||||
echo " Backup Directory: $BACKUP_DIR"
|
||||
echo " Retention Policy:"
|
||||
echo " Daily: $DAILY_KEEP backups"
|
||||
echo " Weekly: $WEEKLY_KEEP backups"
|
||||
echo " Monthly: $MONTHLY_KEEP backups"
|
||||
echo " Yearly: $YEARLY_KEEP backups"
|
||||
echo ""
|
||||
|
||||
if [[ "$DRY_RUN" == "--dry-run" ]]; then
|
||||
echo " [DRY RUN MODE - No files will be deleted]"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Check if dbbackup is available
|
||||
if ! command -v dbbackup &> /dev/null; then
|
||||
echo "ERROR: dbbackup command not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Build cleanup command
|
||||
CLEANUP_CMD="dbbackup cleanup $BACKUP_DIR \
|
||||
--gfs \
|
||||
--gfs-daily $DAILY_KEEP \
|
||||
--gfs-weekly $WEEKLY_KEEP \
|
||||
--gfs-monthly $MONTHLY_KEEP \
|
||||
--gfs-yearly $YEARLY_KEEP \
|
||||
--min-backups $MIN_BACKUPS"
|
||||
|
||||
if [[ "$DRY_RUN" == "--dry-run" ]]; then
|
||||
CLEANUP_CMD="$CLEANUP_CMD --dry-run"
|
||||
fi
|
||||
|
||||
echo "Running: $CLEANUP_CMD"
|
||||
echo ""
|
||||
|
||||
$CLEANUP_CMD
|
||||
|
||||
echo ""
|
||||
echo "═══════════════════════════════════════════════════════════════"
|
||||
echo " Rotation complete"
|
||||
echo "═══════════════════════════════════════════════════════════════"
|
||||
@ -1,92 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Health Check Script for dbbackup
|
||||
# Returns exit codes for monitoring systems:
|
||||
# 0 = OK (backup within RPO)
|
||||
# 1 = WARNING (backup older than warning threshold)
|
||||
# 2 = CRITICAL (backup older than critical threshold or missing)
|
||||
#
|
||||
# Usage: health-check.sh [backup-dir] [warning-hours] [critical-hours]
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="${1:-/var/backups/databases}"
|
||||
WARNING_HOURS="${2:-24}"
|
||||
CRITICAL_HOURS="${3:-48}"
|
||||
|
||||
# Convert to seconds
|
||||
WARNING_SECONDS=$((WARNING_HOURS * 3600))
|
||||
CRITICAL_SECONDS=$((CRITICAL_HOURS * 3600))
|
||||
|
||||
echo "dbbackup Health Check"
|
||||
echo "====================="
|
||||
echo "Backup directory: $BACKUP_DIR"
|
||||
echo "Warning threshold: ${WARNING_HOURS}h"
|
||||
echo "Critical threshold: ${CRITICAL_HOURS}h"
|
||||
echo ""
|
||||
|
||||
# Check if backup directory exists
|
||||
if [[ ! -d "$BACKUP_DIR" ]]; then
|
||||
echo "CRITICAL: Backup directory does not exist"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Find most recent backup file
|
||||
LATEST_BACKUP=$(find "$BACKUP_DIR" -type f \( -name "*.dump" -o -name "*.dump.gz" -o -name "*.sql" -o -name "*.sql.gz" -o -name "*.tar.gz" \) -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1)
|
||||
|
||||
if [[ -z "$LATEST_BACKUP" ]]; then
|
||||
echo "CRITICAL: No backup files found in $BACKUP_DIR"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Extract timestamp and path
|
||||
BACKUP_TIMESTAMP=$(echo "$LATEST_BACKUP" | cut -d' ' -f1 | cut -d'.' -f1)
|
||||
BACKUP_PATH=$(echo "$LATEST_BACKUP" | cut -d' ' -f2-)
|
||||
BACKUP_NAME=$(basename "$BACKUP_PATH")
|
||||
|
||||
# Calculate age
|
||||
NOW=$(date +%s)
|
||||
AGE_SECONDS=$((NOW - BACKUP_TIMESTAMP))
|
||||
AGE_HOURS=$((AGE_SECONDS / 3600))
|
||||
AGE_DAYS=$((AGE_HOURS / 24))
|
||||
|
||||
# Format age string
|
||||
if [[ $AGE_DAYS -gt 0 ]]; then
|
||||
AGE_STR="${AGE_DAYS}d $((AGE_HOURS % 24))h"
|
||||
else
|
||||
AGE_STR="${AGE_HOURS}h $((AGE_SECONDS % 3600 / 60))m"
|
||||
fi
|
||||
|
||||
# Get backup size
|
||||
BACKUP_SIZE=$(du -h "$BACKUP_PATH" 2>/dev/null | cut -f1)
|
||||
|
||||
echo "Latest backup:"
|
||||
echo " File: $BACKUP_NAME"
|
||||
echo " Size: $BACKUP_SIZE"
|
||||
echo " Age: $AGE_STR"
|
||||
echo ""
|
||||
|
||||
# Verify backup integrity if dbbackup is available
|
||||
if command -v dbbackup &> /dev/null; then
|
||||
echo "Verifying backup integrity..."
|
||||
if dbbackup verify "$BACKUP_PATH" --quiet 2>/dev/null; then
|
||||
echo " ✓ Backup integrity verified"
|
||||
else
|
||||
echo " ✗ Backup verification failed"
|
||||
echo ""
|
||||
echo "CRITICAL: Latest backup is corrupted"
|
||||
exit 2
|
||||
fi
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Check thresholds
|
||||
if [[ $AGE_SECONDS -ge $CRITICAL_SECONDS ]]; then
|
||||
echo "CRITICAL: Last backup is ${AGE_STR} old (threshold: ${CRITICAL_HOURS}h)"
|
||||
exit 2
|
||||
elif [[ $AGE_SECONDS -ge $WARNING_SECONDS ]]; then
|
||||
echo "WARNING: Last backup is ${AGE_STR} old (threshold: ${WARNING_HOURS}h)"
|
||||
exit 1
|
||||
else
|
||||
echo "OK: Last backup is ${AGE_STR} old"
|
||||
exit 0
|
||||
fi
|
||||
@ -1,26 +0,0 @@
|
||||
# dbbackup Terraform - AWS Example
|
||||
|
||||
variable "aws_region" {
|
||||
default = "us-east-1"
|
||||
}
|
||||
|
||||
provider "aws" {
|
||||
region = var.aws_region
|
||||
}
|
||||
|
||||
module "dbbackup_storage" {
|
||||
source = "./main.tf"
|
||||
|
||||
environment = "production"
|
||||
bucket_name = "mycompany-database-backups"
|
||||
retention_days = 30
|
||||
glacier_days = 365
|
||||
}
|
||||
|
||||
output "bucket_name" {
|
||||
value = module.dbbackup_storage.bucket_name
|
||||
}
|
||||
|
||||
output "setup_instructions" {
|
||||
value = module.dbbackup_storage.dbbackup_cloud_config
|
||||
}
|
||||
@ -1,202 +0,0 @@
|
||||
# dbbackup Terraform Module - AWS Deployment
|
||||
# Creates S3 bucket for backup storage with proper security
|
||||
|
||||
terraform {
|
||||
required_version = ">= 1.0"
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = ">= 4.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Variables
|
||||
variable "environment" {
|
||||
description = "Environment name (e.g., production, staging)"
|
||||
type = string
|
||||
default = "production"
|
||||
}
|
||||
|
||||
variable "bucket_name" {
|
||||
description = "S3 bucket name for backups"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "retention_days" {
|
||||
description = "Days to keep backups before transitioning to Glacier"
|
||||
type = number
|
||||
default = 30
|
||||
}
|
||||
|
||||
variable "glacier_days" {
|
||||
description = "Days to keep in Glacier before deletion (0 = keep forever)"
|
||||
type = number
|
||||
default = 365
|
||||
}
|
||||
|
||||
variable "enable_encryption" {
|
||||
description = "Enable server-side encryption"
|
||||
type = bool
|
||||
default = true
|
||||
}
|
||||
|
||||
variable "kms_key_arn" {
|
||||
description = "KMS key ARN for encryption (leave empty for aws/s3 managed key)"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
# S3 Bucket
|
||||
resource "aws_s3_bucket" "backups" {
|
||||
bucket = var.bucket_name
|
||||
|
||||
tags = {
|
||||
Name = "Database Backups"
|
||||
Environment = var.environment
|
||||
ManagedBy = "terraform"
|
||||
Application = "dbbackup"
|
||||
}
|
||||
}
|
||||
|
||||
# Versioning
|
||||
resource "aws_s3_bucket_versioning" "backups" {
|
||||
bucket = aws_s3_bucket.backups.id
|
||||
versioning_configuration {
|
||||
status = "Enabled"
|
||||
}
|
||||
}
|
||||
|
||||
# Encryption
|
||||
resource "aws_s3_bucket_server_side_encryption_configuration" "backups" {
|
||||
count = var.enable_encryption ? 1 : 0
|
||||
bucket = aws_s3_bucket.backups.id
|
||||
|
||||
rule {
|
||||
apply_server_side_encryption_by_default {
|
||||
sse_algorithm = var.kms_key_arn != "" ? "aws:kms" : "AES256"
|
||||
kms_master_key_id = var.kms_key_arn != "" ? var.kms_key_arn : null
|
||||
}
|
||||
bucket_key_enabled = true
|
||||
}
|
||||
}
|
||||
|
||||
# Lifecycle Rules
|
||||
resource "aws_s3_bucket_lifecycle_configuration" "backups" {
|
||||
bucket = aws_s3_bucket.backups.id
|
||||
|
||||
rule {
|
||||
id = "transition-to-glacier"
|
||||
status = "Enabled"
|
||||
|
||||
filter {
|
||||
prefix = ""
|
||||
}
|
||||
|
||||
transition {
|
||||
days = var.retention_days
|
||||
storage_class = "GLACIER"
|
||||
}
|
||||
|
||||
dynamic "expiration" {
|
||||
for_each = var.glacier_days > 0 ? [1] : []
|
||||
content {
|
||||
days = var.retention_days + var.glacier_days
|
||||
}
|
||||
}
|
||||
|
||||
noncurrent_version_transition {
|
||||
noncurrent_days = 30
|
||||
storage_class = "GLACIER"
|
||||
}
|
||||
|
||||
noncurrent_version_expiration {
|
||||
noncurrent_days = 90
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Block Public Access
|
||||
resource "aws_s3_bucket_public_access_block" "backups" {
|
||||
bucket = aws_s3_bucket.backups.id
|
||||
|
||||
block_public_acls = true
|
||||
block_public_policy = true
|
||||
ignore_public_acls = true
|
||||
restrict_public_buckets = true
|
||||
}
|
||||
|
||||
# IAM User for dbbackup
|
||||
resource "aws_iam_user" "dbbackup" {
|
||||
name = "dbbackup-${var.environment}"
|
||||
path = "/service-accounts/"
|
||||
|
||||
tags = {
|
||||
Application = "dbbackup"
|
||||
Environment = var.environment
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_iam_access_key" "dbbackup" {
|
||||
user = aws_iam_user.dbbackup.name
|
||||
}
|
||||
|
||||
# IAM Policy
|
||||
resource "aws_iam_user_policy" "dbbackup" {
|
||||
name = "dbbackup-s3-access"
|
||||
user = aws_iam_user.dbbackup.name
|
||||
|
||||
policy = jsonencode({
|
||||
Version = "2012-10-17"
|
||||
Statement = [
|
||||
{
|
||||
Effect = "Allow"
|
||||
Action = [
|
||||
"s3:GetObject",
|
||||
"s3:PutObject",
|
||||
"s3:DeleteObject",
|
||||
"s3:ListBucket",
|
||||
"s3:GetBucketLocation"
|
||||
]
|
||||
Resource = [
|
||||
aws_s3_bucket.backups.arn,
|
||||
"${aws_s3_bucket.backups.arn}/*"
|
||||
]
|
||||
}
|
||||
]
|
||||
})
|
||||
}
|
||||
|
||||
# Outputs
|
||||
output "bucket_name" {
|
||||
description = "S3 bucket name"
|
||||
value = aws_s3_bucket.backups.id
|
||||
}
|
||||
|
||||
output "bucket_arn" {
|
||||
description = "S3 bucket ARN"
|
||||
value = aws_s3_bucket.backups.arn
|
||||
}
|
||||
|
||||
output "access_key_id" {
|
||||
description = "IAM access key ID for dbbackup"
|
||||
value = aws_iam_access_key.dbbackup.id
|
||||
}
|
||||
|
||||
output "secret_access_key" {
|
||||
description = "IAM secret access key for dbbackup"
|
||||
value = aws_iam_access_key.dbbackup.secret
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
output "dbbackup_cloud_config" {
|
||||
description = "Cloud configuration for dbbackup"
|
||||
value = <<-EOT
|
||||
# Add to dbbackup environment:
|
||||
export AWS_ACCESS_KEY_ID="${aws_iam_access_key.dbbackup.id}"
|
||||
export AWS_SECRET_ACCESS_KEY="<run: terraform output -raw secret_access_key>"
|
||||
|
||||
# Use with dbbackup:
|
||||
dbbackup backup cluster --cloud s3://${aws_s3_bucket.backups.id}/backups/
|
||||
EOT
|
||||
}
|
||||
359
diagnose_postgres_memory.sh
Executable file
359
diagnose_postgres_memory.sh
Executable file
@ -0,0 +1,359 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# PostgreSQL Memory and Resource Diagnostic Tool
|
||||
# Analyzes memory usage, locks, and system resources to identify restore issues
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
echo "════════════════════════════════════════════════════════════"
|
||||
echo " PostgreSQL Memory & Resource Diagnostics"
|
||||
echo " $(date '+%Y-%m-%d %H:%M:%S')"
|
||||
echo "════════════════════════════════════════════════════════════"
|
||||
echo
|
||||
|
||||
# Function to format bytes to human readable
|
||||
bytes_to_human() {
|
||||
local bytes=$1
|
||||
if [ "$bytes" -ge 1073741824 ]; then
|
||||
echo "$(awk "BEGIN {printf \"%.2f GB\", $bytes/1073741824}")"
|
||||
elif [ "$bytes" -ge 1048576 ]; then
|
||||
echo "$(awk "BEGIN {printf \"%.2f MB\", $bytes/1048576}")"
|
||||
else
|
||||
echo "$(awk "BEGIN {printf \"%.2f KB\", $bytes/1024}")"
|
||||
fi
|
||||
}
|
||||
|
||||
# 1. SYSTEM MEMORY OVERVIEW
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${BLUE}📊 SYSTEM MEMORY OVERVIEW${NC}"
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo
|
||||
|
||||
if command -v free &> /dev/null; then
|
||||
free -h
|
||||
echo
|
||||
|
||||
# Calculate percentages
|
||||
MEM_TOTAL=$(free -b | awk '/^Mem:/ {print $2}')
|
||||
MEM_USED=$(free -b | awk '/^Mem:/ {print $3}')
|
||||
MEM_FREE=$(free -b | awk '/^Mem:/ {print $4}')
|
||||
MEM_AVAILABLE=$(free -b | awk '/^Mem:/ {print $7}')
|
||||
|
||||
MEM_PERCENT=$(awk "BEGIN {printf \"%.1f\", ($MEM_USED/$MEM_TOTAL)*100}")
|
||||
|
||||
echo "Memory Utilization: ${MEM_PERCENT}%"
|
||||
echo "Total: $(bytes_to_human $MEM_TOTAL)"
|
||||
echo "Used: $(bytes_to_human $MEM_USED)"
|
||||
echo "Available: $(bytes_to_human $MEM_AVAILABLE)"
|
||||
|
||||
if (( $(echo "$MEM_PERCENT > 90" | bc -l) )); then
|
||||
echo -e "${RED}⚠️ WARNING: Memory usage is critically high (>90%)${NC}"
|
||||
elif (( $(echo "$MEM_PERCENT > 70" | bc -l) )); then
|
||||
echo -e "${YELLOW}⚠️ CAUTION: Memory usage is high (>70%)${NC}"
|
||||
else
|
||||
echo -e "${GREEN}✓ Memory usage is acceptable${NC}"
|
||||
fi
|
||||
fi
|
||||
echo
|
||||
|
||||
# 2. TOP MEMORY CONSUMERS
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${BLUE}🔍 TOP 15 MEMORY CONSUMING PROCESSES${NC}"
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo
|
||||
ps aux --sort=-%mem | head -16 | awk 'NR==1 {print $0} NR>1 {printf "%-8s %5s%% %7s %s\n", $1, $4, $6/1024"M", $11}'
|
||||
echo
|
||||
|
||||
# 3. POSTGRESQL PROCESSES
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${BLUE}🐘 POSTGRESQL PROCESSES${NC}"
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo
|
||||
|
||||
PG_PROCS=$(ps aux | grep -E "postgres.*:" | grep -v grep || true)
|
||||
if [ -z "$PG_PROCS" ]; then
|
||||
echo "No PostgreSQL processes found"
|
||||
else
|
||||
echo "$PG_PROCS" | awk '{printf "%-8s %5s%% %7s %s\n", $1, $4, $6/1024"M", $11}'
|
||||
echo
|
||||
|
||||
# Sum up PostgreSQL memory
|
||||
PG_MEM_TOTAL=$(echo "$PG_PROCS" | awk '{sum+=$6} END {print sum/1024}')
|
||||
echo "Total PostgreSQL Memory: ${PG_MEM_TOTAL} MB"
|
||||
fi
|
||||
echo
|
||||
|
||||
# 4. POSTGRESQL CONFIGURATION
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${BLUE}⚙️ POSTGRESQL MEMORY CONFIGURATION${NC}"
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo
|
||||
|
||||
if command -v psql &> /dev/null; then
|
||||
PSQL_CMD="psql -t -A -c"
|
||||
|
||||
# Try as postgres user first, then current user
|
||||
if sudo -u postgres $PSQL_CMD "SELECT 1" &> /dev/null; then
|
||||
PSQL_PREFIX="sudo -u postgres"
|
||||
elif $PSQL_CMD "SELECT 1" &> /dev/null; then
|
||||
PSQL_PREFIX=""
|
||||
else
|
||||
echo "❌ Cannot connect to PostgreSQL"
|
||||
PSQL_PREFIX="NONE"
|
||||
fi
|
||||
|
||||
if [ "$PSQL_PREFIX" != "NONE" ]; then
|
||||
echo "Key Memory Settings:"
|
||||
echo "────────────────────────────────────────────────────────────"
|
||||
|
||||
# Get all relevant settings (strip timing output)
|
||||
SHARED_BUFFERS=$($PSQL_PREFIX psql -t -A -c "SHOW shared_buffers;" 2>/dev/null | head -1 || echo "unknown")
|
||||
WORK_MEM=$($PSQL_PREFIX psql -t -A -c "SHOW work_mem;" 2>/dev/null | head -1 || echo "unknown")
|
||||
MAINT_WORK_MEM=$($PSQL_PREFIX psql -t -A -c "SHOW maintenance_work_mem;" 2>/dev/null | head -1 || echo "unknown")
|
||||
EFFECTIVE_CACHE=$($PSQL_PREFIX psql -t -A -c "SHOW effective_cache_size;" 2>/dev/null | head -1 || echo "unknown")
|
||||
MAX_CONNECTIONS=$($PSQL_PREFIX psql -t -A -c "SHOW max_connections;" 2>/dev/null | head -1 || echo "unknown")
|
||||
MAX_LOCKS=$($PSQL_PREFIX psql -t -A -c "SHOW max_locks_per_transaction;" 2>/dev/null | head -1 || echo "unknown")
|
||||
MAX_PREPARED=$($PSQL_PREFIX psql -t -A -c "SHOW max_prepared_transactions;" 2>/dev/null | head -1 || echo "unknown")
|
||||
|
||||
echo "shared_buffers: $SHARED_BUFFERS"
|
||||
echo "work_mem: $WORK_MEM"
|
||||
echo "maintenance_work_mem: $MAINT_WORK_MEM"
|
||||
echo "effective_cache_size: $EFFECTIVE_CACHE"
|
||||
echo "max_connections: $MAX_CONNECTIONS"
|
||||
echo "max_locks_per_transaction: $MAX_LOCKS"
|
||||
echo "max_prepared_transactions: $MAX_PREPARED"
|
||||
echo
|
||||
|
||||
# Calculate lock capacity
|
||||
if [ "$MAX_LOCKS" != "unknown" ] && [ "$MAX_CONNECTIONS" != "unknown" ] && [ "$MAX_PREPARED" != "unknown" ]; then
|
||||
# Ensure values are numeric
|
||||
if [[ "$MAX_LOCKS" =~ ^[0-9]+$ ]] && [[ "$MAX_CONNECTIONS" =~ ^[0-9]+$ ]] && [[ "$MAX_PREPARED" =~ ^[0-9]+$ ]]; then
|
||||
LOCK_CAPACITY=$((MAX_LOCKS * (MAX_CONNECTIONS + MAX_PREPARED)))
|
||||
echo "Total Lock Capacity: $LOCK_CAPACITY locks"
|
||||
|
||||
if [ "$MAX_LOCKS" -lt 1000 ]; then
|
||||
echo -e "${RED}⚠️ WARNING: max_locks_per_transaction is too low for large restores${NC}"
|
||||
echo -e "${YELLOW} Recommended: 4096 or higher${NC}"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
echo
|
||||
fi
|
||||
else
|
||||
echo "❌ psql not found"
|
||||
fi
|
||||
|
||||
# 5. CURRENT LOCKS AND CONNECTIONS
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${BLUE}🔒 CURRENT LOCKS AND CONNECTIONS${NC}"
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo
|
||||
|
||||
if [ "$PSQL_PREFIX" != "NONE" ] && command -v psql &> /dev/null; then
|
||||
# Active connections
|
||||
ACTIVE_CONNS=$($PSQL_PREFIX psql -t -A -c "SELECT count(*) FROM pg_stat_activity;" 2>/dev/null | head -1 || echo "0")
|
||||
echo "Active Connections: $ACTIVE_CONNS / $MAX_CONNECTIONS"
|
||||
echo
|
||||
|
||||
# Lock statistics
|
||||
echo "Current Lock Usage:"
|
||||
echo "────────────────────────────────────────────────────────────"
|
||||
$PSQL_PREFIX psql -c "
|
||||
SELECT
|
||||
mode,
|
||||
COUNT(*) as count
|
||||
FROM pg_locks
|
||||
GROUP BY mode
|
||||
ORDER BY count DESC;
|
||||
" 2>/dev/null || echo "Unable to query locks"
|
||||
echo
|
||||
|
||||
# Total locks
|
||||
TOTAL_LOCKS=$($PSQL_PREFIX psql -t -A -c "SELECT COUNT(*) FROM pg_locks;" 2>/dev/null | head -1 || echo "0")
|
||||
echo "Total Active Locks: $TOTAL_LOCKS"
|
||||
|
||||
if [ ! -z "$LOCK_CAPACITY" ] && [ ! -z "$TOTAL_LOCKS" ] && [[ "$TOTAL_LOCKS" =~ ^[0-9]+$ ]] && [ "$TOTAL_LOCKS" -gt 0 ] 2>/dev/null; then
|
||||
LOCK_PERCENT=$((TOTAL_LOCKS * 100 / LOCK_CAPACITY))
|
||||
echo "Lock Usage: ${LOCK_PERCENT}%"
|
||||
|
||||
if [ "$LOCK_PERCENT" -gt 80 ]; then
|
||||
echo -e "${RED}⚠️ WARNING: Lock table usage is critically high${NC}"
|
||||
elif [ "$LOCK_PERCENT" -gt 60 ]; then
|
||||
echo -e "${YELLOW}⚠️ CAUTION: Lock table usage is elevated${NC}"
|
||||
fi
|
||||
fi
|
||||
echo
|
||||
|
||||
# Blocking queries
|
||||
echo "Blocking Queries:"
|
||||
echo "────────────────────────────────────────────────────────────"
|
||||
$PSQL_PREFIX psql -c "
|
||||
SELECT
|
||||
blocked_locks.pid AS blocked_pid,
|
||||
blocking_locks.pid AS blocking_pid,
|
||||
blocked_activity.usename AS blocked_user,
|
||||
blocking_activity.usename AS blocking_user,
|
||||
blocked_activity.query AS blocked_query
|
||||
FROM pg_catalog.pg_locks blocked_locks
|
||||
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
|
||||
JOIN pg_catalog.pg_locks blocking_locks
|
||||
ON blocking_locks.locktype = blocked_locks.locktype
|
||||
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
|
||||
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
|
||||
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
|
||||
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
|
||||
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
|
||||
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
|
||||
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
|
||||
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
|
||||
AND blocking_locks.pid != blocked_locks.pid
|
||||
JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
|
||||
WHERE NOT blocked_locks.granted;
|
||||
" 2>/dev/null || echo "No blocking queries or unable to query"
|
||||
echo
|
||||
fi
|
||||
|
||||
# 6. SHARED MEMORY USAGE
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${BLUE}💾 SHARED MEMORY SEGMENTS${NC}"
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo
|
||||
|
||||
if command -v ipcs &> /dev/null; then
|
||||
ipcs -m
|
||||
echo
|
||||
|
||||
# Sum up shared memory
|
||||
TOTAL_SHM=$(ipcs -m | awk '/^0x/ {sum+=$5} END {print sum}')
|
||||
if [ ! -z "$TOTAL_SHM" ]; then
|
||||
echo "Total Shared Memory: $(bytes_to_human $TOTAL_SHM)"
|
||||
fi
|
||||
else
|
||||
echo "ipcs command not available"
|
||||
fi
|
||||
echo
|
||||
|
||||
# 7. DISK SPACE (relevant for temp files)
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${BLUE}💿 DISK SPACE${NC}"
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo
|
||||
|
||||
df -h | grep -E "Filesystem|/$|/var|/tmp|/postgres"
|
||||
echo
|
||||
|
||||
# Check for PostgreSQL temp files
|
||||
if [ "$PSQL_PREFIX" != "NONE" ] && command -v psql &> /dev/null; then
|
||||
TEMP_FILES=$($PSQL_PREFIX psql -t -A -c "SELECT count(*) FROM pg_stat_database WHERE temp_files > 0;" 2>/dev/null | head -1 || echo "0")
|
||||
if [ ! -z "$TEMP_FILES" ] && [ "$TEMP_FILES" -gt 0 ] 2>/dev/null; then
|
||||
echo -e "${YELLOW}⚠️ Databases are using temporary files (work_mem may be too low)${NC}"
|
||||
$PSQL_PREFIX psql -c "SELECT datname, temp_files, pg_size_pretty(temp_bytes) as temp_size FROM pg_stat_database WHERE temp_files > 0;" 2>/dev/null
|
||||
echo
|
||||
fi
|
||||
fi
|
||||
|
||||
# 8. OTHER RESOURCE CONSUMERS
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${BLUE}🔍 OTHER POTENTIAL MEMORY CONSUMERS${NC}"
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo
|
||||
|
||||
# Check for common memory hogs
|
||||
echo "Checking for common memory-intensive services..."
|
||||
echo
|
||||
|
||||
for service in "mysqld" "mongodb" "redis" "elasticsearch" "java" "docker" "containerd"; do
|
||||
MEM=$(ps aux | grep "$service" | grep -v grep | awk '{sum+=$4} END {printf "%.1f", sum}')
|
||||
if [ ! -z "$MEM" ] && (( $(echo "$MEM > 0" | bc -l) )); then
|
||||
echo " ${service}: ${MEM}%"
|
||||
fi
|
||||
done
|
||||
echo
|
||||
|
||||
# 9. SWAP USAGE
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${BLUE}🔄 SWAP USAGE${NC}"
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo
|
||||
|
||||
if command -v free &> /dev/null; then
|
||||
SWAP_TOTAL=$(free -b | awk '/^Swap:/ {print $2}')
|
||||
SWAP_USED=$(free -b | awk '/^Swap:/ {print $3}')
|
||||
|
||||
if [ "$SWAP_TOTAL" -gt 0 ]; then
|
||||
SWAP_PERCENT=$(awk "BEGIN {printf \"%.1f\", ($SWAP_USED/$SWAP_TOTAL)*100}")
|
||||
echo "Swap Total: $(bytes_to_human $SWAP_TOTAL)"
|
||||
echo "Swap Used: $(bytes_to_human $SWAP_USED) (${SWAP_PERCENT}%)"
|
||||
|
||||
if (( $(echo "$SWAP_PERCENT > 50" | bc -l) )); then
|
||||
echo -e "${RED}⚠️ WARNING: Heavy swap usage detected - system may be thrashing${NC}"
|
||||
elif (( $(echo "$SWAP_PERCENT > 20" | bc -l) )); then
|
||||
echo -e "${YELLOW}⚠️ CAUTION: System is using swap${NC}"
|
||||
else
|
||||
echo -e "${GREEN}✓ Swap usage is low${NC}"
|
||||
fi
|
||||
else
|
||||
echo "No swap configured"
|
||||
fi
|
||||
fi
|
||||
echo
|
||||
|
||||
# 10. RECOMMENDATIONS
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo -e "${BLUE}💡 RECOMMENDATIONS${NC}"
|
||||
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
||||
echo
|
||||
|
||||
echo "Based on the diagnostics:"
|
||||
echo
|
||||
|
||||
# Memory recommendations
|
||||
if [ ! -z "$MEM_PERCENT" ]; then
|
||||
if (( $(echo "$MEM_PERCENT > 80" | bc -l) )); then
|
||||
echo "1. ⚠️ Memory Pressure:"
|
||||
echo " • System memory is ${MEM_PERCENT}% utilized"
|
||||
echo " • Stop non-essential services before restore"
|
||||
echo " • Consider increasing system RAM"
|
||||
echo " • Use 'dbbackup restore --parallel=1' to reduce memory usage"
|
||||
echo
|
||||
fi
|
||||
fi
|
||||
|
||||
# Lock recommendations
|
||||
if [ "$MAX_LOCKS" != "unknown" ] && [ ! -z "$MAX_LOCKS" ] && [[ "$MAX_LOCKS" =~ ^[0-9]+$ ]]; then
|
||||
if [ "$MAX_LOCKS" -lt 1000 ] 2>/dev/null; then
|
||||
echo "2. ⚠️ Lock Configuration:"
|
||||
echo " • max_locks_per_transaction is too low: $MAX_LOCKS"
|
||||
echo " • Run: ./fix_postgres_locks.sh"
|
||||
echo " • Or manually: ALTER SYSTEM SET max_locks_per_transaction = 4096;"
|
||||
echo " • Then restart PostgreSQL"
|
||||
echo
|
||||
fi
|
||||
fi
|
||||
|
||||
# Other recommendations
|
||||
echo "3. 🔧 Before Large Restores:"
|
||||
echo " • Stop unnecessary services (web servers, cron jobs, etc.)"
|
||||
echo " • Clear PostgreSQL idle connections"
|
||||
echo " • Ensure adequate disk space for temp files"
|
||||
echo " • Consider using --large-db mode for very large databases"
|
||||
echo
|
||||
|
||||
echo "4. 📊 Monitor During Restore:"
|
||||
echo " • Watch: watch -n 2 'ps aux | grep postgres | head -20'"
|
||||
echo " • Locks: watch -n 5 'psql -c \"SELECT COUNT(*) FROM pg_locks;\"'"
|
||||
echo " • Memory: watch -n 2 free -h"
|
||||
echo
|
||||
|
||||
echo "════════════════════════════════════════════════════════════"
|
||||
echo " Report generated: $(date '+%Y-%m-%d %H:%M:%S')"
|
||||
echo " Save this output: $0 > diagnosis_$(date +%Y%m%d_%H%M%S).log"
|
||||
echo "════════════════════════════════════════════════════════════"
|
||||
339
docs/CATALOG.md
339
docs/CATALOG.md
@ -1,339 +0,0 @@
|
||||
# Backup Catalog
|
||||
|
||||
Complete reference for the dbbackup catalog system for tracking, managing, and analyzing backup inventory.
|
||||
|
||||
## Overview
|
||||
|
||||
The catalog is a SQLite database that tracks all backups, providing:
|
||||
- Backup gap detection (missing scheduled backups)
|
||||
- Retention policy compliance verification
|
||||
- Backup integrity tracking
|
||||
- Historical retention enforcement
|
||||
- Full-text search over backup metadata
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Initialize catalog (automatic on first use)
|
||||
dbbackup catalog sync /mnt/backups/databases
|
||||
|
||||
# List all backups in catalog
|
||||
dbbackup catalog list
|
||||
|
||||
# Show catalog statistics
|
||||
dbbackup catalog stats
|
||||
|
||||
# View backup details
|
||||
dbbackup catalog info mydb_2026-01-23.dump.gz
|
||||
|
||||
# Search for backups
|
||||
dbbackup catalog search --database myapp --after 2026-01-01
|
||||
```
|
||||
|
||||
## Catalog Sync
|
||||
|
||||
Syncs local backup directory with catalog database.
|
||||
|
||||
```bash
|
||||
# Sync all backups in directory
|
||||
dbbackup catalog sync /mnt/backups/databases
|
||||
|
||||
# Force rescan (useful if backups were added manually)
|
||||
dbbackup catalog sync /mnt/backups/databases --force
|
||||
|
||||
# Sync specific database backups
|
||||
dbbackup catalog sync /mnt/backups/databases --database myapp
|
||||
|
||||
# Dry-run to see what would be synced
|
||||
dbbackup catalog sync /mnt/backups/databases --dry-run
|
||||
```
|
||||
|
||||
Catalog entries include:
|
||||
- Backup filename
|
||||
- Database name
|
||||
- Backup timestamp
|
||||
- Size (bytes)
|
||||
- Compression ratio
|
||||
- Encryption status
|
||||
- Backup type (full/incremental/pitr_base)
|
||||
- Retention status
|
||||
- Checksum/hash
|
||||
|
||||
## Listing Backups
|
||||
|
||||
### Show All Backups
|
||||
|
||||
```bash
|
||||
dbbackup catalog list
|
||||
```
|
||||
|
||||
Output format:
|
||||
```
|
||||
Database Timestamp Size Compressed Encrypted Verified Type
|
||||
myapp 2026-01-23 14:30:00 2.5 GB 62% yes yes full
|
||||
myapp 2026-01-23 02:00:00 1.2 GB 58% yes yes incremental
|
||||
mydb 2026-01-23 22:15:00 856 MB 64% no no full
|
||||
```
|
||||
|
||||
### Filter by Database
|
||||
|
||||
```bash
|
||||
dbbackup catalog list --database myapp
|
||||
```
|
||||
|
||||
### Filter by Date Range
|
||||
|
||||
```bash
|
||||
dbbackup catalog list --after 2026-01-01 --before 2026-01-31
|
||||
```
|
||||
|
||||
### Sort Results
|
||||
|
||||
```bash
|
||||
dbbackup catalog list --sort size --reverse # Largest first
|
||||
dbbackup catalog list --sort date # Oldest first
|
||||
dbbackup catalog list --sort verified # Verified first
|
||||
```
|
||||
|
||||
## Statistics and Gaps
|
||||
|
||||
### Show Catalog Statistics
|
||||
|
||||
```bash
|
||||
dbbackup catalog stats
|
||||
```
|
||||
|
||||
Output includes:
|
||||
- Total backups
|
||||
- Total size stored
|
||||
- Unique databases
|
||||
- Success/failure ratio
|
||||
- Oldest/newest backup
|
||||
- Average backup size
|
||||
|
||||
### Detect Backup Gaps
|
||||
|
||||
Gaps are missing expected backups based on schedule.
|
||||
|
||||
```bash
|
||||
# Show gaps in mydb backups (assuming daily schedule)
|
||||
dbbackup catalog gaps mydb --interval 24h
|
||||
|
||||
# 12-hour interval
|
||||
dbbackup catalog gaps mydb --interval 12h
|
||||
|
||||
# Show as calendar grid
|
||||
dbbackup catalog gaps mydb --interval 24h --calendar
|
||||
|
||||
# Define custom work hours (backup only weekdays 02:00)
|
||||
dbbackup catalog gaps mydb --interval 24h --workdays-only
|
||||
```
|
||||
|
||||
Output shows:
|
||||
- Dates with missing backups
|
||||
- Expected backup count
|
||||
- Actual backup count
|
||||
- Gap duration
|
||||
- Reasons (if known)
|
||||
|
||||
## Searching
|
||||
|
||||
Full-text search across backup metadata.
|
||||
|
||||
```bash
|
||||
# Search by database name
|
||||
dbbackup catalog search --database myapp
|
||||
|
||||
# Search by date
|
||||
dbbackup catalog search --after 2026-01-01 --before 2026-01-31
|
||||
|
||||
# Search by size range (GB)
|
||||
dbbackup catalog search --min-size 0.5 --max-size 5.0
|
||||
|
||||
# Search by backup type
|
||||
dbbackup catalog search --backup-type incremental
|
||||
|
||||
# Search by encryption status
|
||||
dbbackup catalog search --encrypted
|
||||
|
||||
# Search by verification status
|
||||
dbbackup catalog search --verified
|
||||
|
||||
# Combine filters
|
||||
dbbackup catalog search --database myapp --encrypted --after 2026-01-01
|
||||
```
|
||||
|
||||
## Backup Details
|
||||
|
||||
```bash
|
||||
# Show full details for a specific backup
|
||||
dbbackup catalog info mydb_2026-01-23.dump.gz
|
||||
|
||||
# Output includes:
|
||||
# - Filename and path
|
||||
# - Database name and version
|
||||
# - Backup timestamp
|
||||
# - Backup type (full/incremental/pitr_base)
|
||||
# - Size (compressed/uncompressed)
|
||||
# - Compression ratio
|
||||
# - Encryption (algorithm, key hash)
|
||||
# - Checksums (md5, sha256)
|
||||
# - Verification status and date
|
||||
# - Retention classification (daily/weekly/monthly)
|
||||
# - Comments/notes
|
||||
```
|
||||
|
||||
## Retention Classification
|
||||
|
||||
The catalog classifies backups according to retention policies.
|
||||
|
||||
### GFS (Grandfather-Father-Son) Classification
|
||||
|
||||
```
|
||||
Daily: Last 7 backups
|
||||
Weekly: One backup per week for 4 weeks
|
||||
Monthly: One backup per month for 12 months
|
||||
```
|
||||
|
||||
Example:
|
||||
```bash
|
||||
dbbackup catalog list --show-retention
|
||||
|
||||
# Output shows:
|
||||
# myapp_2026-01-23.dump.gz daily (retain 6 more days)
|
||||
# myapp_2026-01-16.dump.gz weekly (retain 3 more weeks)
|
||||
# myapp_2026-01-01.dump.gz monthly (retain 11 more months)
|
||||
```
|
||||
|
||||
## Compliance Reports
|
||||
|
||||
Generate compliance reports based on catalog data.
|
||||
|
||||
```bash
|
||||
# Backup compliance report
|
||||
dbbackup catalog compliance-report
|
||||
|
||||
# Shows:
|
||||
# - All backups compliant with retention policy
|
||||
# - Gaps exceeding SLA
|
||||
# - Failed backups
|
||||
# - Unverified backups
|
||||
# - Encryption status
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Catalog settings in `.dbbackup.conf`:
|
||||
|
||||
```ini
|
||||
[catalog]
|
||||
# Enable catalog (default: true)
|
||||
enabled = true
|
||||
|
||||
# Catalog database path (default: ~/.dbbackup/catalog.db)
|
||||
db_path = /var/lib/dbbackup/catalog.db
|
||||
|
||||
# Retention days (default: 30)
|
||||
retention_days = 30
|
||||
|
||||
# Minimum backups to keep (default: 5)
|
||||
min_backups = 5
|
||||
|
||||
# Enable gap detection (default: true)
|
||||
gap_detection = true
|
||||
|
||||
# Gap alert threshold (hours, default: 36)
|
||||
gap_threshold_hours = 36
|
||||
|
||||
# Verify backups automatically (default: true)
|
||||
auto_verify = true
|
||||
```
|
||||
|
||||
## Maintenance
|
||||
|
||||
### Rebuild Catalog
|
||||
|
||||
Rebuild from scratch (useful if corrupted):
|
||||
|
||||
```bash
|
||||
dbbackup catalog rebuild /mnt/backups/databases
|
||||
```
|
||||
|
||||
### Export Catalog
|
||||
|
||||
Export to CSV for analysis in spreadsheet/BI tools:
|
||||
|
||||
```bash
|
||||
dbbackup catalog export --format csv --output catalog.csv
|
||||
```
|
||||
|
||||
Supported formats:
|
||||
- csv (Excel compatible)
|
||||
- json (structured data)
|
||||
- html (browseable report)
|
||||
|
||||
### Cleanup Orphaned Entries
|
||||
|
||||
Remove catalog entries for deleted backups:
|
||||
|
||||
```bash
|
||||
dbbackup catalog cleanup --orphaned
|
||||
|
||||
# Dry-run
|
||||
dbbackup catalog cleanup --orphaned --dry-run
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Find All Encrypted Backups from Last Week
|
||||
|
||||
```bash
|
||||
dbbackup catalog search \
|
||||
--after "$(date -d '7 days ago' +%Y-%m-%d)" \
|
||||
--encrypted
|
||||
```
|
||||
|
||||
### Generate Weekly Compliance Report
|
||||
|
||||
```bash
|
||||
dbbackup catalog search \
|
||||
--after "$(date -d '7 days ago' +%Y-%m-%d)" \
|
||||
--show-retention \
|
||||
--verified
|
||||
```
|
||||
|
||||
### Monitor Backup Size Growth
|
||||
|
||||
```bash
|
||||
dbbackup catalog stats | grep "Average backup size"
|
||||
|
||||
# Track over time
|
||||
for week in $(seq 1 4); do
|
||||
DATE=$(date -d "$((week*7)) days ago" +%Y-%m-%d)
|
||||
echo "Week of $DATE:"
|
||||
dbbackup catalog stats --after "$DATE" | grep "Average backup size"
|
||||
done
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Catalog Shows Wrong Count
|
||||
|
||||
Resync the catalog:
|
||||
```bash
|
||||
dbbackup catalog sync /mnt/backups/databases --force
|
||||
```
|
||||
|
||||
### Gaps Detected But Backups Exist
|
||||
|
||||
Manual backups not in catalog - sync them:
|
||||
```bash
|
||||
dbbackup catalog sync /mnt/backups/databases
|
||||
```
|
||||
|
||||
### Corruption Error
|
||||
|
||||
Rebuild catalog:
|
||||
```bash
|
||||
dbbackup catalog rebuild /mnt/backups/databases
|
||||
```
|
||||
365
docs/DRILL.md
365
docs/DRILL.md
@ -1,365 +0,0 @@
|
||||
# Disaster Recovery Drilling
|
||||
|
||||
Complete guide for automated disaster recovery testing with dbbackup.
|
||||
|
||||
## Overview
|
||||
|
||||
DR drills automate the process of validating backup integrity through actual restore testing. Instead of hoping backups work when needed, automated drills regularly restore backups in isolated containers to verify:
|
||||
|
||||
- Backup file integrity
|
||||
- Database compatibility
|
||||
- Restore time estimates (RTO)
|
||||
- Schema validation
|
||||
- Data consistency
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Run single DR drill on latest backup
|
||||
dbbackup drill /mnt/backups/databases
|
||||
|
||||
# Drill specific database
|
||||
dbbackup drill /mnt/backups/databases --database myapp
|
||||
|
||||
# Drill multiple databases
|
||||
dbbackup drill /mnt/backups/databases --database myapp,mydb
|
||||
|
||||
# Schedule daily drills
|
||||
dbbackup drill /mnt/backups/databases --schedule daily
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Select backup** - Picks latest or specified backup
|
||||
2. **Create container** - Starts isolated database container
|
||||
3. **Extract backup** - Decompresses to temporary storage
|
||||
4. **Restore** - Imports data to test database
|
||||
5. **Validate** - Runs integrity checks
|
||||
6. **Cleanup** - Removes test container
|
||||
7. **Report** - Stores results in catalog
|
||||
|
||||
## Drill Configuration
|
||||
|
||||
### Select Specific Backup
|
||||
|
||||
```bash
|
||||
# Latest backup for database
|
||||
dbbackup drill /mnt/backups/databases --database myapp
|
||||
|
||||
# Backup from specific date
|
||||
dbbackup drill /mnt/backups/databases --database myapp --date 2026-01-23
|
||||
|
||||
# Oldest backup (best test)
|
||||
dbbackup drill /mnt/backups/databases --database myapp --oldest
|
||||
```
|
||||
|
||||
### Drill Options
|
||||
|
||||
```bash
|
||||
# Full validation (slower)
|
||||
dbbackup drill /mnt/backups/databases --full-validation
|
||||
|
||||
# Quick validation (schema only, faster)
|
||||
dbbackup drill /mnt/backups/databases --quick-validation
|
||||
|
||||
# Store results in catalog
|
||||
dbbackup drill /mnt/backups/databases --catalog
|
||||
|
||||
# Send notification on failure
|
||||
dbbackup drill /mnt/backups/databases --notify-on-failure
|
||||
|
||||
# Custom test database name
|
||||
dbbackup drill /mnt/backups/databases --test-database dr_test_prod
|
||||
```
|
||||
|
||||
## Scheduled Drills
|
||||
|
||||
Run drills automatically on a schedule.
|
||||
|
||||
### Configure Schedule
|
||||
|
||||
```bash
|
||||
# Daily drill at 03:00
|
||||
dbbackup drill /mnt/backups/databases --schedule "03:00"
|
||||
|
||||
# Weekly drill (Sunday 02:00)
|
||||
dbbackup drill /mnt/backups/databases --schedule "sun 02:00"
|
||||
|
||||
# Monthly drill (1st of month)
|
||||
dbbackup drill /mnt/backups/databases --schedule "monthly"
|
||||
|
||||
# Install as systemd timer
|
||||
sudo dbbackup install drill \
|
||||
--backup-path /mnt/backups/databases \
|
||||
--schedule "03:00"
|
||||
```
|
||||
|
||||
### Verify Schedule
|
||||
|
||||
```bash
|
||||
# Show next 5 scheduled drills
|
||||
dbbackup drill list --upcoming
|
||||
|
||||
# Check drill history
|
||||
dbbackup drill list --history
|
||||
|
||||
# Show drill statistics
|
||||
dbbackup drill stats
|
||||
```
|
||||
|
||||
## Drill Results
|
||||
|
||||
### View Drill History
|
||||
|
||||
```bash
|
||||
# All drill results
|
||||
dbbackup drill list
|
||||
|
||||
# Recent 10 drills
|
||||
dbbackup drill list --limit 10
|
||||
|
||||
# Drills from last week
|
||||
dbbackup drill list --after "$(date -d '7 days ago' +%Y-%m-%d)"
|
||||
|
||||
# Failed drills only
|
||||
dbbackup drill list --status failed
|
||||
|
||||
# Passed drills only
|
||||
dbbackup drill list --status passed
|
||||
```
|
||||
|
||||
### Detailed Drill Report
|
||||
|
||||
```bash
|
||||
dbbackup drill report myapp_2026-01-23.dump.gz
|
||||
|
||||
# Output includes:
|
||||
# - Backup filename
|
||||
# - Database version
|
||||
# - Extract time
|
||||
# - Restore time
|
||||
# - Row counts (before/after)
|
||||
# - Table verification results
|
||||
# - Data integrity status
|
||||
# - Pass/Fail verdict
|
||||
# - Warnings/errors
|
||||
```
|
||||
|
||||
## Validation Types
|
||||
|
||||
### Full Validation
|
||||
|
||||
Deep integrity checks on restored data.
|
||||
|
||||
```bash
|
||||
dbbackup drill /mnt/backups/databases --full-validation
|
||||
|
||||
# Checks:
|
||||
# - All tables restored
|
||||
# - Row counts match original
|
||||
# - Indexes present and valid
|
||||
# - Constraints enforced
|
||||
# - Foreign key references valid
|
||||
# - Sequence values correct (PostgreSQL)
|
||||
# - Triggers present (if not system-generated)
|
||||
```
|
||||
|
||||
### Quick Validation
|
||||
|
||||
Schema-only validation (fast).
|
||||
|
||||
```bash
|
||||
dbbackup drill /mnt/backups/databases --quick-validation
|
||||
|
||||
# Checks:
|
||||
# - Database connects
|
||||
# - All tables present
|
||||
# - Column definitions correct
|
||||
# - Indexes exist
|
||||
```
|
||||
|
||||
### Custom Validation
|
||||
|
||||
Run custom SQL checks.
|
||||
|
||||
```bash
|
||||
# Add custom validation query
|
||||
dbbackup drill /mnt/backups/databases \
|
||||
--validation-query "SELECT COUNT(*) FROM users" \
|
||||
--validation-expected 15000
|
||||
|
||||
# Example for multiple tables
|
||||
dbbackup drill /mnt/backups/databases \
|
||||
--validation-query "SELECT COUNT(*) FROM orders WHERE status='completed'" \
|
||||
--validation-expected 42000
|
||||
```
|
||||
|
||||
## Reporting
|
||||
|
||||
### Generate Drill Report
|
||||
|
||||
```bash
|
||||
# HTML report (email-friendly)
|
||||
dbbackup drill report --format html --output drill-report.html
|
||||
|
||||
# JSON report (for CI/CD pipelines)
|
||||
dbbackup drill report --format json --output drill-results.json
|
||||
|
||||
# Markdown report (GitHub integration)
|
||||
dbbackup drill report --format markdown --output drill-results.md
|
||||
```
|
||||
|
||||
### Example Report Format
|
||||
|
||||
```
|
||||
Disaster Recovery Drill Results
|
||||
================================
|
||||
|
||||
Backup: myapp_2026-01-23_14-30-00.dump.gz
|
||||
Date: 2026-01-25 03:15:00
|
||||
Duration: 5m 32s
|
||||
Status: PASSED
|
||||
|
||||
Details:
|
||||
Extract Time: 1m 15s
|
||||
Restore Time: 3m 42s
|
||||
Validation Time: 34s
|
||||
|
||||
Tables Restored: 42
|
||||
Rows Verified: 1,234,567
|
||||
Total Size: 2.5 GB
|
||||
|
||||
Validation:
|
||||
Schema Check: OK
|
||||
Row Count Check: OK (all tables)
|
||||
Index Check: OK (all 28 indexes present)
|
||||
Constraint Check: OK (all 5 foreign keys valid)
|
||||
|
||||
Warnings: None
|
||||
Errors: None
|
||||
```
|
||||
|
||||
## Integration with CI/CD
|
||||
|
||||
### GitHub Actions
|
||||
|
||||
```yaml
|
||||
name: Daily DR Drill
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 3 * * *' # Daily at 03:00
|
||||
|
||||
jobs:
|
||||
dr-drill:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Run DR drill
|
||||
run: |
|
||||
dbbackup drill /backups/databases \
|
||||
--full-validation \
|
||||
--format json \
|
||||
--output results.json
|
||||
|
||||
- name: Check results
|
||||
run: |
|
||||
if grep -q '"status":"failed"' results.json; then
|
||||
echo "DR drill failed!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Upload report
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: drill-results
|
||||
path: results.json
|
||||
```
|
||||
|
||||
### Jenkins Pipeline
|
||||
|
||||
```groovy
|
||||
pipeline {
|
||||
triggers {
|
||||
cron('H 3 * * *') // Daily at 03:00
|
||||
}
|
||||
|
||||
stages {
|
||||
stage('DR Drill') {
|
||||
steps {
|
||||
sh 'dbbackup drill /backups/databases --full-validation --format json --output drill.json'
|
||||
}
|
||||
}
|
||||
|
||||
stage('Validate Results') {
|
||||
steps {
|
||||
script {
|
||||
def results = readJSON file: 'drill.json'
|
||||
if (results.status != 'passed') {
|
||||
error("DR drill failed!")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Drill Fails with "Out of Space"
|
||||
|
||||
```bash
|
||||
# Check available disk space
|
||||
df -h
|
||||
|
||||
# Clean up old test databases
|
||||
docker system prune -a
|
||||
|
||||
# Use faster storage for test
|
||||
dbbackup drill /mnt/backups/databases --temp-dir /ssd/drill-temp
|
||||
```
|
||||
|
||||
### Drill Times Out
|
||||
|
||||
```bash
|
||||
# Increase timeout (minutes)
|
||||
dbbackup drill /mnt/backups/databases --timeout 30
|
||||
|
||||
# Skip certain validations to speed up
|
||||
dbbackup drill /mnt/backups/databases --quick-validation
|
||||
```
|
||||
|
||||
### Drill Shows Data Mismatch
|
||||
|
||||
Indicates a problem with the backup - investigate immediately:
|
||||
|
||||
```bash
|
||||
# Get detailed diff report
|
||||
dbbackup drill report --show-diffs myapp_2026-01-23.dump.gz
|
||||
|
||||
# Regenerate backup
|
||||
dbbackup backup single myapp --force-full
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Run weekly drills minimum** - Catch issues early
|
||||
|
||||
2. **Test oldest backups** - Verify full retention chain works
|
||||
```bash
|
||||
dbbackup drill /mnt/backups/databases --oldest
|
||||
```
|
||||
|
||||
3. **Test critical databases first** - Prioritize by impact
|
||||
|
||||
4. **Store results in catalog** - Track historical pass/fail rates
|
||||
|
||||
5. **Alert on failures** - Automatic notification via email/Slack
|
||||
|
||||
6. **Document RTO** - Use drill times to refine recovery objectives
|
||||
|
||||
7. **Test cross-major-versions** - Use test environment with different DB version
|
||||
```bash
|
||||
# Test PostgreSQL 15 backup on PostgreSQL 16
|
||||
dbbackup drill /mnt/backups/databases --target-version 16
|
||||
```
|
||||
537
docs/EXPORTER.md
537
docs/EXPORTER.md
@ -1,537 +0,0 @@
|
||||
# DBBackup Prometheus Exporter & Grafana Dashboard
|
||||
|
||||
This document provides complete reference for the DBBackup Prometheus exporter, including all exported metrics, setup instructions, and Grafana dashboard configuration.
|
||||
|
||||
## What's New (January 2026)
|
||||
|
||||
### New Features
|
||||
- **Backup Type Tracking**: All backup metrics now include a `backup_type` label (`full`, `incremental`, or `pitr_base` for PITR base backups)
|
||||
- **Note**: CLI `--backup-type` flag only accepts `full` or `incremental`. The `pitr_base` label is auto-assigned when using `dbbackup pitr base`
|
||||
- **PITR Metrics**: Complete Point-in-Time Recovery monitoring for PostgreSQL WAL and MySQL binlog archiving
|
||||
- **New Alerts**: PITR-specific alerts for archive lag, chain integrity, and gap detection
|
||||
|
||||
### New Metrics Added
|
||||
| Metric | Description |
|
||||
|--------|-------------|
|
||||
| `dbbackup_build_info` | Build info with version and commit labels |
|
||||
| `dbbackup_backup_by_type` | Count backups by type (full/incremental/pitr_base) |
|
||||
| `dbbackup_pitr_enabled` | Whether PITR is enabled (1/0) |
|
||||
| `dbbackup_pitr_archive_lag_seconds` | Seconds since last WAL/binlog archived |
|
||||
| `dbbackup_pitr_chain_valid` | WAL/binlog chain integrity (1=valid) |
|
||||
| `dbbackup_pitr_gap_count` | Number of gaps in archive chain |
|
||||
| `dbbackup_pitr_archive_count` | Total archived segments |
|
||||
| `dbbackup_pitr_archive_size_bytes` | Total archive storage |
|
||||
| `dbbackup_pitr_recovery_window_minutes` | Estimated PITR coverage |
|
||||
|
||||
### Label Changes
|
||||
- `backup_type` label added to: `dbbackup_rpo_seconds`, `dbbackup_last_success_timestamp`, `dbbackup_last_backup_duration_seconds`, `dbbackup_last_backup_size_bytes`
|
||||
- `dbbackup_backup_total` type changed from counter to gauge (more accurate for snapshot-based collection)
|
||||
|
||||
---
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Quick Start](#quick-start)
|
||||
- [Exporter Modes](#exporter-modes)
|
||||
- [Complete Metrics Reference](#complete-metrics-reference)
|
||||
- [Grafana Dashboard Setup](#grafana-dashboard-setup)
|
||||
- [Alerting Rules](#alerting-rules)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
|
||||
---
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Start the Metrics Server
|
||||
|
||||
```bash
|
||||
# Start HTTP exporter on default port 9399 (auto-detects hostname for server label)
|
||||
dbbackup metrics serve
|
||||
|
||||
# Custom port
|
||||
dbbackup metrics serve --port 9100
|
||||
|
||||
# Specify server name for labels (overrides auto-detection)
|
||||
dbbackup metrics serve --server production-db-01
|
||||
|
||||
# Specify custom catalog database location
|
||||
dbbackup metrics serve --catalog-db /path/to/catalog.db
|
||||
```
|
||||
|
||||
### Export to Textfile (for node_exporter)
|
||||
|
||||
```bash
|
||||
# Export to default location
|
||||
dbbackup metrics export
|
||||
|
||||
# Custom output path
|
||||
dbbackup metrics export --output /var/lib/node_exporter/textfile_collector/dbbackup.prom
|
||||
|
||||
# Specify catalog database and server name
|
||||
dbbackup metrics export --catalog-db /root/.dbbackup/catalog.db --server myhost
|
||||
```
|
||||
|
||||
### Install as Systemd Service
|
||||
|
||||
```bash
|
||||
# Install with metrics exporter
|
||||
sudo dbbackup install --with-metrics
|
||||
|
||||
# Start the service
|
||||
sudo systemctl start dbbackup-exporter
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Exporter Modes
|
||||
|
||||
### HTTP Server Mode (`metrics serve`)
|
||||
|
||||
Runs a standalone HTTP server exposing metrics for direct Prometheus scraping.
|
||||
|
||||
| Endpoint | Description |
|
||||
|-------------|----------------------------------|
|
||||
| `/metrics` | Prometheus metrics |
|
||||
| `/health` | Health check (returns 200 OK) |
|
||||
| `/` | Service info page |
|
||||
|
||||
**Default Port:** 9399
|
||||
|
||||
**Server Label:** Auto-detected from hostname (use `--server` to override)
|
||||
|
||||
**Catalog Location:** `~/.dbbackup/catalog.db` (use `--catalog-db` to override)
|
||||
|
||||
**Configuration:**
|
||||
```bash
|
||||
dbbackup metrics serve [--server <instance-name>] [--port <port>] [--catalog-db <path>]
|
||||
```
|
||||
|
||||
| Flag | Default | Description |
|
||||
|------|---------|-------------|
|
||||
| `--server` | hostname | Server label for metrics (auto-detected if not set) |
|
||||
| `--port` | 9399 | HTTP server port |
|
||||
| `--catalog-db` | ~/.dbbackup/catalog.db | Path to catalog SQLite database |
|
||||
|
||||
### Textfile Mode (`metrics export`)
|
||||
|
||||
Writes metrics to a file for collection by node_exporter's textfile collector.
|
||||
|
||||
**Default Path:** `/var/lib/dbbackup/metrics/dbbackup.prom`
|
||||
|
||||
| Flag | Default | Description |
|
||||
|------|---------|-------------|
|
||||
| `--server` | hostname | Server label for metrics (auto-detected if not set) |
|
||||
| `--output` | /var/lib/dbbackup/metrics/dbbackup.prom | Output file path |
|
||||
| `--catalog-db` | ~/.dbbackup/catalog.db | Path to catalog SQLite database |
|
||||
|
||||
**node_exporter Configuration:**
|
||||
```bash
|
||||
node_exporter --collector.textfile.directory=/var/lib/dbbackup/metrics/
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Complete Metrics Reference
|
||||
|
||||
All metrics use the `dbbackup_` prefix. Below is the **validated** list of metrics exported by DBBackup.
|
||||
|
||||
### Backup Status Metrics
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|-------------|------|--------|-------------|
|
||||
| `dbbackup_last_success_timestamp` | gauge | `server`, `database`, `engine`, `backup_type` | Unix timestamp of last successful backup |
|
||||
| `dbbackup_last_backup_duration_seconds` | gauge | `server`, `database`, `engine`, `backup_type` | Duration of last successful backup in seconds |
|
||||
| `dbbackup_last_backup_size_bytes` | gauge | `server`, `database`, `engine`, `backup_type` | Size of last successful backup in bytes |
|
||||
| `dbbackup_backup_total` | gauge | `server`, `database`, `status` | Total backup attempts (status: `success` or `failure`) |
|
||||
| `dbbackup_backup_by_type` | gauge | `server`, `database`, `backup_type` | Backup count by type (`full`, `incremental`, `pitr_base`) |
|
||||
| `dbbackup_rpo_seconds` | gauge | `server`, `database`, `backup_type` | Seconds since last successful backup (RPO) |
|
||||
| `dbbackup_backup_verified` | gauge | `server`, `database` | Whether last backup was verified (1=yes, 0=no) |
|
||||
| `dbbackup_scrape_timestamp` | gauge | `server` | Unix timestamp when metrics were collected |
|
||||
|
||||
### PITR (Point-in-Time Recovery) Metrics
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|-------------|------|--------|-------------|
|
||||
| `dbbackup_pitr_enabled` | gauge | `server`, `database`, `engine` | Whether PITR is enabled (1=yes, 0=no) |
|
||||
| `dbbackup_pitr_last_archived_timestamp` | gauge | `server`, `database`, `engine` | Unix timestamp of last archived WAL/binlog |
|
||||
| `dbbackup_pitr_archive_lag_seconds` | gauge | `server`, `database`, `engine` | Seconds since last archive (lower is better) |
|
||||
| `dbbackup_pitr_archive_count` | gauge | `server`, `database`, `engine` | Total archived WAL segments or binlog files |
|
||||
| `dbbackup_pitr_archive_size_bytes` | gauge | `server`, `database`, `engine` | Total size of archived logs in bytes |
|
||||
| `dbbackup_pitr_chain_valid` | gauge | `server`, `database`, `engine` | Whether archive chain is valid (1=yes, 0=gaps) |
|
||||
| `dbbackup_pitr_gap_count` | gauge | `server`, `database`, `engine` | Number of gaps in archive chain |
|
||||
| `dbbackup_pitr_recovery_window_minutes` | gauge | `server`, `database`, `engine` | Estimated PITR coverage window in minutes |
|
||||
| `dbbackup_pitr_scrape_timestamp` | gauge | `server` | PITR metrics collection timestamp |
|
||||
|
||||
### Deduplication Metrics
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|-------------|------|--------|-------------|
|
||||
| `dbbackup_dedup_chunks_total` | gauge | `server` | Total unique chunks stored |
|
||||
| `dbbackup_dedup_manifests_total` | gauge | `server` | Total number of deduplicated backups |
|
||||
| `dbbackup_dedup_backup_bytes_total` | gauge | `server` | Total logical size of all backups (bytes) |
|
||||
| `dbbackup_dedup_stored_bytes_total` | gauge | `server` | Total unique data stored after dedup (bytes) |
|
||||
| `dbbackup_dedup_space_saved_bytes` | gauge | `server` | Bytes saved by deduplication |
|
||||
| `dbbackup_dedup_ratio` | gauge | `server` | Dedup efficiency (0-1, higher = better) |
|
||||
| `dbbackup_dedup_disk_usage_bytes` | gauge | `server` | Actual disk usage of chunk store |
|
||||
| `dbbackup_dedup_compression_ratio` | gauge | `server` | Compression ratio (0-1, higher = better) |
|
||||
| `dbbackup_dedup_oldest_chunk_timestamp` | gauge | `server` | Unix timestamp of oldest chunk |
|
||||
| `dbbackup_dedup_newest_chunk_timestamp` | gauge | `server` | Unix timestamp of newest chunk |
|
||||
| `dbbackup_dedup_scrape_timestamp` | gauge | `server` | Dedup metrics collection timestamp |
|
||||
|
||||
### Per-Database Dedup Metrics
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|-------------|------|--------|-------------|
|
||||
| `dbbackup_dedup_database_backup_count` | gauge | `server`, `database` | Deduplicated backups per database |
|
||||
| `dbbackup_dedup_database_ratio` | gauge | `server`, `database` | Per-database dedup ratio |
|
||||
| `dbbackup_dedup_database_last_backup_timestamp` | gauge | `server`, `database` | Last backup timestamp per database |
|
||||
| `dbbackup_dedup_database_total_bytes` | gauge | `server`, `database` | Total logical size per database |
|
||||
| `dbbackup_dedup_database_stored_bytes` | gauge | `server`, `database` | Stored bytes per database (after dedup) |
|
||||
| `dbbackup_rpo_seconds` | gauge | `server`, `database` | Seconds since last backup (same as regular backups for unified alerting) |
|
||||
|
||||
> **Note:** The `dbbackup_rpo_seconds` metric is exported by both regular backups and dedup backups, enabling unified alerting without complex PromQL expressions.
|
||||
|
||||
---
|
||||
|
||||
## Example Metrics Output
|
||||
|
||||
```prometheus
|
||||
# DBBackup Prometheus Metrics
|
||||
# Generated at: 2026-01-27T10:30:00Z
|
||||
# Server: production
|
||||
|
||||
# HELP dbbackup_last_success_timestamp Unix timestamp of last successful backup
|
||||
# TYPE dbbackup_last_success_timestamp gauge
|
||||
dbbackup_last_success_timestamp{server="production",database="myapp",engine="postgres",backup_type="full"} 1737884600
|
||||
|
||||
# HELP dbbackup_last_backup_duration_seconds Duration of last successful backup in seconds
|
||||
# TYPE dbbackup_last_backup_duration_seconds gauge
|
||||
dbbackup_last_backup_duration_seconds{server="production",database="myapp",engine="postgres",backup_type="full"} 125.50
|
||||
|
||||
# HELP dbbackup_last_backup_size_bytes Size of last successful backup in bytes
|
||||
# TYPE dbbackup_last_backup_size_bytes gauge
|
||||
dbbackup_last_backup_size_bytes{server="production",database="myapp",engine="postgres",backup_type="full"} 1073741824
|
||||
|
||||
# HELP dbbackup_backup_total Total number of backup attempts by type and status
|
||||
# TYPE dbbackup_backup_total gauge
|
||||
dbbackup_backup_total{server="production",database="myapp",status="success"} 42
|
||||
dbbackup_backup_total{server="production",database="myapp",status="failure"} 2
|
||||
|
||||
# HELP dbbackup_backup_by_type Total number of backups by backup type
|
||||
# TYPE dbbackup_backup_by_type gauge
|
||||
dbbackup_backup_by_type{server="production",database="myapp",backup_type="full"} 30
|
||||
dbbackup_backup_by_type{server="production",database="myapp",backup_type="incremental"} 12
|
||||
|
||||
# HELP dbbackup_rpo_seconds Recovery Point Objective - seconds since last successful backup
|
||||
# TYPE dbbackup_rpo_seconds gauge
|
||||
dbbackup_rpo_seconds{server="production",database="myapp",backup_type="full"} 3600
|
||||
|
||||
# HELP dbbackup_backup_verified Whether the last backup was verified (1=yes, 0=no)
|
||||
# TYPE dbbackup_backup_verified gauge
|
||||
dbbackup_backup_verified{server="production",database="myapp"} 1
|
||||
|
||||
# HELP dbbackup_pitr_enabled Whether PITR is enabled for database (1=enabled, 0=disabled)
|
||||
# TYPE dbbackup_pitr_enabled gauge
|
||||
dbbackup_pitr_enabled{server="production",database="myapp",engine="postgres"} 1
|
||||
|
||||
# HELP dbbackup_pitr_archive_lag_seconds Seconds since last WAL/binlog was archived
|
||||
# TYPE dbbackup_pitr_archive_lag_seconds gauge
|
||||
dbbackup_pitr_archive_lag_seconds{server="production",database="myapp",engine="postgres"} 45
|
||||
|
||||
# HELP dbbackup_pitr_chain_valid Whether the WAL/binlog chain is valid (1=valid, 0=gaps detected)
|
||||
# TYPE dbbackup_pitr_chain_valid gauge
|
||||
dbbackup_pitr_chain_valid{server="production",database="myapp",engine="postgres"} 1
|
||||
|
||||
# HELP dbbackup_pitr_recovery_window_minutes Estimated recovery window in minutes
|
||||
# TYPE dbbackup_pitr_recovery_window_minutes gauge
|
||||
dbbackup_pitr_recovery_window_minutes{server="production",database="myapp",engine="postgres"} 10080
|
||||
|
||||
# HELP dbbackup_dedup_ratio Deduplication ratio (0-1, higher is better)
|
||||
# TYPE dbbackup_dedup_ratio gauge
|
||||
dbbackup_dedup_ratio{server="production"} 0.6500
|
||||
|
||||
# HELP dbbackup_dedup_space_saved_bytes Bytes saved by deduplication
|
||||
# TYPE dbbackup_dedup_space_saved_bytes gauge
|
||||
dbbackup_dedup_space_saved_bytes{server="production"} 5368709120
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Prometheus Scrape Configuration
|
||||
|
||||
Add to your `prometheus.yml`:
|
||||
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'dbbackup'
|
||||
scrape_interval: 60s
|
||||
scrape_timeout: 10s
|
||||
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'db-server-01:9399'
|
||||
- 'db-server-02:9399'
|
||||
labels:
|
||||
environment: 'production'
|
||||
|
||||
- targets:
|
||||
- 'db-staging:9399'
|
||||
labels:
|
||||
environment: 'staging'
|
||||
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: instance
|
||||
regex: '([^:]+):\d+'
|
||||
replacement: '$1'
|
||||
```
|
||||
|
||||
### File-based Service Discovery
|
||||
|
||||
```yaml
|
||||
- job_name: 'dbbackup-sd'
|
||||
scrape_interval: 60s
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets/dbbackup/*.yml'
|
||||
refresh_interval: 5m
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Grafana Dashboard Setup
|
||||
|
||||
### Import Dashboard
|
||||
|
||||
1. Open Grafana → **Dashboards** → **Import**
|
||||
2. Upload `grafana/dbbackup-dashboard.json` or paste the JSON
|
||||
3. Select your Prometheus data source
|
||||
4. Click **Import**
|
||||
|
||||
### Dashboard Panels
|
||||
|
||||
The dashboard includes the following panels:
|
||||
|
||||
#### Backup Overview Row
|
||||
| Panel | Metric Used | Description |
|
||||
|-------|-------------|-------------|
|
||||
| Last Backup Status | `dbbackup_rpo_seconds < bool 604800` | SUCCESS/FAILED indicator |
|
||||
| Time Since Last Backup | `dbbackup_rpo_seconds` | Time elapsed since last backup |
|
||||
| Verification Status | `dbbackup_backup_verified` | VERIFIED/NOT VERIFIED |
|
||||
| Total Successful Backups | `dbbackup_backup_total{status="success"}` | Counter |
|
||||
| Total Failed Backups | `dbbackup_backup_total{status="failure"}` | Counter |
|
||||
| RPO Over Time | `dbbackup_rpo_seconds` | Time series graph |
|
||||
| Backup Size | `dbbackup_last_backup_size_bytes` | Bar chart |
|
||||
| Backup Duration | `dbbackup_last_backup_duration_seconds` | Time series |
|
||||
| Backup Status Overview | Multiple metrics | Table with color-coded status |
|
||||
|
||||
#### Deduplication Statistics Row
|
||||
| Panel | Metric Used | Description |
|
||||
|-------|-------------|-------------|
|
||||
| Dedup Ratio | `dbbackup_dedup_ratio` | Percentage efficiency |
|
||||
| Space Saved | `dbbackup_dedup_space_saved_bytes` | Total bytes saved |
|
||||
| Disk Usage | `dbbackup_dedup_disk_usage_bytes` | Actual storage used |
|
||||
| Total Chunks | `dbbackup_dedup_chunks_total` | Chunk count |
|
||||
| Compression Ratio | `dbbackup_dedup_compression_ratio` | Compression efficiency |
|
||||
| Oldest Chunk | `dbbackup_dedup_oldest_chunk_timestamp` | Age of oldest data |
|
||||
| Newest Chunk | `dbbackup_dedup_newest_chunk_timestamp` | Most recent chunk |
|
||||
| Dedup Ratio by Database | `dbbackup_dedup_database_ratio` | Per-database efficiency |
|
||||
| Dedup Storage Over Time | `dbbackup_dedup_space_saved_bytes`, `dbbackup_dedup_disk_usage_bytes` | Storage trends |
|
||||
|
||||
### Dashboard Variables
|
||||
|
||||
| Variable | Query | Description |
|
||||
|----------|-------|-------------|
|
||||
| `$server` | `label_values(dbbackup_rpo_seconds, server)` | Filter by server |
|
||||
| `$DS_PROMETHEUS` | datasource | Prometheus data source |
|
||||
|
||||
### Dashboard Thresholds
|
||||
|
||||
#### RPO Thresholds
|
||||
- **Green:** < 12 hours (43200 seconds)
|
||||
- **Yellow:** 12-24 hours
|
||||
- **Red:** > 24 hours (86400 seconds)
|
||||
|
||||
#### Backup Status Thresholds
|
||||
- **1 (Green):** SUCCESS
|
||||
- **0 (Red):** FAILED
|
||||
|
||||
---
|
||||
|
||||
## Alerting Rules
|
||||
|
||||
### Pre-configured Alerts
|
||||
|
||||
Import `deploy/prometheus/alerting-rules.yaml` into Prometheus/Alertmanager.
|
||||
|
||||
#### Backup Status Alerts
|
||||
| Alert | Expression | Severity | Description |
|
||||
|-------|------------|----------|-------------|
|
||||
| `DBBackupRPOWarning` | `dbbackup_rpo_seconds > 43200` | warning | No backup for 12+ hours |
|
||||
| `DBBackupRPOCritical` | `dbbackup_rpo_seconds > 86400` | critical | No backup for 24+ hours |
|
||||
| `DBBackupFailed` | `increase(dbbackup_backup_total{status="failure"}[1h]) > 0` | critical | Backup failed |
|
||||
| `DBBackupFailureRateHigh` | Failure rate > 10% in 24h | warning | High failure rate |
|
||||
| `DBBackupSizeAnomaly` | Size changed > 50% vs 7-day avg | warning | Unusual backup size |
|
||||
| `DBBackupSizeZero` | `dbbackup_last_backup_size_bytes == 0` | critical | Empty backup file |
|
||||
| `DBBackupDurationHigh` | `dbbackup_last_backup_duration_seconds > 3600` | warning | Backup taking > 1 hour |
|
||||
| `DBBackupNotVerified` | `dbbackup_backup_verified == 0` for 24h | warning | Backup not verified |
|
||||
| `DBBackupNoRecentFull` | No full backup in 7+ days | warning | Need full backup for incremental chain |
|
||||
|
||||
#### PITR Alerts (New)
|
||||
| Alert | Expression | Severity | Description |
|
||||
|-------|------------|----------|-------------|
|
||||
| `DBBackupPITRArchiveLag` | `dbbackup_pitr_archive_lag_seconds > 600` | warning | Archive 10+ min behind |
|
||||
| `DBBackupPITRArchiveCritical` | `dbbackup_pitr_archive_lag_seconds > 1800` | critical | Archive 30+ min behind |
|
||||
| `DBBackupPITRChainBroken` | `dbbackup_pitr_chain_valid == 0` | critical | Gaps in WAL/binlog chain |
|
||||
| `DBBackupPITRGaps` | `dbbackup_pitr_gap_count > 0` | warning | Gaps detected in archive chain |
|
||||
| `DBBackupPITRDisabled` | PITR unexpectedly disabled | critical | PITR was enabled but now off |
|
||||
|
||||
#### Infrastructure Alerts
|
||||
| Alert | Expression | Severity | Description |
|
||||
|-------|------------|----------|-------------|
|
||||
| `DBBackupExporterDown` | `up{job="dbbackup"} == 0` | critical | Exporter unreachable |
|
||||
| `DBBackupDedupRatioLow` | `dbbackup_dedup_ratio < 0.2` for 24h | info | Low dedup efficiency |
|
||||
| `DBBackupStorageHigh` | `dbbackup_dedup_disk_usage_bytes > 1TB` | warning | High storage usage |
|
||||
|
||||
### Example Alert Configuration
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: dbbackup
|
||||
rules:
|
||||
- alert: DBBackupRPOCritical
|
||||
expr: dbbackup_rpo_seconds > 86400
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "No backup for {{ $labels.database }} in 24+ hours"
|
||||
description: "RPO violation on {{ $labels.server }}. Last backup: {{ $value | humanizeDuration }} ago."
|
||||
|
||||
- alert: DBBackupPITRChainBroken
|
||||
expr: dbbackup_pitr_chain_valid == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR chain broken for {{ $labels.database }}"
|
||||
description: "WAL/binlog chain has gaps. Point-in-time recovery is NOT possible. New base backup required."
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Exporter Not Returning Metrics
|
||||
|
||||
1. **Check catalog access:**
|
||||
```bash
|
||||
dbbackup catalog list
|
||||
```
|
||||
|
||||
2. **Verify port is open:**
|
||||
```bash
|
||||
curl -v http://localhost:9399/metrics
|
||||
```
|
||||
|
||||
3. **Check logs:**
|
||||
```bash
|
||||
journalctl -u dbbackup-exporter -f
|
||||
```
|
||||
|
||||
### Missing Dedup Metrics
|
||||
|
||||
Dedup metrics are only exported when using deduplication:
|
||||
```bash
|
||||
# Ensure dedup is enabled
|
||||
dbbackup dedup status
|
||||
```
|
||||
|
||||
### Metrics Not Updating
|
||||
|
||||
The exporter caches metrics for 30 seconds. The `/health` endpoint can confirm the exporter is running.
|
||||
|
||||
### Stale or Empty Metrics (Catalog Location Mismatch)
|
||||
|
||||
If the exporter shows stale or no backup data, verify the catalog database location:
|
||||
|
||||
```bash
|
||||
# Check where catalog sync writes
|
||||
dbbackup catalog sync /path/to/backups
|
||||
# Output shows: [STATS] Catalog database: /root/.dbbackup/catalog.db
|
||||
|
||||
# Ensure exporter reads from the same location
|
||||
dbbackup metrics serve --catalog-db /root/.dbbackup/catalog.db
|
||||
```
|
||||
|
||||
**Common Issue:** If backup scripts run as root but the exporter runs as a different user, they may use different catalog locations. Use `--catalog-db` to ensure consistency.
|
||||
|
||||
### Dashboard Shows "No Data"
|
||||
|
||||
1. Verify Prometheus is scraping successfully:
|
||||
```bash
|
||||
curl http://prometheus:9090/api/v1/targets | grep dbbackup
|
||||
```
|
||||
|
||||
2. Check metric names match (case-sensitive):
|
||||
```promql
|
||||
{__name__=~"dbbackup_.*"}
|
||||
```
|
||||
|
||||
3. Verify `server` label matches dashboard variable.
|
||||
|
||||
### Label Mismatch Issues
|
||||
|
||||
Ensure the `--server` flag matches across all instances:
|
||||
```bash
|
||||
# Consistent naming (or let it auto-detect from hostname)
|
||||
dbbackup metrics serve --server prod-db-01
|
||||
```
|
||||
|
||||
> **Note:** As of v3.x, the exporter auto-detects hostname if `--server` is not specified. This ensures unique server labels in multi-host deployments.
|
||||
|
||||
---
|
||||
|
||||
## Metrics Validation Checklist
|
||||
|
||||
Use this checklist to validate your exporter setup:
|
||||
|
||||
- [ ] `/metrics` endpoint returns HTTP 200
|
||||
- [ ] `/health` endpoint returns `{"status":"ok"}`
|
||||
- [ ] `dbbackup_rpo_seconds` shows correct RPO values
|
||||
- [ ] `dbbackup_backup_total` increments after backups
|
||||
- [ ] `dbbackup_backup_verified` reflects verification status
|
||||
- [ ] `dbbackup_last_backup_size_bytes` matches actual backup sizes
|
||||
- [ ] Prometheus scrape succeeds (check targets page)
|
||||
- [ ] Grafana dashboard loads without errors
|
||||
- [ ] Dashboard variables populate correctly
|
||||
- [ ] All panels show data (no "No Data" messages)
|
||||
|
||||
---
|
||||
|
||||
## Files Reference
|
||||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `grafana/dbbackup-dashboard.json` | Grafana dashboard JSON |
|
||||
| `grafana/alerting-rules.yaml` | Grafana alerting rules |
|
||||
| `deploy/prometheus/alerting-rules.yaml` | Prometheus alerting rules |
|
||||
| `deploy/prometheus/scrape-config.yaml` | Prometheus scrape configuration |
|
||||
| `docs/METRICS.md` | Metrics documentation |
|
||||
|
||||
---
|
||||
|
||||
## Version Compatibility
|
||||
|
||||
| DBBackup Version | Metrics Version | Dashboard UID |
|
||||
|------------------|-----------------|---------------|
|
||||
| 1.0.0+ | v1 | `dbbackup-overview` |
|
||||
|
||||
---
|
||||
|
||||
## Support
|
||||
|
||||
For issues with the exporter or dashboard:
|
||||
1. Check the [troubleshooting section](#troubleshooting)
|
||||
2. Review logs: `journalctl -u dbbackup-exporter`
|
||||
3. Open an issue with metrics output and dashboard screenshots
|
||||
314
docs/METRICS.md
314
docs/METRICS.md
@ -1,314 +0,0 @@
|
||||
# DBBackup Prometheus Metrics
|
||||
|
||||
This document describes all Prometheus metrics exposed by DBBackup for monitoring and alerting.
|
||||
|
||||
## Backup Status Metrics
|
||||
|
||||
### `dbbackup_rpo_seconds`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `backup_type`
|
||||
**Description:** Time in seconds since the last successful backup (Recovery Point Objective).
|
||||
|
||||
**Recommended Thresholds:**
|
||||
- Green: < 43200 (12 hours)
|
||||
- Yellow: 43200-86400 (12-24 hours)
|
||||
- Red: > 86400 (24+ hours)
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
dbbackup_rpo_seconds{server="prod-db-01"} > 86400
|
||||
|
||||
# RPO by backup type
|
||||
dbbackup_rpo_seconds{backup_type="full"}
|
||||
dbbackup_rpo_seconds{backup_type="incremental"}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_backup_total`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `status`
|
||||
**Description:** Total count of backup attempts, labeled by status (`success` or `failure`).
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Total successful backups
|
||||
dbbackup_backup_total{status="success"}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_backup_by_type`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `backup_type`
|
||||
**Description:** Total count of backups by backup type (`full`, `incremental`, `pitr_base`).
|
||||
|
||||
> **Note:** The `backup_type` label values are:
|
||||
> - `full` - Created with `--backup-type full` (default)
|
||||
> - `incremental` - Created with `--backup-type incremental`
|
||||
> - `pitr_base` - Auto-assigned when using `dbbackup pitr base` command
|
||||
>
|
||||
> The CLI `--backup-type` flag only accepts `full` or `incremental`.
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Count of each backup type
|
||||
dbbackup_backup_by_type{backup_type="full"}
|
||||
dbbackup_backup_by_type{backup_type="incremental"}
|
||||
dbbackup_backup_by_type{backup_type="pitr_base"}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_backup_verified`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`
|
||||
**Description:** Whether the most recent backup was verified successfully (1 = verified, 0 = not verified).
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_last_backup_size_bytes`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`, `backup_type`
|
||||
**Description:** Size of the last successful backup in bytes.
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Total backup storage across all databases
|
||||
sum(dbbackup_last_backup_size_bytes)
|
||||
|
||||
# Size by backup type
|
||||
dbbackup_last_backup_size_bytes{backup_type="full"}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_last_backup_duration_seconds`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`, `backup_type`
|
||||
**Description:** Duration of the last backup operation in seconds.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_last_success_timestamp`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`, `backup_type`
|
||||
**Description:** Unix timestamp of the last successful backup.
|
||||
|
||||
---
|
||||
|
||||
## PITR (Point-in-Time Recovery) Metrics
|
||||
|
||||
### `dbbackup_pitr_enabled`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Whether PITR is enabled for the database (1 = enabled, 0 = disabled).
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Check if PITR is enabled
|
||||
dbbackup_pitr_enabled{database="production"} == 1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_last_archived_timestamp`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Unix timestamp of the last archived WAL segment (PostgreSQL) or binlog file (MySQL).
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_archive_lag_seconds`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Seconds since the last WAL/binlog was archived. High values indicate archiving issues.
|
||||
|
||||
**Recommended Thresholds:**
|
||||
- Green: < 300 (5 minutes)
|
||||
- Yellow: 300-600 (5-10 minutes)
|
||||
- Red: > 600 (10+ minutes)
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Alert on high archive lag
|
||||
dbbackup_pitr_archive_lag_seconds > 600
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_archive_count`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Total number of archived WAL segments or binlog files.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_archive_size_bytes`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Total size of archived logs in bytes.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_chain_valid`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Whether the WAL/binlog chain is valid (1 = valid, 0 = gaps detected).
|
||||
|
||||
**Example Query:**
|
||||
```promql
|
||||
# Alert on broken chain
|
||||
dbbackup_pitr_chain_valid == 0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_gap_count`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Number of gaps detected in the WAL/binlog chain. Any value > 0 requires investigation.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_pitr_recovery_window_minutes`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`, `engine`
|
||||
**Description:** Estimated recovery window in minutes - the time span covered by archived logs.
|
||||
|
||||
---
|
||||
|
||||
## Deduplication Metrics
|
||||
|
||||
### `dbbackup_dedup_ratio`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`
|
||||
**Description:** Overall deduplication efficiency (0-1). A ratio of 0.5 means 50% space savings.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_dedup_database_ratio`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `database`
|
||||
**Description:** Per-database deduplication ratio.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_dedup_space_saved_bytes`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`
|
||||
**Description:** Total bytes saved by deduplication across all backups.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_dedup_disk_usage_bytes`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`
|
||||
**Description:** Actual disk usage of the chunk store after deduplication.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_dedup_chunks_total`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`
|
||||
**Description:** Total number of unique content-addressed chunks in the dedup store.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_dedup_compression_ratio`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`
|
||||
**Description:** Compression ratio achieved on chunk data (0-1). Higher = better compression.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_dedup_oldest_chunk_timestamp`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`
|
||||
**Description:** Unix timestamp of the oldest chunk. Useful for monitoring retention policy.
|
||||
|
||||
---
|
||||
|
||||
### `dbbackup_dedup_newest_chunk_timestamp`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`
|
||||
**Description:** Unix timestamp of the newest chunk. Confirms dedup is working on recent backups.
|
||||
|
||||
---
|
||||
|
||||
## Build Information Metrics
|
||||
|
||||
### `dbbackup_build_info`
|
||||
**Type:** Gauge
|
||||
**Labels:** `server`, `version`, `commit`, `build_time`
|
||||
**Description:** Build information for the dbbackup exporter. Value is always 1.
|
||||
|
||||
This metric is useful for:
|
||||
- Tracking which version is deployed across your fleet
|
||||
- Alerting when versions drift between servers
|
||||
- Correlating behavior changes with deployments
|
||||
|
||||
**Example Queries:**
|
||||
```promql
|
||||
# Show all deployed versions
|
||||
group by (version) (dbbackup_build_info)
|
||||
|
||||
# Find servers not on latest version
|
||||
dbbackup_build_info{version!="4.1.4"}
|
||||
|
||||
# Alert on version drift
|
||||
count(count by (version) (dbbackup_build_info)) > 1
|
||||
|
||||
# PITR archive lag
|
||||
dbbackup_pitr_archive_lag_seconds > 600
|
||||
|
||||
# Check PITR chain integrity
|
||||
dbbackup_pitr_chain_valid == 1
|
||||
|
||||
# Estimate available PITR window (in minutes)
|
||||
dbbackup_pitr_recovery_window_minutes
|
||||
|
||||
# PITR gaps detected
|
||||
dbbackup_pitr_gap_count > 0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alerting Rules
|
||||
|
||||
See [alerting-rules.yaml](../grafana/alerting-rules.yaml) for pre-configured Prometheus alerting rules.
|
||||
|
||||
### Recommended Alerts
|
||||
|
||||
| Alert Name | Condition | Severity |
|
||||
|------------|-----------|----------|
|
||||
| BackupStale | `dbbackup_rpo_seconds > 86400` | Critical |
|
||||
| BackupFailed | `increase(dbbackup_backup_total{status="failure"}[1h]) > 0` | Warning |
|
||||
| BackupNotVerified | `dbbackup_backup_verified == 0` | Warning |
|
||||
| DedupDegraded | `dbbackup_dedup_ratio < 0.1` | Info |
|
||||
| PITRArchiveLag | `dbbackup_pitr_archive_lag_seconds > 600` | Warning |
|
||||
| PITRChainBroken | `dbbackup_pitr_chain_valid == 0` | Critical |
|
||||
| PITRDisabled | `dbbackup_pitr_enabled == 0` (unexpected) | Critical |
|
||||
| NoIncrementalBackups | `dbbackup_backup_by_type{backup_type="incremental"} == 0` for 7d | Info |
|
||||
|
||||
---
|
||||
|
||||
## Dashboard
|
||||
|
||||
Import the [Grafana dashboard](../grafana/dbbackup-dashboard.json) for visualization of all metrics.
|
||||
|
||||
## Exporting Metrics
|
||||
|
||||
Metrics are exposed at `/metrics` when running with `--metrics` flag:
|
||||
|
||||
```bash
|
||||
dbbackup backup cluster --metrics --metrics-port 9090
|
||||
```
|
||||
|
||||
Or configure in `.dbbackup.conf`:
|
||||
```ini
|
||||
[metrics]
|
||||
enabled = true
|
||||
port = 9090
|
||||
path = /metrics
|
||||
```
|
||||
364
docs/RTO.md
364
docs/RTO.md
@ -1,364 +0,0 @@
|
||||
# RTO/RPO Analysis
|
||||
|
||||
Complete reference for Recovery Time Objective (RTO) and Recovery Point Objective (RPO) analysis and calculation.
|
||||
|
||||
## Overview
|
||||
|
||||
RTO and RPO are critical metrics for disaster recovery planning:
|
||||
|
||||
- **RTO (Recovery Time Objective)** - Maximum acceptable time to restore systems
|
||||
- **RPO (Recovery Point Objective)** - Maximum acceptable data loss (time)
|
||||
|
||||
dbbackup calculates these based on:
|
||||
- Backup size and compression
|
||||
- Database size and transaction rate
|
||||
- Network bandwidth
|
||||
- Hardware resources
|
||||
- Retention policy
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Show RTO/RPO analysis
|
||||
dbbackup rto show
|
||||
|
||||
# Show recommendations
|
||||
dbbackup rto recommendations
|
||||
|
||||
# Export for disaster recovery plan
|
||||
dbbackup rto export --format pdf --output drp.pdf
|
||||
```
|
||||
|
||||
## RTO Calculation
|
||||
|
||||
RTO depends on restore operations:
|
||||
|
||||
```
|
||||
RTO = Time to: Extract + Restore + Validation
|
||||
|
||||
Extract Time = Backup Size / Extraction Speed (~500 MB/s typical)
|
||||
Restore Time = Total Operations / Database Write Speed (~10-100K rows/sec)
|
||||
Validation = Backup Verify (~10% of restore time)
|
||||
```
|
||||
|
||||
### Example
|
||||
|
||||
```
|
||||
Backup: myapp_production
|
||||
- Size on disk: 2.5 GB
|
||||
- Compressed: 850 MB
|
||||
|
||||
Extract Time = 850 MB / 500 MB/s = 1.7 minutes
|
||||
Restore Time = 1.5M rows / 50K rows/sec = 30 minutes
|
||||
Validation = 3 minutes
|
||||
|
||||
Total RTO = 34.7 minutes
|
||||
```
|
||||
|
||||
## RPO Calculation
|
||||
|
||||
RPO depends on backup frequency and transaction rate:
|
||||
|
||||
```
|
||||
RPO = Backup Interval + WAL Replay Time
|
||||
|
||||
Example with daily backups:
|
||||
- Backup interval: 24 hours
|
||||
- WAL available for PITR: +6 hours
|
||||
|
||||
RPO = 24-30 hours (worst case)
|
||||
```
|
||||
|
||||
### Optimizing RPO
|
||||
|
||||
Reduce RPO by:
|
||||
|
||||
```bash
|
||||
# More frequent backups (hourly vs daily)
|
||||
dbbackup backup single myapp --schedule "0 * * * *" # Every hour
|
||||
|
||||
# Enable PITR (Point-in-Time Recovery)
|
||||
dbbackup pitr enable myapp /mnt/wal
|
||||
dbbackup pitr base myapp /mnt/wal
|
||||
|
||||
# Continuous WAL archiving
|
||||
dbbackup pitr status myapp /mnt/wal
|
||||
```
|
||||
|
||||
With PITR enabled:
|
||||
```
|
||||
RPO = Time since last transaction (typically < 5 minutes)
|
||||
```
|
||||
|
||||
## Analysis Command
|
||||
|
||||
### Show Current Metrics
|
||||
|
||||
```bash
|
||||
dbbackup rto show
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
Database: production
|
||||
Engine: PostgreSQL 15
|
||||
|
||||
Current Status:
|
||||
Last Backup: 2026-01-23 02:00:00 (22 hours ago)
|
||||
Backup Size: 2.5 GB (compressed: 850 MB)
|
||||
RTO Estimate: 35 minutes
|
||||
RPO Current: 22 hours
|
||||
PITR Enabled: yes
|
||||
PITR Window: 6 hours
|
||||
|
||||
Recommendations:
|
||||
- RTO is acceptable (< 1 hour)
|
||||
- RPO could be improved with hourly backups (currently 22h)
|
||||
- PITR reduces RPO to 6 hours in case of full backup loss
|
||||
|
||||
Recovery Plans:
|
||||
Scenario 1: Full database loss
|
||||
RTO: 35 minutes (restore from latest backup)
|
||||
RPO: 22 hours (data since last backup lost)
|
||||
|
||||
Scenario 2: Point-in-time recovery
|
||||
RTO: 45 minutes (restore backup + replay WAL)
|
||||
RPO: 5 minutes (last transaction available)
|
||||
|
||||
Scenario 3: Table-level recovery (single table drop)
|
||||
RTO: 30 minutes (restore to temp DB, extract table)
|
||||
RPO: 22 hours
|
||||
```
|
||||
|
||||
### Get Recommendations
|
||||
|
||||
```bash
|
||||
dbbackup rto recommendations
|
||||
|
||||
# Output includes:
|
||||
# - Suggested backup frequency
|
||||
# - PITR recommendations
|
||||
# - Parallelism recommendations
|
||||
# - Resource utilization tips
|
||||
# - Cost-benefit analysis
|
||||
```
|
||||
|
||||
## Scenarios
|
||||
|
||||
### Scenario Analysis
|
||||
|
||||
Calculate RTO/RPO for different failure modes.
|
||||
|
||||
```bash
|
||||
# Full database loss (use latest backup)
|
||||
dbbackup rto scenario --type full-loss
|
||||
|
||||
# Point-in-time recovery (specific time before incident)
|
||||
dbbackup rto scenario --type point-in-time --time "2026-01-23 14:30:00"
|
||||
|
||||
# Table-level recovery
|
||||
dbbackup rto scenario --type table-level --table users
|
||||
|
||||
# Multiple databases
|
||||
dbbackup rto scenario --type multi-db --databases myapp,mydb
|
||||
```
|
||||
|
||||
### Custom Scenario
|
||||
|
||||
```bash
|
||||
# Network bandwidth constraint
|
||||
dbbackup rto scenario \
|
||||
--type full-loss \
|
||||
--bandwidth 10MB/s \
|
||||
--storage-type s3
|
||||
|
||||
# Limited resources (small restore server)
|
||||
dbbackup rto scenario \
|
||||
--type full-loss \
|
||||
--cpu-cores 4 \
|
||||
--memory-gb 8
|
||||
|
||||
# High transaction rate database
|
||||
dbbackup rto scenario \
|
||||
--type point-in-time \
|
||||
--tps 100000
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Track RTO/RPO Trends
|
||||
|
||||
```bash
|
||||
# Show trend over time
|
||||
dbbackup rto history
|
||||
|
||||
# Export metrics for trending
|
||||
dbbackup rto export --format csv
|
||||
|
||||
# Output:
|
||||
# Date,Database,RTO_Minutes,RPO_Hours,Backup_Size_GB,Status
|
||||
# 2026-01-15,production,35,22,2.5,ok
|
||||
# 2026-01-16,production,35,22,2.5,ok
|
||||
# 2026-01-17,production,38,24,2.6,warning
|
||||
```
|
||||
|
||||
### Alert on RTO/RPO Violations
|
||||
|
||||
```bash
|
||||
# Alert if RTO > 1 hour
|
||||
dbbackup rto alert --type rto-violation --threshold 60
|
||||
|
||||
# Alert if RPO > 24 hours
|
||||
dbbackup rto alert --type rpo-violation --threshold 24
|
||||
|
||||
# Email on violations
|
||||
dbbackup rto alert \
|
||||
--type rpo-violation \
|
||||
--threshold 24 \
|
||||
--notify-email admin@example.com
|
||||
```
|
||||
|
||||
## Detailed Calculations
|
||||
|
||||
### Backup Time Components
|
||||
|
||||
```bash
|
||||
# Analyze last backup performance
|
||||
dbbackup rto backup-analysis
|
||||
|
||||
# Output:
|
||||
# Database: production
|
||||
# Backup Date: 2026-01-23 02:00:00
|
||||
# Total Duration: 45 minutes
|
||||
#
|
||||
# Components:
|
||||
# - Data extraction: 25m 30s (56%)
|
||||
# - Compression: 12m 15s (27%)
|
||||
# - Encryption: 5m 45s (13%)
|
||||
# - Upload to cloud: 1m 30s (3%)
|
||||
#
|
||||
# Throughput: 95 MB/s
|
||||
# Compression Ratio: 65%
|
||||
```
|
||||
|
||||
### Restore Time Components
|
||||
|
||||
```bash
|
||||
# Analyze restore performance from a test drill
|
||||
dbbackup rto restore-analysis myapp_2026-01-23.dump.gz
|
||||
|
||||
# Output:
|
||||
# Extract Time: 1m 45s
|
||||
# Restore Time: 28m 30s
|
||||
# Validation: 3m 15s
|
||||
# Total RTO: 33m 30s
|
||||
#
|
||||
# Restore Speed: 2.8M rows/minute
|
||||
# Objects Created: 4200
|
||||
# Indexes Built: 145
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Configure RTO/RPO targets in `.dbbackup.conf`:
|
||||
|
||||
```ini
|
||||
[rto_rpo]
|
||||
# Target RTO (minutes)
|
||||
target_rto_minutes = 60
|
||||
|
||||
# Target RPO (hours)
|
||||
target_rpo_hours = 4
|
||||
|
||||
# Alert on threshold violation
|
||||
alert_on_violation = true
|
||||
|
||||
# Minimum backups to maintain RTO
|
||||
min_backups_for_rto = 5
|
||||
|
||||
# PITR window target (hours)
|
||||
pitr_window_hours = 6
|
||||
```
|
||||
|
||||
## SLAs and Compliance
|
||||
|
||||
### Define SLA
|
||||
|
||||
```bash
|
||||
# Create SLA requirement
|
||||
dbbackup rto sla \
|
||||
--name production \
|
||||
--target-rto-minutes 30 \
|
||||
--target-rpo-hours 4 \
|
||||
--databases myapp,payments
|
||||
|
||||
# Verify compliance
|
||||
dbbackup rto sla --verify production
|
||||
|
||||
# Generate compliance report
|
||||
dbbackup rto sla --report production
|
||||
```
|
||||
|
||||
### Audit Trail
|
||||
|
||||
```bash
|
||||
# Show RTO/RPO audit history
|
||||
dbbackup rto audit
|
||||
|
||||
# Output shows:
|
||||
# Date Metric Value Target Status
|
||||
# 2026-01-25 03:15:00 RTO 35m 60m PASS
|
||||
# 2026-01-25 03:15:00 RPO 22h 4h FAIL
|
||||
# 2026-01-24 03:00:00 RTO 35m 60m PASS
|
||||
# 2026-01-24 03:00:00 RPO 22h 4h FAIL
|
||||
```
|
||||
|
||||
## Reporting
|
||||
|
||||
### Generate Report
|
||||
|
||||
```bash
|
||||
# Markdown report
|
||||
dbbackup rto report --format markdown --output rto-report.md
|
||||
|
||||
# PDF for disaster recovery plan
|
||||
dbbackup rto report --format pdf --output drp.pdf
|
||||
|
||||
# HTML for dashboard
|
||||
dbbackup rto report --format html --output rto-metrics.html
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Define SLA targets** - Start with business requirements
|
||||
- Critical systems: RTO < 1 hour
|
||||
- Important systems: RTO < 4 hours
|
||||
- Standard systems: RTO < 24 hours
|
||||
|
||||
2. **Test RTO regularly** - DR drills validate estimates
|
||||
```bash
|
||||
dbbackup drill /mnt/backups --full-validation
|
||||
```
|
||||
|
||||
3. **Monitor trends** - Increasing RTO may indicate issues
|
||||
|
||||
4. **Optimize backups** - Faster backups = smaller RTO
|
||||
- Increase parallelism
|
||||
- Use faster storage
|
||||
- Optimize compression level
|
||||
|
||||
5. **Plan for PITR** - Critical systems should have PITR enabled
|
||||
```bash
|
||||
dbbackup pitr enable myapp /mnt/wal
|
||||
```
|
||||
|
||||
6. **Document assumptions** - RTO/RPO calculations depend on:
|
||||
- Available bandwidth
|
||||
- Target hardware
|
||||
- Parallelism settings
|
||||
- Database size changes
|
||||
|
||||
7. **Regular audit** - Monthly SLA compliance review
|
||||
```bash
|
||||
dbbackup rto sla --verify production
|
||||
```
|
||||
112
email_infra_team.txt
Normal file
112
email_infra_team.txt
Normal file
@ -0,0 +1,112 @@
|
||||
Betreff: PostgreSQL Restore Fehler - "out of shared memory" auf RST Server
|
||||
|
||||
Hallo Infra-Team,
|
||||
|
||||
wir haben auf dem RST PostgreSQL Server (PostgreSQL 17.4) wiederholt Restore-Fehler mit "out of shared memory" Meldungen.
|
||||
|
||||
═══════════════════════════════════════════════════════════
|
||||
ANALYSE (Stand: 20.01.2026)
|
||||
═══════════════════════════════════════════════════════════
|
||||
|
||||
Server-Specs:
|
||||
• RAM: 31 GB (aktuell 19.6 GB belegt = 63.9%)
|
||||
• PostgreSQL nutzt nur ~118 MB für eigene Prozesse
|
||||
• Swap: 4 GB (6.4% genutzt)
|
||||
|
||||
Lock-Konfiguration:
|
||||
• max_locks_per_transaction: 4096 ✓ (bereits korrekt)
|
||||
• max_connections: 100
|
||||
• Lock Capacity: 409.600 ✓ (ausreichend)
|
||||
|
||||
═══════════════════════════════════════════════════════════
|
||||
PROBLEM-IDENTIFIKATION
|
||||
═══════════════════════════════════════════════════════════
|
||||
|
||||
1. MEMORY CONSUMER (nicht-PostgreSQL):
|
||||
• Nessus Agent: ~173 MB
|
||||
• Elastic Agent: ~300 MB (mehrere Komponenten)
|
||||
• Icinga: ~24 MB
|
||||
• Weitere Monitoring: ~100+ MB
|
||||
|
||||
2. WORK_MEM ZU NIEDRIG:
|
||||
• Aktuell: 64 MB
|
||||
• 4 Datenbanken nutzen Temp-Files (Indikator für zu wenig work_mem):
|
||||
- prodkc: 201 MB temp files
|
||||
- keycloak: 45 MB temp files
|
||||
- d7030: 6 MB temp files
|
||||
- pgbench_db: 2 MB temp files
|
||||
|
||||
═══════════════════════════════════════════════════════════
|
||||
EMPFOHLENE MASSNAHMEN
|
||||
═══════════════════════════════════════════════════════════
|
||||
|
||||
VARIANTE A - Temporär für große Restores:
|
||||
-------------------------------------------
|
||||
1. Monitoring-Agents stoppen (frei: ~500 MB):
|
||||
sudo systemctl stop nessus-agent
|
||||
sudo systemctl stop elastic-agent
|
||||
|
||||
2. work_mem erhöhen:
|
||||
sudo -u postgres psql -c "ALTER SYSTEM SET work_mem = '256MB';"
|
||||
sudo systemctl restart postgresql
|
||||
|
||||
3. Restore durchführen
|
||||
|
||||
4. Agents wieder starten:
|
||||
sudo systemctl start nessus-agent
|
||||
sudo systemctl start elastic-agent
|
||||
|
||||
|
||||
VARIANTE B - Permanente Lösung:
|
||||
-------------------------------------------
|
||||
1. work_mem auf 256 MB erhöhen (statt 64 MB)
|
||||
2. maintenance_work_mem optional auf 4 GB erhöhen (statt 2 GB)
|
||||
3. Falls möglich: Monitoring auf dedizierten Server verschieben
|
||||
|
||||
SQL-Befehle:
|
||||
ALTER SYSTEM SET work_mem = '256MB';
|
||||
ALTER SYSTEM SET maintenance_work_mem = '4GB';
|
||||
-- Anschließend PostgreSQL restart
|
||||
|
||||
|
||||
VARIANTE C - Falls keine Config-Änderung möglich:
|
||||
-------------------------------------------
|
||||
• Restore mit --profile=conservative durchführen (reduziert Memory-Druck)
|
||||
dbbackup restore cluster backup.tar.gz --profile=conservative --confirm
|
||||
|
||||
• Oder TUI-Modus nutzen (verwendet automatisch conservative profile):
|
||||
dbbackup interactive
|
||||
|
||||
• Monitoring während Restore-Fenster deaktivieren
|
||||
|
||||
═══════════════════════════════════════════════════════════
|
||||
DETAIL-REPORT
|
||||
═══════════════════════════════════════════════════════════
|
||||
|
||||
Vollständiger Diagnose-Report liegt bei bzw. kann jederzeit mit
|
||||
diesem Script generiert werden:
|
||||
|
||||
/path/to/diagnose_postgres_memory.sh
|
||||
|
||||
Das Script analysiert:
|
||||
• System Memory Usage
|
||||
• PostgreSQL Konfiguration
|
||||
• Lock Usage
|
||||
• Temp File Usage
|
||||
• Blocking Queries
|
||||
• Shared Memory Segments
|
||||
|
||||
═══════════════════════════════════════════════════════════
|
||||
|
||||
Bevorzugt wäre Variante B (permanente work_mem Erhöhung), damit künftige
|
||||
große Restores ohne manuelle Eingriffe durchlaufen.
|
||||
|
||||
Bitte um Rückmeldung, welche Variante ihr umsetzt bzw. ob ihr weitere
|
||||
Infos benötigt.
|
||||
|
||||
Danke & Grüße
|
||||
[Dein Name]
|
||||
|
||||
---
|
||||
Anhang: diagnose_postgres_memory.sh (falls nicht vorhanden)
|
||||
Error Log: /a01/dba/tmp/dbbackup-restore-debug-20260119-221730.json
|
||||
140
fix_postgres_locks.sh
Executable file
140
fix_postgres_locks.sh
Executable file
@ -0,0 +1,140 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Fix PostgreSQL Lock Table Exhaustion
|
||||
# Increases max_locks_per_transaction to handle large database restores
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
echo "════════════════════════════════════════════════════════════"
|
||||
echo " PostgreSQL Lock Configuration Fix"
|
||||
echo "════════════════════════════════════════════════════════════"
|
||||
echo
|
||||
|
||||
# Check if running as postgres user or with sudo
|
||||
if [ "$EUID" -ne 0 ] && [ "$(whoami)" != "postgres" ]; then
|
||||
echo "⚠️ This script should be run as:"
|
||||
echo " sudo $0"
|
||||
echo " or as the postgres user"
|
||||
echo
|
||||
read -p "Continue anyway? (y/N) " -n 1 -r
|
||||
echo
|
||||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Detect PostgreSQL version and config
|
||||
PSQL=$(command -v psql || echo "")
|
||||
if [ -z "$PSQL" ]; then
|
||||
echo "❌ psql not found in PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "📊 Current PostgreSQL Configuration:"
|
||||
echo "────────────────────────────────────────────────────────────"
|
||||
sudo -u postgres psql -c "SHOW max_locks_per_transaction;" 2>/dev/null || psql -c "SHOW max_locks_per_transaction;" || echo "Unable to query current value"
|
||||
sudo -u postgres psql -c "SHOW max_connections;" 2>/dev/null || psql -c "SHOW max_connections;" || echo "Unable to query current value"
|
||||
sudo -u postgres psql -c "SHOW work_mem;" 2>/dev/null || psql -c "SHOW work_mem;" || echo "Unable to query current value"
|
||||
sudo -u postgres psql -c "SHOW maintenance_work_mem;" 2>/dev/null || psql -c "SHOW maintenance_work_mem;" || echo "Unable to query current value"
|
||||
echo
|
||||
|
||||
# Recommended values
|
||||
RECOMMENDED_LOCKS=4096
|
||||
RECOMMENDED_WORK_MEM="256MB"
|
||||
RECOMMENDED_MAINTENANCE_WORK_MEM="4GB"
|
||||
|
||||
echo "🔧 Applying Fixes:"
|
||||
echo "────────────────────────────────────────────────────────────"
|
||||
echo "1. Setting max_locks_per_transaction = $RECOMMENDED_LOCKS"
|
||||
echo "2. Setting work_mem = $RECOMMENDED_WORK_MEM (improves query performance)"
|
||||
echo "3. Setting maintenance_work_mem = $RECOMMENDED_MAINTENANCE_WORK_MEM (speeds up restore/vacuum)"
|
||||
echo
|
||||
|
||||
# Apply the settings
|
||||
SUCCESS=0
|
||||
|
||||
# Fix 1: max_locks_per_transaction
|
||||
if sudo -u postgres psql -c "ALTER SYSTEM SET max_locks_per_transaction = $RECOMMENDED_LOCKS;" 2>/dev/null; then
|
||||
echo "✅ max_locks_per_transaction updated successfully"
|
||||
SUCCESS=$((SUCCESS + 1))
|
||||
elif psql -c "ALTER SYSTEM SET max_locks_per_transaction = $RECOMMENDED_LOCKS;" 2>/dev/null; then
|
||||
echo "✅ max_locks_per_transaction updated successfully"
|
||||
SUCCESS=$((SUCCESS + 1))
|
||||
else
|
||||
echo "❌ Failed to update max_locks_per_transaction"
|
||||
fi
|
||||
|
||||
# Fix 2: work_mem
|
||||
if sudo -u postgres psql -c "ALTER SYSTEM SET work_mem = '$RECOMMENDED_WORK_MEM';" 2>/dev/null; then
|
||||
echo "✅ work_mem updated successfully"
|
||||
SUCCESS=$((SUCCESS + 1))
|
||||
elif psql -c "ALTER SYSTEM SET work_mem = '$RECOMMENDED_WORK_MEM';" 2>/dev/null; then
|
||||
echo "✅ work_mem updated successfully"
|
||||
SUCCESS=$((SUCCESS + 1))
|
||||
else
|
||||
echo "❌ Failed to update work_mem"
|
||||
fi
|
||||
|
||||
# Fix 3: maintenance_work_mem
|
||||
if sudo -u postgres psql -c "ALTER SYSTEM SET maintenance_work_mem = '$RECOMMENDED_MAINTENANCE_WORK_MEM';" 2>/dev/null; then
|
||||
echo "✅ maintenance_work_mem updated successfully"
|
||||
SUCCESS=$((SUCCESS + 1))
|
||||
elif psql -c "ALTER SYSTEM SET maintenance_work_mem = '$RECOMMENDED_MAINTENANCE_WORK_MEM';" 2>/dev/null; then
|
||||
echo "✅ maintenance_work_mem updated successfully"
|
||||
SUCCESS=$((SUCCESS + 1))
|
||||
else
|
||||
echo "❌ Failed to update maintenance_work_mem"
|
||||
fi
|
||||
|
||||
if [ $SUCCESS -eq 0 ]; then
|
||||
echo
|
||||
echo "❌ All configuration updates failed"
|
||||
echo
|
||||
echo "Manual steps:"
|
||||
echo "1. Connect to PostgreSQL as superuser:"
|
||||
echo " sudo -u postgres psql"
|
||||
echo
|
||||
echo "2. Run these commands:"
|
||||
echo " ALTER SYSTEM SET max_locks_per_transaction = $RECOMMENDED_LOCKS;"
|
||||
echo " ALTER SYSTEM SET work_mem = '$RECOMMENDED_WORK_MEM';"
|
||||
echo " ALTER SYSTEM SET maintenance_work_mem = '$RECOMMENDED_MAINTENANCE_WORK_MEM';"
|
||||
echo
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "✅ Applied $SUCCESS out of 3 configuration changes"
|
||||
|
||||
echo
|
||||
echo "⚠️ IMPORTANT: PostgreSQL restart required!"
|
||||
echo "────────────────────────────────────────────────────────────"
|
||||
echo
|
||||
echo "Restart PostgreSQL using one of these commands:"
|
||||
echo
|
||||
echo " • systemd: sudo systemctl restart postgresql"
|
||||
echo " • pg_ctl: sudo -u postgres pg_ctl restart -D /var/lib/postgresql/data"
|
||||
echo " • service: sudo service postgresql restart"
|
||||
echo
|
||||
echo "📊 Expected capacity after restart:"
|
||||
echo "────────────────────────────────────────────────────────────"
|
||||
echo " Lock capacity: max_locks_per_transaction × (max_connections + max_prepared)"
|
||||
echo " = $RECOMMENDED_LOCKS × (connections + prepared)"
|
||||
echo
|
||||
echo " Work memory: $RECOMMENDED_WORK_MEM per query operation"
|
||||
echo " Maintenance: $RECOMMENDED_MAINTENANCE_WORK_MEM for restore/vacuum/index"
|
||||
echo
|
||||
echo "After restarting, verify with:"
|
||||
echo " psql -c 'SHOW max_locks_per_transaction;'"
|
||||
echo " psql -c 'SHOW work_mem;'"
|
||||
echo " psql -c 'SHOW maintenance_work_mem;'"
|
||||
echo
|
||||
echo "💡 Benefits:"
|
||||
echo " ✓ Prevents 'out of shared memory' errors during restore"
|
||||
echo " ✓ Reduces temp file usage (better performance)"
|
||||
echo " ✓ Faster restore, vacuum, and index operations"
|
||||
echo
|
||||
echo "🔍 For comprehensive diagnostics, run:"
|
||||
echo " ./diagnose_postgres_memory.sh"
|
||||
echo
|
||||
echo "════════════════════════════════════════════════════════════"
|
||||
22
go.mod
22
go.mod
@ -13,25 +13,19 @@ require (
|
||||
github.com/aws/aws-sdk-go-v2/credentials v1.19.2
|
||||
github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.20.12
|
||||
github.com/aws/aws-sdk-go-v2/service/s3 v1.92.1
|
||||
github.com/cenkalti/backoff/v4 v4.3.0
|
||||
github.com/charmbracelet/bubbles v0.21.0
|
||||
github.com/charmbracelet/bubbletea v1.3.10
|
||||
github.com/charmbracelet/lipgloss v1.1.0
|
||||
github.com/dustin/go-humanize v1.0.1
|
||||
github.com/fatih/color v1.18.0
|
||||
github.com/go-sql-driver/mysql v1.9.3
|
||||
github.com/hashicorp/go-multierror v1.1.1
|
||||
github.com/jackc/pgx/v5 v5.7.6
|
||||
github.com/klauspost/pgzip v1.2.6
|
||||
github.com/schollz/progressbar/v3 v3.19.0
|
||||
github.com/mattn/go-sqlite3 v1.14.32
|
||||
github.com/shirou/gopsutil/v3 v3.24.5
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
github.com/spf13/afero v1.15.0
|
||||
github.com/spf13/cobra v1.10.1
|
||||
github.com/spf13/pflag v1.0.9
|
||||
golang.org/x/crypto v0.43.0
|
||||
google.golang.org/api v0.256.0
|
||||
modernc.org/sqlite v1.44.3
|
||||
)
|
||||
|
||||
require (
|
||||
@ -63,6 +57,7 @@ require (
|
||||
github.com/aws/aws-sdk-go-v2/service/sts v1.41.2 // indirect
|
||||
github.com/aws/smithy-go v1.23.2 // indirect
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
|
||||
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc // indirect
|
||||
github.com/charmbracelet/x/ansi v0.10.1 // indirect
|
||||
@ -72,6 +67,7 @@ require (
|
||||
github.com/envoyproxy/go-control-plane/envoy v1.32.4 // indirect
|
||||
github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
|
||||
github.com/fatih/color v1.18.0 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/go-jose/go-jose/v4 v4.1.2 // indirect
|
||||
github.com/go-logr/logr v1.4.3 // indirect
|
||||
@ -82,11 +78,11 @@ require (
|
||||
github.com/googleapis/enterprise-certificate-proxy v0.3.7 // indirect
|
||||
github.com/googleapis/gax-go/v2 v2.15.0 // indirect
|
||||
github.com/hashicorp/errwrap v1.0.0 // indirect
|
||||
github.com/hashicorp/go-multierror v1.1.1 // indirect
|
||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||
github.com/jackc/pgpassfile v1.0.0 // indirect
|
||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
|
||||
github.com/jackc/puddle/v2 v2.2.2 // indirect
|
||||
github.com/klauspost/compress v1.18.3 // indirect
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
|
||||
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||
@ -97,11 +93,11 @@ require (
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
|
||||
github.com/muesli/cancelreader v0.2.2 // indirect
|
||||
github.com/muesli/termenv v0.16.0 // indirect
|
||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
github.com/rivo/uniseg v0.4.7 // indirect
|
||||
github.com/schollz/progressbar/v3 v3.19.0 // indirect
|
||||
github.com/spf13/afero v1.15.0 // indirect
|
||||
github.com/spiffe/go-spiffe/v2 v2.5.0 // indirect
|
||||
github.com/tklauser/go-sysconf v0.3.12 // indirect
|
||||
github.com/tklauser/numcpus v0.6.1 // indirect
|
||||
@ -117,10 +113,9 @@ require (
|
||||
go.opentelemetry.io/otel/sdk v1.37.0 // indirect
|
||||
go.opentelemetry.io/otel/sdk/metric v1.37.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.37.0 // indirect
|
||||
golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect
|
||||
golang.org/x/net v0.46.0 // indirect
|
||||
golang.org/x/oauth2 v0.33.0 // indirect
|
||||
golang.org/x/sync v0.19.0 // indirect
|
||||
golang.org/x/sync v0.18.0 // indirect
|
||||
golang.org/x/sys v0.38.0 // indirect
|
||||
golang.org/x/term v0.36.0 // indirect
|
||||
golang.org/x/text v0.30.0 // indirect
|
||||
@ -130,7 +125,4 @@ require (
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101 // indirect
|
||||
google.golang.org/grpc v1.76.0 // indirect
|
||||
google.golang.org/protobuf v1.36.10 // indirect
|
||||
modernc.org/libc v1.67.6 // indirect
|
||||
modernc.org/mathutil v1.7.1 // indirect
|
||||
modernc.org/memory v1.11.0 // indirect
|
||||
)
|
||||
|
||||
56
go.sum
56
go.sum
@ -102,8 +102,6 @@ github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd h1:vy0G
|
||||
github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd/go.mod h1:xe0nKWGd3eJgtqZRaN9RjMtK7xUYchjzPr7q6kcvCCs=
|
||||
github.com/charmbracelet/x/term v0.2.1 h1:AQeHeLZ1OqSXhrAWpYUtZyX1T3zVxfpZuEQMIQaGIAQ=
|
||||
github.com/charmbracelet/x/term v0.2.1/go.mod h1:oQ4enTYFV7QN4m0i9mzHrViD7TQKvNEEkHUMCmsxdUg=
|
||||
github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM=
|
||||
github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY=
|
||||
github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443 h1:aQ3y1lwWyqYPiWZThqv1aFbZMiM9vblcSArJRf2Irls=
|
||||
github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
|
||||
@ -147,8 +145,6 @@ github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/google/martian/v3 v3.3.3 h1:DIhPTQrbPkgs2yJYdXU/eNACCG5DVQjySNRNlflZ9Fc=
|
||||
github.com/google/martian/v3 v3.3.3/go.mod h1:iEPrYcgCF7jA9OtScMFQyAlZZ4YXTKEtJ1E6RWzmBA0=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||
github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0=
|
||||
github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
@ -161,8 +157,6 @@ github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/U
|
||||
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
|
||||
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
|
||||
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
|
||||
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
|
||||
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
|
||||
@ -173,10 +167,6 @@ github.com/jackc/pgx/v5 v5.7.6 h1:rWQc5FwZSPX58r1OQmkuaNicxdmExaEz5A2DO2hUuTk=
|
||||
github.com/jackc/pgx/v5 v5.7.6/go.mod h1:aruU7o91Tc2q2cFp5h4uP3f6ztExVpyVv88Xl/8Vl8M=
|
||||
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
|
||||
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
|
||||
github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw=
|
||||
github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
|
||||
github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
|
||||
github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
|
||||
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
||||
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
|
||||
@ -192,6 +182,8 @@ github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2J
|
||||
github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
|
||||
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
|
||||
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs=
|
||||
github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
|
||||
@ -200,8 +192,6 @@ github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELU
|
||||
github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
|
||||
github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
|
||||
github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
|
||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ=
|
||||
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU=
|
||||
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo=
|
||||
@ -211,8 +201,6 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw=
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
|
||||
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||
@ -268,16 +256,14 @@ go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mx
|
||||
go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0=
|
||||
golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04=
|
||||
golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0=
|
||||
golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 h1:mgKeJMpvi0yx/sU5GsxQ7p6s2wtOnGAHZWCHUM4KGzY=
|
||||
golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546/go.mod h1:j/pmGrbnkbPtQfxEe5D0VQhZC6qKbfKifgD0oM7sR70=
|
||||
golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA=
|
||||
golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w=
|
||||
golang.org/x/exp v0.0.0-20220909182711-5c715a9e8561 h1:MDc5xs78ZrZr3HMQugiXOAkSZtfTpbJLDr/lwfgO53E=
|
||||
golang.org/x/exp v0.0.0-20220909182711-5c715a9e8561/go.mod h1:cyybsKvd6eL0RnXn6p/Grxp8F5bW7iYuBgsNCOHpMYE=
|
||||
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
|
||||
golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
|
||||
golang.org/x/oauth2 v0.33.0 h1:4Q+qn+E5z8gPRJfmRy7C2gGG3T4jIprK6aSYgTXGRpo=
|
||||
golang.org/x/oauth2 v0.33.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
|
||||
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
|
||||
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||
golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I=
|
||||
golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
@ -294,8 +280,6 @@ golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
|
||||
golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=
|
||||
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
|
||||
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
|
||||
golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ=
|
||||
golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
|
||||
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
|
||||
@ -315,31 +299,3 @@ gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis=
|
||||
modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0=
|
||||
modernc.org/ccgo/v4 v4.30.1 h1:4r4U1J6Fhj98NKfSjnPUN7Ze2c6MnAdL0hWw6+LrJpc=
|
||||
modernc.org/ccgo/v4 v4.30.1/go.mod h1:bIOeI1JL54Utlxn+LwrFyjCx2n2RDiYEaJVSrgdrRfM=
|
||||
modernc.org/fileutil v1.3.40 h1:ZGMswMNc9JOCrcrakF1HrvmergNLAmxOPjizirpfqBA=
|
||||
modernc.org/fileutil v1.3.40/go.mod h1:HxmghZSZVAz/LXcMNwZPA/DRrQZEVP9VX0V4LQGQFOc=
|
||||
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||
modernc.org/gc/v3 v3.1.1 h1:k8T3gkXWY9sEiytKhcgyiZ2L0DTyCQ/nvX+LoCljoRE=
|
||||
modernc.org/gc/v3 v3.1.1/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||
modernc.org/libc v1.67.6 h1:eVOQvpModVLKOdT+LvBPjdQqfrZq+pC39BygcT+E7OI=
|
||||
modernc.org/libc v1.67.6/go.mod h1:JAhxUVlolfYDErnwiqaLvUqc8nfb2r6S6slAgZOnaiE=
|
||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||
modernc.org/sqlite v1.44.3 h1:+39JvV/HWMcYslAwRxHb8067w+2zowvFOUrOWIy9PjY=
|
||||
modernc.org/sqlite v1.44.3/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA=
|
||||
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||
|
||||
@ -1,220 +0,0 @@
|
||||
# DBBackup Prometheus Alerting Rules
|
||||
# Deploy these to your Prometheus server or use Grafana Alerting
|
||||
#
|
||||
# Usage with Prometheus:
|
||||
# Add to prometheus.yml:
|
||||
# rule_files:
|
||||
# - /path/to/alerting-rules.yaml
|
||||
#
|
||||
# Usage with Grafana Alerting:
|
||||
# Import these as Grafana alert rules via the UI or provisioning
|
||||
|
||||
groups:
|
||||
- name: dbbackup_alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
# Critical: No backup in 24 hours
|
||||
- alert: DBBackupRPOCritical
|
||||
expr: dbbackup_rpo_seconds > 86400
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "No backup for {{ $labels.database }} in 24+ hours"
|
||||
description: |
|
||||
Database {{ $labels.database }} on {{ $labels.server }} has not been
|
||||
backed up in {{ $value | humanizeDuration }}. This exceeds the 24-hour
|
||||
RPO threshold. Immediate investigation required.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#rpo-critical"
|
||||
|
||||
# Warning: No backup in 12 hours
|
||||
- alert: DBBackupRPOWarning
|
||||
expr: dbbackup_rpo_seconds > 43200 and dbbackup_rpo_seconds <= 86400
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "No backup for {{ $labels.database }} in 12+ hours"
|
||||
description: |
|
||||
Database {{ $labels.database }} on {{ $labels.server }} has not been
|
||||
backed up in {{ $value | humanizeDuration }}. Check backup schedule.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#rpo-warning"
|
||||
|
||||
# Critical: Backup failures detected
|
||||
- alert: DBBackupFailure
|
||||
expr: increase(dbbackup_backup_total{status="failure"}[1h]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Backup failure detected for {{ $labels.database }}"
|
||||
description: |
|
||||
One or more backup attempts failed for {{ $labels.database }} on
|
||||
{{ $labels.server }} in the last hour. Check logs for details.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#backup-failure"
|
||||
|
||||
# Warning: Backup not verified
|
||||
- alert: DBBackupNotVerified
|
||||
expr: dbbackup_backup_verified == 0
|
||||
for: 24h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Backup for {{ $labels.database }} not verified"
|
||||
description: |
|
||||
The latest backup for {{ $labels.database }} on {{ $labels.server }}
|
||||
has not been verified. Consider running verification to ensure
|
||||
backup integrity.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#verification"
|
||||
|
||||
# Warning: Dedup ratio dropping
|
||||
- alert: DBBackupDedupRatioLow
|
||||
expr: dbbackup_dedup_ratio < 0.1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low deduplication ratio on {{ $labels.server }}"
|
||||
description: |
|
||||
Deduplication ratio on {{ $labels.server }} is {{ $value | printf "%.1f%%" }}.
|
||||
This may indicate changes in data patterns or dedup configuration issues.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#dedup-low"
|
||||
|
||||
# Warning: Dedup disk usage growing rapidly
|
||||
- alert: DBBackupDedupDiskGrowth
|
||||
expr: |
|
||||
predict_linear(dbbackup_dedup_disk_usage_bytes[7d], 30*24*3600) >
|
||||
(dbbackup_dedup_disk_usage_bytes * 2)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Rapid dedup storage growth on {{ $labels.server }}"
|
||||
description: |
|
||||
Dedup storage on {{ $labels.server }} is growing rapidly.
|
||||
At current rate, usage will double in 30 days.
|
||||
Current usage: {{ $value | humanize1024 }}B
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#storage-growth"
|
||||
|
||||
# PITR: Archive lag high
|
||||
- alert: DBBackupPITRArchiveLag
|
||||
expr: dbbackup_pitr_archive_lag_seconds > 600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PITR archive lag high for {{ $labels.database }}"
|
||||
description: |
|
||||
WAL/binlog archiving for {{ $labels.database }} on {{ $labels.server }}
|
||||
is {{ $value | humanizeDuration }} behind. This reduces the PITR
|
||||
recovery point. Check archive process and disk space.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-archive-lag"
|
||||
|
||||
# PITR: Archive lag critical
|
||||
- alert: DBBackupPITRArchiveLagCritical
|
||||
expr: dbbackup_pitr_archive_lag_seconds > 1800
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR archive severely behind for {{ $labels.database }}"
|
||||
description: |
|
||||
WAL/binlog archiving for {{ $labels.database }} is {{ $value | humanizeDuration }}
|
||||
behind. Point-in-time recovery capability is at risk. Immediate action required.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-archive-critical"
|
||||
|
||||
# PITR: Chain broken (gaps detected)
|
||||
- alert: DBBackupPITRChainBroken
|
||||
expr: dbbackup_pitr_chain_valid == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR chain broken for {{ $labels.database }}"
|
||||
description: |
|
||||
The WAL/binlog chain for {{ $labels.database }} on {{ $labels.server }}
|
||||
has gaps. Point-in-time recovery to arbitrary points is NOT possible.
|
||||
A new base backup is required to restore PITR capability.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-chain-broken"
|
||||
|
||||
# PITR: Gaps in chain
|
||||
- alert: DBBackupPITRGapsDetected
|
||||
expr: dbbackup_pitr_gap_count > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PITR chain has {{ $value }} gaps for {{ $labels.database }}"
|
||||
description: |
|
||||
{{ $value }} gaps detected in WAL/binlog chain for {{ $labels.database }}.
|
||||
Recovery to points within gaps will fail. Consider taking a new base backup.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-gaps"
|
||||
|
||||
# PITR: Unexpectedly disabled
|
||||
- alert: DBBackupPITRDisabled
|
||||
expr: |
|
||||
dbbackup_pitr_enabled == 0
|
||||
and on(database) dbbackup_pitr_archive_count > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PITR unexpectedly disabled for {{ $labels.database }}"
|
||||
description: |
|
||||
PITR was previously enabled for {{ $labels.database }} (has archived logs)
|
||||
but is now disabled. This may indicate a configuration issue or
|
||||
database restart without PITR settings.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#pitr-disabled"
|
||||
|
||||
# Backup type: No full backups recently
|
||||
- alert: DBBackupNoRecentFullBackup
|
||||
expr: |
|
||||
time() - dbbackup_last_success_timestamp{backup_type="full"} > 604800
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "No full backup in 7+ days for {{ $labels.database }}"
|
||||
description: |
|
||||
Database {{ $labels.database }} has not had a full backup in over 7 days.
|
||||
Incremental backups depend on a valid full backup base.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#no-full-backup"
|
||||
|
||||
# Info: Exporter not responding
|
||||
- alert: DBBackupExporterDown
|
||||
expr: up{job="dbbackup"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "DBBackup exporter is down on {{ $labels.instance }}"
|
||||
description: |
|
||||
The DBBackup Prometheus exporter on {{ $labels.instance }} is not
|
||||
responding. Metrics collection is affected.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#exporter-down"
|
||||
|
||||
# Info: Metrics stale (scrape timestamp old)
|
||||
- alert: DBBackupMetricsStale
|
||||
expr: time() - dbbackup_scrape_timestamp > 600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "DBBackup metrics are stale on {{ $labels.server }}"
|
||||
description: |
|
||||
Metrics for {{ $labels.server }} haven't been updated in
|
||||
{{ $value | humanizeDuration }}. The exporter may be having issues.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#metrics-stale"
|
||||
|
||||
# Critical: No successful backups ever
|
||||
- alert: DBBackupNeverSucceeded
|
||||
expr: dbbackup_backup_total{status="success"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "No successful backups for {{ $labels.database }}"
|
||||
description: |
|
||||
Database {{ $labels.database }} on {{ $labels.server }} has never
|
||||
had a successful backup. This requires immediate attention.
|
||||
runbook_url: "https://github.com/your-org/dbbackup/wiki/Runbooks#never-succeeded"
|
||||
@ -15,33 +15,18 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Comprehensive monitoring dashboard for DBBackup - tracks backup status, RPO, deduplication, and verification across all database servers.",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 200,
|
||||
"panels": [],
|
||||
"title": "Backup Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Shows SUCCESS if RPO is under 7 days, FAILED otherwise. Green = healthy backup schedule.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -82,9 +67,9 @@
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 5,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
@ -109,7 +94,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_rpo_seconds{server=~\"$server\"} < bool 604800",
|
||||
"expr": "dbbackup_rpo_seconds{instance=~\"$instance\"} < bool 604800",
|
||||
"legendFormat": "{{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
@ -123,7 +108,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Time elapsed since the last successful backup. Green < 12h, Yellow < 24h, Red > 24h.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -153,9 +137,9 @@
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 5,
|
||||
"x": 5,
|
||||
"y": 1
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
@ -180,7 +164,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_rpo_seconds{server=~\"$server\"}",
|
||||
"expr": "dbbackup_rpo_seconds{instance=~\"$instance\"}",
|
||||
"legendFormat": "{{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
@ -194,89 +178,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Whether the most recent backup was verified successfully. 1 = verified and valid.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "orange",
|
||||
"index": 1,
|
||||
"text": "NOT VERIFIED"
|
||||
},
|
||||
"1": {
|
||||
"color": "green",
|
||||
"index": 0,
|
||||
"text": "VERIFIED"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "orange",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 5,
|
||||
"x": 10,
|
||||
"y": 1
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_backup_verified{server=~\"$server\"}",
|
||||
"legendFormat": "{{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Verification Status",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Total count of successful backup completions.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -297,9 +198,9 @@
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 4,
|
||||
"x": 15,
|
||||
"y": 1
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
@ -324,7 +225,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_backup_total{server=~\"$server\", status=\"success\"}",
|
||||
"expr": "dbbackup_backup_total{instance=~\"$instance\", status=\"success\"}",
|
||||
"legendFormat": "{{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
@ -338,7 +239,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Total count of failed backup attempts. Any value > 0 warrants investigation.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -363,9 +263,9 @@
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 5,
|
||||
"x": 19,
|
||||
"y": 1
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 0
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
@ -390,7 +290,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_backup_total{server=~\"$server\", status=\"failure\"}",
|
||||
"expr": "dbbackup_backup_total{instance=~\"$instance\", status=\"failure\"}",
|
||||
"legendFormat": "{{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
@ -404,7 +304,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Recovery Point Objective over time. Shows how long since the last successful backup. Red line at 24h threshold.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -463,7 +362,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
"y": 4
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
@ -485,8 +384,8 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_rpo_seconds{server=~\"$server\"}",
|
||||
"legendFormat": "{{server}} - {{database}}",
|
||||
"expr": "dbbackup_rpo_seconds{instance=~\"$instance\"}",
|
||||
"legendFormat": "{{instance}} - {{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
@ -499,7 +398,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Size of each backup over time. Useful for capacity planning and detecting unexpected growth.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -554,7 +452,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 5
|
||||
"y": 4
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
@ -576,8 +474,8 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_last_backup_size_bytes{server=~\"$server\"}",
|
||||
"legendFormat": "{{server}} - {{database}}",
|
||||
"expr": "dbbackup_last_backup_size_bytes{instance=~\"$instance\"}",
|
||||
"legendFormat": "{{instance}} - {{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
@ -590,7 +488,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "How long each backup takes. Monitor for trends that may indicate database growth or performance issues.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -645,7 +542,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 13
|
||||
"y": 12
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
@ -667,8 +564,8 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_last_backup_duration_seconds{server=~\"$server\"}",
|
||||
"legendFormat": "{{server}} - {{database}}",
|
||||
"expr": "dbbackup_last_backup_duration_seconds{instance=~\"$instance\"}",
|
||||
"legendFormat": "{{instance}} - {{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
@ -681,7 +578,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Summary table showing current status of all databases with color-coded RPO and backup sizes.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -798,7 +694,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 13
|
||||
"y": 12
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
@ -821,7 +717,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_rpo_seconds{server=~\"$server\"}",
|
||||
"expr": "dbbackup_rpo_seconds{instance=~\"$instance\"}",
|
||||
"format": "table",
|
||||
"hide": false,
|
||||
"instant": true,
|
||||
@ -835,7 +731,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_last_backup_size_bytes{server=~\"$server\"}",
|
||||
"expr": "dbbackup_last_backup_size_bytes{instance=~\"$instance\"}",
|
||||
"format": "table",
|
||||
"hide": false,
|
||||
"instant": true,
|
||||
@ -896,7 +792,7 @@
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 21
|
||||
"y": 30
|
||||
},
|
||||
"id": 100,
|
||||
"panels": [],
|
||||
@ -908,7 +804,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Overall deduplication efficiency (0-1). Higher values mean more duplicate data eliminated. 0.5 = 50% space savings.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -932,7 +827,7 @@
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 22
|
||||
"y": 31
|
||||
},
|
||||
"id": 101,
|
||||
"options": {
|
||||
@ -955,7 +850,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_ratio{server=~\"$server\"}",
|
||||
"expr": "dbbackup_dedup_ratio{instance=~\"$instance\"}",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
@ -969,7 +864,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Total bytes saved by deduplication across all backups.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -993,7 +887,7 @@
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 22
|
||||
"y": 31
|
||||
},
|
||||
"id": 102,
|
||||
"options": {
|
||||
@ -1016,7 +910,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_space_saved_bytes{server=~\"$server\"}",
|
||||
"expr": "dbbackup_dedup_space_saved_bytes{instance=~\"$instance\"}",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
@ -1030,7 +924,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Actual disk usage of the chunk store after deduplication.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -1054,7 +947,7 @@
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 22
|
||||
"y": 31
|
||||
},
|
||||
"id": 103,
|
||||
"options": {
|
||||
@ -1077,7 +970,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_disk_usage_bytes{server=~\"$server\"}",
|
||||
"expr": "dbbackup_dedup_disk_usage_bytes{instance=~\"$instance\"}",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
@ -1091,7 +984,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Total number of unique content-addressed chunks in the dedup store.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -1115,7 +1007,7 @@
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 22
|
||||
"y": 31
|
||||
},
|
||||
"id": 104,
|
||||
"options": {
|
||||
@ -1138,7 +1030,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_chunks_total{server=~\"$server\"}",
|
||||
"expr": "dbbackup_dedup_chunks_total{instance=~\"$instance\"}",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
@ -1152,190 +1044,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Compression ratio achieved (0-1). Higher = better compression of chunk data.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "orange",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 4,
|
||||
"x": 0,
|
||||
"y": 27
|
||||
},
|
||||
"id": 107,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_compression_ratio{server=~\"$server\"}",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Compression Ratio",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Timestamp of the oldest chunk - useful for monitoring retention policy.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "semi-dark-blue",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "dateTimeFromNow"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 4,
|
||||
"x": 4,
|
||||
"y": 27
|
||||
},
|
||||
"id": 108,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_oldest_chunk_timestamp{server=~\"$server\"} * 1000",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Oldest Chunk",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Timestamp of the newest chunk - confirms dedup is working on recent backups.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "semi-dark-green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "dateTimeFromNow"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 4,
|
||||
"x": 8,
|
||||
"y": 27
|
||||
},
|
||||
"id": 109,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_newest_chunk_timestamp{server=~\"$server\"} * 1000",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Newest Chunk",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Per-database deduplication efficiency over time. Compare databases to identify which benefit most from dedup.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -1391,7 +1099,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
"y": 36
|
||||
},
|
||||
"id": 105,
|
||||
"options": {
|
||||
@ -1414,7 +1122,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_database_ratio{server=~\"$server\"}",
|
||||
"expr": "dbbackup_dedup_database_ratio{instance=~\"$instance\"}",
|
||||
"legendFormat": "{{database}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
@ -1428,7 +1136,6 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Storage trends: compare space saved by dedup vs actual disk usage over time.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
@ -1484,7 +1191,7 @@
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 32
|
||||
"y": 36
|
||||
},
|
||||
"id": 106,
|
||||
"options": {
|
||||
@ -1507,7 +1214,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_space_saved_bytes{server=~\"$server\"}",
|
||||
"expr": "dbbackup_dedup_space_saved_bytes{instance=~\"$instance\"}",
|
||||
"legendFormat": "Space Saved",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
@ -1518,7 +1225,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "dbbackup_dedup_disk_usage_bytes{server=~\"$server\"}",
|
||||
"expr": "dbbackup_dedup_disk_usage_bytes{instance=~\"$instance\"}",
|
||||
"legendFormat": "Disk Usage",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
@ -1534,8 +1241,7 @@
|
||||
"dbbackup",
|
||||
"backup",
|
||||
"database",
|
||||
"dedup",
|
||||
"monitoring"
|
||||
"dedup"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
@ -1549,15 +1255,15 @@
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"definition": "label_values(dbbackup_rpo_seconds, server)",
|
||||
"definition": "label_values(dbbackup_rpo_seconds, instance)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Server",
|
||||
"label": "Instance",
|
||||
"multi": true,
|
||||
"name": "server",
|
||||
"name": "instance",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values(dbbackup_rpo_seconds, server)",
|
||||
"query": "label_values(dbbackup_rpo_seconds, instance)",
|
||||
"refId": "StandardVariableQuery"
|
||||
},
|
||||
"refresh": 2,
|
||||
|
||||
@ -201,12 +201,12 @@ func buildAuthMismatchMessage(osUser, dbUser string, method AuthMethod) string {
|
||||
msg.WriteString("\n[WARN] Authentication Mismatch Detected\n")
|
||||
msg.WriteString(strings.Repeat("=", 60) + "\n\n")
|
||||
|
||||
msg.WriteString(" PostgreSQL is using '" + string(method) + "' authentication\n")
|
||||
msg.WriteString(" OS user '" + osUser + "' cannot authenticate as DB user '" + dbUser + "'\n\n")
|
||||
msg.WriteString(fmt.Sprintf(" PostgreSQL is using '%s' authentication\n", method))
|
||||
msg.WriteString(fmt.Sprintf(" OS user '%s' cannot authenticate as DB user '%s'\n\n", osUser, dbUser))
|
||||
|
||||
msg.WriteString("[TIP] Solutions (choose one):\n\n")
|
||||
|
||||
msg.WriteString(" 1. Run as matching user:\n")
|
||||
msg.WriteString(fmt.Sprintf(" 1. Run as matching user:\n"))
|
||||
msg.WriteString(fmt.Sprintf(" sudo -u %s %s\n\n", dbUser, getCommandLine()))
|
||||
|
||||
msg.WriteString(" 2. Configure ~/.pgpass file (recommended):\n")
|
||||
@ -214,11 +214,11 @@ func buildAuthMismatchMessage(osUser, dbUser string, method AuthMethod) string {
|
||||
msg.WriteString(" chmod 0600 ~/.pgpass\n\n")
|
||||
|
||||
msg.WriteString(" 3. Set PGPASSWORD environment variable:\n")
|
||||
msg.WriteString(" export PGPASSWORD=your_password\n")
|
||||
msg.WriteString(" " + getCommandLine() + "\n\n")
|
||||
msg.WriteString(fmt.Sprintf(" export PGPASSWORD=your_password\n"))
|
||||
msg.WriteString(fmt.Sprintf(" %s\n\n", getCommandLine()))
|
||||
|
||||
msg.WriteString(" 4. Provide password via flag:\n")
|
||||
msg.WriteString(" " + getCommandLine() + " --password your_password\n\n")
|
||||
msg.WriteString(fmt.Sprintf(" %s --password your_password\n\n", getCommandLine()))
|
||||
|
||||
msg.WriteString("[NOTE] Note: For production use, ~/.pgpass or PGPASSWORD are recommended\n")
|
||||
msg.WriteString(" to avoid exposing passwords in command history.\n\n")
|
||||
|
||||
@ -20,7 +20,6 @@ import (
|
||||
"dbbackup/internal/cloud"
|
||||
"dbbackup/internal/config"
|
||||
"dbbackup/internal/database"
|
||||
"dbbackup/internal/fs"
|
||||
"dbbackup/internal/logger"
|
||||
"dbbackup/internal/metadata"
|
||||
"dbbackup/internal/metrics"
|
||||
@ -105,6 +104,13 @@ func (e *Engine) SetDatabaseProgressCallback(cb DatabaseProgressCallback) {
|
||||
e.dbProgressCallback = cb
|
||||
}
|
||||
|
||||
// reportProgress reports progress to the callback if set
|
||||
func (e *Engine) reportProgress(current, total int64, description string) {
|
||||
if e.progressCallback != nil {
|
||||
e.progressCallback(current, total, description)
|
||||
}
|
||||
}
|
||||
|
||||
// reportDatabaseProgress reports database count progress to the callback if set
|
||||
func (e *Engine) reportDatabaseProgress(done, total int, dbName string) {
|
||||
if e.dbProgressCallback != nil {
|
||||
@ -707,7 +713,6 @@ func (e *Engine) monitorCommandProgress(stderr io.ReadCloser, tracker *progress.
|
||||
}
|
||||
|
||||
// executeMySQLWithProgressAndCompression handles MySQL backup with compression and progress
|
||||
// Uses in-process pgzip for parallel compression (2-4x faster on multi-core systems)
|
||||
func (e *Engine) executeMySQLWithProgressAndCompression(ctx context.Context, cmdArgs []string, outputFile string, tracker *progress.OperationTracker) error {
|
||||
// Create mysqldump command
|
||||
dumpCmd := exec.CommandContext(ctx, cmdArgs[0], cmdArgs[1:]...)
|
||||
@ -716,6 +721,9 @@ func (e *Engine) executeMySQLWithProgressAndCompression(ctx context.Context, cmd
|
||||
dumpCmd.Env = append(dumpCmd.Env, "MYSQL_PWD="+e.cfg.Password)
|
||||
}
|
||||
|
||||
// Create gzip command
|
||||
gzipCmd := exec.CommandContext(ctx, "gzip", fmt.Sprintf("-%d", e.cfg.CompressionLevel))
|
||||
|
||||
// Create output file
|
||||
outFile, err := os.Create(outputFile)
|
||||
if err != nil {
|
||||
@ -723,19 +731,15 @@ func (e *Engine) executeMySQLWithProgressAndCompression(ctx context.Context, cmd
|
||||
}
|
||||
defer outFile.Close()
|
||||
|
||||
// Create parallel gzip writer using pgzip
|
||||
gzWriter, err := fs.NewParallelGzipWriter(outFile, e.cfg.CompressionLevel)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create gzip writer: %w", err)
|
||||
}
|
||||
defer gzWriter.Close()
|
||||
|
||||
// Set up pipeline: mysqldump stdout -> pgzip writer -> file
|
||||
// Set up pipeline: mysqldump | gzip > outputfile
|
||||
pipe, err := dumpCmd.StdoutPipe()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create pipe: %w", err)
|
||||
}
|
||||
|
||||
gzipCmd.Stdin = pipe
|
||||
gzipCmd.Stdout = outFile
|
||||
|
||||
// Get stderr for progress monitoring
|
||||
stderr, err := dumpCmd.StderrPipe()
|
||||
if err != nil {
|
||||
@ -749,17 +753,15 @@ func (e *Engine) executeMySQLWithProgressAndCompression(ctx context.Context, cmd
|
||||
e.monitorCommandProgress(stderr, tracker)
|
||||
}()
|
||||
|
||||
// Start mysqldump
|
||||
if err := dumpCmd.Start(); err != nil {
|
||||
return fmt.Errorf("failed to start mysqldump: %w", err)
|
||||
// Start both commands
|
||||
if err := gzipCmd.Start(); err != nil {
|
||||
return fmt.Errorf("failed to start gzip: %w", err)
|
||||
}
|
||||
|
||||
// Copy mysqldump output through pgzip in a goroutine
|
||||
copyDone := make(chan error, 1)
|
||||
go func() {
|
||||
_, err := io.Copy(gzWriter, pipe)
|
||||
copyDone <- err
|
||||
}()
|
||||
if err := dumpCmd.Start(); err != nil {
|
||||
gzipCmd.Process.Kill()
|
||||
return fmt.Errorf("failed to start mysqldump: %w", err)
|
||||
}
|
||||
|
||||
// Wait for mysqldump with context handling
|
||||
dumpDone := make(chan error, 1)
|
||||
@ -774,6 +776,7 @@ func (e *Engine) executeMySQLWithProgressAndCompression(ctx context.Context, cmd
|
||||
case <-ctx.Done():
|
||||
e.log.Warn("Backup cancelled - killing mysqldump")
|
||||
dumpCmd.Process.Kill()
|
||||
gzipCmd.Process.Kill()
|
||||
<-dumpDone
|
||||
return ctx.Err()
|
||||
}
|
||||
@ -781,14 +784,10 @@ func (e *Engine) executeMySQLWithProgressAndCompression(ctx context.Context, cmd
|
||||
// Wait for stderr reader
|
||||
<-stderrDone
|
||||
|
||||
// Wait for copy to complete
|
||||
if copyErr := <-copyDone; copyErr != nil {
|
||||
return fmt.Errorf("compression failed: %w", copyErr)
|
||||
}
|
||||
|
||||
// Close gzip writer to flush all data
|
||||
if err := gzWriter.Close(); err != nil {
|
||||
return fmt.Errorf("failed to close gzip writer: %w", err)
|
||||
// Close pipe and wait for gzip
|
||||
pipe.Close()
|
||||
if err := gzipCmd.Wait(); err != nil {
|
||||
return fmt.Errorf("gzip failed: %w", err)
|
||||
}
|
||||
|
||||
if dumpErr != nil {
|
||||
@ -799,7 +798,6 @@ func (e *Engine) executeMySQLWithProgressAndCompression(ctx context.Context, cmd
|
||||
}
|
||||
|
||||
// executeMySQLWithCompression handles MySQL backup with compression
|
||||
// Uses in-process pgzip for parallel compression (2-4x faster on multi-core systems)
|
||||
func (e *Engine) executeMySQLWithCompression(ctx context.Context, cmdArgs []string, outputFile string) error {
|
||||
// Create mysqldump command
|
||||
dumpCmd := exec.CommandContext(ctx, cmdArgs[0], cmdArgs[1:]...)
|
||||
@ -808,6 +806,9 @@ func (e *Engine) executeMySQLWithCompression(ctx context.Context, cmdArgs []stri
|
||||
dumpCmd.Env = append(dumpCmd.Env, "MYSQL_PWD="+e.cfg.Password)
|
||||
}
|
||||
|
||||
// Create gzip command
|
||||
gzipCmd := exec.CommandContext(ctx, "gzip", fmt.Sprintf("-%d", e.cfg.CompressionLevel))
|
||||
|
||||
// Create output file
|
||||
outFile, err := os.Create(outputFile)
|
||||
if err != nil {
|
||||
@ -815,31 +816,25 @@ func (e *Engine) executeMySQLWithCompression(ctx context.Context, cmdArgs []stri
|
||||
}
|
||||
defer outFile.Close()
|
||||
|
||||
// Create parallel gzip writer using pgzip
|
||||
gzWriter, err := fs.NewParallelGzipWriter(outFile, e.cfg.CompressionLevel)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create gzip writer: %w", err)
|
||||
}
|
||||
defer gzWriter.Close()
|
||||
|
||||
// Set up pipeline: mysqldump stdout -> pgzip writer -> file
|
||||
pipe, err := dumpCmd.StdoutPipe()
|
||||
// Set up pipeline: mysqldump | gzip > outputfile
|
||||
stdin, err := dumpCmd.StdoutPipe()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create pipe: %w", err)
|
||||
}
|
||||
gzipCmd.Stdin = stdin
|
||||
gzipCmd.Stdout = outFile
|
||||
|
||||
// Start gzip first
|
||||
if err := gzipCmd.Start(); err != nil {
|
||||
return fmt.Errorf("failed to start gzip: %w", err)
|
||||
}
|
||||
|
||||
// Start mysqldump
|
||||
if err := dumpCmd.Start(); err != nil {
|
||||
gzipCmd.Process.Kill()
|
||||
return fmt.Errorf("failed to start mysqldump: %w", err)
|
||||
}
|
||||
|
||||
// Copy mysqldump output through pgzip in a goroutine
|
||||
copyDone := make(chan error, 1)
|
||||
go func() {
|
||||
_, err := io.Copy(gzWriter, pipe)
|
||||
copyDone <- err
|
||||
}()
|
||||
|
||||
// Wait for mysqldump with context handling
|
||||
dumpDone := make(chan error, 1)
|
||||
go func() {
|
||||
@ -853,18 +848,15 @@ func (e *Engine) executeMySQLWithCompression(ctx context.Context, cmdArgs []stri
|
||||
case <-ctx.Done():
|
||||
e.log.Warn("Backup cancelled - killing mysqldump")
|
||||
dumpCmd.Process.Kill()
|
||||
gzipCmd.Process.Kill()
|
||||
<-dumpDone
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
// Wait for copy to complete
|
||||
if copyErr := <-copyDone; copyErr != nil {
|
||||
return fmt.Errorf("compression failed: %w", copyErr)
|
||||
}
|
||||
|
||||
// Close gzip writer to flush all data
|
||||
if err := gzWriter.Close(); err != nil {
|
||||
return fmt.Errorf("failed to close gzip writer: %w", err)
|
||||
// Close pipe and wait for gzip
|
||||
stdin.Close()
|
||||
if err := gzipCmd.Wait(); err != nil {
|
||||
return fmt.Errorf("gzip failed: %w", err)
|
||||
}
|
||||
|
||||
if dumpErr != nil {
|
||||
@ -960,74 +952,125 @@ func (e *Engine) backupGlobals(ctx context.Context, tempDir string) error {
|
||||
cmd.Env = append(cmd.Env, "PGPASSWORD="+e.cfg.Password)
|
||||
}
|
||||
|
||||
// Use Start/Wait pattern for proper Ctrl+C handling
|
||||
stdout, err := cmd.StdoutPipe()
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create stdout pipe: %w", err)
|
||||
}
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
return fmt.Errorf("failed to start pg_dumpall: %w", err)
|
||||
}
|
||||
|
||||
// Read output in goroutine
|
||||
var output []byte
|
||||
var readErr error
|
||||
readDone := make(chan struct{})
|
||||
go func() {
|
||||
defer close(readDone)
|
||||
output, readErr = io.ReadAll(stdout)
|
||||
}()
|
||||
|
||||
// Wait for command with proper context handling
|
||||
cmdDone := make(chan error, 1)
|
||||
go func() {
|
||||
cmdDone <- cmd.Wait()
|
||||
}()
|
||||
|
||||
var cmdErr error
|
||||
select {
|
||||
case cmdErr = <-cmdDone:
|
||||
// Command completed normally
|
||||
case <-ctx.Done():
|
||||
e.log.Warn("Globals backup cancelled - killing pg_dumpall")
|
||||
cmd.Process.Kill()
|
||||
<-cmdDone
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
<-readDone
|
||||
|
||||
if cmdErr != nil {
|
||||
return fmt.Errorf("pg_dumpall failed: %w", cmdErr)
|
||||
}
|
||||
if readErr != nil {
|
||||
return fmt.Errorf("failed to read pg_dumpall output: %w", readErr)
|
||||
return fmt.Errorf("pg_dumpall failed: %w", err)
|
||||
}
|
||||
|
||||
return os.WriteFile(globalsFile, output, 0644)
|
||||
}
|
||||
|
||||
// createArchive creates a compressed tar archive using parallel gzip compression
|
||||
// Uses in-process pgzip for 2-4x faster compression on multi-core systems
|
||||
// createArchive creates a compressed tar archive
|
||||
func (e *Engine) createArchive(ctx context.Context, sourceDir, outputFile string) error {
|
||||
e.log.Debug("Creating archive with parallel compression",
|
||||
"source", sourceDir,
|
||||
"output", outputFile,
|
||||
"compression", e.cfg.CompressionLevel)
|
||||
// Use pigz for faster parallel compression if available, otherwise use standard gzip
|
||||
compressCmd := "tar"
|
||||
compressArgs := []string{"-czf", outputFile, "-C", sourceDir, "."}
|
||||
|
||||
// Use in-process parallel compression with pgzip
|
||||
err := fs.CreateTarGzParallel(ctx, sourceDir, outputFile, e.cfg.CompressionLevel, func(progress fs.CreateProgress) {
|
||||
// Optional: log progress for large archives
|
||||
if progress.FilesCount%100 == 0 && progress.FilesCount > 0 {
|
||||
e.log.Debug("Archive progress", "files", progress.FilesCount, "bytes", progress.BytesWritten)
|
||||
// Check if pigz is available for faster parallel compression
|
||||
if _, err := exec.LookPath("pigz"); err == nil {
|
||||
// Use pigz with number of cores for parallel compression
|
||||
compressArgs = []string{"-cf", "-", "-C", sourceDir, "."}
|
||||
cmd := exec.CommandContext(ctx, "tar", compressArgs...)
|
||||
|
||||
// Create output file
|
||||
outFile, err := os.Create(outputFile)
|
||||
if err != nil {
|
||||
// Fallback to regular tar
|
||||
goto regularTar
|
||||
}
|
||||
})
|
||||
defer outFile.Close()
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("parallel archive creation failed: %w", err)
|
||||
// Pipe to pigz for parallel compression
|
||||
pigzCmd := exec.CommandContext(ctx, "pigz", "-p", strconv.Itoa(e.cfg.Jobs))
|
||||
|
||||
tarOut, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
outFile.Close()
|
||||
// Fallback to regular tar
|
||||
goto regularTar
|
||||
}
|
||||
pigzCmd.Stdin = tarOut
|
||||
pigzCmd.Stdout = outFile
|
||||
|
||||
// Start both commands
|
||||
if err := pigzCmd.Start(); err != nil {
|
||||
outFile.Close()
|
||||
goto regularTar
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
pigzCmd.Process.Kill()
|
||||
outFile.Close()
|
||||
goto regularTar
|
||||
}
|
||||
|
||||
// Wait for tar with proper context handling
|
||||
tarDone := make(chan error, 1)
|
||||
go func() {
|
||||
tarDone <- cmd.Wait()
|
||||
}()
|
||||
|
||||
var tarErr error
|
||||
select {
|
||||
case tarErr = <-tarDone:
|
||||
// tar completed
|
||||
case <-ctx.Done():
|
||||
e.log.Warn("Archive creation cancelled - killing processes")
|
||||
cmd.Process.Kill()
|
||||
pigzCmd.Process.Kill()
|
||||
<-tarDone
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
if tarErr != nil {
|
||||
pigzCmd.Process.Kill()
|
||||
return fmt.Errorf("tar failed: %w", tarErr)
|
||||
}
|
||||
|
||||
// Wait for pigz with proper context handling
|
||||
pigzDone := make(chan error, 1)
|
||||
go func() {
|
||||
pigzDone <- pigzCmd.Wait()
|
||||
}()
|
||||
|
||||
var pigzErr error
|
||||
select {
|
||||
case pigzErr = <-pigzDone:
|
||||
case <-ctx.Done():
|
||||
pigzCmd.Process.Kill()
|
||||
<-pigzDone
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
if pigzErr != nil {
|
||||
return fmt.Errorf("pigz compression failed: %w", pigzErr)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
regularTar:
|
||||
// Standard tar with gzip (fallback)
|
||||
cmd := exec.CommandContext(ctx, compressCmd, compressArgs...)
|
||||
|
||||
// Stream stderr to avoid memory issues
|
||||
// Use io.Copy to ensure goroutine completes when pipe closes
|
||||
stderr, err := cmd.StderrPipe()
|
||||
if err == nil {
|
||||
go func() {
|
||||
scanner := bufio.NewScanner(stderr)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if line != "" {
|
||||
e.log.Debug("Archive creation", "output", line)
|
||||
}
|
||||
}
|
||||
// Scanner will exit when stderr pipe closes after cmd.Wait()
|
||||
}()
|
||||
}
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
return fmt.Errorf("tar failed: %w", err)
|
||||
}
|
||||
// cmd.Run() calls Wait() which closes stderr pipe, terminating the goroutine
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@ -2,13 +2,12 @@ package backup
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/klauspost/pgzip"
|
||||
)
|
||||
|
||||
// extractTarGz extracts a tar.gz archive to the specified directory
|
||||
@ -21,8 +20,8 @@ func (e *PostgresIncrementalEngine) extractTarGz(ctx context.Context, archivePat
|
||||
}
|
||||
defer archiveFile.Close()
|
||||
|
||||
// Create parallel gzip reader for faster decompression
|
||||
gzReader, err := pgzip.NewReader(archiveFile)
|
||||
// Create gzip reader
|
||||
gzReader, err := gzip.NewReader(archiveFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create gzip reader: %w", err)
|
||||
}
|
||||
|
||||
@ -2,6 +2,7 @@ package backup
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
@ -12,8 +13,6 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/klauspost/pgzip"
|
||||
|
||||
"dbbackup/internal/logger"
|
||||
"dbbackup/internal/metadata"
|
||||
)
|
||||
@ -368,15 +367,15 @@ func (e *MySQLIncrementalEngine) CalculateFileChecksum(path string) (string, err
|
||||
|
||||
// createTarGz creates a tar.gz archive with the specified changed files
|
||||
func (e *MySQLIncrementalEngine) createTarGz(ctx context.Context, outputFile string, changedFiles []ChangedFile, config *IncrementalBackupConfig) error {
|
||||
// Create output file
|
||||
// Import needed for tar/gzip
|
||||
outFile, err := os.Create(outputFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create output file: %w", err)
|
||||
}
|
||||
defer outFile.Close()
|
||||
|
||||
// Create parallel gzip writer for faster compression
|
||||
gzWriter, err := pgzip.NewWriterLevel(outFile, config.CompressionLevel)
|
||||
// Create gzip writer
|
||||
gzWriter, err := gzip.NewWriterLevel(outFile, config.CompressionLevel)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create gzip writer: %w", err)
|
||||
}
|
||||
@ -461,8 +460,8 @@ func (e *MySQLIncrementalEngine) extractTarGz(ctx context.Context, archivePath,
|
||||
}
|
||||
defer archiveFile.Close()
|
||||
|
||||
// Create parallel gzip reader for faster decompression
|
||||
gzReader, err := pgzip.NewReader(archiveFile)
|
||||
// Create gzip reader
|
||||
gzReader, err := gzip.NewReader(archiveFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create gzip reader: %w", err)
|
||||
}
|
||||
|
||||
@ -2,12 +2,11 @@ package backup
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"github.com/klauspost/pgzip"
|
||||
)
|
||||
|
||||
// createTarGz creates a tar.gz archive with the specified changed files
|
||||
@ -19,8 +18,8 @@ func (e *PostgresIncrementalEngine) createTarGz(ctx context.Context, outputFile
|
||||
}
|
||||
defer outFile.Close()
|
||||
|
||||
// Create parallel gzip writer for faster compression
|
||||
gzWriter, err := pgzip.NewWriterLevel(outFile, config.CompressionLevel)
|
||||
// Create gzip writer
|
||||
gzWriter, err := gzip.NewWriterLevel(outFile, config.CompressionLevel)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create gzip writer: %w", err)
|
||||
}
|
||||
|
||||
@ -150,14 +150,12 @@ type Catalog interface {
|
||||
|
||||
// SyncResult contains results from a catalog sync operation
|
||||
type SyncResult struct {
|
||||
Added int `json:"added"`
|
||||
Updated int `json:"updated"`
|
||||
Removed int `json:"removed"`
|
||||
Skipped int `json:"skipped"` // Files without metadata (legacy backups)
|
||||
Errors int `json:"errors"`
|
||||
Duration float64 `json:"duration_seconds"`
|
||||
Details []string `json:"details,omitempty"`
|
||||
LegacyWarning string `json:"legacy_warning,omitempty"` // Warning about legacy files
|
||||
Added int `json:"added"`
|
||||
Updated int `json:"updated"`
|
||||
Removed int `json:"removed"`
|
||||
Errors int `json:"errors"`
|
||||
Duration float64 `json:"duration_seconds"`
|
||||
Details []string `json:"details,omitempty"`
|
||||
}
|
||||
|
||||
// FormatSize formats bytes as human-readable string
|
||||
|
||||
@ -11,7 +11,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
_ "modernc.org/sqlite" // Pure Go SQLite driver (no CGO required)
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
// SQLiteCatalog implements Catalog interface with SQLite storage
|
||||
@ -28,7 +28,7 @@ func NewSQLiteCatalog(dbPath string) (*SQLiteCatalog, error) {
|
||||
return nil, fmt.Errorf("failed to create catalog directory: %w", err)
|
||||
}
|
||||
|
||||
db, err := sql.Open("sqlite", dbPath+"?_journal_mode=WAL&_foreign_keys=ON")
|
||||
db, err := sql.Open("sqlite3", dbPath+"?_journal_mode=WAL&_foreign_keys=ON")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open catalog database: %w", err)
|
||||
}
|
||||
@ -464,8 +464,8 @@ func (c *SQLiteCatalog) Stats(ctx context.Context) (*Stats, error) {
|
||||
MAX(created_at),
|
||||
COALESCE(AVG(duration), 0),
|
||||
CAST(COALESCE(AVG(size_bytes), 0) AS INTEGER),
|
||||
COALESCE(SUM(CASE WHEN verified_at IS NOT NULL THEN 1 ELSE 0 END), 0),
|
||||
COALESCE(SUM(CASE WHEN drill_tested_at IS NOT NULL THEN 1 ELSE 0 END), 0)
|
||||
SUM(CASE WHEN verified_at IS NOT NULL THEN 1 ELSE 0 END),
|
||||
SUM(CASE WHEN drill_tested_at IS NOT NULL THEN 1 ELSE 0 END)
|
||||
FROM backups WHERE status != 'deleted'
|
||||
`)
|
||||
|
||||
@ -548,8 +548,8 @@ func (c *SQLiteCatalog) StatsByDatabase(ctx context.Context, database string) (*
|
||||
MAX(created_at),
|
||||
COALESCE(AVG(duration), 0),
|
||||
COALESCE(AVG(size_bytes), 0),
|
||||
COALESCE(SUM(CASE WHEN verified_at IS NOT NULL THEN 1 ELSE 0 END), 0),
|
||||
COALESCE(SUM(CASE WHEN drill_tested_at IS NOT NULL THEN 1 ELSE 0 END), 0)
|
||||
SUM(CASE WHEN verified_at IS NOT NULL THEN 1 ELSE 0 END),
|
||||
SUM(CASE WHEN drill_tested_at IS NOT NULL THEN 1 ELSE 0 END)
|
||||
FROM backups WHERE database = ? AND status != 'deleted'
|
||||
`, database)
|
||||
|
||||
|
||||
@ -30,33 +30,6 @@ func (c *SQLiteCatalog) SyncFromDirectory(ctx context.Context, dir string) (*Syn
|
||||
subMatches, _ := filepath.Glob(subPattern)
|
||||
matches = append(matches, subMatches...)
|
||||
|
||||
// Count legacy backups (files without metadata)
|
||||
legacySkipped := 0
|
||||
legacyPatterns := []string{
|
||||
filepath.Join(dir, "*.sql"),
|
||||
filepath.Join(dir, "*.sql.gz"),
|
||||
filepath.Join(dir, "*.sql.lz4"),
|
||||
filepath.Join(dir, "*.sql.zst"),
|
||||
filepath.Join(dir, "*.dump"),
|
||||
filepath.Join(dir, "*.dump.gz"),
|
||||
filepath.Join(dir, "*", "*.sql"),
|
||||
filepath.Join(dir, "*", "*.sql.gz"),
|
||||
}
|
||||
metaSet := make(map[string]bool)
|
||||
for _, m := range matches {
|
||||
// Store the backup file path (without .meta.json)
|
||||
metaSet[strings.TrimSuffix(m, ".meta.json")] = true
|
||||
}
|
||||
for _, pat := range legacyPatterns {
|
||||
legacyMatches, _ := filepath.Glob(pat)
|
||||
for _, lm := range legacyMatches {
|
||||
// Skip if this file has metadata
|
||||
if !metaSet[lm] {
|
||||
legacySkipped++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, metaPath := range matches {
|
||||
// Derive backup file path from metadata path
|
||||
backupPath := strings.TrimSuffix(metaPath, ".meta.json")
|
||||
@ -124,17 +97,6 @@ func (c *SQLiteCatalog) SyncFromDirectory(ctx context.Context, dir string) (*Syn
|
||||
}
|
||||
}
|
||||
|
||||
// Set legacy backup warning if applicable
|
||||
result.Skipped = legacySkipped
|
||||
if legacySkipped > 0 {
|
||||
result.LegacyWarning = fmt.Sprintf(
|
||||
"%d backup file(s) found without .meta.json metadata. "+
|
||||
"These are likely legacy backups created by raw mysqldump/pg_dump. "+
|
||||
"Only backups created by 'dbbackup backup' (with metadata) can be imported. "+
|
||||
"To track legacy backups, re-create them using 'dbbackup backup' command.",
|
||||
legacySkipped)
|
||||
}
|
||||
|
||||
result.Duration = time.Since(start).Seconds()
|
||||
return result, nil
|
||||
}
|
||||
|
||||
@ -1,181 +0,0 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// lockRecommendation represents a normalized recommendation for locks
|
||||
type lockRecommendation int
|
||||
|
||||
const (
|
||||
recIncrease lockRecommendation = iota
|
||||
recSingleThreadedOrIncrease
|
||||
recSingleThreaded
|
||||
)
|
||||
|
||||
// determineLockRecommendation contains the pure logic (easy to unit-test).
|
||||
func determineLockRecommendation(locks, conns, prepared int64) (status CheckStatus, rec lockRecommendation) {
|
||||
// follow same thresholds as legacy script
|
||||
switch {
|
||||
case locks < 2048:
|
||||
return StatusFailed, recIncrease
|
||||
case locks < 8192:
|
||||
return StatusWarning, recIncrease
|
||||
case locks < 65536:
|
||||
return StatusWarning, recSingleThreadedOrIncrease
|
||||
default:
|
||||
return StatusPassed, recSingleThreaded
|
||||
}
|
||||
}
|
||||
|
||||
var nonDigits = regexp.MustCompile(`[^0-9]+`)
|
||||
|
||||
// parseNumeric strips non-digits and parses up to 10 characters (like the shell helper)
|
||||
func parseNumeric(s string) (int64, error) {
|
||||
if s == "" {
|
||||
return 0, fmt.Errorf("empty string")
|
||||
}
|
||||
s = nonDigits.ReplaceAllString(s, "")
|
||||
if len(s) > 10 {
|
||||
s = s[:10]
|
||||
}
|
||||
v, err := strconv.ParseInt(s, 10, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("parse error: %w", err)
|
||||
}
|
||||
return v, nil
|
||||
}
|
||||
|
||||
// execPsql runs psql with the supplied arguments and returns stdout (trimmed).
|
||||
// It attempts to avoid leaking passwords in error messages.
|
||||
func execPsql(ctx context.Context, args []string, env []string, useSudo bool) (string, error) {
|
||||
var cmd *exec.Cmd
|
||||
if useSudo {
|
||||
// sudo -u postgres psql --no-psqlrc -t -A -c "..."
|
||||
all := append([]string{"-u", "postgres", "--"}, "psql")
|
||||
all = append(all, args...)
|
||||
cmd = exec.CommandContext(ctx, "sudo", all...)
|
||||
} else {
|
||||
cmd = exec.CommandContext(ctx, "psql", args...)
|
||||
}
|
||||
cmd.Env = append(os.Environ(), env...)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
// prefer a concise error
|
||||
return "", fmt.Errorf("psql failed: %w", err)
|
||||
}
|
||||
return strings.TrimSpace(string(out)), nil
|
||||
}
|
||||
|
||||
// checkPostgresLocks probes PostgreSQL (via psql) and returns a PreflightCheck.
|
||||
// It intentionally does not require a live internal/database.Database; it uses
|
||||
// the configured connection parameters or falls back to local sudo when possible.
|
||||
func (p *PreflightChecker) checkPostgresLocks(ctx context.Context) PreflightCheck {
|
||||
check := PreflightCheck{Name: "PostgreSQL lock configuration"}
|
||||
|
||||
if !p.cfg.IsPostgreSQL() {
|
||||
check.Status = StatusSkipped
|
||||
check.Message = "Skipped (not a PostgreSQL configuration)"
|
||||
return check
|
||||
}
|
||||
|
||||
// Build common psql args
|
||||
psqlArgs := []string{"--no-psqlrc", "-t", "-A", "-c"}
|
||||
queryLocks := "SHOW max_locks_per_transaction;"
|
||||
queryConns := "SHOW max_connections;"
|
||||
queryPrepared := "SHOW max_prepared_transactions;"
|
||||
|
||||
// Build connection flags
|
||||
if p.cfg.Host != "" {
|
||||
psqlArgs = append(psqlArgs, "-h", p.cfg.Host)
|
||||
}
|
||||
psqlArgs = append(psqlArgs, "-p", fmt.Sprint(p.cfg.Port))
|
||||
if p.cfg.User != "" {
|
||||
psqlArgs = append(psqlArgs, "-U", p.cfg.User)
|
||||
}
|
||||
// Use database if provided (helps some setups)
|
||||
if p.cfg.Database != "" {
|
||||
psqlArgs = append(psqlArgs, "-d", p.cfg.Database)
|
||||
}
|
||||
|
||||
// Env: prefer PGPASSWORD if configured
|
||||
env := []string{}
|
||||
if p.cfg.Password != "" {
|
||||
env = append(env, "PGPASSWORD="+p.cfg.Password)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// helper to run a single SHOW query and parse numeric result
|
||||
runShow := func(q string) (int64, error) {
|
||||
args := append(psqlArgs, q)
|
||||
out, err := execPsql(ctx, args, env, false)
|
||||
if err != nil {
|
||||
// If local host and no explicit auth, try sudo -u postgres
|
||||
if (p.cfg.Host == "" || p.cfg.Host == "localhost" || p.cfg.Host == "127.0.0.1") && p.cfg.Password == "" {
|
||||
out, err = execPsql(ctx, append(psqlArgs, q), env, true)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
} else {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
v, err := parseNumeric(out)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("non-numeric response from psql: %q", out)
|
||||
}
|
||||
return v, nil
|
||||
}
|
||||
|
||||
locks, err := runShow(queryLocks)
|
||||
if err != nil {
|
||||
check.Status = StatusFailed
|
||||
check.Message = "Could not read max_locks_per_transaction"
|
||||
check.Details = err.Error()
|
||||
return check
|
||||
}
|
||||
|
||||
conns, err := runShow(queryConns)
|
||||
if err != nil {
|
||||
check.Status = StatusFailed
|
||||
check.Message = "Could not read max_connections"
|
||||
check.Details = err.Error()
|
||||
return check
|
||||
}
|
||||
|
||||
prepared, _ := runShow(queryPrepared) // optional; treat errors as zero
|
||||
|
||||
// Compute capacity
|
||||
capacity := locks * (conns + prepared)
|
||||
|
||||
status, rec := determineLockRecommendation(locks, conns, prepared)
|
||||
check.Status = status
|
||||
check.Message = fmt.Sprintf("locks=%d connections=%d prepared=%d capacity=%d", locks, conns, prepared, capacity)
|
||||
|
||||
// Human-friendly details + actionable remediation
|
||||
detailLines := []string{fmt.Sprintf("max_locks_per_transaction: %d", locks), fmt.Sprintf("max_connections: %d", conns), fmt.Sprintf("max_prepared_transactions: %d", prepared), fmt.Sprintf("Total lock capacity: %d", capacity)}
|
||||
|
||||
switch rec {
|
||||
case recIncrease:
|
||||
detailLines = append(detailLines, "RECOMMENDATION: Increase to at least 65536 and run restore single-threaded")
|
||||
detailLines = append(detailLines, " sudo -u postgres psql -c \"ALTER SYSTEM SET max_locks_per_transaction = 65536;\" && sudo systemctl restart postgresql")
|
||||
check.Details = strings.Join(detailLines, "\n")
|
||||
case recSingleThreadedOrIncrease:
|
||||
detailLines = append(detailLines, "RECOMMENDATION: Use single-threaded restore (--jobs 1 --parallel-dbs 1) or increase locks to 65536 and still prefer single-threaded")
|
||||
check.Details = strings.Join(detailLines, "\n")
|
||||
case recSingleThreaded:
|
||||
detailLines = append(detailLines, "RECOMMENDATION: Single-threaded restore is safest for very large DBs")
|
||||
check.Details = strings.Join(detailLines, "\n")
|
||||
}
|
||||
|
||||
return check
|
||||
}
|
||||
@ -1,55 +0,0 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestDetermineLockRecommendation(t *testing.T) {
|
||||
tests := []struct {
|
||||
locks int64
|
||||
conns int64
|
||||
prepared int64
|
||||
exStatus CheckStatus
|
||||
exRec lockRecommendation
|
||||
}{
|
||||
{locks: 1024, conns: 100, prepared: 0, exStatus: StatusFailed, exRec: recIncrease},
|
||||
{locks: 4096, conns: 200, prepared: 0, exStatus: StatusWarning, exRec: recIncrease},
|
||||
{locks: 16384, conns: 200, prepared: 0, exStatus: StatusWarning, exRec: recSingleThreadedOrIncrease},
|
||||
{locks: 65536, conns: 200, prepared: 0, exStatus: StatusPassed, exRec: recSingleThreaded},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
st, rec := determineLockRecommendation(tc.locks, tc.conns, tc.prepared)
|
||||
if st != tc.exStatus {
|
||||
t.Fatalf("locks=%d: status = %v, want %v", tc.locks, st, tc.exStatus)
|
||||
}
|
||||
if rec != tc.exRec {
|
||||
t.Fatalf("locks=%d: rec = %v, want %v", tc.locks, rec, tc.exRec)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNumeric(t *testing.T) {
|
||||
cases := map[string]int64{
|
||||
"4096": 4096,
|
||||
" 4096\n": 4096,
|
||||
"4096 (default)": 4096,
|
||||
"unknown": 0, // should error
|
||||
}
|
||||
|
||||
for in, want := range cases {
|
||||
v, err := parseNumeric(in)
|
||||
if want == 0 {
|
||||
if err == nil {
|
||||
t.Fatalf("expected error parsing %q", in)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("parseNumeric(%q) error: %v", in, err)
|
||||
}
|
||||
if v != want {
|
||||
t.Fatalf("parseNumeric(%q) = %d, want %d", in, v, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -120,17 +120,6 @@ func (p *PreflightChecker) RunAllChecks(ctx context.Context, dbName string) (*Pr
|
||||
result.FailureCount++
|
||||
}
|
||||
|
||||
// Postgres lock configuration check (provides explicit restore guidance)
|
||||
locksCheck := p.checkPostgresLocks(ctx)
|
||||
result.Checks = append(result.Checks, locksCheck)
|
||||
if locksCheck.Status == StatusFailed {
|
||||
result.AllPassed = false
|
||||
result.FailureCount++
|
||||
} else if locksCheck.Status == StatusWarning {
|
||||
result.HasWarnings = true
|
||||
result.WarningCount++
|
||||
}
|
||||
|
||||
// Extract database info if connection succeeded
|
||||
if dbCheck.Status == StatusPassed && p.db != nil {
|
||||
version, _ := p.db.GetVersion(ctx)
|
||||
|
||||
@ -162,12 +162,7 @@ func (a *AzureBackend) uploadSimple(ctx context.Context, file *os.File, blobName
|
||||
blockBlobClient := a.client.ServiceClient().NewContainerClient(a.containerName).NewBlockBlobClient(blobName)
|
||||
|
||||
// Wrap reader with progress tracking
|
||||
var reader io.Reader = NewProgressReader(file, fileSize, progress)
|
||||
|
||||
// Apply bandwidth throttling if configured
|
||||
if a.config.BandwidthLimit > 0 {
|
||||
reader = NewThrottledReader(ctx, reader, a.config.BandwidthLimit)
|
||||
}
|
||||
reader := NewProgressReader(file, fileSize, progress)
|
||||
|
||||
// Calculate MD5 hash for integrity
|
||||
hash := sha256.New()
|
||||
@ -209,13 +204,6 @@ func (a *AzureBackend) uploadBlocks(ctx context.Context, file *os.File, blobName
|
||||
hash := sha256.New()
|
||||
var totalUploaded int64
|
||||
|
||||
// Calculate throttle delay per byte if bandwidth limited
|
||||
var throttleDelay time.Duration
|
||||
if a.config.BandwidthLimit > 0 {
|
||||
// Calculate nanoseconds per byte
|
||||
throttleDelay = time.Duration(float64(time.Second) / float64(a.config.BandwidthLimit) * float64(blockSize))
|
||||
}
|
||||
|
||||
for i := int64(0); i < numBlocks; i++ {
|
||||
blockID := base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("block-%08d", i)))
|
||||
blockIDs = append(blockIDs, blockID)
|
||||
@ -237,15 +225,6 @@ func (a *AzureBackend) uploadBlocks(ctx context.Context, file *os.File, blobName
|
||||
// Update hash
|
||||
hash.Write(blockData)
|
||||
|
||||
// Apply throttling between blocks if configured
|
||||
if a.config.BandwidthLimit > 0 && i > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(throttleDelay):
|
||||
}
|
||||
}
|
||||
|
||||
// Upload block
|
||||
reader := bytes.NewReader(blockData)
|
||||
_, err = blockBlobClient.StageBlock(ctx, blockID, streaming.NopCloser(reader), nil)
|
||||
|
||||
@ -121,12 +121,7 @@ func (g *GCSBackend) Upload(ctx context.Context, localPath, remotePath string, p
|
||||
|
||||
// Wrap reader with progress tracking and hash calculation
|
||||
hash := sha256.New()
|
||||
var reader io.Reader = NewProgressReader(io.TeeReader(file, hash), fileSize, progress)
|
||||
|
||||
// Apply bandwidth throttling if configured
|
||||
if g.config.BandwidthLimit > 0 {
|
||||
reader = NewThrottledReader(ctx, reader, g.config.BandwidthLimit)
|
||||
}
|
||||
reader := NewProgressReader(io.TeeReader(file, hash), fileSize, progress)
|
||||
|
||||
// Upload with progress tracking
|
||||
_, err = io.Copy(writer, reader)
|
||||
|
||||
@ -46,19 +46,18 @@ type ProgressCallback func(bytesTransferred, totalBytes int64)
|
||||
|
||||
// Config contains common configuration for cloud backends
|
||||
type Config struct {
|
||||
Provider string // "s3", "minio", "azure", "gcs", "b2"
|
||||
Bucket string // Bucket or container name
|
||||
Region string // Region (for S3)
|
||||
Endpoint string // Custom endpoint (for MinIO, S3-compatible)
|
||||
AccessKey string // Access key or account ID
|
||||
SecretKey string // Secret key or access token
|
||||
UseSSL bool // Use SSL/TLS (default: true)
|
||||
PathStyle bool // Use path-style addressing (for MinIO)
|
||||
Prefix string // Prefix for all operations (e.g., "backups/")
|
||||
Timeout int // Timeout in seconds (default: 300)
|
||||
MaxRetries int // Maximum retry attempts (default: 3)
|
||||
Concurrency int // Upload/download concurrency (default: 5)
|
||||
BandwidthLimit int64 // Maximum upload/download bandwidth in bytes/sec (0 = unlimited)
|
||||
Provider string // "s3", "minio", "azure", "gcs", "b2"
|
||||
Bucket string // Bucket or container name
|
||||
Region string // Region (for S3)
|
||||
Endpoint string // Custom endpoint (for MinIO, S3-compatible)
|
||||
AccessKey string // Access key or account ID
|
||||
SecretKey string // Secret key or access token
|
||||
UseSSL bool // Use SSL/TLS (default: true)
|
||||
PathStyle bool // Use path-style addressing (for MinIO)
|
||||
Prefix string // Prefix for all operations (e.g., "backups/")
|
||||
Timeout int // Timeout in seconds (default: 300)
|
||||
MaxRetries int // Maximum retry attempts (default: 3)
|
||||
Concurrency int // Upload/download concurrency (default: 5)
|
||||
}
|
||||
|
||||
// NewBackend creates a new cloud storage backend based on the provider
|
||||
|
||||
@ -183,10 +183,9 @@ func IsRetryableError(err error) bool {
|
||||
}
|
||||
|
||||
// Network errors are typically retryable
|
||||
// Note: netErr.Temporary() is deprecated since Go 1.18 - most "temporary" errors are timeouts
|
||||
var netErr net.Error
|
||||
if ok := isNetError(err, &netErr); ok {
|
||||
return netErr.Timeout()
|
||||
return netErr.Timeout() || netErr.Temporary()
|
||||
}
|
||||
|
||||
errStr := strings.ToLower(err.Error())
|
||||
|
||||
@ -138,11 +138,6 @@ func (s *S3Backend) uploadSimple(ctx context.Context, file *os.File, key string,
|
||||
reader = NewProgressReader(file, fileSize, progress)
|
||||
}
|
||||
|
||||
// Apply bandwidth throttling if configured
|
||||
if s.config.BandwidthLimit > 0 {
|
||||
reader = NewThrottledReader(ctx, reader, s.config.BandwidthLimit)
|
||||
}
|
||||
|
||||
// Upload to S3
|
||||
_, err := s.client.PutObject(ctx, &s3.PutObjectInput{
|
||||
Bucket: aws.String(s.bucket),
|
||||
@ -168,21 +163,13 @@ func (s *S3Backend) uploadMultipart(ctx context.Context, file *os.File, key stri
|
||||
return fmt.Errorf("failed to reset file position: %w", err)
|
||||
}
|
||||
|
||||
// Calculate concurrency based on bandwidth limit
|
||||
// If limited, reduce concurrency to make throttling more effective
|
||||
concurrency := 10
|
||||
if s.config.BandwidthLimit > 0 {
|
||||
// With bandwidth limiting, use fewer concurrent parts
|
||||
concurrency = 3
|
||||
}
|
||||
|
||||
// Create uploader with custom options
|
||||
uploader := manager.NewUploader(s.client, func(u *manager.Uploader) {
|
||||
// Part size: 10MB
|
||||
u.PartSize = 10 * 1024 * 1024
|
||||
|
||||
// Adjust concurrency
|
||||
u.Concurrency = concurrency
|
||||
// Upload up to 10 parts concurrently
|
||||
u.Concurrency = 10
|
||||
|
||||
// Leave parts on failure for debugging
|
||||
u.LeavePartsOnError = false
|
||||
@ -194,11 +181,6 @@ func (s *S3Backend) uploadMultipart(ctx context.Context, file *os.File, key stri
|
||||
reader = NewProgressReader(file, fileSize, progress)
|
||||
}
|
||||
|
||||
// Apply bandwidth throttling if configured
|
||||
if s.config.BandwidthLimit > 0 {
|
||||
reader = NewThrottledReader(ctx, reader, s.config.BandwidthLimit)
|
||||
}
|
||||
|
||||
// Upload with multipart
|
||||
_, err := uploader.Upload(ctx, &s3.PutObjectInput{
|
||||
Bucket: aws.String(s.bucket),
|
||||
|
||||
@ -1,251 +0,0 @@
|
||||
// Package cloud provides throttled readers for bandwidth limiting during cloud uploads/downloads
|
||||
package cloud
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ThrottledReader wraps an io.Reader and limits the read rate to a maximum bytes per second.
|
||||
// This is useful for cloud uploads where you don't want to saturate the network.
|
||||
type ThrottledReader struct {
|
||||
reader io.Reader
|
||||
bytesPerSec int64 // Maximum bytes per second (0 = unlimited)
|
||||
bytesRead int64 // Bytes read in current window
|
||||
windowStart time.Time // Start of current measurement window
|
||||
windowSize time.Duration // Size of the measurement window
|
||||
mu sync.Mutex // Protects bytesRead and windowStart
|
||||
ctx context.Context
|
||||
}
|
||||
|
||||
// NewThrottledReader creates a new bandwidth-limited reader.
|
||||
// bytesPerSec is the maximum transfer rate in bytes per second.
|
||||
// Set to 0 for unlimited bandwidth.
|
||||
func NewThrottledReader(ctx context.Context, reader io.Reader, bytesPerSec int64) *ThrottledReader {
|
||||
return &ThrottledReader{
|
||||
reader: reader,
|
||||
bytesPerSec: bytesPerSec,
|
||||
windowStart: time.Now(),
|
||||
windowSize: 100 * time.Millisecond, // Measure in 100ms windows for smooth throttling
|
||||
ctx: ctx,
|
||||
}
|
||||
}
|
||||
|
||||
// Read implements io.Reader with bandwidth throttling
|
||||
func (t *ThrottledReader) Read(p []byte) (int, error) {
|
||||
// No throttling if unlimited
|
||||
if t.bytesPerSec <= 0 {
|
||||
return t.reader.Read(p)
|
||||
}
|
||||
|
||||
t.mu.Lock()
|
||||
|
||||
// Calculate how many bytes we're allowed in this window
|
||||
now := time.Now()
|
||||
elapsed := now.Sub(t.windowStart)
|
||||
|
||||
// If we've passed the window, reset
|
||||
if elapsed >= t.windowSize {
|
||||
t.bytesRead = 0
|
||||
t.windowStart = now
|
||||
elapsed = 0
|
||||
}
|
||||
|
||||
// Calculate bytes allowed per window
|
||||
bytesPerWindow := int64(float64(t.bytesPerSec) * t.windowSize.Seconds())
|
||||
|
||||
// How many bytes can we still read in this window?
|
||||
remaining := bytesPerWindow - t.bytesRead
|
||||
if remaining <= 0 {
|
||||
// We've exhausted our quota for this window - wait for next window
|
||||
sleepDuration := t.windowSize - elapsed
|
||||
t.mu.Unlock()
|
||||
|
||||
select {
|
||||
case <-t.ctx.Done():
|
||||
return 0, t.ctx.Err()
|
||||
case <-time.After(sleepDuration):
|
||||
}
|
||||
|
||||
// Retry after sleeping
|
||||
return t.Read(p)
|
||||
}
|
||||
|
||||
// Limit read size to remaining quota
|
||||
maxRead := len(p)
|
||||
if int64(maxRead) > remaining {
|
||||
maxRead = int(remaining)
|
||||
}
|
||||
t.mu.Unlock()
|
||||
|
||||
// Perform the actual read
|
||||
n, err := t.reader.Read(p[:maxRead])
|
||||
|
||||
// Track bytes read
|
||||
t.mu.Lock()
|
||||
t.bytesRead += int64(n)
|
||||
t.mu.Unlock()
|
||||
|
||||
return n, err
|
||||
}
|
||||
|
||||
// ThrottledWriter wraps an io.Writer and limits the write rate.
|
||||
type ThrottledWriter struct {
|
||||
writer io.Writer
|
||||
bytesPerSec int64
|
||||
bytesWritten int64
|
||||
windowStart time.Time
|
||||
windowSize time.Duration
|
||||
mu sync.Mutex
|
||||
ctx context.Context
|
||||
}
|
||||
|
||||
// NewThrottledWriter creates a new bandwidth-limited writer.
|
||||
func NewThrottledWriter(ctx context.Context, writer io.Writer, bytesPerSec int64) *ThrottledWriter {
|
||||
return &ThrottledWriter{
|
||||
writer: writer,
|
||||
bytesPerSec: bytesPerSec,
|
||||
windowStart: time.Now(),
|
||||
windowSize: 100 * time.Millisecond,
|
||||
ctx: ctx,
|
||||
}
|
||||
}
|
||||
|
||||
// Write implements io.Writer with bandwidth throttling
|
||||
func (t *ThrottledWriter) Write(p []byte) (int, error) {
|
||||
if t.bytesPerSec <= 0 {
|
||||
return t.writer.Write(p)
|
||||
}
|
||||
|
||||
totalWritten := 0
|
||||
for totalWritten < len(p) {
|
||||
t.mu.Lock()
|
||||
|
||||
now := time.Now()
|
||||
elapsed := now.Sub(t.windowStart)
|
||||
|
||||
if elapsed >= t.windowSize {
|
||||
t.bytesWritten = 0
|
||||
t.windowStart = now
|
||||
elapsed = 0
|
||||
}
|
||||
|
||||
bytesPerWindow := int64(float64(t.bytesPerSec) * t.windowSize.Seconds())
|
||||
remaining := bytesPerWindow - t.bytesWritten
|
||||
|
||||
if remaining <= 0 {
|
||||
sleepDuration := t.windowSize - elapsed
|
||||
t.mu.Unlock()
|
||||
|
||||
select {
|
||||
case <-t.ctx.Done():
|
||||
return totalWritten, t.ctx.Err()
|
||||
case <-time.After(sleepDuration):
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Calculate how much to write
|
||||
toWrite := len(p) - totalWritten
|
||||
if int64(toWrite) > remaining {
|
||||
toWrite = int(remaining)
|
||||
}
|
||||
t.mu.Unlock()
|
||||
|
||||
// Write chunk
|
||||
n, err := t.writer.Write(p[totalWritten : totalWritten+toWrite])
|
||||
totalWritten += n
|
||||
|
||||
t.mu.Lock()
|
||||
t.bytesWritten += int64(n)
|
||||
t.mu.Unlock()
|
||||
|
||||
if err != nil {
|
||||
return totalWritten, err
|
||||
}
|
||||
}
|
||||
|
||||
return totalWritten, nil
|
||||
}
|
||||
|
||||
// ParseBandwidth parses a human-readable bandwidth string into bytes per second.
|
||||
// Supports: "10MB/s", "10MiB/s", "100KB/s", "1GB/s", "10Mbps", "100Kbps"
|
||||
// Returns 0 for empty or "unlimited"
|
||||
func ParseBandwidth(s string) (int64, error) {
|
||||
if s == "" || s == "0" || s == "unlimited" {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// Normalize input
|
||||
s = strings.TrimSpace(s)
|
||||
s = strings.ToLower(s)
|
||||
s = strings.TrimSuffix(s, "/s")
|
||||
s = strings.TrimSuffix(s, "ps") // For mbps/kbps
|
||||
|
||||
// Parse unit
|
||||
var multiplier int64 = 1
|
||||
var value float64
|
||||
|
||||
switch {
|
||||
case strings.HasSuffix(s, "gib"):
|
||||
multiplier = 1024 * 1024 * 1024
|
||||
s = strings.TrimSuffix(s, "gib")
|
||||
case strings.HasSuffix(s, "gb"):
|
||||
multiplier = 1000 * 1000 * 1000
|
||||
s = strings.TrimSuffix(s, "gb")
|
||||
case strings.HasSuffix(s, "mib"):
|
||||
multiplier = 1024 * 1024
|
||||
s = strings.TrimSuffix(s, "mib")
|
||||
case strings.HasSuffix(s, "mb"):
|
||||
multiplier = 1000 * 1000
|
||||
s = strings.TrimSuffix(s, "mb")
|
||||
case strings.HasSuffix(s, "kib"):
|
||||
multiplier = 1024
|
||||
s = strings.TrimSuffix(s, "kib")
|
||||
case strings.HasSuffix(s, "kb"):
|
||||
multiplier = 1000
|
||||
s = strings.TrimSuffix(s, "kb")
|
||||
case strings.HasSuffix(s, "b"):
|
||||
multiplier = 1
|
||||
s = strings.TrimSuffix(s, "b")
|
||||
default:
|
||||
// Assume MB if no unit
|
||||
multiplier = 1000 * 1000
|
||||
}
|
||||
|
||||
// Parse numeric value
|
||||
_, err := fmt.Sscanf(s, "%f", &value)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid bandwidth value: %s", s)
|
||||
}
|
||||
|
||||
return int64(value * float64(multiplier)), nil
|
||||
}
|
||||
|
||||
// FormatBandwidth returns a human-readable bandwidth string
|
||||
func FormatBandwidth(bytesPerSec int64) string {
|
||||
if bytesPerSec <= 0 {
|
||||
return "unlimited"
|
||||
}
|
||||
|
||||
const (
|
||||
KB = 1000
|
||||
MB = 1000 * KB
|
||||
GB = 1000 * MB
|
||||
)
|
||||
|
||||
switch {
|
||||
case bytesPerSec >= GB:
|
||||
return fmt.Sprintf("%.1f GB/s", float64(bytesPerSec)/float64(GB))
|
||||
case bytesPerSec >= MB:
|
||||
return fmt.Sprintf("%.1f MB/s", float64(bytesPerSec)/float64(MB))
|
||||
case bytesPerSec >= KB:
|
||||
return fmt.Sprintf("%.1f KB/s", float64(bytesPerSec)/float64(KB))
|
||||
default:
|
||||
return fmt.Sprintf("%d B/s", bytesPerSec)
|
||||
}
|
||||
}
|
||||
@ -1,175 +0,0 @@
|
||||
package cloud
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"io"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestParseBandwidth(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
expected int64
|
||||
wantErr bool
|
||||
}{
|
||||
// Empty/unlimited
|
||||
{"", 0, false},
|
||||
{"0", 0, false},
|
||||
{"unlimited", 0, false},
|
||||
|
||||
// Megabytes per second (SI)
|
||||
{"10MB/s", 10 * 1000 * 1000, false},
|
||||
{"10mb/s", 10 * 1000 * 1000, false},
|
||||
{"10MB", 10 * 1000 * 1000, false},
|
||||
{"100MB/s", 100 * 1000 * 1000, false},
|
||||
|
||||
// Mebibytes per second (binary)
|
||||
{"10MiB/s", 10 * 1024 * 1024, false},
|
||||
{"10mib/s", 10 * 1024 * 1024, false},
|
||||
|
||||
// Kilobytes
|
||||
{"500KB/s", 500 * 1000, false},
|
||||
{"500KiB/s", 500 * 1024, false},
|
||||
|
||||
// Gigabytes
|
||||
{"1GB/s", 1000 * 1000 * 1000, false},
|
||||
{"1GiB/s", 1024 * 1024 * 1024, false},
|
||||
|
||||
// Megabits per second
|
||||
{"100Mbps", 100 * 1000 * 1000, false},
|
||||
|
||||
// Plain bytes
|
||||
{"1000B/s", 1000, false},
|
||||
|
||||
// No unit (assumes MB)
|
||||
{"50", 50 * 1000 * 1000, false},
|
||||
|
||||
// Decimal values
|
||||
{"1.5MB/s", 1500000, false},
|
||||
{"0.5GB/s", 500 * 1000 * 1000, false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.input, func(t *testing.T) {
|
||||
got, err := ParseBandwidth(tt.input)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("ParseBandwidth(%q) error = %v, wantErr %v", tt.input, err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
if got != tt.expected {
|
||||
t.Errorf("ParseBandwidth(%q) = %d, want %d", tt.input, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatBandwidth(t *testing.T) {
|
||||
tests := []struct {
|
||||
input int64
|
||||
expected string
|
||||
}{
|
||||
{0, "unlimited"},
|
||||
{500, "500 B/s"},
|
||||
{1500, "1.5 KB/s"},
|
||||
{10 * 1000 * 1000, "10.0 MB/s"},
|
||||
{1000 * 1000 * 1000, "1.0 GB/s"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.expected, func(t *testing.T) {
|
||||
got := FormatBandwidth(tt.input)
|
||||
if got != tt.expected {
|
||||
t.Errorf("FormatBandwidth(%d) = %q, want %q", tt.input, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestThrottledReader_Unlimited(t *testing.T) {
|
||||
data := []byte("hello world")
|
||||
reader := bytes.NewReader(data)
|
||||
ctx := context.Background()
|
||||
|
||||
throttled := NewThrottledReader(ctx, reader, 0) // 0 = unlimited
|
||||
|
||||
result, err := io.ReadAll(throttled)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !bytes.Equal(result, data) {
|
||||
t.Errorf("got %q, want %q", result, data)
|
||||
}
|
||||
}
|
||||
|
||||
func TestThrottledReader_Limited(t *testing.T) {
|
||||
// Create 1KB of data
|
||||
data := make([]byte, 1024)
|
||||
for i := range data {
|
||||
data[i] = byte(i % 256)
|
||||
}
|
||||
|
||||
reader := bytes.NewReader(data)
|
||||
ctx := context.Background()
|
||||
|
||||
// Limit to 512 bytes/second - should take ~2 seconds
|
||||
throttled := NewThrottledReader(ctx, reader, 512)
|
||||
|
||||
start := time.Now()
|
||||
result, err := io.ReadAll(throttled)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !bytes.Equal(result, data) {
|
||||
t.Errorf("data mismatch: got %d bytes, want %d bytes", len(result), len(data))
|
||||
}
|
||||
|
||||
// Should take at least 1.5 seconds (allowing some margin)
|
||||
if elapsed < 1500*time.Millisecond {
|
||||
t.Errorf("read completed too fast: %v (expected ~2s for 1KB at 512B/s)", elapsed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestThrottledReader_CancelContext(t *testing.T) {
|
||||
data := make([]byte, 10*1024) // 10KB
|
||||
reader := bytes.NewReader(data)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
// Very slow rate
|
||||
throttled := NewThrottledReader(ctx, reader, 100)
|
||||
|
||||
// Cancel after 100ms
|
||||
go func() {
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
_, err := io.ReadAll(throttled)
|
||||
if err != context.Canceled {
|
||||
t.Errorf("expected context.Canceled, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestThrottledWriter_Unlimited(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
var buf bytes.Buffer
|
||||
|
||||
throttled := NewThrottledWriter(ctx, &buf, 0) // 0 = unlimited
|
||||
|
||||
data := []byte("hello world")
|
||||
n, err := throttled.Write(data)
|
||||
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if n != len(data) {
|
||||
t.Errorf("wrote %d bytes, want %d", n, len(data))
|
||||
}
|
||||
if !bytes.Equal(buf.Bytes(), data) {
|
||||
t.Errorf("got %q, want %q", buf.Bytes(), data)
|
||||
}
|
||||
}
|
||||
@ -17,16 +17,12 @@ type Config struct {
|
||||
BuildTime string
|
||||
GitCommit string
|
||||
|
||||
// Config file path (--config flag)
|
||||
ConfigPath string
|
||||
|
||||
// Database connection
|
||||
Host string
|
||||
Port int
|
||||
User string
|
||||
Database string
|
||||
Password string
|
||||
Socket string // Unix socket path for MySQL/MariaDB
|
||||
DatabaseType string // "postgres" or "mysql"
|
||||
SSLMode string
|
||||
Insecure bool
|
||||
@ -41,10 +37,8 @@ type Config struct {
|
||||
CPUWorkloadType string // "cpu-intensive", "io-intensive", "balanced"
|
||||
|
||||
// Resource profile for backup/restore operations
|
||||
ResourceProfile string // "conservative", "balanced", "performance", "max-performance", "turbo"
|
||||
ResourceProfile string // "conservative", "balanced", "performance", "max-performance"
|
||||
LargeDBMode bool // Enable large database mode (reduces parallelism, increases max_locks)
|
||||
BufferedIO bool // Use 32KB buffered I/O for faster extraction (turbo profile)
|
||||
ParallelExtract bool // Enable parallel file extraction where possible (turbo profile)
|
||||
|
||||
// CPU detection
|
||||
CPUDetector *cpu.Detector
|
||||
@ -439,7 +433,7 @@ func (c *Config) ApplyResourceProfile(profileName string) error {
|
||||
return &ConfigError{
|
||||
Field: "resource_profile",
|
||||
Value: profileName,
|
||||
Message: "unknown profile. Valid profiles: conservative, balanced, performance, max-performance, turbo",
|
||||
Message: "unknown profile. Valid profiles: conservative, balanced, performance, max-performance",
|
||||
}
|
||||
}
|
||||
|
||||
@ -462,10 +456,6 @@ func (c *Config) ApplyResourceProfile(profileName string) error {
|
||||
c.Jobs = profile.Jobs
|
||||
c.DumpJobs = profile.DumpJobs
|
||||
|
||||
// Apply turbo mode optimizations
|
||||
c.BufferedIO = profile.BufferedIO
|
||||
c.ParallelExtract = profile.ParallelExtract
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -619,6 +609,37 @@ func getDefaultBackupDir() string {
|
||||
return filepath.Join(os.TempDir(), "db_backups")
|
||||
}
|
||||
|
||||
// CPU-related helper functions
|
||||
func getDefaultJobs(cpuInfo *cpu.CPUInfo) int {
|
||||
if cpuInfo == nil {
|
||||
return 1
|
||||
}
|
||||
// Default to logical cores for restore operations
|
||||
jobs := cpuInfo.LogicalCores
|
||||
if jobs < 1 {
|
||||
jobs = 1
|
||||
}
|
||||
if jobs > 16 {
|
||||
jobs = 16 // Safety limit
|
||||
}
|
||||
return jobs
|
||||
}
|
||||
|
||||
func getDefaultDumpJobs(cpuInfo *cpu.CPUInfo) int {
|
||||
if cpuInfo == nil {
|
||||
return 1
|
||||
}
|
||||
// Use physical cores for dump operations (CPU intensive)
|
||||
jobs := cpuInfo.PhysicalCores
|
||||
if jobs < 1 {
|
||||
jobs = 1
|
||||
}
|
||||
if jobs > 8 {
|
||||
jobs = 8 // Conservative limit for dumps
|
||||
}
|
||||
return jobs
|
||||
}
|
||||
|
||||
func getDefaultMaxCores(cpuInfo *cpu.CPUInfo) int {
|
||||
if cpuInfo == nil {
|
||||
return 16
|
||||
|
||||
@ -1,260 +0,0 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNew(t *testing.T) {
|
||||
cfg := New()
|
||||
if cfg == nil {
|
||||
t.Fatal("expected non-nil config")
|
||||
}
|
||||
|
||||
// Check defaults
|
||||
if cfg.Host == "" {
|
||||
t.Error("expected non-empty host")
|
||||
}
|
||||
if cfg.Port == 0 {
|
||||
t.Error("expected non-zero port")
|
||||
}
|
||||
if cfg.User == "" {
|
||||
t.Error("expected non-empty user")
|
||||
}
|
||||
if cfg.DatabaseType != "postgres" && cfg.DatabaseType != "mysql" {
|
||||
t.Errorf("expected valid database type, got %q", cfg.DatabaseType)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsPostgreSQL(t *testing.T) {
|
||||
tests := []struct {
|
||||
dbType string
|
||||
expected bool
|
||||
}{
|
||||
{"postgres", true},
|
||||
{"mysql", false},
|
||||
{"mariadb", false},
|
||||
{"", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.dbType, func(t *testing.T) {
|
||||
cfg := &Config{DatabaseType: tt.dbType}
|
||||
if got := cfg.IsPostgreSQL(); got != tt.expected {
|
||||
t.Errorf("IsPostgreSQL() = %v, want %v", got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsMySQL(t *testing.T) {
|
||||
tests := []struct {
|
||||
dbType string
|
||||
expected bool
|
||||
}{
|
||||
{"mysql", true},
|
||||
{"mariadb", true},
|
||||
{"postgres", false},
|
||||
{"", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.dbType, func(t *testing.T) {
|
||||
cfg := &Config{DatabaseType: tt.dbType}
|
||||
if got := cfg.IsMySQL(); got != tt.expected {
|
||||
t.Errorf("IsMySQL() = %v, want %v", got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSetDatabaseType(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
expected string
|
||||
shouldError bool
|
||||
}{
|
||||
{"postgres", "postgres", false},
|
||||
{"postgresql", "postgres", false},
|
||||
{"POSTGRES", "postgres", false},
|
||||
{"mysql", "mysql", false},
|
||||
{"MYSQL", "mysql", false},
|
||||
{"mariadb", "mariadb", false},
|
||||
{"invalid", "", true},
|
||||
{"", "", true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.input, func(t *testing.T) {
|
||||
cfg := &Config{Port: 0}
|
||||
err := cfg.SetDatabaseType(tt.input)
|
||||
|
||||
if tt.shouldError {
|
||||
if err == nil {
|
||||
t.Error("expected error, got nil")
|
||||
}
|
||||
} else {
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
if cfg.DatabaseType != tt.expected {
|
||||
t.Errorf("DatabaseType = %q, want %q", cfg.DatabaseType, tt.expected)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSetDatabaseTypePortDefaults(t *testing.T) {
|
||||
cfg := &Config{Port: 0}
|
||||
_ = cfg.SetDatabaseType("postgres")
|
||||
if cfg.Port != 5432 {
|
||||
t.Errorf("expected PostgreSQL default port 5432, got %d", cfg.Port)
|
||||
}
|
||||
|
||||
cfg = &Config{Port: 0}
|
||||
_ = cfg.SetDatabaseType("mysql")
|
||||
if cfg.Port != 3306 {
|
||||
t.Errorf("expected MySQL default port 3306, got %d", cfg.Port)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetEnvString(t *testing.T) {
|
||||
os.Setenv("TEST_CONFIG_VAR", "test_value")
|
||||
defer os.Unsetenv("TEST_CONFIG_VAR")
|
||||
|
||||
if got := getEnvString("TEST_CONFIG_VAR", "default"); got != "test_value" {
|
||||
t.Errorf("getEnvString() = %q, want %q", got, "test_value")
|
||||
}
|
||||
|
||||
if got := getEnvString("NONEXISTENT_VAR", "default"); got != "default" {
|
||||
t.Errorf("getEnvString() = %q, want %q", got, "default")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetEnvInt(t *testing.T) {
|
||||
os.Setenv("TEST_INT_VAR", "42")
|
||||
defer os.Unsetenv("TEST_INT_VAR")
|
||||
|
||||
if got := getEnvInt("TEST_INT_VAR", 0); got != 42 {
|
||||
t.Errorf("getEnvInt() = %d, want %d", got, 42)
|
||||
}
|
||||
|
||||
os.Setenv("TEST_INT_VAR", "invalid")
|
||||
if got := getEnvInt("TEST_INT_VAR", 10); got != 10 {
|
||||
t.Errorf("getEnvInt() with invalid = %d, want %d", got, 10)
|
||||
}
|
||||
|
||||
if got := getEnvInt("NONEXISTENT_INT_VAR", 99); got != 99 {
|
||||
t.Errorf("getEnvInt() nonexistent = %d, want %d", got, 99)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetEnvBool(t *testing.T) {
|
||||
tests := []struct {
|
||||
envValue string
|
||||
expected bool
|
||||
}{
|
||||
{"true", true},
|
||||
{"TRUE", true},
|
||||
{"1", true},
|
||||
{"false", false},
|
||||
{"FALSE", false},
|
||||
{"0", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.envValue, func(t *testing.T) {
|
||||
os.Setenv("TEST_BOOL_VAR", tt.envValue)
|
||||
defer os.Unsetenv("TEST_BOOL_VAR")
|
||||
|
||||
if got := getEnvBool("TEST_BOOL_VAR", false); got != tt.expected {
|
||||
t.Errorf("getEnvBool(%q) = %v, want %v", tt.envValue, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCanonicalDatabaseType(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
expected string
|
||||
ok bool
|
||||
}{
|
||||
{"postgres", "postgres", true},
|
||||
{"postgresql", "postgres", true},
|
||||
{"pg", "postgres", true},
|
||||
{"POSTGRES", "postgres", true},
|
||||
{"mysql", "mysql", true},
|
||||
{"MYSQL", "mysql", true},
|
||||
{"mariadb", "mariadb", true},
|
||||
{"maria", "mariadb", true},
|
||||
{"invalid", "", false},
|
||||
{"", "", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.input, func(t *testing.T) {
|
||||
got, ok := canonicalDatabaseType(tt.input)
|
||||
if ok != tt.ok {
|
||||
t.Errorf("canonicalDatabaseType(%q) ok = %v, want %v", tt.input, ok, tt.ok)
|
||||
}
|
||||
if got != tt.expected {
|
||||
t.Errorf("canonicalDatabaseType(%q) = %q, want %q", tt.input, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDisplayDatabaseType(t *testing.T) {
|
||||
tests := []struct {
|
||||
dbType string
|
||||
expected string
|
||||
}{
|
||||
{"postgres", "PostgreSQL"},
|
||||
{"mysql", "MySQL"},
|
||||
{"mariadb", "MariaDB"},
|
||||
{"unknown", "unknown"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.dbType, func(t *testing.T) {
|
||||
cfg := &Config{DatabaseType: tt.dbType}
|
||||
if got := cfg.DisplayDatabaseType(); got != tt.expected {
|
||||
t.Errorf("DisplayDatabaseType() = %q, want %q", got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfigError(t *testing.T) {
|
||||
err := &ConfigError{
|
||||
Field: "port",
|
||||
Value: "invalid",
|
||||
Message: "must be a valid port number",
|
||||
}
|
||||
|
||||
errStr := err.Error()
|
||||
if errStr == "" {
|
||||
t.Error("expected non-empty error string")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetCurrentOSUser(t *testing.T) {
|
||||
user := GetCurrentOSUser()
|
||||
if user == "" {
|
||||
t.Error("expected non-empty user")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultPortFor(t *testing.T) {
|
||||
if port := defaultPortFor("postgres"); port != 5432 {
|
||||
t.Errorf("defaultPortFor(postgres) = %d, want 5432", port)
|
||||
}
|
||||
if port := defaultPortFor("mysql"); port != 3306 {
|
||||
t.Errorf("defaultPortFor(mysql) = %d, want 3306", port)
|
||||
}
|
||||
if port := defaultPortFor("unknown"); port != 5432 {
|
||||
t.Errorf("defaultPortFor(unknown) = %d, want 5432 (default)", port)
|
||||
}
|
||||
}
|
||||
@ -42,11 +42,8 @@ type LocalConfig struct {
|
||||
|
||||
// LoadLocalConfig loads configuration from .dbbackup.conf in current directory
|
||||
func LoadLocalConfig() (*LocalConfig, error) {
|
||||
return LoadLocalConfigFromPath(filepath.Join(".", ConfigFileName))
|
||||
}
|
||||
configPath := filepath.Join(".", ConfigFileName)
|
||||
|
||||
// LoadLocalConfigFromPath loads configuration from a specific path
|
||||
func LoadLocalConfigFromPath(configPath string) (*LocalConfig, error) {
|
||||
data, err := os.ReadFile(configPath)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user