feat: Add content-defined chunking deduplication
- Gear hash CDC with 92%+ overlap on shifted data - SHA-256 content-addressed chunk storage - AES-256-GCM per-chunk encryption (optional) - Gzip compression (default enabled) - SQLite index for fast lookups - JSON manifests with SHA-256 verification Commands: dedup backup/restore/list/stats/delete/gc Resistance is futile.
This commit is contained in:
579
cmd/dedup.go
Normal file
579
cmd/dedup.go
Normal file
@@ -0,0 +1,579 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"dbbackup/internal/dedup"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
var dedupCmd = &cobra.Command{
|
||||
Use: "dedup",
|
||||
Short: "Deduplicated backup operations",
|
||||
Long: `Content-defined chunking deduplication for space-efficient backups.
|
||||
|
||||
Similar to restic/borgbackup but with native database dump support.
|
||||
|
||||
Features:
|
||||
- Content-defined chunking (CDC) with Buzhash rolling hash
|
||||
- SHA-256 content-addressed storage
|
||||
- AES-256-GCM encryption (optional)
|
||||
- Gzip compression (optional)
|
||||
- SQLite index for fast lookups
|
||||
|
||||
Storage Structure:
|
||||
<dedup-dir>/
|
||||
chunks/ # Content-addressed chunk files
|
||||
ab/cdef... # Sharded by first 2 chars of hash
|
||||
manifests/ # JSON manifest per backup
|
||||
chunks.db # SQLite index`,
|
||||
}
|
||||
|
||||
var dedupBackupCmd = &cobra.Command{
|
||||
Use: "backup <file>",
|
||||
Short: "Create a deduplicated backup of a file",
|
||||
Long: `Chunk a file using content-defined chunking and store deduplicated chunks.
|
||||
|
||||
Example:
|
||||
dbbackup dedup backup /path/to/database.dump
|
||||
dbbackup dedup backup mydb.sql --compress --encrypt`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: runDedupBackup,
|
||||
}
|
||||
|
||||
var dedupRestoreCmd = &cobra.Command{
|
||||
Use: "restore <manifest-id> <output-file>",
|
||||
Short: "Restore a backup from its manifest",
|
||||
Long: `Reconstruct a file from its deduplicated chunks.
|
||||
|
||||
Example:
|
||||
dbbackup dedup restore 2026-01-07_120000_mydb /tmp/restored.dump
|
||||
dbbackup dedup list # to see available manifests`,
|
||||
Args: cobra.ExactArgs(2),
|
||||
RunE: runDedupRestore,
|
||||
}
|
||||
|
||||
var dedupListCmd = &cobra.Command{
|
||||
Use: "list",
|
||||
Short: "List all deduplicated backups",
|
||||
RunE: runDedupList,
|
||||
}
|
||||
|
||||
var dedupStatsCmd = &cobra.Command{
|
||||
Use: "stats",
|
||||
Short: "Show deduplication statistics",
|
||||
RunE: runDedupStats,
|
||||
}
|
||||
|
||||
var dedupGCCmd = &cobra.Command{
|
||||
Use: "gc",
|
||||
Short: "Garbage collect unreferenced chunks",
|
||||
Long: `Remove chunks that are no longer referenced by any manifest.
|
||||
|
||||
Run after deleting old backups to reclaim space.`,
|
||||
RunE: runDedupGC,
|
||||
}
|
||||
|
||||
var dedupDeleteCmd = &cobra.Command{
|
||||
Use: "delete <manifest-id>",
|
||||
Short: "Delete a backup manifest (chunks cleaned by gc)",
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: runDedupDelete,
|
||||
}
|
||||
|
||||
// Flags
|
||||
var (
|
||||
dedupDir string
|
||||
dedupCompress bool
|
||||
dedupEncrypt bool
|
||||
dedupKey string
|
||||
dedupName string
|
||||
dedupDBType string
|
||||
dedupDBName string
|
||||
dedupDBHost string
|
||||
)
|
||||
|
||||
func init() {
|
||||
rootCmd.AddCommand(dedupCmd)
|
||||
dedupCmd.AddCommand(dedupBackupCmd)
|
||||
dedupCmd.AddCommand(dedupRestoreCmd)
|
||||
dedupCmd.AddCommand(dedupListCmd)
|
||||
dedupCmd.AddCommand(dedupStatsCmd)
|
||||
dedupCmd.AddCommand(dedupGCCmd)
|
||||
dedupCmd.AddCommand(dedupDeleteCmd)
|
||||
|
||||
// Global dedup flags
|
||||
dedupCmd.PersistentFlags().StringVar(&dedupDir, "dedup-dir", "", "Dedup storage directory (default: $BACKUP_DIR/dedup)")
|
||||
dedupCmd.PersistentFlags().BoolVar(&dedupCompress, "compress", true, "Compress chunks with gzip")
|
||||
dedupCmd.PersistentFlags().BoolVar(&dedupEncrypt, "encrypt", false, "Encrypt chunks with AES-256-GCM")
|
||||
dedupCmd.PersistentFlags().StringVar(&dedupKey, "key", "", "Encryption key (hex) or use DBBACKUP_DEDUP_KEY env")
|
||||
|
||||
// Backup-specific flags
|
||||
dedupBackupCmd.Flags().StringVar(&dedupName, "name", "", "Optional backup name")
|
||||
dedupBackupCmd.Flags().StringVar(&dedupDBType, "db-type", "", "Database type (postgres/mysql)")
|
||||
dedupBackupCmd.Flags().StringVar(&dedupDBName, "db-name", "", "Database name")
|
||||
dedupBackupCmd.Flags().StringVar(&dedupDBHost, "db-host", "", "Database host")
|
||||
}
|
||||
|
||||
func getDedupDir() string {
|
||||
if dedupDir != "" {
|
||||
return dedupDir
|
||||
}
|
||||
if cfg != nil && cfg.BackupDir != "" {
|
||||
return filepath.Join(cfg.BackupDir, "dedup")
|
||||
}
|
||||
return filepath.Join(os.Getenv("HOME"), "db_backups", "dedup")
|
||||
}
|
||||
|
||||
func getEncryptionKey() string {
|
||||
if dedupKey != "" {
|
||||
return dedupKey
|
||||
}
|
||||
return os.Getenv("DBBACKUP_DEDUP_KEY")
|
||||
}
|
||||
|
||||
func runDedupBackup(cmd *cobra.Command, args []string) error {
|
||||
inputPath := args[0]
|
||||
|
||||
// Open input file
|
||||
file, err := os.Open(inputPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open input file: %w", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
info, err := file.Stat()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to stat input file: %w", err)
|
||||
}
|
||||
|
||||
// Setup dedup storage
|
||||
basePath := getDedupDir()
|
||||
encKey := ""
|
||||
if dedupEncrypt {
|
||||
encKey = getEncryptionKey()
|
||||
if encKey == "" {
|
||||
return fmt.Errorf("encryption enabled but no key provided (use --key or DBBACKUP_DEDUP_KEY)")
|
||||
}
|
||||
}
|
||||
|
||||
store, err := dedup.NewChunkStore(dedup.StoreConfig{
|
||||
BasePath: basePath,
|
||||
Compress: dedupCompress,
|
||||
EncryptionKey: encKey,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open chunk store: %w", err)
|
||||
}
|
||||
|
||||
manifestStore, err := dedup.NewManifestStore(basePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open manifest store: %w", err)
|
||||
}
|
||||
|
||||
index, err := dedup.NewChunkIndex(basePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open chunk index: %w", err)
|
||||
}
|
||||
defer index.Close()
|
||||
|
||||
// Generate manifest ID
|
||||
now := time.Now()
|
||||
manifestID := now.Format("2006-01-02_150405")
|
||||
if dedupDBName != "" {
|
||||
manifestID += "_" + dedupDBName
|
||||
} else {
|
||||
base := filepath.Base(inputPath)
|
||||
ext := filepath.Ext(base)
|
||||
manifestID += "_" + strings.TrimSuffix(base, ext)
|
||||
}
|
||||
|
||||
fmt.Printf("Creating deduplicated backup: %s\n", manifestID)
|
||||
fmt.Printf("Input: %s (%s)\n", inputPath, formatBytes(info.Size()))
|
||||
fmt.Printf("Store: %s\n", basePath)
|
||||
|
||||
// Hash the entire file for verification
|
||||
file.Seek(0, 0)
|
||||
h := sha256.New()
|
||||
io.Copy(h, file)
|
||||
fileHash := hex.EncodeToString(h.Sum(nil))
|
||||
file.Seek(0, 0)
|
||||
|
||||
// Chunk the file
|
||||
chunker := dedup.NewChunker(file, dedup.DefaultChunkerConfig())
|
||||
var chunks []dedup.ChunkRef
|
||||
var totalSize, storedSize int64
|
||||
var chunkCount, newChunks int
|
||||
|
||||
startTime := time.Now()
|
||||
|
||||
for {
|
||||
chunk, err := chunker.Next()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("chunking failed: %w", err)
|
||||
}
|
||||
|
||||
chunkCount++
|
||||
totalSize += int64(chunk.Length)
|
||||
|
||||
// Store chunk (deduplication happens here)
|
||||
isNew, err := store.Put(chunk)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to store chunk: %w", err)
|
||||
}
|
||||
|
||||
if isNew {
|
||||
newChunks++
|
||||
storedSize += int64(chunk.Length)
|
||||
// Record in index
|
||||
index.AddChunk(chunk.Hash, chunk.Length, chunk.Length)
|
||||
}
|
||||
|
||||
chunks = append(chunks, dedup.ChunkRef{
|
||||
Hash: chunk.Hash,
|
||||
Offset: chunk.Offset,
|
||||
Length: chunk.Length,
|
||||
})
|
||||
|
||||
// Progress
|
||||
if chunkCount%1000 == 0 {
|
||||
fmt.Printf("\r Processed %d chunks, %d new...", chunkCount, newChunks)
|
||||
}
|
||||
}
|
||||
|
||||
duration := time.Since(startTime)
|
||||
|
||||
// Calculate dedup ratio
|
||||
dedupRatio := 0.0
|
||||
if totalSize > 0 {
|
||||
dedupRatio = 1.0 - float64(storedSize)/float64(totalSize)
|
||||
}
|
||||
|
||||
// Create manifest
|
||||
manifest := &dedup.Manifest{
|
||||
ID: manifestID,
|
||||
Name: dedupName,
|
||||
CreatedAt: now,
|
||||
DatabaseType: dedupDBType,
|
||||
DatabaseName: dedupDBName,
|
||||
DatabaseHost: dedupDBHost,
|
||||
Chunks: chunks,
|
||||
OriginalSize: totalSize,
|
||||
StoredSize: storedSize,
|
||||
ChunkCount: chunkCount,
|
||||
NewChunks: newChunks,
|
||||
DedupRatio: dedupRatio,
|
||||
Encrypted: dedupEncrypt,
|
||||
Compressed: dedupCompress,
|
||||
SHA256: fileHash,
|
||||
}
|
||||
|
||||
if err := manifestStore.Save(manifest); err != nil {
|
||||
return fmt.Errorf("failed to save manifest: %w", err)
|
||||
}
|
||||
|
||||
if err := index.AddManifest(manifest); err != nil {
|
||||
log.Warn("Failed to index manifest", "error", err)
|
||||
}
|
||||
|
||||
fmt.Printf("\r \r")
|
||||
fmt.Printf("\nBackup complete!\n")
|
||||
fmt.Printf(" Manifest: %s\n", manifestID)
|
||||
fmt.Printf(" Chunks: %d total, %d new\n", chunkCount, newChunks)
|
||||
fmt.Printf(" Original: %s\n", formatBytes(totalSize))
|
||||
fmt.Printf(" Stored: %s (new data)\n", formatBytes(storedSize))
|
||||
fmt.Printf(" Dedup ratio: %.1f%%\n", dedupRatio*100)
|
||||
fmt.Printf(" Duration: %s\n", duration.Round(time.Millisecond))
|
||||
fmt.Printf(" Throughput: %s/s\n", formatBytes(int64(float64(totalSize)/duration.Seconds())))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func runDedupRestore(cmd *cobra.Command, args []string) error {
|
||||
manifestID := args[0]
|
||||
outputPath := args[1]
|
||||
|
||||
basePath := getDedupDir()
|
||||
encKey := ""
|
||||
if dedupEncrypt {
|
||||
encKey = getEncryptionKey()
|
||||
}
|
||||
|
||||
store, err := dedup.NewChunkStore(dedup.StoreConfig{
|
||||
BasePath: basePath,
|
||||
Compress: dedupCompress,
|
||||
EncryptionKey: encKey,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open chunk store: %w", err)
|
||||
}
|
||||
|
||||
manifestStore, err := dedup.NewManifestStore(basePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open manifest store: %w", err)
|
||||
}
|
||||
|
||||
manifest, err := manifestStore.Load(manifestID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load manifest: %w", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Restoring backup: %s\n", manifestID)
|
||||
fmt.Printf(" Created: %s\n", manifest.CreatedAt.Format(time.RFC3339))
|
||||
fmt.Printf(" Size: %s\n", formatBytes(manifest.OriginalSize))
|
||||
fmt.Printf(" Chunks: %d\n", manifest.ChunkCount)
|
||||
|
||||
// Create output file
|
||||
outFile, err := os.Create(outputPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create output file: %w", err)
|
||||
}
|
||||
defer outFile.Close()
|
||||
|
||||
h := sha256.New()
|
||||
writer := io.MultiWriter(outFile, h)
|
||||
|
||||
startTime := time.Now()
|
||||
|
||||
for i, ref := range manifest.Chunks {
|
||||
chunk, err := store.Get(ref.Hash)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read chunk %d (%s): %w", i, ref.Hash[:8], err)
|
||||
}
|
||||
|
||||
if _, err := writer.Write(chunk.Data); err != nil {
|
||||
return fmt.Errorf("failed to write chunk %d: %w", i, err)
|
||||
}
|
||||
|
||||
if (i+1)%1000 == 0 {
|
||||
fmt.Printf("\r Restored %d/%d chunks...", i+1, manifest.ChunkCount)
|
||||
}
|
||||
}
|
||||
|
||||
duration := time.Since(startTime)
|
||||
restoredHash := hex.EncodeToString(h.Sum(nil))
|
||||
|
||||
fmt.Printf("\r \r")
|
||||
fmt.Printf("\nRestore complete!\n")
|
||||
fmt.Printf(" Output: %s\n", outputPath)
|
||||
fmt.Printf(" Duration: %s\n", duration.Round(time.Millisecond))
|
||||
|
||||
// Verify hash
|
||||
if manifest.SHA256 != "" {
|
||||
if restoredHash == manifest.SHA256 {
|
||||
fmt.Printf(" Verification: ✓ SHA-256 matches\n")
|
||||
} else {
|
||||
fmt.Printf(" Verification: ✗ SHA-256 MISMATCH!\n")
|
||||
fmt.Printf(" Expected: %s\n", manifest.SHA256)
|
||||
fmt.Printf(" Got: %s\n", restoredHash)
|
||||
return fmt.Errorf("integrity verification failed")
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func runDedupList(cmd *cobra.Command, args []string) error {
|
||||
basePath := getDedupDir()
|
||||
|
||||
manifestStore, err := dedup.NewManifestStore(basePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open manifest store: %w", err)
|
||||
}
|
||||
|
||||
manifests, err := manifestStore.ListAll()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to list manifests: %w", err)
|
||||
}
|
||||
|
||||
if len(manifests) == 0 {
|
||||
fmt.Println("No deduplicated backups found.")
|
||||
fmt.Printf("Store: %s\n", basePath)
|
||||
return nil
|
||||
}
|
||||
|
||||
fmt.Printf("Deduplicated Backups (%s)\n\n", basePath)
|
||||
fmt.Printf("%-30s %-12s %-10s %-10s %s\n", "ID", "SIZE", "DEDUP", "CHUNKS", "CREATED")
|
||||
fmt.Println(strings.Repeat("-", 80))
|
||||
|
||||
for _, m := range manifests {
|
||||
fmt.Printf("%-30s %-12s %-10.1f%% %-10d %s\n",
|
||||
truncateStr(m.ID, 30),
|
||||
formatBytes(m.OriginalSize),
|
||||
m.DedupRatio*100,
|
||||
m.ChunkCount,
|
||||
m.CreatedAt.Format("2006-01-02 15:04"),
|
||||
)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func runDedupStats(cmd *cobra.Command, args []string) error {
|
||||
basePath := getDedupDir()
|
||||
|
||||
index, err := dedup.NewChunkIndex(basePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open chunk index: %w", err)
|
||||
}
|
||||
defer index.Close()
|
||||
|
||||
stats, err := index.Stats()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get stats: %w", err)
|
||||
}
|
||||
|
||||
store, err := dedup.NewChunkStore(dedup.StoreConfig{BasePath: basePath})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open chunk store: %w", err)
|
||||
}
|
||||
|
||||
storeStats, err := store.Stats()
|
||||
if err != nil {
|
||||
log.Warn("Failed to get store stats", "error", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Deduplication Statistics\n")
|
||||
fmt.Printf("========================\n\n")
|
||||
fmt.Printf("Store: %s\n", basePath)
|
||||
fmt.Printf("Manifests: %d\n", stats.TotalManifests)
|
||||
fmt.Printf("Unique chunks: %d\n", stats.TotalChunks)
|
||||
fmt.Printf("Total raw size: %s\n", formatBytes(stats.TotalSizeRaw))
|
||||
fmt.Printf("Stored size: %s\n", formatBytes(stats.TotalSizeStored))
|
||||
fmt.Printf("Dedup ratio: %.1f%%\n", stats.DedupRatio*100)
|
||||
fmt.Printf("Space saved: %s\n", formatBytes(stats.TotalSizeRaw-stats.TotalSizeStored))
|
||||
|
||||
if storeStats != nil {
|
||||
fmt.Printf("Disk usage: %s\n", formatBytes(storeStats.TotalSize))
|
||||
fmt.Printf("Directories: %d\n", storeStats.Directories)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func runDedupGC(cmd *cobra.Command, args []string) error {
|
||||
basePath := getDedupDir()
|
||||
|
||||
index, err := dedup.NewChunkIndex(basePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open chunk index: %w", err)
|
||||
}
|
||||
defer index.Close()
|
||||
|
||||
store, err := dedup.NewChunkStore(dedup.StoreConfig{
|
||||
BasePath: basePath,
|
||||
Compress: dedupCompress,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open chunk store: %w", err)
|
||||
}
|
||||
|
||||
// Find orphaned chunks
|
||||
orphans, err := index.ListOrphanedChunks()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to find orphaned chunks: %w", err)
|
||||
}
|
||||
|
||||
if len(orphans) == 0 {
|
||||
fmt.Println("No orphaned chunks to clean up.")
|
||||
return nil
|
||||
}
|
||||
|
||||
fmt.Printf("Found %d orphaned chunks\n", len(orphans))
|
||||
|
||||
var freed int64
|
||||
for _, hash := range orphans {
|
||||
if meta, _ := index.GetChunk(hash); meta != nil {
|
||||
freed += meta.SizeStored
|
||||
}
|
||||
if err := store.Delete(hash); err != nil {
|
||||
log.Warn("Failed to delete chunk", "hash", hash[:8], "error", err)
|
||||
continue
|
||||
}
|
||||
if err := index.RemoveChunk(hash); err != nil {
|
||||
log.Warn("Failed to remove chunk from index", "hash", hash[:8], "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("Deleted %d chunks, freed %s\n", len(orphans), formatBytes(freed))
|
||||
|
||||
// Vacuum the index
|
||||
if err := index.Vacuum(); err != nil {
|
||||
log.Warn("Failed to vacuum index", "error", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func runDedupDelete(cmd *cobra.Command, args []string) error {
|
||||
manifestID := args[0]
|
||||
basePath := getDedupDir()
|
||||
|
||||
manifestStore, err := dedup.NewManifestStore(basePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open manifest store: %w", err)
|
||||
}
|
||||
|
||||
index, err := dedup.NewChunkIndex(basePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open chunk index: %w", err)
|
||||
}
|
||||
defer index.Close()
|
||||
|
||||
// Load manifest to decrement chunk refs
|
||||
manifest, err := manifestStore.Load(manifestID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load manifest: %w", err)
|
||||
}
|
||||
|
||||
// Decrement reference counts
|
||||
for _, ref := range manifest.Chunks {
|
||||
index.DecrementRef(ref.Hash)
|
||||
}
|
||||
|
||||
// Delete manifest
|
||||
if err := manifestStore.Delete(manifestID); err != nil {
|
||||
return fmt.Errorf("failed to delete manifest: %w", err)
|
||||
}
|
||||
|
||||
if err := index.RemoveManifest(manifestID); err != nil {
|
||||
log.Warn("Failed to remove manifest from index", "error", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Deleted backup: %s\n", manifestID)
|
||||
fmt.Println("Run 'dbbackup dedup gc' to reclaim space from unreferenced chunks.")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
func formatBytes(b int64) string {
|
||||
const unit = 1024
|
||||
if b < unit {
|
||||
return fmt.Sprintf("%d B", b)
|
||||
}
|
||||
div, exp := int64(unit), 0
|
||||
for n := b / unit; n >= unit; n /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
func truncateStr(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max-3] + "..."
|
||||
}
|
||||
Reference in New Issue
Block a user