feat: Add content-defined chunking deduplication
All checks were successful
CI/CD / Test (push) Successful in 1m17s
CI/CD / Lint (push) Successful in 1m23s
CI/CD / Build & Release (push) Successful in 3m12s

- Gear hash CDC with 92%+ overlap on shifted data
- SHA-256 content-addressed chunk storage
- AES-256-GCM per-chunk encryption (optional)
- Gzip compression (default enabled)
- SQLite index for fast lookups
- JSON manifests with SHA-256 verification

Commands: dedup backup/restore/list/stats/delete/gc

Resistance is futile.
This commit is contained in:
2026-01-07 15:02:41 +01:00
parent b99b05fedb
commit c519f08ef2
9 changed files with 2002 additions and 12 deletions

188
internal/dedup/manifest.go Normal file
View File

@@ -0,0 +1,188 @@
package dedup
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"time"
)
// Manifest describes a single backup as a list of chunks
type Manifest struct {
// ID is the unique identifier (typically timestamp-based)
ID string `json:"id"`
// Name is an optional human-readable name
Name string `json:"name,omitempty"`
// CreatedAt is when this backup was created
CreatedAt time.Time `json:"created_at"`
// Database information
DatabaseType string `json:"database_type"` // postgres, mysql
DatabaseName string `json:"database_name"`
DatabaseHost string `json:"database_host"`
// Chunks is the ordered list of chunk hashes
// The file is reconstructed by concatenating chunks in order
Chunks []ChunkRef `json:"chunks"`
// Stats about the backup
OriginalSize int64 `json:"original_size"` // Size before deduplication
StoredSize int64 `json:"stored_size"` // Size after dedup (new chunks only)
ChunkCount int `json:"chunk_count"` // Total chunks
NewChunks int `json:"new_chunks"` // Chunks that weren't deduplicated
DedupRatio float64 `json:"dedup_ratio"` // 1.0 = no dedup, 0.0 = 100% dedup
// Encryption and compression settings used
Encrypted bool `json:"encrypted"`
Compressed bool `json:"compressed"`
// Verification
SHA256 string `json:"sha256"` // Hash of reconstructed file
VerifiedAt time.Time `json:"verified_at,omitempty"`
}
// ChunkRef references a chunk in the manifest
type ChunkRef struct {
Hash string `json:"h"` // SHA-256 hash (64 chars)
Offset int64 `json:"o"` // Offset in original file
Length int `json:"l"` // Chunk length
}
// ManifestStore manages backup manifests
type ManifestStore struct {
basePath string
}
// NewManifestStore creates a new manifest store
func NewManifestStore(basePath string) (*ManifestStore, error) {
manifestDir := filepath.Join(basePath, "manifests")
if err := os.MkdirAll(manifestDir, 0700); err != nil {
return nil, fmt.Errorf("failed to create manifest directory: %w", err)
}
return &ManifestStore{basePath: basePath}, nil
}
// manifestPath returns the path for a manifest ID
func (s *ManifestStore) manifestPath(id string) string {
return filepath.Join(s.basePath, "manifests", id+".manifest.json")
}
// Save writes a manifest to disk
func (s *ManifestStore) Save(m *Manifest) error {
path := s.manifestPath(m.ID)
data, err := json.MarshalIndent(m, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal manifest: %w", err)
}
// Atomic write
tmpPath := path + ".tmp"
if err := os.WriteFile(tmpPath, data, 0600); err != nil {
return fmt.Errorf("failed to write manifest: %w", err)
}
if err := os.Rename(tmpPath, path); err != nil {
os.Remove(tmpPath)
return fmt.Errorf("failed to commit manifest: %w", err)
}
return nil
}
// Load reads a manifest from disk
func (s *ManifestStore) Load(id string) (*Manifest, error) {
path := s.manifestPath(id)
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("failed to read manifest %s: %w", id, err)
}
var m Manifest
if err := json.Unmarshal(data, &m); err != nil {
return nil, fmt.Errorf("failed to parse manifest %s: %w", id, err)
}
return &m, nil
}
// Delete removes a manifest
func (s *ManifestStore) Delete(id string) error {
path := s.manifestPath(id)
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to delete manifest %s: %w", id, err)
}
return nil
}
// List returns all manifest IDs
func (s *ManifestStore) List() ([]string, error) {
manifestDir := filepath.Join(s.basePath, "manifests")
entries, err := os.ReadDir(manifestDir)
if err != nil {
return nil, fmt.Errorf("failed to list manifests: %w", err)
}
var ids []string
for _, e := range entries {
if e.IsDir() {
continue
}
name := e.Name()
if len(name) > 14 && name[len(name)-14:] == ".manifest.json" {
ids = append(ids, name[:len(name)-14])
}
}
return ids, nil
}
// ListAll returns all manifests sorted by creation time (newest first)
func (s *ManifestStore) ListAll() ([]*Manifest, error) {
ids, err := s.List()
if err != nil {
return nil, err
}
var manifests []*Manifest
for _, id := range ids {
m, err := s.Load(id)
if err != nil {
continue // Skip corrupted manifests
}
manifests = append(manifests, m)
}
// Sort by creation time (newest first)
for i := 0; i < len(manifests)-1; i++ {
for j := i + 1; j < len(manifests); j++ {
if manifests[j].CreatedAt.After(manifests[i].CreatedAt) {
manifests[i], manifests[j] = manifests[j], manifests[i]
}
}
}
return manifests, nil
}
// GetChunkHashes returns all unique chunk hashes referenced by manifests
func (s *ManifestStore) GetChunkHashes() (map[string]int, error) {
manifests, err := s.ListAll()
if err != nil {
return nil, err
}
// Map hash -> reference count
refs := make(map[string]int)
for _, m := range manifests {
for _, c := range m.Chunks {
refs[c.Hash]++
}
}
return refs, nil
}