Files
dbbackup/internal/dedup/index.go
Alexander Renz c519f08ef2
All checks were successful
CI/CD / Test (push) Successful in 1m17s
CI/CD / Lint (push) Successful in 1m23s
CI/CD / Build & Release (push) Successful in 3m12s
feat: Add content-defined chunking deduplication
- Gear hash CDC with 92%+ overlap on shifted data
- SHA-256 content-addressed chunk storage
- AES-256-GCM per-chunk encryption (optional)
- Gzip compression (default enabled)
- SQLite index for fast lookups
- JSON manifests with SHA-256 verification

Commands: dedup backup/restore/list/stats/delete/gc

Resistance is futile.
2026-01-07 15:02:41 +01:00

240 lines
6.2 KiB
Go

package dedup
import (
"database/sql"
"fmt"
"path/filepath"
"time"
_ "github.com/mattn/go-sqlite3" // SQLite driver
)
// ChunkIndex provides fast chunk lookups using SQLite
type ChunkIndex struct {
db *sql.DB
}
// NewChunkIndex opens or creates a chunk index database
func NewChunkIndex(basePath string) (*ChunkIndex, error) {
dbPath := filepath.Join(basePath, "chunks.db")
db, err := sql.Open("sqlite3", dbPath+"?_journal_mode=WAL&_synchronous=NORMAL")
if err != nil {
return nil, fmt.Errorf("failed to open chunk index: %w", err)
}
idx := &ChunkIndex{db: db}
if err := idx.migrate(); err != nil {
db.Close()
return nil, err
}
return idx, nil
}
// migrate creates the schema if needed
func (idx *ChunkIndex) migrate() error {
schema := `
CREATE TABLE IF NOT EXISTS chunks (
hash TEXT PRIMARY KEY,
size_raw INTEGER NOT NULL,
size_stored INTEGER NOT NULL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
last_accessed DATETIME,
ref_count INTEGER DEFAULT 1
);
CREATE TABLE IF NOT EXISTS manifests (
id TEXT PRIMARY KEY,
database_type TEXT,
database_name TEXT,
database_host TEXT,
created_at DATETIME,
original_size INTEGER,
stored_size INTEGER,
chunk_count INTEGER,
new_chunks INTEGER,
dedup_ratio REAL,
sha256 TEXT,
verified_at DATETIME
);
CREATE INDEX IF NOT EXISTS idx_chunks_created ON chunks(created_at);
CREATE INDEX IF NOT EXISTS idx_chunks_accessed ON chunks(last_accessed);
CREATE INDEX IF NOT EXISTS idx_manifests_created ON manifests(created_at);
CREATE INDEX IF NOT EXISTS idx_manifests_database ON manifests(database_name);
`
_, err := idx.db.Exec(schema)
return err
}
// Close closes the database
func (idx *ChunkIndex) Close() error {
return idx.db.Close()
}
// AddChunk records a chunk in the index
func (idx *ChunkIndex) AddChunk(hash string, sizeRaw, sizeStored int) error {
_, err := idx.db.Exec(`
INSERT INTO chunks (hash, size_raw, size_stored, created_at, last_accessed, ref_count)
VALUES (?, ?, ?, ?, ?, 1)
ON CONFLICT(hash) DO UPDATE SET
ref_count = ref_count + 1,
last_accessed = ?
`, hash, sizeRaw, sizeStored, time.Now(), time.Now(), time.Now())
return err
}
// HasChunk checks if a chunk exists in the index
func (idx *ChunkIndex) HasChunk(hash string) (bool, error) {
var count int
err := idx.db.QueryRow("SELECT COUNT(*) FROM chunks WHERE hash = ?", hash).Scan(&count)
return count > 0, err
}
// GetChunk retrieves chunk metadata
func (idx *ChunkIndex) GetChunk(hash string) (*ChunkMeta, error) {
var m ChunkMeta
err := idx.db.QueryRow(`
SELECT hash, size_raw, size_stored, created_at, ref_count
FROM chunks WHERE hash = ?
`, hash).Scan(&m.Hash, &m.SizeRaw, &m.SizeStored, &m.CreatedAt, &m.RefCount)
if err == sql.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, err
}
return &m, nil
}
// ChunkMeta holds metadata about a chunk
type ChunkMeta struct {
Hash string
SizeRaw int64
SizeStored int64
CreatedAt time.Time
RefCount int
}
// DecrementRef decreases the reference count for a chunk
// Returns true if the chunk should be deleted (ref_count <= 0)
func (idx *ChunkIndex) DecrementRef(hash string) (shouldDelete bool, err error) {
result, err := idx.db.Exec(`
UPDATE chunks SET ref_count = ref_count - 1 WHERE hash = ?
`, hash)
if err != nil {
return false, err
}
affected, _ := result.RowsAffected()
if affected == 0 {
return false, nil
}
var refCount int
err = idx.db.QueryRow("SELECT ref_count FROM chunks WHERE hash = ?", hash).Scan(&refCount)
if err != nil {
return false, err
}
return refCount <= 0, nil
}
// RemoveChunk removes a chunk from the index
func (idx *ChunkIndex) RemoveChunk(hash string) error {
_, err := idx.db.Exec("DELETE FROM chunks WHERE hash = ?", hash)
return err
}
// AddManifest records a manifest in the index
func (idx *ChunkIndex) AddManifest(m *Manifest) error {
_, err := idx.db.Exec(`
INSERT OR REPLACE INTO manifests
(id, database_type, database_name, database_host, created_at,
original_size, stored_size, chunk_count, new_chunks, dedup_ratio, sha256)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`, m.ID, m.DatabaseType, m.DatabaseName, m.DatabaseHost, m.CreatedAt,
m.OriginalSize, m.StoredSize, m.ChunkCount, m.NewChunks, m.DedupRatio, m.SHA256)
return err
}
// RemoveManifest removes a manifest from the index
func (idx *ChunkIndex) RemoveManifest(id string) error {
_, err := idx.db.Exec("DELETE FROM manifests WHERE id = ?", id)
return err
}
// IndexStats holds statistics about the dedup index
type IndexStats struct {
TotalChunks int64
TotalManifests int64
TotalSizeRaw int64 // Uncompressed, undeduplicated
TotalSizeStored int64 // On-disk after dedup+compression
DedupRatio float64
OldestChunk time.Time
NewestChunk time.Time
}
// Stats returns statistics about the index
func (idx *ChunkIndex) Stats() (*IndexStats, error) {
stats := &IndexStats{}
var oldestStr, newestStr string
err := idx.db.QueryRow(`
SELECT
COUNT(*),
COALESCE(SUM(size_raw), 0),
COALESCE(SUM(size_stored), 0),
COALESCE(MIN(created_at), ''),
COALESCE(MAX(created_at), '')
FROM chunks
`).Scan(&stats.TotalChunks, &stats.TotalSizeRaw, &stats.TotalSizeStored,
&oldestStr, &newestStr)
if err != nil {
return nil, err
}
// Parse time strings
if oldestStr != "" {
stats.OldestChunk, _ = time.Parse("2006-01-02 15:04:05", oldestStr)
}
if newestStr != "" {
stats.NewestChunk, _ = time.Parse("2006-01-02 15:04:05", newestStr)
}
idx.db.QueryRow("SELECT COUNT(*) FROM manifests").Scan(&stats.TotalManifests)
if stats.TotalSizeRaw > 0 {
stats.DedupRatio = 1.0 - float64(stats.TotalSizeStored)/float64(stats.TotalSizeRaw)
}
return stats, nil
}
// ListOrphanedChunks returns chunks that have ref_count <= 0
func (idx *ChunkIndex) ListOrphanedChunks() ([]string, error) {
rows, err := idx.db.Query("SELECT hash FROM chunks WHERE ref_count <= 0")
if err != nil {
return nil, err
}
defer rows.Close()
var hashes []string
for rows.Next() {
var hash string
if err := rows.Scan(&hash); err != nil {
continue
}
hashes = append(hashes, hash)
}
return hashes, rows.Err()
}
// Vacuum cleans up the database
func (idx *ChunkIndex) Vacuum() error {
_, err := idx.db.Exec("VACUUM")
return err
}