package dedup
import (
"compress/gzip"
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"os"
"path/filepath"
"sync"
)
// ChunkStore manages content-addressed chunk storage
// Chunks are stored as: //.chunk[.gz][.enc]
type ChunkStore struct {
basePath string
compress bool
encryptionKey []byte // 32 bytes for AES-256
mu sync.RWMutex
existingChunks map[string]bool // Cache of known chunks
}
// StoreConfig holds configuration for the chunk store
type StoreConfig struct {
BasePath string
Compress bool // Enable gzip compression
EncryptionKey string // Optional: hex-encoded 32-byte key for AES-256-GCM
}
// NewChunkStore creates a new chunk store
func NewChunkStore(config StoreConfig) (*ChunkStore, error) {
store := &ChunkStore{
basePath: config.BasePath,
compress: config.Compress,
existingChunks: make(map[string]bool),
}
// Parse encryption key if provided
if config.EncryptionKey != "" {
key, err := hex.DecodeString(config.EncryptionKey)
if err != nil {
return nil, fmt.Errorf("invalid encryption key: %w", err)
}
if len(key) != 32 {
return nil, fmt.Errorf("encryption key must be 32 bytes (got %d)", len(key))
}
store.encryptionKey = key
}
// Create base directory structure
if err := os.MkdirAll(config.BasePath, 0700); err != nil {
return nil, fmt.Errorf("failed to create chunk store: %w", err)
}
// Create chunks and manifests directories
for _, dir := range []string{"chunks", "manifests"} {
if err := os.MkdirAll(filepath.Join(config.BasePath, dir), 0700); err != nil {
return nil, fmt.Errorf("failed to create %s directory: %w", dir, err)
}
}
return store, nil
}
// chunkPath returns the filesystem path for a chunk hash
// Uses 2-character prefix for directory sharding (256 subdirs)
func (s *ChunkStore) chunkPath(hash string) string {
if len(hash) < 2 {
return filepath.Join(s.basePath, "chunks", "xx", hash+s.chunkExt())
}
prefix := hash[:2]
return filepath.Join(s.basePath, "chunks", prefix, hash+s.chunkExt())
}
// chunkExt returns the file extension based on compression/encryption settings
func (s *ChunkStore) chunkExt() string {
ext := ".chunk"
if s.compress {
ext += ".gz"
}
if s.encryptionKey != nil {
ext += ".enc"
}
return ext
}
// Has checks if a chunk exists in the store
func (s *ChunkStore) Has(hash string) bool {
s.mu.RLock()
if exists, ok := s.existingChunks[hash]; ok {
s.mu.RUnlock()
return exists
}
s.mu.RUnlock()
// Check filesystem
path := s.chunkPath(hash)
_, err := os.Stat(path)
exists := err == nil
s.mu.Lock()
s.existingChunks[hash] = exists
s.mu.Unlock()
return exists
}
// Put stores a chunk, returning true if it was new (not deduplicated)
func (s *ChunkStore) Put(chunk *Chunk) (isNew bool, err error) {
// Check if already exists (deduplication!)
if s.Has(chunk.Hash) {
return false, nil
}
path := s.chunkPath(chunk.Hash)
// Create prefix directory
if err := os.MkdirAll(filepath.Dir(path), 0700); err != nil {
return false, fmt.Errorf("failed to create chunk directory: %w", err)
}
// Prepare data
data := chunk.Data
// Compress if enabled
if s.compress {
data, err = s.compressData(data)
if err != nil {
return false, fmt.Errorf("compression failed: %w", err)
}
}
// Encrypt if enabled
if s.encryptionKey != nil {
data, err = s.encryptData(data)
if err != nil {
return false, fmt.Errorf("encryption failed: %w", err)
}
}
// Write atomically (write to temp, then rename)
tmpPath := path + ".tmp"
if err := os.WriteFile(tmpPath, data, 0600); err != nil {
return false, fmt.Errorf("failed to write chunk: %w", err)
}
if err := os.Rename(tmpPath, path); err != nil {
os.Remove(tmpPath)
return false, fmt.Errorf("failed to commit chunk: %w", err)
}
// Update cache
s.mu.Lock()
s.existingChunks[chunk.Hash] = true
s.mu.Unlock()
return true, nil
}
// Get retrieves a chunk by hash
func (s *ChunkStore) Get(hash string) (*Chunk, error) {
path := s.chunkPath(hash)
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("failed to read chunk %s: %w", hash, err)
}
// Decrypt if encrypted
if s.encryptionKey != nil {
data, err = s.decryptData(data)
if err != nil {
return nil, fmt.Errorf("decryption failed: %w", err)
}
}
// Decompress if compressed
if s.compress {
data, err = s.decompressData(data)
if err != nil {
return nil, fmt.Errorf("decompression failed: %w", err)
}
}
// Verify hash
h := sha256.Sum256(data)
actualHash := hex.EncodeToString(h[:])
if actualHash != hash {
return nil, fmt.Errorf("chunk hash mismatch: expected %s, got %s", hash, actualHash)
}
return &Chunk{
Hash: hash,
Data: data,
Length: len(data),
}, nil
}
// Delete removes a chunk from the store
func (s *ChunkStore) Delete(hash string) error {
path := s.chunkPath(hash)
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to delete chunk %s: %w", hash, err)
}
s.mu.Lock()
delete(s.existingChunks, hash)
s.mu.Unlock()
return nil
}
// Stats returns storage statistics
type StoreStats struct {
TotalChunks int64
TotalSize int64 // Bytes on disk (after compression/encryption)
UniqueSize int64 // Bytes of unique data
Directories int
}
// Stats returns statistics about the chunk store
func (s *ChunkStore) Stats() (*StoreStats, error) {
stats := &StoreStats{}
chunksDir := filepath.Join(s.basePath, "chunks")
err := filepath.Walk(chunksDir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if info.IsDir() {
stats.Directories++
return nil
}
stats.TotalChunks++
stats.TotalSize += info.Size()
return nil
})
return stats, err
}
// LoadIndex loads the existing chunk hashes into memory
func (s *ChunkStore) LoadIndex() error {
s.mu.Lock()
defer s.mu.Unlock()
s.existingChunks = make(map[string]bool)
chunksDir := filepath.Join(s.basePath, "chunks")
return filepath.Walk(chunksDir, func(path string, info os.FileInfo, err error) error {
if err != nil || info.IsDir() {
return err
}
// Extract hash from filename
base := filepath.Base(path)
hash := base
// Remove extensions
for _, ext := range []string{".enc", ".gz", ".chunk"} {
if len(hash) > len(ext) && hash[len(hash)-len(ext):] == ext {
hash = hash[:len(hash)-len(ext)]
}
}
if len(hash) == 64 { // SHA-256 hex length
s.existingChunks[hash] = true
}
return nil
})
}
// compressData compresses data using gzip
func (s *ChunkStore) compressData(data []byte) ([]byte, error) {
var buf []byte
w, err := gzip.NewWriterLevel((*bytesBuffer)(&buf), gzip.BestCompression)
if err != nil {
return nil, err
}
if _, err := w.Write(data); err != nil {
return nil, err
}
if err := w.Close(); err != nil {
return nil, err
}
return buf, nil
}
// bytesBuffer is a simple io.Writer that appends to a byte slice
type bytesBuffer []byte
func (b *bytesBuffer) Write(p []byte) (int, error) {
*b = append(*b, p...)
return len(p), nil
}
// decompressData decompresses gzip data
func (s *ChunkStore) decompressData(data []byte) ([]byte, error) {
r, err := gzip.NewReader(&bytesReader{data: data})
if err != nil {
return nil, err
}
defer r.Close()
return io.ReadAll(r)
}
// bytesReader is a simple io.Reader from a byte slice
type bytesReader struct {
data []byte
pos int
}
func (r *bytesReader) Read(p []byte) (int, error) {
if r.pos >= len(r.data) {
return 0, io.EOF
}
n := copy(p, r.data[r.pos:])
r.pos += n
return n, nil
}
// encryptData encrypts data using AES-256-GCM
func (s *ChunkStore) encryptData(plaintext []byte) ([]byte, error) {
block, err := aes.NewCipher(s.encryptionKey)
if err != nil {
return nil, err
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return nil, err
}
nonce := make([]byte, gcm.NonceSize())
if _, err := rand.Read(nonce); err != nil {
return nil, err
}
// Prepend nonce to ciphertext
return gcm.Seal(nonce, nonce, plaintext, nil), nil
}
// decryptData decrypts AES-256-GCM encrypted data
func (s *ChunkStore) decryptData(ciphertext []byte) ([]byte, error) {
block, err := aes.NewCipher(s.encryptionKey)
if err != nil {
return nil, err
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return nil, err
}
if len(ciphertext) < gcm.NonceSize() {
return nil, fmt.Errorf("ciphertext too short")
}
nonce := ciphertext[:gcm.NonceSize()]
ciphertext = ciphertext[gcm.NonceSize():]
return gcm.Open(nil, nonce, ciphertext, nil)
}