🔥 Tremora del Terra: ultimate hmac-file-server fix – final push before the drop 💾🔐

2025-07-18 12:05:49 +00:00
parent e0751bb7d6
commit a61e9c40e1
4 changed files with 1074 additions and 0 deletions
--- a/cmd/server/queue_resilience.go
+++ b/cmd/server/queue_resilience.go
@ -0,0 +1,560 @@
+// queue_resilience.go - Enhanced queue resilience implementation
+
+package main
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"time"
+)
+
+// RobustQueue represents an enhanced queue with timeout resilience
+type RobustQueue struct {
+	// Core queue components
+	items           chan QueueItem
+	spillover       []QueueItem
+	spilloverMutex  sync.RWMutex
+	
+	// Configuration
+	config          *QueueResilienceConfig
+	
+	// State tracking
+	length          int64
+	processed       int64
+	failed          int64
+	spilloverActive bool
+	
+	// Circuit breaker
+	circuitBreaker  *CircuitBreaker
+	
+	// Priority queues
+	highPriority    chan QueueItem
+	mediumPriority  chan QueueItem
+	lowPriority     chan QueueItem
+	
+	// Worker management
+	workers         []*QueueWorker
+	workerHealth    map[int]*WorkerHealth
+	healthMutex     sync.RWMutex
+	
+	// Context and lifecycle
+	ctx             context.Context
+	cancel          context.CancelFunc
+	wg              sync.WaitGroup
+}
+
+// QueueItem represents an item in the queue
+type QueueItem struct {
+	ID          string
+	Data        interface{}
+	Priority    int
+	EnqueueTime time.Time
+	Retries     int
+	MaxRetries  int
+	Timeout     time.Duration
+	Context     context.Context
+}
+
+// QueueResilienceConfig holds the resilience configuration
+type QueueResilienceConfig struct {
+	// Basic settings
+	Enabled                   bool
+	QueueSize                 int
+	SpilloverEnabled          bool
+	SpilloverMaxSize          int64
+	
+	// Timeout settings
+	QueueOperationTimeout     time.Duration
+	QueueDrainTimeout         time.Duration
+	WorkerHealthCheckInterval time.Duration
+	
+	// Circuit breaker settings
+	CircuitBreakerEnabled     bool
+	CircuitBreakerThreshold   int
+	CircuitBreakerTimeout     time.Duration
+	
+	// Priority settings
+	PriorityLevels           int
+	PriorityAgingEnabled     bool
+	PriorityAgingThreshold   time.Duration
+	
+	// Backpressure settings
+	BackpressureThreshold    float64
+	EmergencyModeThreshold   float64
+}
+
+// CircuitBreaker implements circuit breaker pattern for queue operations
+type CircuitBreaker struct {
+	failures    int64
+	lastFailure time.Time
+	state       int32 // 0=closed, 1=open, 2=half-open
+	threshold   int
+	timeout     time.Duration
+	mutex       sync.RWMutex
+}
+
+// WorkerHealth tracks individual worker health
+type WorkerHealth struct {
+	ID              int
+	LastSeen        time.Time
+	ProcessedCount  int64
+	ErrorCount      int64
+	AverageTime     time.Duration
+	Status          string // "healthy", "slow", "failed"
+}
+
+// QueueWorker represents a queue worker
+type QueueWorker struct {
+	ID       int
+	queue    *RobustQueue
+	health   *WorkerHealth
+	ctx      context.Context
+	cancel   context.CancelFunc
+}
+
+// NewRobustQueue creates a new robust queue with timeout resilience
+func NewRobustQueue(config *QueueResilienceConfig) *RobustQueue {
+	ctx, cancel := context.WithCancel(context.Background())
+	
+	queue := &RobustQueue{
+		items:          make(chan QueueItem, config.QueueSize),
+		config:         config,
+		circuitBreaker: NewCircuitBreaker(config.CircuitBreakerThreshold, config.CircuitBreakerTimeout),
+		workerHealth:   make(map[int]*WorkerHealth),
+		ctx:            ctx,
+		cancel:         cancel,
+	}
+	
+	// Initialize priority queues if enabled
+	if config.PriorityLevels > 1 {
+		queue.highPriority = make(chan QueueItem, config.QueueSize/3)
+		queue.mediumPriority = make(chan QueueItem, config.QueueSize/3)
+		queue.lowPriority = make(chan QueueItem, config.QueueSize/3)
+	}
+	
+	// Start background routines
+	queue.startHealthMonitoring()
+	queue.startPriorityAging()
+	queue.startSpilloverManager()
+	
+	return queue
+}
+
+// Enqueue adds an item to the queue with timeout resilience
+func (q *RobustQueue) Enqueue(item QueueItem) error {
+	// Check circuit breaker
+	if !q.circuitBreaker.CanExecute() {
+		return errors.New("circuit breaker is open")
+	}
+	
+	// Create timeout context for queue operation
+	ctx, cancel := context.WithTimeout(q.ctx, q.config.QueueOperationTimeout)
+	defer cancel()
+	
+	// Check backpressure
+	currentLoad := float64(atomic.LoadInt64(&q.length)) / float64(q.config.QueueSize)
+	if currentLoad > q.config.BackpressureThreshold {
+		// Apply backpressure delay
+		backpressureDelay := time.Duration(currentLoad * float64(time.Second))
+		select {
+		case <-time.After(backpressureDelay):
+		case <-ctx.Done():
+			return ctx.Err()
+		}
+	}
+	
+	// Try to enqueue with priority support
+	err := q.enqueueWithPriority(ctx, item)
+	if err != nil {
+		q.circuitBreaker.RecordFailure()
+		return err
+	}
+	
+	q.circuitBreaker.RecordSuccess()
+	atomic.AddInt64(&q.length, 1)
+	return nil
+}
+
+// enqueueWithPriority handles priority-based enqueueing
+func (q *RobustQueue) enqueueWithPriority(ctx context.Context, item QueueItem) error {
+	// Set enqueue time
+	item.EnqueueTime = time.Now()
+	
+	// Choose appropriate queue based on priority
+	var targetQueue chan QueueItem
+	if q.config.PriorityLevels > 1 {
+		switch item.Priority {
+		case 3:
+			targetQueue = q.highPriority
+		case 2:
+			targetQueue = q.mediumPriority
+		default:
+			targetQueue = q.lowPriority
+		}
+	} else {
+		targetQueue = q.items
+	}
+	
+	// Try to enqueue
+	select {
+	case targetQueue <- item:
+		return nil
+	case <-ctx.Done():
+		// If primary queue is full, try spillover
+		if q.config.SpilloverEnabled {
+			return q.spilloverEnqueue(item)
+		}
+		return ctx.Err()
+	}
+}
+
+// spilloverEnqueue handles disk spillover when memory queues are full
+func (q *RobustQueue) spilloverEnqueue(item QueueItem) error {
+	q.spilloverMutex.Lock()
+	defer q.spilloverMutex.Unlock()
+	
+	// Check spillover size limit
+	if int64(len(q.spillover)) >= q.config.SpilloverMaxSize {
+		return errors.New("spillover queue is full")
+	}
+	
+	q.spillover = append(q.spillover, item)
+	q.spilloverActive = true
+	return nil
+}
+
+// Dequeue removes an item from the queue with timeout handling
+func (q *RobustQueue) Dequeue(timeout time.Duration) (*QueueItem, error) {
+	ctx, cancel := context.WithTimeout(q.ctx, timeout)
+	defer cancel()
+	
+	// Try priority queues first
+	if q.config.PriorityLevels > 1 {
+		item, err := q.dequeueWithPriority(ctx)
+		if err == nil {
+			atomic.AddInt64(&q.length, -1)
+			return item, nil
+		}
+	}
+	
+	// Try main queue
+	select {
+	case item := <-q.items:
+		atomic.AddInt64(&q.length, -1)
+		return &item, nil
+	case <-ctx.Done():
+		// Try spillover as last resort
+		return q.spilloverDequeue()
+	}
+}
+
+// dequeueWithPriority handles priority-based dequeuing
+func (q *RobustQueue) dequeueWithPriority(ctx context.Context) (*QueueItem, error) {
+	// Try high priority first
+	select {
+	case item := <-q.highPriority:
+		return &item, nil
+	default:
+	}
+	
+	// Try medium priority
+	select {
+	case item := <-q.mediumPriority:
+		return &item, nil
+	default:
+	}
+	
+	// Try low priority
+	select {
+	case item := <-q.lowPriority:
+		return &item, nil
+	case <-ctx.Done():
+		return nil, ctx.Err()
+	}
+}
+
+// spilloverDequeue retrieves items from disk spillover
+func (q *RobustQueue) spilloverDequeue() (*QueueItem, error) {
+	q.spilloverMutex.Lock()
+	defer q.spilloverMutex.Unlock()
+	
+	if len(q.spillover) == 0 {
+		return nil, errors.New("no items available")
+	}
+	
+	item := q.spillover[0]
+	q.spillover = q.spillover[1:]
+	
+	if len(q.spillover) == 0 {
+		q.spilloverActive = false
+	}
+	
+	return &item, nil
+}
+
+// startHealthMonitoring monitors worker health continuously
+func (q *RobustQueue) startHealthMonitoring() {
+	q.wg.Add(1)
+	go func() {
+		defer q.wg.Done()
+		ticker := time.NewTicker(q.config.WorkerHealthCheckInterval)
+		defer ticker.Stop()
+		
+		for {
+			select {
+			case <-ticker.C:
+				q.checkWorkerHealth()
+			case <-q.ctx.Done():
+				return
+			}
+		}
+	}()
+}
+
+// checkWorkerHealth evaluates the health of all workers
+func (q *RobustQueue) checkWorkerHealth() {
+	q.healthMutex.RLock()
+	defer q.healthMutex.RUnlock()
+	
+	now := time.Now()
+	for _, health := range q.workerHealth {
+		// Check if worker is responsive
+		if now.Sub(health.LastSeen) > q.config.WorkerHealthCheckInterval*2 {
+			health.Status = "failed"
+			log.Warnf("Worker %d is unresponsive", health.ID)
+		} else if health.ErrorCount > health.ProcessedCount/2 {
+			health.Status = "slow"
+			log.Warnf("Worker %d has high error rate", health.ID)
+		} else {
+			health.Status = "healthy"
+		}
+	}
+}
+
+// startPriorityAging ages lower priority items to prevent starvation
+func (q *RobustQueue) startPriorityAging() {
+	if !q.config.PriorityAgingEnabled {
+		return
+	}
+	
+	q.wg.Add(1)
+	go func() {
+		defer q.wg.Done()
+		ticker := time.NewTicker(q.config.PriorityAgingThreshold / 2)
+		defer ticker.Stop()
+		
+		for {
+			select {
+			case <-ticker.C:
+				q.ageQueueItems()
+			case <-q.ctx.Done():
+				return
+			}
+		}
+	}()
+}
+
+// ageQueueItems promotes old items to higher priority
+func (q *RobustQueue) ageQueueItems() {
+	now := time.Now()
+	
+	// Age medium priority items to high priority
+	q.ageSpecificQueue(q.mediumPriority, q.highPriority, now)
+	
+	// Age low priority items to medium priority
+	q.ageSpecificQueue(q.lowPriority, q.mediumPriority, now)
+}
+
+// ageSpecificQueue ages items from source to target queue
+func (q *RobustQueue) ageSpecificQueue(source, target chan QueueItem, now time.Time) {
+	for {
+		select {
+		case item := <-source:
+			if now.Sub(item.EnqueueTime) > q.config.PriorityAgingThreshold {
+				// Age up the item
+				item.Priority++
+				select {
+				case target <- item:
+				default:
+					// Target queue is full, put it back
+					select {
+					case source <- item:
+					default:
+						// Both queues full, move to spillover
+						q.spilloverEnqueue(item)
+					}
+				}
+			} else {
+				// Put it back, not old enough yet
+				select {
+				case source <- item:
+				default:
+					q.spilloverEnqueue(item)
+				}
+			}
+		default:
+			return // No more items to age
+		}
+	}
+}
+
+// startSpilloverManager manages the spillover queue
+func (q *RobustQueue) startSpilloverManager() {
+	q.wg.Add(1)
+	go func() {
+		defer q.wg.Done()
+		ticker := time.NewTicker(time.Second * 30)
+		defer ticker.Stop()
+		
+		for {
+			select {
+			case <-ticker.C:
+				q.manageSpillover()
+			case <-q.ctx.Done():
+				return
+			}
+		}
+	}()
+}
+
+// manageSpillover tries to move items from spillover back to memory queues
+func (q *RobustQueue) manageSpillover() {
+	if !q.spilloverActive {
+		return
+	}
+	
+	q.spilloverMutex.Lock()
+	defer q.spilloverMutex.Unlock()
+	
+	moved := 0
+	for i := 0; i < len(q.spillover) && moved < 10; i++ {
+		item := q.spillover[i]
+		
+		// Try to move back to appropriate queue
+		var targetQueue chan QueueItem
+		if q.config.PriorityLevels > 1 {
+			switch item.Priority {
+			case 3:
+				targetQueue = q.highPriority
+			case 2:
+				targetQueue = q.mediumPriority
+			default:
+				targetQueue = q.lowPriority
+			}
+		} else {
+			targetQueue = q.items
+		}
+		
+		select {
+		case targetQueue <- item:
+			// Successfully moved back
+			q.spillover = append(q.spillover[:i], q.spillover[i+1:]...)
+			i-- // Adjust index after removal
+			moved++
+		default:
+			// Queue still full, try later
+		}
+	}
+	
+	if len(q.spillover) == 0 {
+		q.spilloverActive = false
+	}
+	
+	if moved > 0 {
+		log.Debugf("Moved %d items from spillover back to memory queues", moved)
+	}
+}
+
+// NewCircuitBreaker creates a new circuit breaker
+func NewCircuitBreaker(threshold int, timeout time.Duration) *CircuitBreaker {
+	return &CircuitBreaker{
+		threshold: threshold,
+		timeout:   timeout,
+	}
+}
+
+// CanExecute checks if the circuit breaker allows execution
+func (cb *CircuitBreaker) CanExecute() bool {
+	cb.mutex.RLock()
+	defer cb.mutex.RUnlock()
+	
+	state := atomic.LoadInt32(&cb.state)
+	if state == 0 { // Closed
+		return true
+	}
+	
+	if state == 1 { // Open
+		if time.Since(cb.lastFailure) > cb.timeout {
+			// Try to transition to half-open
+			if atomic.CompareAndSwapInt32(&cb.state, 1, 2) {
+				return true
+			}
+		}
+		return false
+	}
+	
+	// Half-open state
+	return true
+}
+
+// RecordSuccess records a successful operation
+func (cb *CircuitBreaker) RecordSuccess() {
+	cb.mutex.Lock()
+	defer cb.mutex.Unlock()
+	
+	atomic.StoreInt64(&cb.failures, 0)
+	atomic.StoreInt32(&cb.state, 0) // Close circuit
+}
+
+// RecordFailure records a failed operation
+func (cb *CircuitBreaker) RecordFailure() {
+	cb.mutex.Lock()
+	defer cb.mutex.Unlock()
+	
+	failures := atomic.AddInt64(&cb.failures, 1)
+	cb.lastFailure = time.Now()
+	
+	if failures >= int64(cb.threshold) {
+		atomic.StoreInt32(&cb.state, 1) // Open circuit
+	}
+}
+
+// GetStats returns queue statistics
+func (q *RobustQueue) GetStats() map[string]interface{} {
+	return map[string]interface{}{
+		"length":           atomic.LoadInt64(&q.length),
+		"processed":        atomic.LoadInt64(&q.processed),
+		"failed":           atomic.LoadInt64(&q.failed),
+		"spillover_active": q.spilloverActive,
+		"spillover_size":   len(q.spillover),
+		"circuit_state":    atomic.LoadInt32(&q.circuitBreaker.state),
+		"circuit_failures": atomic.LoadInt64(&q.circuitBreaker.failures),
+	}
+}
+
+// Shutdown gracefully shuts down the queue
+func (q *RobustQueue) Shutdown(timeout time.Duration) error {
+	log.Info("Starting queue shutdown...")
+	
+	// Cancel context to stop background routines
+	q.cancel()
+	
+	// Wait for background routines to finish
+	done := make(chan struct{})
+	go func() {
+		q.wg.Wait()
+		close(done)
+	}()
+	
+	select {
+	case <-done:
+		log.Info("Queue shutdown completed successfully")
+		return nil
+	case <-time.After(timeout):
+		log.Warn("Queue shutdown timed out")
+		return errors.New("shutdown timeout")
+	}
+}