🔥 Tremora del Terra: ultimate hmac-file-server fix – final push before the drop 💾🔐

This commit is contained in:
2025-07-18 12:05:49 +00:00
parent e0751bb7d6
commit a61e9c40e1
4 changed files with 1074 additions and 0 deletions

View File

@ -0,0 +1,560 @@
// queue_resilience.go - Enhanced queue resilience implementation
package main
import (
"context"
"errors"
"fmt"
"sync"
"sync/atomic"
"time"
)
// RobustQueue represents an enhanced queue with timeout resilience
type RobustQueue struct {
// Core queue components
items chan QueueItem
spillover []QueueItem
spilloverMutex sync.RWMutex
// Configuration
config *QueueResilienceConfig
// State tracking
length int64
processed int64
failed int64
spilloverActive bool
// Circuit breaker
circuitBreaker *CircuitBreaker
// Priority queues
highPriority chan QueueItem
mediumPriority chan QueueItem
lowPriority chan QueueItem
// Worker management
workers []*QueueWorker
workerHealth map[int]*WorkerHealth
healthMutex sync.RWMutex
// Context and lifecycle
ctx context.Context
cancel context.CancelFunc
wg sync.WaitGroup
}
// QueueItem represents an item in the queue
type QueueItem struct {
ID string
Data interface{}
Priority int
EnqueueTime time.Time
Retries int
MaxRetries int
Timeout time.Duration
Context context.Context
}
// QueueResilienceConfig holds the resilience configuration
type QueueResilienceConfig struct {
// Basic settings
Enabled bool
QueueSize int
SpilloverEnabled bool
SpilloverMaxSize int64
// Timeout settings
QueueOperationTimeout time.Duration
QueueDrainTimeout time.Duration
WorkerHealthCheckInterval time.Duration
// Circuit breaker settings
CircuitBreakerEnabled bool
CircuitBreakerThreshold int
CircuitBreakerTimeout time.Duration
// Priority settings
PriorityLevels int
PriorityAgingEnabled bool
PriorityAgingThreshold time.Duration
// Backpressure settings
BackpressureThreshold float64
EmergencyModeThreshold float64
}
// CircuitBreaker implements circuit breaker pattern for queue operations
type CircuitBreaker struct {
failures int64
lastFailure time.Time
state int32 // 0=closed, 1=open, 2=half-open
threshold int
timeout time.Duration
mutex sync.RWMutex
}
// WorkerHealth tracks individual worker health
type WorkerHealth struct {
ID int
LastSeen time.Time
ProcessedCount int64
ErrorCount int64
AverageTime time.Duration
Status string // "healthy", "slow", "failed"
}
// QueueWorker represents a queue worker
type QueueWorker struct {
ID int
queue *RobustQueue
health *WorkerHealth
ctx context.Context
cancel context.CancelFunc
}
// NewRobustQueue creates a new robust queue with timeout resilience
func NewRobustQueue(config *QueueResilienceConfig) *RobustQueue {
ctx, cancel := context.WithCancel(context.Background())
queue := &RobustQueue{
items: make(chan QueueItem, config.QueueSize),
config: config,
circuitBreaker: NewCircuitBreaker(config.CircuitBreakerThreshold, config.CircuitBreakerTimeout),
workerHealth: make(map[int]*WorkerHealth),
ctx: ctx,
cancel: cancel,
}
// Initialize priority queues if enabled
if config.PriorityLevels > 1 {
queue.highPriority = make(chan QueueItem, config.QueueSize/3)
queue.mediumPriority = make(chan QueueItem, config.QueueSize/3)
queue.lowPriority = make(chan QueueItem, config.QueueSize/3)
}
// Start background routines
queue.startHealthMonitoring()
queue.startPriorityAging()
queue.startSpilloverManager()
return queue
}
// Enqueue adds an item to the queue with timeout resilience
func (q *RobustQueue) Enqueue(item QueueItem) error {
// Check circuit breaker
if !q.circuitBreaker.CanExecute() {
return errors.New("circuit breaker is open")
}
// Create timeout context for queue operation
ctx, cancel := context.WithTimeout(q.ctx, q.config.QueueOperationTimeout)
defer cancel()
// Check backpressure
currentLoad := float64(atomic.LoadInt64(&q.length)) / float64(q.config.QueueSize)
if currentLoad > q.config.BackpressureThreshold {
// Apply backpressure delay
backpressureDelay := time.Duration(currentLoad * float64(time.Second))
select {
case <-time.After(backpressureDelay):
case <-ctx.Done():
return ctx.Err()
}
}
// Try to enqueue with priority support
err := q.enqueueWithPriority(ctx, item)
if err != nil {
q.circuitBreaker.RecordFailure()
return err
}
q.circuitBreaker.RecordSuccess()
atomic.AddInt64(&q.length, 1)
return nil
}
// enqueueWithPriority handles priority-based enqueueing
func (q *RobustQueue) enqueueWithPriority(ctx context.Context, item QueueItem) error {
// Set enqueue time
item.EnqueueTime = time.Now()
// Choose appropriate queue based on priority
var targetQueue chan QueueItem
if q.config.PriorityLevels > 1 {
switch item.Priority {
case 3:
targetQueue = q.highPriority
case 2:
targetQueue = q.mediumPriority
default:
targetQueue = q.lowPriority
}
} else {
targetQueue = q.items
}
// Try to enqueue
select {
case targetQueue <- item:
return nil
case <-ctx.Done():
// If primary queue is full, try spillover
if q.config.SpilloverEnabled {
return q.spilloverEnqueue(item)
}
return ctx.Err()
}
}
// spilloverEnqueue handles disk spillover when memory queues are full
func (q *RobustQueue) spilloverEnqueue(item QueueItem) error {
q.spilloverMutex.Lock()
defer q.spilloverMutex.Unlock()
// Check spillover size limit
if int64(len(q.spillover)) >= q.config.SpilloverMaxSize {
return errors.New("spillover queue is full")
}
q.spillover = append(q.spillover, item)
q.spilloverActive = true
return nil
}
// Dequeue removes an item from the queue with timeout handling
func (q *RobustQueue) Dequeue(timeout time.Duration) (*QueueItem, error) {
ctx, cancel := context.WithTimeout(q.ctx, timeout)
defer cancel()
// Try priority queues first
if q.config.PriorityLevels > 1 {
item, err := q.dequeueWithPriority(ctx)
if err == nil {
atomic.AddInt64(&q.length, -1)
return item, nil
}
}
// Try main queue
select {
case item := <-q.items:
atomic.AddInt64(&q.length, -1)
return &item, nil
case <-ctx.Done():
// Try spillover as last resort
return q.spilloverDequeue()
}
}
// dequeueWithPriority handles priority-based dequeuing
func (q *RobustQueue) dequeueWithPriority(ctx context.Context) (*QueueItem, error) {
// Try high priority first
select {
case item := <-q.highPriority:
return &item, nil
default:
}
// Try medium priority
select {
case item := <-q.mediumPriority:
return &item, nil
default:
}
// Try low priority
select {
case item := <-q.lowPriority:
return &item, nil
case <-ctx.Done():
return nil, ctx.Err()
}
}
// spilloverDequeue retrieves items from disk spillover
func (q *RobustQueue) spilloverDequeue() (*QueueItem, error) {
q.spilloverMutex.Lock()
defer q.spilloverMutex.Unlock()
if len(q.spillover) == 0 {
return nil, errors.New("no items available")
}
item := q.spillover[0]
q.spillover = q.spillover[1:]
if len(q.spillover) == 0 {
q.spilloverActive = false
}
return &item, nil
}
// startHealthMonitoring monitors worker health continuously
func (q *RobustQueue) startHealthMonitoring() {
q.wg.Add(1)
go func() {
defer q.wg.Done()
ticker := time.NewTicker(q.config.WorkerHealthCheckInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
q.checkWorkerHealth()
case <-q.ctx.Done():
return
}
}
}()
}
// checkWorkerHealth evaluates the health of all workers
func (q *RobustQueue) checkWorkerHealth() {
q.healthMutex.RLock()
defer q.healthMutex.RUnlock()
now := time.Now()
for _, health := range q.workerHealth {
// Check if worker is responsive
if now.Sub(health.LastSeen) > q.config.WorkerHealthCheckInterval*2 {
health.Status = "failed"
log.Warnf("Worker %d is unresponsive", health.ID)
} else if health.ErrorCount > health.ProcessedCount/2 {
health.Status = "slow"
log.Warnf("Worker %d has high error rate", health.ID)
} else {
health.Status = "healthy"
}
}
}
// startPriorityAging ages lower priority items to prevent starvation
func (q *RobustQueue) startPriorityAging() {
if !q.config.PriorityAgingEnabled {
return
}
q.wg.Add(1)
go func() {
defer q.wg.Done()
ticker := time.NewTicker(q.config.PriorityAgingThreshold / 2)
defer ticker.Stop()
for {
select {
case <-ticker.C:
q.ageQueueItems()
case <-q.ctx.Done():
return
}
}
}()
}
// ageQueueItems promotes old items to higher priority
func (q *RobustQueue) ageQueueItems() {
now := time.Now()
// Age medium priority items to high priority
q.ageSpecificQueue(q.mediumPriority, q.highPriority, now)
// Age low priority items to medium priority
q.ageSpecificQueue(q.lowPriority, q.mediumPriority, now)
}
// ageSpecificQueue ages items from source to target queue
func (q *RobustQueue) ageSpecificQueue(source, target chan QueueItem, now time.Time) {
for {
select {
case item := <-source:
if now.Sub(item.EnqueueTime) > q.config.PriorityAgingThreshold {
// Age up the item
item.Priority++
select {
case target <- item:
default:
// Target queue is full, put it back
select {
case source <- item:
default:
// Both queues full, move to spillover
q.spilloverEnqueue(item)
}
}
} else {
// Put it back, not old enough yet
select {
case source <- item:
default:
q.spilloverEnqueue(item)
}
}
default:
return // No more items to age
}
}
}
// startSpilloverManager manages the spillover queue
func (q *RobustQueue) startSpilloverManager() {
q.wg.Add(1)
go func() {
defer q.wg.Done()
ticker := time.NewTicker(time.Second * 30)
defer ticker.Stop()
for {
select {
case <-ticker.C:
q.manageSpillover()
case <-q.ctx.Done():
return
}
}
}()
}
// manageSpillover tries to move items from spillover back to memory queues
func (q *RobustQueue) manageSpillover() {
if !q.spilloverActive {
return
}
q.spilloverMutex.Lock()
defer q.spilloverMutex.Unlock()
moved := 0
for i := 0; i < len(q.spillover) && moved < 10; i++ {
item := q.spillover[i]
// Try to move back to appropriate queue
var targetQueue chan QueueItem
if q.config.PriorityLevels > 1 {
switch item.Priority {
case 3:
targetQueue = q.highPriority
case 2:
targetQueue = q.mediumPriority
default:
targetQueue = q.lowPriority
}
} else {
targetQueue = q.items
}
select {
case targetQueue <- item:
// Successfully moved back
q.spillover = append(q.spillover[:i], q.spillover[i+1:]...)
i-- // Adjust index after removal
moved++
default:
// Queue still full, try later
}
}
if len(q.spillover) == 0 {
q.spilloverActive = false
}
if moved > 0 {
log.Debugf("Moved %d items from spillover back to memory queues", moved)
}
}
// NewCircuitBreaker creates a new circuit breaker
func NewCircuitBreaker(threshold int, timeout time.Duration) *CircuitBreaker {
return &CircuitBreaker{
threshold: threshold,
timeout: timeout,
}
}
// CanExecute checks if the circuit breaker allows execution
func (cb *CircuitBreaker) CanExecute() bool {
cb.mutex.RLock()
defer cb.mutex.RUnlock()
state := atomic.LoadInt32(&cb.state)
if state == 0 { // Closed
return true
}
if state == 1 { // Open
if time.Since(cb.lastFailure) > cb.timeout {
// Try to transition to half-open
if atomic.CompareAndSwapInt32(&cb.state, 1, 2) {
return true
}
}
return false
}
// Half-open state
return true
}
// RecordSuccess records a successful operation
func (cb *CircuitBreaker) RecordSuccess() {
cb.mutex.Lock()
defer cb.mutex.Unlock()
atomic.StoreInt64(&cb.failures, 0)
atomic.StoreInt32(&cb.state, 0) // Close circuit
}
// RecordFailure records a failed operation
func (cb *CircuitBreaker) RecordFailure() {
cb.mutex.Lock()
defer cb.mutex.Unlock()
failures := atomic.AddInt64(&cb.failures, 1)
cb.lastFailure = time.Now()
if failures >= int64(cb.threshold) {
atomic.StoreInt32(&cb.state, 1) // Open circuit
}
}
// GetStats returns queue statistics
func (q *RobustQueue) GetStats() map[string]interface{} {
return map[string]interface{}{
"length": atomic.LoadInt64(&q.length),
"processed": atomic.LoadInt64(&q.processed),
"failed": atomic.LoadInt64(&q.failed),
"spillover_active": q.spilloverActive,
"spillover_size": len(q.spillover),
"circuit_state": atomic.LoadInt32(&q.circuitBreaker.state),
"circuit_failures": atomic.LoadInt64(&q.circuitBreaker.failures),
}
}
// Shutdown gracefully shuts down the queue
func (q *RobustQueue) Shutdown(timeout time.Duration) error {
log.Info("Starting queue shutdown...")
// Cancel context to stop background routines
q.cancel()
// Wait for background routines to finish
done := make(chan struct{})
go func() {
q.wg.Wait()
close(done)
}()
select {
case <-done:
log.Info("Queue shutdown completed successfully")
return nil
case <-time.After(timeout):
log.Warn("Queue shutdown timed out")
return errors.New("shutdown timeout")
}
}