v3.42.33: Add cenkalti/backoff for exponential backoff retry
- Exponential backoff retry for all cloud operations (S3, Azure, GCS) - RetryConfig presets: Default (5x), Aggressive (10x), Quick (3x) - Smart error classification: IsPermanentError, IsRetryableError - Automatic file position reset on upload retry - Retry logging with wait duration - Multipart uploads use aggressive retry (more tolerance)
This commit is contained in:
257
internal/cloud/retry.go
Normal file
257
internal/cloud/retry.go
Normal file
@@ -0,0 +1,257 @@
|
||||
package cloud
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/cenkalti/backoff/v4"
|
||||
)
|
||||
|
||||
// RetryConfig configures retry behavior
|
||||
type RetryConfig struct {
|
||||
MaxRetries int // Maximum number of retries (0 = unlimited)
|
||||
InitialInterval time.Duration // Initial backoff interval
|
||||
MaxInterval time.Duration // Maximum backoff interval
|
||||
MaxElapsedTime time.Duration // Maximum total time for retries
|
||||
Multiplier float64 // Backoff multiplier
|
||||
}
|
||||
|
||||
// DefaultRetryConfig returns sensible defaults for cloud operations
|
||||
func DefaultRetryConfig() *RetryConfig {
|
||||
return &RetryConfig{
|
||||
MaxRetries: 5,
|
||||
InitialInterval: 500 * time.Millisecond,
|
||||
MaxInterval: 30 * time.Second,
|
||||
MaxElapsedTime: 5 * time.Minute,
|
||||
Multiplier: 2.0,
|
||||
}
|
||||
}
|
||||
|
||||
// AggressiveRetryConfig returns config for critical operations that need more retries
|
||||
func AggressiveRetryConfig() *RetryConfig {
|
||||
return &RetryConfig{
|
||||
MaxRetries: 10,
|
||||
InitialInterval: 1 * time.Second,
|
||||
MaxInterval: 60 * time.Second,
|
||||
MaxElapsedTime: 15 * time.Minute,
|
||||
Multiplier: 1.5,
|
||||
}
|
||||
}
|
||||
|
||||
// QuickRetryConfig returns config for operations that should fail fast
|
||||
func QuickRetryConfig() *RetryConfig {
|
||||
return &RetryConfig{
|
||||
MaxRetries: 3,
|
||||
InitialInterval: 100 * time.Millisecond,
|
||||
MaxInterval: 5 * time.Second,
|
||||
MaxElapsedTime: 30 * time.Second,
|
||||
Multiplier: 2.0,
|
||||
}
|
||||
}
|
||||
|
||||
// RetryOperation executes an operation with exponential backoff retry
|
||||
func RetryOperation(ctx context.Context, cfg *RetryConfig, operation func() error) error {
|
||||
if cfg == nil {
|
||||
cfg = DefaultRetryConfig()
|
||||
}
|
||||
|
||||
// Create exponential backoff
|
||||
expBackoff := backoff.NewExponentialBackOff()
|
||||
expBackoff.InitialInterval = cfg.InitialInterval
|
||||
expBackoff.MaxInterval = cfg.MaxInterval
|
||||
expBackoff.MaxElapsedTime = cfg.MaxElapsedTime
|
||||
expBackoff.Multiplier = cfg.Multiplier
|
||||
expBackoff.Reset()
|
||||
|
||||
// Wrap with max retries if specified
|
||||
var b backoff.BackOff = expBackoff
|
||||
if cfg.MaxRetries > 0 {
|
||||
b = backoff.WithMaxRetries(expBackoff, uint64(cfg.MaxRetries))
|
||||
}
|
||||
|
||||
// Add context support
|
||||
b = backoff.WithContext(b, ctx)
|
||||
|
||||
// Track attempts for logging
|
||||
attempt := 0
|
||||
|
||||
// Wrap operation to handle permanent vs retryable errors
|
||||
wrappedOp := func() error {
|
||||
attempt++
|
||||
err := operation()
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check if error is permanent (should not retry)
|
||||
if IsPermanentError(err) {
|
||||
return backoff.Permanent(err)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
return backoff.Retry(wrappedOp, b)
|
||||
}
|
||||
|
||||
// RetryOperationWithNotify executes an operation with retry and calls notify on each retry
|
||||
func RetryOperationWithNotify(ctx context.Context, cfg *RetryConfig, operation func() error, notify func(err error, duration time.Duration)) error {
|
||||
if cfg == nil {
|
||||
cfg = DefaultRetryConfig()
|
||||
}
|
||||
|
||||
// Create exponential backoff
|
||||
expBackoff := backoff.NewExponentialBackOff()
|
||||
expBackoff.InitialInterval = cfg.InitialInterval
|
||||
expBackoff.MaxInterval = cfg.MaxInterval
|
||||
expBackoff.MaxElapsedTime = cfg.MaxElapsedTime
|
||||
expBackoff.Multiplier = cfg.Multiplier
|
||||
expBackoff.Reset()
|
||||
|
||||
// Wrap with max retries if specified
|
||||
var b backoff.BackOff = expBackoff
|
||||
if cfg.MaxRetries > 0 {
|
||||
b = backoff.WithMaxRetries(expBackoff, uint64(cfg.MaxRetries))
|
||||
}
|
||||
|
||||
// Add context support
|
||||
b = backoff.WithContext(b, ctx)
|
||||
|
||||
// Wrap operation to handle permanent vs retryable errors
|
||||
wrappedOp := func() error {
|
||||
err := operation()
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check if error is permanent (should not retry)
|
||||
if IsPermanentError(err) {
|
||||
return backoff.Permanent(err)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
return backoff.RetryNotify(wrappedOp, b, notify)
|
||||
}
|
||||
|
||||
// IsPermanentError returns true if the error should not be retried
|
||||
func IsPermanentError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
errStr := strings.ToLower(err.Error())
|
||||
|
||||
// Authentication/authorization errors - don't retry
|
||||
permanentPatterns := []string{
|
||||
"access denied",
|
||||
"forbidden",
|
||||
"unauthorized",
|
||||
"invalid credentials",
|
||||
"invalid access key",
|
||||
"invalid secret",
|
||||
"no such bucket",
|
||||
"bucket not found",
|
||||
"container not found",
|
||||
"nosuchbucket",
|
||||
"nosuchkey",
|
||||
"invalid argument",
|
||||
"malformed",
|
||||
"invalid request",
|
||||
"permission denied",
|
||||
"access control",
|
||||
"policy",
|
||||
}
|
||||
|
||||
for _, pattern := range permanentPatterns {
|
||||
if strings.Contains(errStr, pattern) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// IsRetryableError returns true if the error is transient and should be retried
|
||||
func IsRetryableError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
// Network errors are typically retryable
|
||||
var netErr net.Error
|
||||
if ok := isNetError(err, &netErr); ok {
|
||||
return netErr.Timeout() || netErr.Temporary()
|
||||
}
|
||||
|
||||
errStr := strings.ToLower(err.Error())
|
||||
|
||||
// Transient errors - should retry
|
||||
retryablePatterns := []string{
|
||||
"timeout",
|
||||
"connection reset",
|
||||
"connection refused",
|
||||
"connection closed",
|
||||
"eof",
|
||||
"broken pipe",
|
||||
"temporary failure",
|
||||
"service unavailable",
|
||||
"internal server error",
|
||||
"bad gateway",
|
||||
"gateway timeout",
|
||||
"too many requests",
|
||||
"rate limit",
|
||||
"throttl",
|
||||
"slowdown",
|
||||
"try again",
|
||||
"retry",
|
||||
}
|
||||
|
||||
for _, pattern := range retryablePatterns {
|
||||
if strings.Contains(errStr, pattern) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// isNetError checks if err wraps a net.Error
|
||||
func isNetError(err error, target *net.Error) bool {
|
||||
for err != nil {
|
||||
if ne, ok := err.(net.Error); ok {
|
||||
*target = ne
|
||||
return true
|
||||
}
|
||||
// Try to unwrap
|
||||
if unwrapper, ok := err.(interface{ Unwrap() error }); ok {
|
||||
err = unwrapper.Unwrap()
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// WithRetry is a helper that wraps a function with default retry logic
|
||||
func WithRetry(ctx context.Context, operationName string, fn func() error) error {
|
||||
notify := func(err error, duration time.Duration) {
|
||||
// Log retry attempts (caller can provide their own logger if needed)
|
||||
fmt.Printf("[RETRY] %s failed, retrying in %v: %v\n", operationName, duration, err)
|
||||
}
|
||||
|
||||
return RetryOperationWithNotify(ctx, DefaultRetryConfig(), fn, notify)
|
||||
}
|
||||
|
||||
// WithRetryConfig is a helper that wraps a function with custom retry config
|
||||
func WithRetryConfig(ctx context.Context, cfg *RetryConfig, operationName string, fn func() error) error {
|
||||
notify := func(err error, duration time.Duration) {
|
||||
fmt.Printf("[RETRY] %s failed, retrying in %v: %v\n", operationName, duration, err)
|
||||
}
|
||||
|
||||
return RetryOperationWithNotify(ctx, cfg, fn, notify)
|
||||
}
|
||||
Reference in New Issue
Block a user