- Exponential backoff retry for all cloud operations (S3, Azure, GCS) - RetryConfig presets: Default (5x), Aggressive (10x), Quick (3x) - Smart error classification: IsPermanentError, IsRetryableError - Automatic file position reset on upload retry - Retry logging with wait duration - Multipart uploads use aggressive retry (more tolerance)
258 lines
6.3 KiB
Go
258 lines
6.3 KiB
Go
package cloud
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/cenkalti/backoff/v4"
|
|
)
|
|
|
|
// RetryConfig configures retry behavior
|
|
type RetryConfig struct {
|
|
MaxRetries int // Maximum number of retries (0 = unlimited)
|
|
InitialInterval time.Duration // Initial backoff interval
|
|
MaxInterval time.Duration // Maximum backoff interval
|
|
MaxElapsedTime time.Duration // Maximum total time for retries
|
|
Multiplier float64 // Backoff multiplier
|
|
}
|
|
|
|
// DefaultRetryConfig returns sensible defaults for cloud operations
|
|
func DefaultRetryConfig() *RetryConfig {
|
|
return &RetryConfig{
|
|
MaxRetries: 5,
|
|
InitialInterval: 500 * time.Millisecond,
|
|
MaxInterval: 30 * time.Second,
|
|
MaxElapsedTime: 5 * time.Minute,
|
|
Multiplier: 2.0,
|
|
}
|
|
}
|
|
|
|
// AggressiveRetryConfig returns config for critical operations that need more retries
|
|
func AggressiveRetryConfig() *RetryConfig {
|
|
return &RetryConfig{
|
|
MaxRetries: 10,
|
|
InitialInterval: 1 * time.Second,
|
|
MaxInterval: 60 * time.Second,
|
|
MaxElapsedTime: 15 * time.Minute,
|
|
Multiplier: 1.5,
|
|
}
|
|
}
|
|
|
|
// QuickRetryConfig returns config for operations that should fail fast
|
|
func QuickRetryConfig() *RetryConfig {
|
|
return &RetryConfig{
|
|
MaxRetries: 3,
|
|
InitialInterval: 100 * time.Millisecond,
|
|
MaxInterval: 5 * time.Second,
|
|
MaxElapsedTime: 30 * time.Second,
|
|
Multiplier: 2.0,
|
|
}
|
|
}
|
|
|
|
// RetryOperation executes an operation with exponential backoff retry
|
|
func RetryOperation(ctx context.Context, cfg *RetryConfig, operation func() error) error {
|
|
if cfg == nil {
|
|
cfg = DefaultRetryConfig()
|
|
}
|
|
|
|
// Create exponential backoff
|
|
expBackoff := backoff.NewExponentialBackOff()
|
|
expBackoff.InitialInterval = cfg.InitialInterval
|
|
expBackoff.MaxInterval = cfg.MaxInterval
|
|
expBackoff.MaxElapsedTime = cfg.MaxElapsedTime
|
|
expBackoff.Multiplier = cfg.Multiplier
|
|
expBackoff.Reset()
|
|
|
|
// Wrap with max retries if specified
|
|
var b backoff.BackOff = expBackoff
|
|
if cfg.MaxRetries > 0 {
|
|
b = backoff.WithMaxRetries(expBackoff, uint64(cfg.MaxRetries))
|
|
}
|
|
|
|
// Add context support
|
|
b = backoff.WithContext(b, ctx)
|
|
|
|
// Track attempts for logging
|
|
attempt := 0
|
|
|
|
// Wrap operation to handle permanent vs retryable errors
|
|
wrappedOp := func() error {
|
|
attempt++
|
|
err := operation()
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
|
|
// Check if error is permanent (should not retry)
|
|
if IsPermanentError(err) {
|
|
return backoff.Permanent(err)
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
return backoff.Retry(wrappedOp, b)
|
|
}
|
|
|
|
// RetryOperationWithNotify executes an operation with retry and calls notify on each retry
|
|
func RetryOperationWithNotify(ctx context.Context, cfg *RetryConfig, operation func() error, notify func(err error, duration time.Duration)) error {
|
|
if cfg == nil {
|
|
cfg = DefaultRetryConfig()
|
|
}
|
|
|
|
// Create exponential backoff
|
|
expBackoff := backoff.NewExponentialBackOff()
|
|
expBackoff.InitialInterval = cfg.InitialInterval
|
|
expBackoff.MaxInterval = cfg.MaxInterval
|
|
expBackoff.MaxElapsedTime = cfg.MaxElapsedTime
|
|
expBackoff.Multiplier = cfg.Multiplier
|
|
expBackoff.Reset()
|
|
|
|
// Wrap with max retries if specified
|
|
var b backoff.BackOff = expBackoff
|
|
if cfg.MaxRetries > 0 {
|
|
b = backoff.WithMaxRetries(expBackoff, uint64(cfg.MaxRetries))
|
|
}
|
|
|
|
// Add context support
|
|
b = backoff.WithContext(b, ctx)
|
|
|
|
// Wrap operation to handle permanent vs retryable errors
|
|
wrappedOp := func() error {
|
|
err := operation()
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
|
|
// Check if error is permanent (should not retry)
|
|
if IsPermanentError(err) {
|
|
return backoff.Permanent(err)
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
return backoff.RetryNotify(wrappedOp, b, notify)
|
|
}
|
|
|
|
// IsPermanentError returns true if the error should not be retried
|
|
func IsPermanentError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
|
|
errStr := strings.ToLower(err.Error())
|
|
|
|
// Authentication/authorization errors - don't retry
|
|
permanentPatterns := []string{
|
|
"access denied",
|
|
"forbidden",
|
|
"unauthorized",
|
|
"invalid credentials",
|
|
"invalid access key",
|
|
"invalid secret",
|
|
"no such bucket",
|
|
"bucket not found",
|
|
"container not found",
|
|
"nosuchbucket",
|
|
"nosuchkey",
|
|
"invalid argument",
|
|
"malformed",
|
|
"invalid request",
|
|
"permission denied",
|
|
"access control",
|
|
"policy",
|
|
}
|
|
|
|
for _, pattern := range permanentPatterns {
|
|
if strings.Contains(errStr, pattern) {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// IsRetryableError returns true if the error is transient and should be retried
|
|
func IsRetryableError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
|
|
// Network errors are typically retryable
|
|
var netErr net.Error
|
|
if ok := isNetError(err, &netErr); ok {
|
|
return netErr.Timeout() || netErr.Temporary()
|
|
}
|
|
|
|
errStr := strings.ToLower(err.Error())
|
|
|
|
// Transient errors - should retry
|
|
retryablePatterns := []string{
|
|
"timeout",
|
|
"connection reset",
|
|
"connection refused",
|
|
"connection closed",
|
|
"eof",
|
|
"broken pipe",
|
|
"temporary failure",
|
|
"service unavailable",
|
|
"internal server error",
|
|
"bad gateway",
|
|
"gateway timeout",
|
|
"too many requests",
|
|
"rate limit",
|
|
"throttl",
|
|
"slowdown",
|
|
"try again",
|
|
"retry",
|
|
}
|
|
|
|
for _, pattern := range retryablePatterns {
|
|
if strings.Contains(errStr, pattern) {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// isNetError checks if err wraps a net.Error
|
|
func isNetError(err error, target *net.Error) bool {
|
|
for err != nil {
|
|
if ne, ok := err.(net.Error); ok {
|
|
*target = ne
|
|
return true
|
|
}
|
|
// Try to unwrap
|
|
if unwrapper, ok := err.(interface{ Unwrap() error }); ok {
|
|
err = unwrapper.Unwrap()
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// WithRetry is a helper that wraps a function with default retry logic
|
|
func WithRetry(ctx context.Context, operationName string, fn func() error) error {
|
|
notify := func(err error, duration time.Duration) {
|
|
// Log retry attempts (caller can provide their own logger if needed)
|
|
fmt.Printf("[RETRY] %s failed, retrying in %v: %v\n", operationName, duration, err)
|
|
}
|
|
|
|
return RetryOperationWithNotify(ctx, DefaultRetryConfig(), fn, notify)
|
|
}
|
|
|
|
// WithRetryConfig is a helper that wraps a function with custom retry config
|
|
func WithRetryConfig(ctx context.Context, cfg *RetryConfig, operationName string, fn func() error) error {
|
|
notify := func(err error, duration time.Duration) {
|
|
fmt.Printf("[RETRY] %s failed, retrying in %v: %v\n", operationName, duration, err)
|
|
}
|
|
|
|
return RetryOperationWithNotify(ctx, cfg, fn, notify)
|
|
}
|