Compare commits

..

2 Commits

Author SHA1 Message Date
4ea3ec2cf8 fix(preflight): improve BLOB count detection and block restore when max_locks_per_transaction is critically low
All checks were successful
CI/CD / Test (push) Successful in 1m15s
CI/CD / Lint (push) Successful in 1m26s
CI/CD / Build & Release (push) Successful in 3m14s
- Add higher lock boost tiers for extreme BLOB counts (100K+, 200K+)
- Calculate actual lock requirement: (totalBLOBs / max_connections) * 1.5
- Block restore with CRITICAL error when BLOB count exceeds 2x safe limit
- Improve preflight summary with PASSED/FAILED status display
- Add clearer fix instructions for max_locks_per_transaction
2026-01-17 07:25:45 +01:00
9200024e50 fix(restore): add context validity checks to debug cancellation issues
All checks were successful
CI/CD / Test (push) Successful in 1m21s
CI/CD / Lint (push) Successful in 1m28s
CI/CD / Build & Release (push) Successful in 3m17s
Added explicit context checks at critical points:
1. After extraction completes - logs error if context was cancelled
2. Before database restore loop starts - catches premature cancellation

This helps diagnose issues where all database restores fail with
'context cancelled' even though extraction completed successfully.

The user reported this happening after 4h20m extraction - all 6 DBs
showed 'restore skipped (context cancelled)'. These checks will log
exactly when/where the context becomes invalid.
2026-01-16 19:36:52 +01:00
3 changed files with 97 additions and 6 deletions

View File

@@ -4,8 +4,8 @@ This directory contains pre-compiled binaries for the DB Backup Tool across mult
## Build Information
- **Version**: 3.42.50
- **Build Time**: 2026-01-16_15:09:21_UTC
- **Git Commit**: dd7c4da
- **Build Time**: 2026-01-16_18:37:32_UTC
- **Git Commit**: 9200024
## Recent Updates (v1.1.0)
- ✅ Fixed TUI progress display with line-by-line output

View File

@@ -910,6 +910,16 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
return fmt.Errorf("failed to extract archive: %w", err)
}
// Check context validity after extraction (debugging context cancellation issues)
if ctx.Err() != nil {
e.log.Error("Context cancelled after extraction - this should not happen",
"context_error", ctx.Err(),
"extraction_completed", true)
operation.Fail("Context cancelled unexpectedly")
return fmt.Errorf("context cancelled after extraction completed: %w", ctx.Err())
}
e.log.Info("Extraction completed, context still valid")
// Check if user has superuser privileges (required for ownership restoration)
e.progress.Update("Checking privileges...")
isSuperuser, err := e.checkSuperuser(ctx)
@@ -1108,6 +1118,18 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
var successCount, failCount int32
var mu sync.Mutex // Protect shared resources (progress, logger)
// CRITICAL: Check context before starting database restore loop
// This helps debug issues where context gets cancelled between extraction and restore
if ctx.Err() != nil {
e.log.Error("Context cancelled before database restore loop started",
"context_error", ctx.Err(),
"total_databases", totalDBs,
"parallelism", parallelism)
operation.Fail("Context cancelled before database restores could start")
return fmt.Errorf("context cancelled before database restore: %w", ctx.Err())
}
e.log.Info("Starting database restore loop", "databases", totalDBs, "parallelism", parallelism)
// Timing tracking for restore phase progress
restorePhaseStart := time.Now()
var completedDBTimes []time.Duration // Track duration for each completed DB restore

View File

@@ -212,7 +212,8 @@ func (e *Engine) checkPostgreSQL(ctx context.Context, result *PreflightResult) {
result.Warnings = append(result.Warnings,
fmt.Sprintf("max_locks_per_transaction=%d is low (recommend 256+). "+
"This setting requires PostgreSQL RESTART to change. "+
"BLOB-heavy databases may fail with 'out of shared memory' error.",
"BLOB-heavy databases may fail with 'out of shared memory' error. "+
"Fix: Edit postgresql.conf, set max_locks_per_transaction=2048, then restart PostgreSQL.",
result.PostgreSQL.MaxLocksPerTransaction))
}
@@ -324,14 +325,57 @@ func (e *Engine) calculateRecommendations(result *PreflightResult) {
if result.Archive.TotalBlobCount > 50000 {
lockBoost = 16384
}
if result.Archive.TotalBlobCount > 100000 {
lockBoost = 32768
}
if result.Archive.TotalBlobCount > 200000 {
lockBoost = 65536
}
// Cap at reasonable maximum
if lockBoost > 16384 {
lockBoost = 16384
// For extreme cases, calculate actual requirement
// Rule of thumb: ~1 lock per BLOB, divided by max_connections (default 100)
// Add 50% safety margin
maxConns := result.PostgreSQL.MaxConnections
if maxConns == 0 {
maxConns = 100 // default
}
calculatedLocks := (result.Archive.TotalBlobCount / maxConns) * 3 / 2 // 1.5x safety margin
if calculatedLocks > lockBoost {
lockBoost = calculatedLocks
}
result.Archive.RecommendedLockBoost = lockBoost
// CRITICAL: Check if current max_locks_per_transaction is dangerously low for this BLOB count
currentLocks := result.PostgreSQL.MaxLocksPerTransaction
if currentLocks > 0 && result.Archive.TotalBlobCount > 0 {
// Estimate max BLOBs we can handle: locks * max_connections
maxSafeBLOBs := currentLocks * maxConns
if result.Archive.TotalBlobCount > maxSafeBLOBs {
severity := "WARNING"
if result.Archive.TotalBlobCount > maxSafeBLOBs*2 {
severity = "CRITICAL"
result.CanProceed = false
}
e.log.Error(fmt.Sprintf("%s: max_locks_per_transaction too low for BLOB count", severity),
"current_max_locks", currentLocks,
"total_blobs", result.Archive.TotalBlobCount,
"max_safe_blobs", maxSafeBLOBs,
"recommended_max_locks", lockBoost)
result.Errors = append(result.Errors,
fmt.Sprintf("%s: Archive contains %s BLOBs but max_locks_per_transaction=%d can only safely handle ~%s. "+
"Increase max_locks_per_transaction to %d in postgresql.conf and RESTART PostgreSQL.",
severity,
humanize.Comma(int64(result.Archive.TotalBlobCount)),
currentLocks,
humanize.Comma(int64(maxSafeBLOBs)),
lockBoost))
}
}
// Log recommendation
e.log.Info("Calculated recommended lock boost",
"total_blobs", result.Archive.TotalBlobCount,
@@ -386,6 +430,14 @@ func (e *Engine) printPreflightSummary(result *PreflightResult) {
}
}
// Errors (blocking issues)
if len(result.Errors) > 0 {
fmt.Println("\n ✗ ERRORS (must fix before proceeding):")
for _, e := range result.Errors {
fmt.Printf(" • %s\n", e)
}
}
// Warnings
if len(result.Warnings) > 0 {
fmt.Println("\n ⚠ Warnings:")
@@ -394,6 +446,23 @@ func (e *Engine) printPreflightSummary(result *PreflightResult) {
}
}
// Final status
fmt.Println()
if !result.CanProceed {
fmt.Println(" ┌─────────────────────────────────────────────────────────┐")
fmt.Println(" │ ✗ PREFLIGHT FAILED - Cannot proceed with restore │")
fmt.Println(" │ Fix the errors above and try again. │")
fmt.Println(" └─────────────────────────────────────────────────────────┘")
} else if len(result.Warnings) > 0 {
fmt.Println(" ┌─────────────────────────────────────────────────────────┐")
fmt.Println(" │ ⚠ PREFLIGHT PASSED WITH WARNINGS - Proceed with care │")
fmt.Println(" └─────────────────────────────────────────────────────────┘")
} else {
fmt.Println(" ┌─────────────────────────────────────────────────────────┐")
fmt.Println(" │ ✓ PREFLIGHT PASSED - Ready to restore │")
fmt.Println(" └─────────────────────────────────────────────────────────┘")
}
fmt.Println(strings.Repeat("─", 60))
fmt.Println()
}