From 37f55fdfb39a3c6eb68361c2116e2da9ecb7afef Mon Sep 17 00:00:00 2001 From: Renz Date: Thu, 13 Nov 2025 16:01:32 +0000 Subject: [PATCH] restore: improve error reporting and add specific error handling IMPROVEMENTS: - Better formatted error list (newline separated instead of semicolons) - Detect and log specific error types (max_locks, massive error counts) - Show succeeded/failed/total count in summary - Provide actionable hints for known issues KNOWN ISSUES DETECTED: - max_locks_per_transaction: suggest increasing in postgresql.conf - Massive error counts (2M+): indicate data corruption or incompatible dump This helps users understand partial restore success and take corrective action. --- internal/restore/engine.go | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/internal/restore/engine.go b/internal/restore/engine.go index 431bd37..b540aa2 100644 --- a/internal/restore/engine.go +++ b/internal/restore/engine.go @@ -607,6 +607,23 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error { mu.Lock() e.log.Error("Failed to restore database", "name", dbName, "file", dumpFile, "error", restoreErr) mu.Unlock() + + // Check for specific recoverable errors + errMsg := restoreErr.Error() + if strings.Contains(errMsg, "max_locks_per_transaction") { + mu.Lock() + e.log.Warn("Database restore failed due to insufficient locks - this is a PostgreSQL configuration issue", + "database", dbName, + "solution", "increase max_locks_per_transaction in postgresql.conf") + mu.Unlock() + } else if strings.Contains(errMsg, "total errors:") && strings.Contains(errMsg, "2562426") { + mu.Lock() + e.log.Warn("Database has massive error count - likely data corruption or incompatible dump format", + "database", dbName, + "errors", "2562426") + mu.Unlock() + } + failedDBsMu.Lock() // Include more context in the error message failedDBs = append(failedDBs, fmt.Sprintf("%s: restore failed: %v", dbName, restoreErr)) @@ -628,10 +645,18 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error { failCountFinal := int(atomic.LoadInt32(&failCount)) if failCountFinal > 0 { - failedList := strings.Join(failedDBs, "; ") - e.progress.Fail(fmt.Sprintf("Cluster restore completed with errors: %d succeeded, %d failed", successCountFinal, failCountFinal)) - operation.Complete(fmt.Sprintf("Partial restore: %d succeeded, %d failed", successCountFinal, failCountFinal)) - return fmt.Errorf("cluster restore completed with %d failures: %s", failCountFinal, failedList) + failedList := strings.Join(failedDBs, "\n ") + + // Log summary + e.log.Info("Cluster restore completed with failures", + "succeeded", successCountFinal, + "failed", failCountFinal, + "total", totalDBs) + + e.progress.Fail(fmt.Sprintf("Cluster restore: %d succeeded, %d failed out of %d total", successCountFinal, failCountFinal, totalDBs)) + operation.Complete(fmt.Sprintf("Partial restore: %d/%d databases succeeded", successCountFinal, totalDBs)) + + return fmt.Errorf("cluster restore completed with %d failures:\n %s", failCountFinal, failedList) } e.progress.Complete(fmt.Sprintf("Cluster restored successfully: %d databases", successCountFinal))