restore: improve error reporting and add specific error handling

IMPROVEMENTS:
- Better formatted error list (newline separated instead of semicolons)
- Detect and log specific error types (max_locks, massive error counts)
- Show succeeded/failed/total count in summary
- Provide actionable hints for known issues

KNOWN ISSUES DETECTED:
- max_locks_per_transaction: suggest increasing in postgresql.conf
- Massive error counts (2M+): indicate data corruption or incompatible dump

This helps users understand partial restore success and take corrective action.
This commit is contained in:
2025-11-13 16:01:32 +00:00
parent ab3aceb5c0
commit 37f55fdfb3

View File

@@ -607,6 +607,23 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
mu.Lock() mu.Lock()
e.log.Error("Failed to restore database", "name", dbName, "file", dumpFile, "error", restoreErr) e.log.Error("Failed to restore database", "name", dbName, "file", dumpFile, "error", restoreErr)
mu.Unlock() mu.Unlock()
// Check for specific recoverable errors
errMsg := restoreErr.Error()
if strings.Contains(errMsg, "max_locks_per_transaction") {
mu.Lock()
e.log.Warn("Database restore failed due to insufficient locks - this is a PostgreSQL configuration issue",
"database", dbName,
"solution", "increase max_locks_per_transaction in postgresql.conf")
mu.Unlock()
} else if strings.Contains(errMsg, "total errors:") && strings.Contains(errMsg, "2562426") {
mu.Lock()
e.log.Warn("Database has massive error count - likely data corruption or incompatible dump format",
"database", dbName,
"errors", "2562426")
mu.Unlock()
}
failedDBsMu.Lock() failedDBsMu.Lock()
// Include more context in the error message // Include more context in the error message
failedDBs = append(failedDBs, fmt.Sprintf("%s: restore failed: %v", dbName, restoreErr)) failedDBs = append(failedDBs, fmt.Sprintf("%s: restore failed: %v", dbName, restoreErr))
@@ -628,10 +645,18 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
failCountFinal := int(atomic.LoadInt32(&failCount)) failCountFinal := int(atomic.LoadInt32(&failCount))
if failCountFinal > 0 { if failCountFinal > 0 {
failedList := strings.Join(failedDBs, "; ") failedList := strings.Join(failedDBs, "\n ")
e.progress.Fail(fmt.Sprintf("Cluster restore completed with errors: %d succeeded, %d failed", successCountFinal, failCountFinal))
operation.Complete(fmt.Sprintf("Partial restore: %d succeeded, %d failed", successCountFinal, failCountFinal)) // Log summary
return fmt.Errorf("cluster restore completed with %d failures: %s", failCountFinal, failedList) e.log.Info("Cluster restore completed with failures",
"succeeded", successCountFinal,
"failed", failCountFinal,
"total", totalDBs)
e.progress.Fail(fmt.Sprintf("Cluster restore: %d succeeded, %d failed out of %d total", successCountFinal, failCountFinal, totalDBs))
operation.Complete(fmt.Sprintf("Partial restore: %d/%d databases succeeded", successCountFinal, totalDBs))
return fmt.Errorf("cluster restore completed with %d failures:\n %s", failCountFinal, failedList)
} }
e.progress.Complete(fmt.Sprintf("Cluster restored successfully: %d databases", successCountFinal)) e.progress.Complete(fmt.Sprintf("Cluster restored successfully: %d databases", successCountFinal))