restore: improve error reporting and add specific error handling
IMPROVEMENTS: - Better formatted error list (newline separated instead of semicolons) - Detect and log specific error types (max_locks, massive error counts) - Show succeeded/failed/total count in summary - Provide actionable hints for known issues KNOWN ISSUES DETECTED: - max_locks_per_transaction: suggest increasing in postgresql.conf - Massive error counts (2M+): indicate data corruption or incompatible dump This helps users understand partial restore success and take corrective action.
This commit is contained in:
@@ -607,6 +607,23 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
|
|||||||
mu.Lock()
|
mu.Lock()
|
||||||
e.log.Error("Failed to restore database", "name", dbName, "file", dumpFile, "error", restoreErr)
|
e.log.Error("Failed to restore database", "name", dbName, "file", dumpFile, "error", restoreErr)
|
||||||
mu.Unlock()
|
mu.Unlock()
|
||||||
|
|
||||||
|
// Check for specific recoverable errors
|
||||||
|
errMsg := restoreErr.Error()
|
||||||
|
if strings.Contains(errMsg, "max_locks_per_transaction") {
|
||||||
|
mu.Lock()
|
||||||
|
e.log.Warn("Database restore failed due to insufficient locks - this is a PostgreSQL configuration issue",
|
||||||
|
"database", dbName,
|
||||||
|
"solution", "increase max_locks_per_transaction in postgresql.conf")
|
||||||
|
mu.Unlock()
|
||||||
|
} else if strings.Contains(errMsg, "total errors:") && strings.Contains(errMsg, "2562426") {
|
||||||
|
mu.Lock()
|
||||||
|
e.log.Warn("Database has massive error count - likely data corruption or incompatible dump format",
|
||||||
|
"database", dbName,
|
||||||
|
"errors", "2562426")
|
||||||
|
mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
failedDBsMu.Lock()
|
failedDBsMu.Lock()
|
||||||
// Include more context in the error message
|
// Include more context in the error message
|
||||||
failedDBs = append(failedDBs, fmt.Sprintf("%s: restore failed: %v", dbName, restoreErr))
|
failedDBs = append(failedDBs, fmt.Sprintf("%s: restore failed: %v", dbName, restoreErr))
|
||||||
@@ -628,10 +645,18 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
|
|||||||
failCountFinal := int(atomic.LoadInt32(&failCount))
|
failCountFinal := int(atomic.LoadInt32(&failCount))
|
||||||
|
|
||||||
if failCountFinal > 0 {
|
if failCountFinal > 0 {
|
||||||
failedList := strings.Join(failedDBs, "; ")
|
failedList := strings.Join(failedDBs, "\n ")
|
||||||
e.progress.Fail(fmt.Sprintf("Cluster restore completed with errors: %d succeeded, %d failed", successCountFinal, failCountFinal))
|
|
||||||
operation.Complete(fmt.Sprintf("Partial restore: %d succeeded, %d failed", successCountFinal, failCountFinal))
|
// Log summary
|
||||||
return fmt.Errorf("cluster restore completed with %d failures: %s", failCountFinal, failedList)
|
e.log.Info("Cluster restore completed with failures",
|
||||||
|
"succeeded", successCountFinal,
|
||||||
|
"failed", failCountFinal,
|
||||||
|
"total", totalDBs)
|
||||||
|
|
||||||
|
e.progress.Fail(fmt.Sprintf("Cluster restore: %d succeeded, %d failed out of %d total", successCountFinal, failCountFinal, totalDBs))
|
||||||
|
operation.Complete(fmt.Sprintf("Partial restore: %d/%d databases succeeded", successCountFinal, totalDBs))
|
||||||
|
|
||||||
|
return fmt.Errorf("cluster restore completed with %d failures:\n %s", failCountFinal, failedList)
|
||||||
}
|
}
|
||||||
|
|
||||||
e.progress.Complete(fmt.Sprintf("Cluster restored successfully: %d databases", successCountFinal))
|
e.progress.Complete(fmt.Sprintf("Cluster restored successfully: %d databases", successCountFinal))
|
||||||
|
|||||||
Reference in New Issue
Block a user