restore: enhance error capture with detailed stderr logging and verbose pg_restore

- Capture all ERROR/FATAL/error: messages from pg_restore/psql stderr
- Include full error details in failure messages for better diagnostics
- Add --verbose flag to pg_restore for comprehensive error reporting
- Improve thread-safe logging in parallel cluster restore
- Help diagnose cluster restore failures with actual PostgreSQL error messages
This commit is contained in:
2025-11-13 12:47:40 +00:00
parent 71386828bb
commit b9b44dd989
2 changed files with 40 additions and 10 deletions

View File

@@ -287,13 +287,15 @@ func (e *Engine) executeRestoreCommand(ctx context.Context, cmdArgs []string) er
// Read stderr in chunks to log errors without loading all into memory
buf := make([]byte, 4096)
var lastError string
var allErrors []string
for {
n, err := stderr.Read(buf)
if n > 0 {
chunk := string(buf[:n])
// Only log errors/warnings, not all output
if strings.Contains(chunk, "ERROR") || strings.Contains(chunk, "FATAL") {
// Capture all errors/warnings for better diagnostics
if strings.Contains(chunk, "ERROR") || strings.Contains(chunk, "FATAL") || strings.Contains(chunk, "error:") {
lastError = chunk
allErrors = append(allErrors, strings.TrimSpace(chunk))
e.log.Warn("Restore stderr", "output", chunk)
}
}
@@ -303,7 +305,15 @@ func (e *Engine) executeRestoreCommand(ctx context.Context, cmdArgs []string) er
}
if err := cmd.Wait(); err != nil {
e.log.Error("Restore command failed", "error", err, "last_error", lastError)
// Include all captured errors in the return message for better diagnostics
errorDetails := lastError
if len(allErrors) > 0 {
errorDetails = strings.Join(allErrors, " | ")
}
e.log.Error("Restore command failed", "error", err, "stderr", errorDetails)
if errorDetails != "" {
return fmt.Errorf("restore failed: %w (stderr: %s)", err, errorDetails)
}
return fmt.Errorf("restore failed: %w", err)
}
@@ -342,13 +352,15 @@ func (e *Engine) executeRestoreWithDecompression(ctx context.Context, archivePat
// Read stderr in chunks to log errors without loading all into memory
buf := make([]byte, 4096)
var lastError string
var allErrors []string
for {
n, err := stderr.Read(buf)
if n > 0 {
chunk := string(buf[:n])
// Only log errors/warnings, not all output
if strings.Contains(chunk, "ERROR") || strings.Contains(chunk, "FATAL") {
// Capture all errors/warnings for better diagnostics
if strings.Contains(chunk, "ERROR") || strings.Contains(chunk, "FATAL") || strings.Contains(chunk, "error:") {
lastError = chunk
allErrors = append(allErrors, strings.TrimSpace(chunk))
e.log.Warn("Restore stderr", "output", chunk)
}
}
@@ -358,7 +370,15 @@ func (e *Engine) executeRestoreWithDecompression(ctx context.Context, archivePat
}
if err := cmd.Wait(); err != nil {
e.log.Error("Restore with decompression failed", "error", err, "last_error", lastError)
// Include all captured errors in the return message for better diagnostics
errorDetails := lastError
if len(allErrors) > 0 {
errorDetails = strings.Join(allErrors, " | ")
}
e.log.Error("Restore with decompression failed", "error", err, "stderr", errorDetails)
if errorDetails != "" {
return fmt.Errorf("restore failed: %w (stderr: %s)", err, errorDetails)
}
return fmt.Errorf("restore failed: %w", err)
}
@@ -572,17 +592,24 @@ func (e *Engine) RestoreCluster(ctx context.Context, archivePath string) error {
var restoreErr error
if isCompressedSQL {
e.log.Info("Detected compressed SQL format, using psql + gunzip", "file", dumpFile)
mu.Lock()
e.log.Info("Detected compressed SQL format, using psql + gunzip", "file", dumpFile, "database", dbName)
mu.Unlock()
restoreErr = e.restorePostgreSQLSQL(ctx, dumpFile, dbName, true)
} else {
e.log.Info("Detected custom dump format, using pg_restore", "file", dumpFile)
mu.Lock()
e.log.Info("Detected custom dump format, using pg_restore", "file", dumpFile, "database", dbName)
mu.Unlock()
restoreErr = e.restorePostgreSQLDumpWithOwnership(ctx, dumpFile, dbName, false, preserveOwnership)
}
if restoreErr != nil {
e.log.Error("Failed to restore database", "name", dbName, "error", restoreErr)
mu.Lock()
e.log.Error("Failed to restore database", "name", dbName, "file", dumpFile, "error", restoreErr)
mu.Unlock()
failedDBsMu.Lock()
failedDBs = append(failedDBs, fmt.Sprintf("%s: %v", dbName, restoreErr))
// Include more context in the error message
failedDBs = append(failedDBs, fmt.Sprintf("%s: restore failed: %v", dbName, restoreErr))
failedDBsMu.Unlock()
atomic.AddInt32(&failCount, 1)
return