diff --git a/cmd/satoru/backup_job.go b/cmd/satoru/backup_job.go index 29f57da..a19e5b1 100644 --- a/cmd/satoru/backup_job.go +++ b/cmd/satoru/backup_job.go @@ -24,6 +24,7 @@ import ( const ( defaultStagingRoot = "./backups" defaultResticRepo = "./repos/restic" + defaultSyncTimeout = 2 * time.Hour ) func (a *app) runBackupJob(ctx context.Context, job store.Job, site store.Site) (string, string) { @@ -410,18 +411,16 @@ func (a *app) runResticSyncJob(ctx context.Context, job store.Job, site store.Si _ = a.store.AddJobEvent(ctx, store.JobEvent{JobID: job.ID, Level: "error", Message: "restic sync failed: " + err.Error()}) return "failed", "restic sync failed: command build error" } - a.log.Debug("restic sync copy", zap.Int64("job_id", job.ID), zap.Int64("site_id", site.ID), zap.String("local_repo", repoPath), zap.String("b2_repo", b2Repo), zap.String("snapshot_id", snapshotID), zap.String("copy_mode", copyMode), zap.Strings("args", args)) + syncTimeout := resticSyncTimeout(a.log, job.ID, site.ID) + a.log.Debug("restic sync copy", zap.Int64("job_id", job.ID), zap.Int64("site_id", site.ID), zap.String("local_repo", repoPath), zap.String("b2_repo", b2Repo), zap.String("snapshot_id", snapshotID), zap.String("copy_mode", copyMode), zap.Duration("timeout", syncTimeout), zap.Strings("args", args)) - cmdCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) + cmdCtx, cancel := context.WithTimeout(ctx, syncTimeout) defer cancel() cmd := exec.CommandContext(cmdCtx, "restic", args...) cmd.Env = env out, err := cmd.CombinedOutput() if err != nil { - msg := strings.TrimSpace(string(out)) - if msg == "" { - msg = err.Error() - } + msg := resticSyncErrorMessage(cmdCtx, err, out, syncTimeout) _ = a.store.AddJobEvent(ctx, store.JobEvent{JobID: job.ID, Level: "error", Message: "restic sync failed: " + msg}) return "failed", "restic sync failed" } @@ -430,6 +429,51 @@ func (a *app) runResticSyncJob(ctx context.Context, job store.Job, site store.Si return "success", "restic sync completed" } +func resticSyncTimeout(log *zap.Logger, jobID, siteID int64) time.Duration { + raw := strings.TrimSpace(configValue("SATORU_RESTIC_SYNC_TIMEOUT")) + if raw == "" { + return defaultSyncTimeout + } + d, err := time.ParseDuration(raw) + if err == nil && d > 0 { + return d + } + if log != nil { + log.Warn("invalid restic sync timeout; using default", + zap.Int64("job_id", jobID), + zap.Int64("site_id", siteID), + zap.String("raw_timeout", raw), + zap.Duration("default_timeout", defaultSyncTimeout), + zap.Error(err), + ) + } + return defaultSyncTimeout +} + +func resticSyncErrorMessage(cmdCtx context.Context, runErr error, output []byte, timeout time.Duration) string { + msg := strings.TrimSpace(string(output)) + switch { + case errors.Is(cmdCtx.Err(), context.DeadlineExceeded): + timeoutMsg := fmt.Sprintf("sync timed out after %s", timeout) + if msg == "" { + return timeoutMsg + } + return msg + "; " + timeoutMsg + case errors.Is(cmdCtx.Err(), context.Canceled): + if msg == "" { + return "sync canceled" + } + return msg + "; sync canceled" + } + if msg == "" { + return runErr.Error() + } + if strings.Contains(msg, runErr.Error()) { + return msg + } + return msg + "; " + runErr.Error() +} + func buildResticCopyInvocation(ctx context.Context, sourceRepo, destinationRepo, snapshotID string) ([]string, []string, string, error) { modern, err := resticSupportsFromRepo(ctx) if err != nil {