From 8079433f581666f7c03c02f05ba46bfe2b3a8a0a Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Wed, 2 Apr 2025 12:23:25 +0200 Subject: syz-cluster: better handle SeriesProcessor restarts If the Loop() was restarted in between the moment we marked the session as started in the DB and the moment we actually started the workflow, there was no way back to the normal operation. That was the reason of the sporadic TestProcessor failures we've seen in the presubmit tests. Handle this case in the code by just continuing the non-finished calls. Closes #5776. --- syz-cluster/controller/processor.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'syz-cluster/controller/processor.go') diff --git a/syz-cluster/controller/processor.go b/syz-cluster/controller/processor.go index 809640f0e..5e96f04c7 100644 --- a/syz-cluster/controller/processor.go +++ b/syz-cluster/controller/processor.go @@ -154,11 +154,16 @@ func (sp *SeriesProcessor) handleSession(ctx context.Context, session *db.Sessio switch status { case workflow.StatusNotFound: log.Printf("scheduling a workflow for %q", session.ID) - if err := sp.sessionRepo.Start(ctx, session.ID); err != nil { + err := sp.sessionRepo.Start(ctx, session.ID) + if err == db.ErrSessionAlreadyStarted { + // It may happen if the service was restarted right between the moment we updated the DB + // and actually started the workflow. + log.Printf("session %q was already marked as started, but there's no actual workflow", session.ID) + } else if err != nil { app.Errorf("failed to mark session started: %v", err) break } - err := sp.workflows.Start(session.ID) + err = sp.workflows.Start(session.ID) if err != nil { app.Errorf("failed to start a workflow: %v", err) } -- cgit mrf-deployment