diff options
| author | Aleksandr Nogikh <nogikh@google.com> | 2025-04-02 12:23:25 +0200 |
|---|---|---|
| committer | Aleksandr Nogikh <nogikh@google.com> | 2025-04-02 14:41:22 +0000 |
| commit | 8079433f581666f7c03c02f05ba46bfe2b3a8a0a (patch) | |
| tree | 2565055f20ebf3440bf826f7249b71f8e4794d42 /syz-cluster/controller/processor.go | |
| parent | e347022420b455f456dd988455b327805a9d3a1c (diff) | |
syz-cluster: better handle SeriesProcessor restarts
If the Loop() was restarted in between the moment we marked the session
as started in the DB and the moment we actually started the workflow,
there was no way back to the normal operation.
That was the reason of the sporadic TestProcessor failures we've seen in
the presubmit tests.
Handle this case in the code by just continuing the non-finished calls.
Closes #5776.
Diffstat (limited to 'syz-cluster/controller/processor.go')
| -rw-r--r-- | syz-cluster/controller/processor.go | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/syz-cluster/controller/processor.go b/syz-cluster/controller/processor.go index 809640f0e..5e96f04c7 100644 --- a/syz-cluster/controller/processor.go +++ b/syz-cluster/controller/processor.go @@ -154,11 +154,16 @@ func (sp *SeriesProcessor) handleSession(ctx context.Context, session *db.Sessio switch status { case workflow.StatusNotFound: log.Printf("scheduling a workflow for %q", session.ID) - if err := sp.sessionRepo.Start(ctx, session.ID); err != nil { + err := sp.sessionRepo.Start(ctx, session.ID) + if err == db.ErrSessionAlreadyStarted { + // It may happen if the service was restarted right between the moment we updated the DB + // and actually started the workflow. + log.Printf("session %q was already marked as started, but there's no actual workflow", session.ID) + } else if err != nil { app.Errorf("failed to mark session started: %v", err) break } - err := sp.workflows.Start(session.ID) + err = sp.workflows.Start(session.ID) if err != nil { app.Errorf("failed to start a workflow: %v", err) } |
