syz-cluster: better handle SeriesProcessor restarts

If the Loop() was restarted in between the moment we marked the session as started in the DB and the moment we actually started the workflow, there was no way back to the normal operation. That was the reason of the sporadic TestProcessor failures we've seen in the presubmit tests. Handle this case in the code by just continuing the non-finished calls. Closes #5776.
author: Aleksandr Nogikh <nogikh@google.com> 2025-04-02 12:23:25 +0200
committer: Aleksandr Nogikh <nogikh@google.com> 2025-04-02 14:41:22 +0000
commit: 8079433f581666f7c03c02f05ba46bfe2b3a8a0a (patch)
tree: 2565055f20ebf3440bf826f7249b71f8e4794d42 /syz-cluster/controller/processor.go
parent: e347022420b455f456dd988455b327805a9d3a1c (diff)
1 files changed, 7 insertions, 2 deletions
diff --git a/syz-cluster/controller/processor.go b/syz-cluster/controller/processor.go
index 809640f0e..5e96f04c7 100644
--- a/syz-cluster/controller/processor.go
+++ b/syz-cluster/controller/processor.go
@@ -154,11 +154,16 @@ func (sp *SeriesProcessor) handleSession(ctx context.Context, session *db.Sessio
 		switch status {
 		case workflow.StatusNotFound:
 			log.Printf("scheduling a workflow for %q", session.ID)
-			if err := sp.sessionRepo.Start(ctx, session.ID); err != nil {
+			err := sp.sessionRepo.Start(ctx, session.ID)
+			if err == db.ErrSessionAlreadyStarted {
+				// It may happen if the service was restarted right between the moment we updated the DB
+				// and actually started the workflow.
+				log.Printf("session %q was already marked as started, but there's no actual workflow", session.ID)
+			} else if err != nil {
 				app.Errorf("failed to mark session started: %v", err)
 				break
 			}
-			err := sp.workflows.Start(session.ID)
+			err = sp.workflows.Start(session.ID)
 			if err != nil {
 				app.Errorf("failed to start a workflow: %v", err)
 			}
author	Aleksandr Nogikh <nogikh@google.com>	2025-04-02 12:23:25 +0200
committer	Aleksandr Nogikh <nogikh@google.com>	2025-04-02 14:41:22 +0000
commit	8079433f581666f7c03c02f05ba46bfe2b3a8a0a (patch)
tree	2565055f20ebf3440bf826f7249b71f8e4794d42 /syz-cluster/controller/processor.go
parent	e347022420b455f456dd988455b327805a9d3a1c (diff)