From 8079433f581666f7c03c02f05ba46bfe2b3a8a0a Mon Sep 17 00:00:00 2001
From: Aleksandr Nogikh <nogikh@google.com>
Date: Wed, 2 Apr 2025 12:23:25 +0200
Subject: syz-cluster: better handle SeriesProcessor restarts

If the Loop() was restarted in between the moment we marked the session
as started in the DB and the moment we actually started the workflow,
there was no way back to the normal operation.

That was the reason of the sporadic TestProcessor failures we've seen in
the presubmit tests.

Handle this case in the code by just continuing the non-finished calls.

Closes #5776.
---
 syz-cluster/controller/processor.go | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'syz-cluster/controller/processor.go')

diff --git a/syz-cluster/controller/processor.go b/syz-cluster/controller/processor.go
index 809640f0e..5e96f04c7 100644
--- a/syz-cluster/controller/processor.go
+++ b/syz-cluster/controller/processor.go
@@ -154,11 +154,16 @@ func (sp *SeriesProcessor) handleSession(ctx context.Context, session *db.Sessio
 		switch status {
 		case workflow.StatusNotFound:
 			log.Printf("scheduling a workflow for %q", session.ID)
-			if err := sp.sessionRepo.Start(ctx, session.ID); err != nil {
+			err := sp.sessionRepo.Start(ctx, session.ID)
+			if err == db.ErrSessionAlreadyStarted {
+				// It may happen if the service was restarted right between the moment we updated the DB
+				// and actually started the workflow.
+				log.Printf("session %q was already marked as started, but there's no actual workflow", session.ID)
+			} else if err != nil {
 				app.Errorf("failed to mark session started: %v", err)
 				break
 			}
-			err := sp.workflows.Start(session.ID)
+			err = sp.workflows.Start(session.ID)
 			if err != nil {
 				app.Errorf("failed to start a workflow: %v", err)
 			}
-- 
cgit mrf-deployment