Fix pipeline cancellation status handling and step state synchronization (#6011)

Co-authored-by: pnkcaht <samzoovsk19@gmail.com> Co-authored-by: qwerty287 <80460567+qwerty287@users.noreply.github.com> Co-authored-by: Lauris B <lauris@nix.lv>
2026-02-13 21:00:00 +00:00 · 2026-02-05 21:41:05 +01:00
parent 1af1ef562c
commit 8a8f9ad3aa
26 changed files with 956 additions and 387 deletions
--- a/agent/logger.go
+++ b/agent/logger.go
@@ -36,6 +36,7 @@ func (r *Runner) createLogger(_logger zerolog.Logger, uploads *sync.WaitGroup, w
 			Logger()
 		uploads.Add(1)
 		defer uploads.Done()
 		var secrets []string
 		for _, secret := range workflow.Config.Secrets {
@@ -50,8 +51,6 @@ func (r *Runner) createLogger(_logger zerolog.Logger, uploads *sync.WaitGroup, w
 		}
 		logger.Debug().Msg("log stream copied, close ...")
 		uploads.Done()
 		return nil
 	}
 }
--- a/agent/rpc/client_grpc.go
+++ b/agent/rpc/client_grpc.go
@@ -148,15 +148,16 @@ func (c *client) Next(ctx context.Context, filter rpc.Filter) (*rpc.Workflow, er
 	return w, nil
 }
-// Wait blocks until the workflow is complete.
+// Wait blocks until the workflow with the given ID is marked as completed or canceled by the server.
-func (c *client) Wait(ctx context.Context, workflowID string) (err error) {
+func (c *client) Wait(ctx context.Context, workflowID string) (canceled bool, err error) {
 	retry := c.newBackOff()
 	req := new(proto.WaitRequest)
 	req.Id = workflowID
 	for {
-		_, err = c.client.Wait(ctx, req)
+		resp, err := c.client.Wait(ctx, req)
 		if err == nil {
-			break
+			// wait block was released normally as expected by server
 			return resp.GetCanceled(), nil
 		}
 		switch status.Code(err) {
@@ -164,10 +165,10 @@ func (c *client) Wait(ctx context.Context, workflowID string) (err error) {
 			if ctx.Err() != nil {
 				// expected as context was canceled
 				log.Debug().Err(err).Msgf("grpc error: wait(): context canceled")
-				return nil
+				return false, nil
 			}
 			log.Error().Err(err).Msgf("grpc error: wait(): code: %v", status.Code(err))
-			return err
+			return false, err
 		case
 			codes.Aborted,
 			codes.DataLoss,
@@ -178,16 +179,15 @@ func (c *client) Wait(ctx context.Context, workflowID string) (err error) {
 			log.Warn().Err(err).Msgf("grpc error: wait(): code: %v", status.Code(err))
 		default:
 			log.Error().Err(err).Msgf("grpc error: wait(): code: %v", status.Code(err))
-			return err
+			return false, err
 		}
 		select {
 		case <-time.After(retry.NextBackOff()):
 		case <-ctx.Done():
-			return ctx.Err()
+			return false, ctx.Err()
 		}
 	}
 	return nil
 }
 // Init signals the workflow is initialized.
@@ -199,6 +199,7 @@ func (c *client) Init(ctx context.Context, workflowID string, state rpc.Workflow
 	req.State.Started = state.Started
 	req.State.Finished = state.Finished
 	req.State.Error = state.Error
 	req.State.Canceled = state.Canceled
 	for {
 		_, err = c.client.Init(ctx, req)
 		if err == nil {
@@ -238,7 +239,7 @@ func (c *client) Init(ctx context.Context, workflowID string, state rpc.Workflow
 	return nil
 }
-// Done signals the workflow is complete.
+// Done let agent signal to server the workflow has stopped.
 func (c *client) Done(ctx context.Context, workflowID string, state rpc.WorkflowState) (err error) {
 	retry := c.newBackOff()
 	req := new(proto.DoneRequest)
@@ -247,6 +248,7 @@ func (c *client) Done(ctx context.Context, workflowID string, state rpc.Workflow
 	req.State.Started = state.Started
 	req.State.Finished = state.Finished
 	req.State.Error = state.Error
 	req.State.Canceled = state.Canceled
 	for {
 		_, err = c.client.Done(ctx, req)
 		if err == nil {
@@ -330,7 +332,7 @@ func (c *client) Extend(ctx context.Context, workflowID string) (err error) {
 	return nil
 }
-// Update updates the workflow state.
+// Update let agent updates the step state at the server.
 func (c *client) Update(ctx context.Context, workflowID string, state rpc.StepState) (err error) {
 	retry := c.newBackOff()
 	req := new(proto.UpdateRequest)
@@ -342,6 +344,7 @@ func (c *client) Update(ctx context.Context, workflowID string, state rpc.StepSt
 	req.State.Exited = state.Exited
 	req.State.ExitCode = int32(state.ExitCode)
 	req.State.Error = state.Error
 	req.State.Canceled = state.Canceled
 	for {
 		_, err = c.client.Update(ctx, req)
 		if err == nil {
--- a/agent/runner.go
+++ b/agent/runner.go
@@ -20,7 +20,6 @@ import (
 	"errors"
 	"fmt"
 	"sync"
 	"sync/atomic"
 	"time"
 	"github.com/rs/zerolog/log"
@@ -51,6 +50,7 @@ func NewRunner(workEngine rpc.Peer, f rpc.Filter, h string, state *State, backen
 	}
 }
 // Run executes a workflow using a backend, tracks its state and reports the state back to the server.
 func (r *Runner) Run(runnerCtx, shutdownCtx context.Context) error {
 	log.Debug().Msg("request next execution")
@@ -90,34 +90,32 @@ func (r *Runner) Run(runnerCtx, shutdownCtx context.Context) error {
 	// Workflow execution context.
 	// This context is the SINGLE source of truth for cancellation.
-	workflowCtx, cancel := context.WithTimeout(ctxMeta, timeout)
+	workflowCtx, _ := context.WithTimeout(ctxMeta, timeout) //nolint:govet
-	defer cancel()
+	workflowCtx, cancelWorkflowCtx := context.WithCancelCause(workflowCtx)
 	defer cancelWorkflowCtx(nil)
-	// Handle SIGTERM (k8s, docker, system shutdown)
+	// Add sigterm support for internal context.
 	// Required to be able to terminate the running workflow by external signals.
 	workflowCtx = utils.WithContextSigtermCallback(workflowCtx, func() {
 		logger.Error().Msg("received sigterm termination signal")
-		cancel()
+		// WithContextSigtermCallback would cancel the context too, but  we want our own custom error
 		cancelWorkflowCtx(pipeline.ErrCancel)
 	})
 	// canceled indicates whether the workflow was canceled remotely (UI/API).
 	// Must be atomic because it is written from a goroutine and read later.
 	var canceled atomic.Bool
 	// Listen for remote cancel events (UI / API).
 	// When canceled, we MUST cancel the workflow context
-	// so that pipeline execution and backend processes stop immediately.
+	// so that workflow execution stop immediately.
 	go func() {
-		logger.Debug().Msg("listening for cancel signal")
+		logger.Debug().Msg("start listening for server side cancel signal")
-		if err := r.client.Wait(workflowCtx, workflow.ID); err != nil {
+		if canceled, err := r.client.Wait(workflowCtx, workflow.ID); err != nil {
-			logger.Warn().Err(err).Msg("cancel signal received from server")
+			logger.Error().Err(err).Msg("server returned unexpected err while waiting for workflow to finish run")
-
+			cancelWorkflowCtx(err)
 			// Mark workflow as canceled (thread-safe)
 			canceled.Store(true)
 			// Propagate cancellation to pipeline + backend
 			cancel()
 		} else {
 			if canceled {
 				logger.Debug().Err(err).Msg("server side cancel signal received")
 				cancelWorkflowCtx(pipeline.ErrCancel)
 			}
 			// Wait returned without error, meaning the workflow finished normally
 			logger.Debug().Msg("cancel listener exited normally")
 		}
@@ -143,9 +141,13 @@ func (r *Runner) Run(runnerCtx, shutdownCtx context.Context) error {
 	state := rpc.WorkflowState{
 		Started: time.Now().Unix(),
 	}
 	if err := r.client.Init(runnerCtx, workflow.ID, state); err != nil {
-		logger.Error().Err(err).Msg("workflow initialization failed")
+		logger.Error().Err(err).Msg("signaling workflow initialization to server failed")
-		// TODO: should we return here?
+		// We have an error, maybe the server is currently unreachable or other server-side errors occurred.
 		// So let's clean up and end this not yet started workflow run.
 		cancelWorkflowCtx(err)
 		return err
 	}
 	var uploads sync.WaitGroup
@@ -167,19 +169,18 @@ func (r *Runner) Run(runnerCtx, shutdownCtx context.Context) error {
 	state.Finished = time.Now().Unix()
 	// Normalize cancellation error
 	if errors.Is(err, pipeline.ErrCancel) || canceled.Load() {
 		canceled.Store(true)
 		err = pipeline.ErrCancel
 	}
 	if err != nil {
 		state.Error = err.Error()
 		if errors.Is(err, pipeline.ErrCancel) {
 			state.Canceled = true
 			// cleanup joined error messages
 			state.Error = pipeline.ErrCancel.Error()
 		}
 	}
 	logger.Debug().
 		Str("error", state.Error).
-		Bool("canceled", canceled.Load()).
+		Bool("canceled", state.Canceled).
 		Msg("workflow finished")
 	// Ensure all logs/traces are uploaded before finishing
@@ -195,6 +196,8 @@ func (r *Runner) Run(runnerCtx, shutdownCtx context.Context) error {
 	if err := r.client.Done(doneCtx, workflow.ID, state); err != nil {
 		logger.Error().Err(err).Msg("failed to update workflow status")
 	} else {
 		logger.Debug().Msg("signaling workflow stopped done")
 	}
 	return nil
--- a/agent/tracer.go
+++ b/agent/tracer.go
@@ -16,6 +16,7 @@ package agent
 import (
 	"context"
 	"errors"
 	"runtime"
 	"strconv"
 	"sync"
@@ -30,6 +31,7 @@ import (
 func (r *Runner) createTracer(ctxMeta context.Context, uploads *sync.WaitGroup, logger zerolog.Logger, workflow *rpc.Workflow) pipeline.TraceFunc {
 	return func(state *pipeline.State) error {
 		uploads.Add(1)
 		defer uploads.Done()
 		stepLogger := logger.With().
 			Str("image", state.Pipeline.Step.Image).
@@ -43,12 +45,15 @@ func (r *Runner) createTracer(ctxMeta context.Context, uploads *sync.WaitGroup,
 			StepUUID: state.Pipeline.Step.UUID,
 			Exited:   state.Process.Exited,
 			ExitCode: state.Process.ExitCode,
-			Started:  time.Now().Unix(), // TODO: do not do this
+			Started:  state.Process.Started,
-			Finished: time.Now().Unix(),
+			Canceled: errors.Is(state.Process.Error, pipeline.ErrCancel),
 		}
 		if state.Process.Error != nil {
 			stepState.Error = state.Process.Error.Error()
 		}
 		if state.Process.Exited {
 			stepState.Finished = time.Now().Unix()
 		}
 		defer func() {
 			stepLogger.Debug().Msg("update step status")
@@ -60,7 +65,6 @@ func (r *Runner) createTracer(ctxMeta context.Context, uploads *sync.WaitGroup,
 			}
 			stepLogger.Debug().Msg("update step status complete")
 			uploads.Done()
 		}()
 		if state.Process.Exited {
 			return nil
--- a/pipeline/backend/docker/docker.go
+++ b/pipeline/backend/docker/docker.go
@@ -250,29 +250,11 @@ func (e *docker) StartStep(ctx context.Context, step *backend.Step, taskUUID str
 	return e.client.ContainerStart(ctx, containerName, container.StartOptions{})
 }
 // WaitStep waits for a step container to exit.
 //
 // When the context is canceled, the container is immediately killed to prevent
 // orphaned containers from continuing to run after agent shutdown.
 func (e *docker) WaitStep(ctx context.Context, step *backend.Step, taskUUID string) (*backend.State, error) {
-	log := log.Logger.With().
+	log := log.Logger.With().Str("taskUUID", taskUUID).Str("stepUUID", step.UUID).Logger()
 		Str("taskUUID", taskUUID).
 		Str("stepUUID", step.UUID).
 		Logger()
 	log.Trace().Msgf("wait for step %s", step.Name)
 	containerName := toContainerName(step)
 	done := make(chan struct{})
 	// Ensure container is killed if context is canceled (SIGTERM / pipeline cancel)
 	go func() {
 		select {
 		case <-ctx.Done():
 			_ = e.client.ContainerKill(context.Background(), containerName, "9") //nolint:contextcheck
 		case <-done:
 		}
 	}()
 	wait, errC := e.client.ContainerWait(ctx, containerName, "")
 	select {
@@ -282,9 +264,6 @@ func (e *docker) WaitStep(ctx context.Context, step *backend.Step, taskUUID stri
 		log.Trace().Msgf("ContainerWait returned with err: %v", err)
 	}
 	// Stop cancellation watcher
 	close(done)
 	info, err := e.client.ContainerInspect(ctx, containerName)
 	if err != nil {
 		return nil, err
--- a/pipeline/backend/local/local.go
+++ b/pipeline/backend/local/local.go
@@ -171,9 +171,18 @@ func (e *local) StartStep(ctx context.Context, step *types.Step, taskUUID string
 	}
 }
-func (e *local) WaitStep(_ context.Context, step *types.Step, taskUUID string) (*types.State, error) {
+func (e *local) WaitStep(ctx context.Context, step *types.Step, taskUUID string) (*types.State, error) {
 	log.Trace().Str("taskUUID", taskUUID).Msgf("wait for step %s", step.Name)
 	stepState := &types.State{
 		Exited: true,
 	}
 	if err := ctx.Err(); err != nil {
 		stepState.Error = err
 		return stepState, nil
 	}
 	state, err := e.getStepState(taskUUID, step.UUID)
 	if err != nil {
 		return nil, err
@@ -183,10 +192,6 @@ func (e *local) WaitStep(_ context.Context, step *types.Step, taskUUID string) (
 		return nil, errors.New("exec: step command not set up")
 	}
 	stepState := &types.State{
 		Exited: true,
 	}
 	// normally we use cmd.Wait() to wait for *exec.Cmd, but cmd.StdoutPipe() tells us not
 	// as Wait() would close the io pipe even if not all logs where read and send back
 	// so we have to do use the underlying functions
--- a/pipeline/backend/types/state.go
+++ b/pipeline/backend/types/state.go
@@ -16,11 +16,14 @@ package types
 // State defines a container state.
 type State struct {
 	// Unix start time
 	Started int64 `json:"started"`
 	// Container exit code
 	ExitCode int `json:"exit_code"`
 	// Container exited, true or false
 	Exited bool `json:"exited"`
 	// Container is oom killed, true or false
 	// TODO (6024): well known errors as string enum into ./errors.go
 	OOMKilled bool `json:"oom_killed"`
 	// Container error
 	Error error
--- a/pipeline/pipeline.go
+++ b/pipeline/pipeline.go
@@ -47,18 +47,23 @@ type (
 		}
 		// Current process state.
-		Process *backend.State
+		Process backend.State
 	}
 )
-// Runtime is a configuration runtime.
+// Runtime represents a workflow state executed by a specific backend.
 // Each workflow gets its own state configuration at runtime.
 type Runtime struct {
 	err     error
 	spec    *backend.Config
 	engine  backend.Backend
 	started int64
-	ctx    context.Context
+	// The context a workflow is being executed with.
 	// All normal (non cleanup) operations must use this.
 	// Cleanup operations should use the runnerCtx passed to Run()
 	ctx context.Context
 	tracer Tracer
 	logger Logger
@@ -122,7 +127,7 @@ func (r *Runtime) Run(runnerCtx context.Context) error {
 			state := new(State)
 			state.Pipeline.Step = stepErr.Step
 			state.Pipeline.Error = stepErr.Err
-			state.Process = &backend.State{
+			state.Process = backend.State{
 				Error:    stepErr.Err,
 				Exited:   true,
 				ExitCode: 1,
@@ -143,7 +148,7 @@ func (r *Runtime) Run(runnerCtx context.Context) error {
 		select {
 		case <-r.ctx.Done():
 			return ErrCancel
-		case err := <-r.execAll(stage.Steps):
+		case err := <-r.execAll(runnerCtx, stage.Steps):
 			if err != nil {
 				r.err = err
 			}
@@ -154,28 +159,30 @@ func (r *Runtime) Run(runnerCtx context.Context) error {
 }
 // Updates the current status of a step.
 // If processState is nil, we assume the step did not start.
 // If step did not started and err exists, it's a step start issue and step is done.
 func (r *Runtime) traceStep(processState *backend.State, err error, step *backend.Step) error {
 	if r.tracer == nil {
 		// no tracer nothing to trace :)
 		return nil
 	}
 	if processState == nil {
 		processState = new(backend.State)
 		if err != nil {
 			processState.Error = err
 			processState.Exited = true
 			processState.OOMKilled = false
 			processState.ExitCode = 126 // command invoked cannot be executed.
 		}
 	}
 	state := new(State)
 	state.Pipeline.Started = r.started
 	state.Pipeline.Step = step
 	state.Process = processState // empty
 	state.Pipeline.Error = r.err
 	// We have an error while starting the step
 	if processState == nil && err != nil {
 		state.Process = backend.State{
 			Error:     err,
 			Exited:    true,
 			OOMKilled: false,
 		}
 	} else if processState != nil {
 		state.Process = *processState
 	}
 	if traceErr := r.tracer.Trace(state); traceErr != nil {
 		return traceErr
 	}
@@ -183,7 +190,7 @@ func (r *Runtime) traceStep(processState *backend.State, err error, step *backen
 }
 // Executes a set of parallel steps.
-func (r *Runtime) execAll(steps []*backend.Step) <-chan error {
+func (r *Runtime) execAll(runnerCtx context.Context, steps []*backend.Step) <-chan error {
 	var g errgroup.Group
 	done := make(chan error)
 	logger := r.MakeLogger()
@@ -226,12 +233,17 @@ func (r *Runtime) execAll(steps []*backend.Step) <-chan error {
 				Str("step", step.Name).
 				Msg("executing")
-			processState, err := r.exec(step)
+			processState, err := r.exec(runnerCtx, step)
 			logger.Debug().
 				Str("step", step.Name).
 				Msg("complete")
 			// normalize context cancel error
 			if errors.Is(err, context.Canceled) {
 				err = ErrCancel
 			}
 			// Return the error after tracing it.
 			err = r.traceStep(processState, err, step)
 			if err != nil && step.Failure == metadata.FailureIgnore {
@@ -245,18 +257,21 @@ func (r *Runtime) execAll(steps []*backend.Step) <-chan error {
 		done <- g.Wait()
 		close(done)
 	}()
 	return done
 }
 // Executes the step and returns the state and error.
-func (r *Runtime) exec(step *backend.Step) (*backend.State, error) {
+func (r *Runtime) exec(runnerCtx context.Context, step *backend.Step) (*backend.State, error) {
-	if err := r.engine.StartStep(r.ctx, step, r.taskUUID); err != nil {
+	if err := r.engine.StartStep(r.ctx, step, r.taskUUID); err != nil { //nolint:contextcheck
 		return nil, err
 	}
 	startTime := time.Now().Unix()
 	logger := r.MakeLogger()
 	var wg sync.WaitGroup
 	if r.logger != nil {
-		rc, err := r.engine.TailStep(r.ctx, step, r.taskUUID)
+		rc, err := r.engine.TailStep(r.ctx, step, r.taskUUID) //nolint:contextcheck
 		if err != nil {
 			return nil, err
 		}
@@ -264,7 +279,6 @@ func (r *Runtime) exec(step *backend.Step) (*backend.State, error) {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
 			logger := r.MakeLogger()
 			if err := r.logger(step, rc); err != nil {
 				logger.Error().Err(err).Msg("process logging failed")
@@ -281,16 +295,27 @@ func (r *Runtime) exec(step *backend.Step) (*backend.State, error) {
 	// We wait until all data was logged. (Needed for some backends like local as WaitStep kills the log stream)
 	wg.Wait()
-	waitState, err := r.engine.WaitStep(r.ctx, step, r.taskUUID)
+	waitState, err := r.engine.WaitStep(r.ctx, step, r.taskUUID) //nolint:contextcheck
 	if err != nil {
 		if errors.Is(err, context.Canceled) {
-			return waitState, ErrCancel
+			waitState.Error = ErrCancel
 		} else {
 			return nil, err
 		}
 	}
 	// It is important to use the runnerCtx here because
 	// in case the workflow was canceled we still have the docker daemon to stop the container.
 	if err := r.engine.DestroyStep(runnerCtx, step, r.taskUUID); err != nil {
 		return nil, err
 	}
-	if err := r.engine.DestroyStep(r.ctx, step, r.taskUUID); err != nil {
+	// we update with our start time here
-		return nil, err
+	waitState.Started = startTime
 	// we handle cancel case
 	if ctxErr := r.ctx.Err(); ctxErr != nil && errors.Is(ctxErr, context.Canceled) {
 		waitState.Error = ErrCancel
 	}
 	if waitState.OOMKilled {
--- a/pipeline/shutdown.go
+++ b/pipeline/shutdown.go
@@ -23,26 +23,15 @@ import (
 const shutdownTimeout = time.Second * 5
 var (
-	shutdownCtx       context.Context
+	shutdownCtx     context.Context
-	shutdownCtxCancel context.CancelFunc
+	shutdownCtxLock sync.Mutex
 	shutdownCtxLock   sync.Mutex
 )
 func GetShutdownCtx() context.Context {
 	shutdownCtxLock.Lock()
 	defer shutdownCtxLock.Unlock()
 	if shutdownCtx == nil {
-		shutdownCtx, shutdownCtxCancel = context.WithTimeout(context.Background(), shutdownTimeout)
+		shutdownCtx, _ = context.WithTimeout(context.Background(), shutdownTimeout) //nolint:govet
 	}
 	return shutdownCtx
 }
 func CancelShutdown() {
 	shutdownCtxLock.Lock()
 	defer shutdownCtxLock.Unlock()
 	if shutdownCtxCancel == nil {
 		// we create an canceled context
 		shutdownCtx, shutdownCtxCancel = context.WithCancel(context.Background()) //nolint:forbidigo
 	}
 	shutdownCtxCancel()
 }
--- a/rpc/mocks/mock_Peer.go
+++ b/rpc/mocks/mock_Peer.go
@@ -623,20 +623,29 @@ func (_c *MockPeer_Version_Call) RunAndReturn(run func(c context.Context) (*rpc.
 }
 // Wait provides a mock function for the type MockPeer
-func (_mock *MockPeer) Wait(c context.Context, workflowID string) error {
+func (_mock *MockPeer) Wait(c context.Context, workflowID string) (bool, error) {
 	ret := _mock.Called(c, workflowID)
 	if len(ret) == 0 {
 		panic("no return value specified for Wait")
 	}
-	var r0 error
+	var r0 bool
-	if returnFunc, ok := ret.Get(0).(func(context.Context, string) error); ok {
+	var r1 error
 	if returnFunc, ok := ret.Get(0).(func(context.Context, string) (bool, error)); ok {
 		return returnFunc(c, workflowID)
 	}
 	if returnFunc, ok := ret.Get(0).(func(context.Context, string) bool); ok {
 		r0 = returnFunc(c, workflowID)
 	} else {
-		r0 = ret.Error(0)
+		r0 = ret.Get(0).(bool)
 	}
-	return r0
+	if returnFunc, ok := ret.Get(1).(func(context.Context, string) error); ok {
 		r1 = returnFunc(c, workflowID)
 	} else {
 		r1 = ret.Error(1)
 	}
 	return r0, r1
 }
 // MockPeer_Wait_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Wait'
@@ -669,12 +678,12 @@ func (_c *MockPeer_Wait_Call) Run(run func(c context.Context, workflowID string)
 	return _c
 }
-func (_c *MockPeer_Wait_Call) Return(err error) *MockPeer_Wait_Call {
+func (_c *MockPeer_Wait_Call) Return(canceled bool, err error) *MockPeer_Wait_Call {
-	_c.Call.Return(err)
+	_c.Call.Return(canceled, err)
 	return _c
 }
-func (_c *MockPeer_Wait_Call) RunAndReturn(run func(c context.Context, workflowID string) error) *MockPeer_Wait_Call {
+func (_c *MockPeer_Wait_Call) RunAndReturn(run func(c context.Context, workflowID string) (bool, error)) *MockPeer_Wait_Call {
 	_c.Call.Return(run)
 	return _c
 }
--- a/rpc/peer.go
+++ b/rpc/peer.go
@@ -15,89 +15,291 @@
 package rpc
-import (
+import "context"
 	"context"
-	backend "go.woodpecker-ci.org/woodpecker/v3/pipeline/backend/types"
+// Peer defines the bidirectional communication interface between Woodpecker agents and servers.
-)
+//
-
+// # Architecture and Implementations
-type (
+//
-	// Filter defines filters for fetching items from the queue.
+// The Peer interface is implemented differently on each side of the communication:
-	Filter struct {
+//
-		Labels map[string]string `json:"labels"`
+//   - Agent side: Implemented by agent/rpc/client_grpc.go's client struct, which wraps
-	}
+//     a gRPC client connection to make RPC calls to the server.
-
+//
-	// StepState defines the step state.
+//   - Server side: Implemented by server/rpc/rpc.go's RPC struct, which contains the
-	StepState struct {
+//     business logic and is wrapped by server/rpc/server.go's WoodpeckerServer struct
-		StepUUID string `json:"step_uuid"`
+//     to handle incoming gRPC requests.
-		Started  int64  `json:"started"`
+//
-		Finished int64  `json:"finished"`
+// # Thread Safety and Concurrency
-		Exited   bool   `json:"exited"`
+//
-		ExitCode int    `json:"exit_code"`
+//   - Implementations must be safe for concurrent calls across different workflows
-		Error    string `json:"error"`
+//   - The same Peer instance may be called concurrently from multiple goroutines
-	}
+//   - Each workflow is identified by a unique workflowID string
-
+//   - Implementations must properly isolate workflow state using workflowID
-	// WorkflowState defines the workflow state.
+//
-	WorkflowState struct {
+// # Error Handling Conventions
-		Started  int64  `json:"started"`
+//
-		Finished int64  `json:"finished"`
+//   - Methods return errors for communication failures, validation errors, or server-side issues
-		Error    string `json:"error"`
+//   - Errors should not be used for business logic
-	}
+//   - Network/transport errors should be retried by the caller when appropriate
-
+//   - Nil error indicates successful operation
-	// Workflow defines the workflow execution details.
+//   - Context cancellation should return nil or context.Canceled, not a custom error
-	Workflow struct {
+//   - Business logic errors (e.g., workflow not found) return specific error types
-		ID      string          `json:"id"`
+//
-		Config  *backend.Config `json:"config"`
+// # Intended Execution Flow
-		Timeout int64           `json:"timeout"`
+//
-	}
+//  1. Agent Lifecycle:
-
+//     - Version() checks compatibility with server
-	Version struct {
+//     - RegisterAgent() announces agent availability
-		GrpcVersion   int32  `json:"grpc_version,omitempty"`
+//     - ReportHealth() periodically confirms agent is alive
-		ServerVersion string `json:"server_version,omitempty"`
+//     - UnregisterAgent() gracefully disconnects agent
-	}
+//
-
+//  2. Workflow Execution (may happen concurrently for multiple workflows):
-	// AgentInfo represents all the metadata that should be known about an agent.
+//     - Next() blocks until server assigns a workflow
-	AgentInfo struct {
+//     - Init() signals workflow execution has started
-		Version      string            `json:"version"`
+//     - Wait() (in background goroutine) monitors for cancellation signals
-		Platform     string            `json:"platform"`
+//     - Update() reports step state changes as workflow progresses
-		Backend      string            `json:"backend"`
+//     - EnqueueLog() streams log output from steps
-		Capacity     int               `json:"capacity"`
+//     - Extend() extends workflow timeout if needed so queue does not reschedule it as retry
-		CustomLabels map[string]string `json:"custom_labels"`
+//     - Done() signals workflow has completed
-	}
+//
-)
+//  3. Cancellation Flow:
-
+//     - Server can cancel workflow by releasing Wait() with canceled=true
-// Peer defines a peer-to-peer connection.
+//     - Agent detects cancellation from Wait() return value
 //     - Agent stops workflow execution and calls Done() with canceled state
 type Peer interface {
-	// Version returns the server- & grpc-version
+	// Version returns the server and gRPC protocol version information.
 	//
 	// This is typically called once during agent initialization to verify
 	// compatibility between agent and server versions.
 	//
 	// Returns:
 	//   - Version with server version string and gRPC protocol version number
 	//   - Error if communication fails or server is unreachable
 	Version(c context.Context) (*Version, error)
-	// Next returns the next workflow in the queue
+	// Next blocks until the server provides the next workflow to execute from the queue.
 	//
 	// This is the primary work-polling mechanism. Agents call this repeatedly in a loop,
 	// and it blocks until either:
 	//   1. A workflow matching the filter becomes available
 	//   2. The context is canceled (agent shutdown, network timeout, etc.)
 	//
 	// The filter allows agents to specify capabilities via labels (e.g., platform,
 	// backend type) so the server only assigns compatible workflows.
 	//
 	// Context Handling:
 	//   - This is a long-polling operation that may block for extended periods
 	//   - Implementations MUST check context regularly (not just at entry)
 	//   - When context is canceled, must return nil workflow and nil error
 	//   - Server may send keep-alive signals or periodically return nil to allow reconnection
 	//
 	// Returns:
 	//   - Workflow object with ID, Config, and Workflow.Timeout if work is available
 	//   - nil, nil if context is canceled or no work available (retry expected)
 	//   - nil, error if a non-retryable error occurs
 	Next(c context.Context, f Filter) (*Workflow, error)
-	// Wait blocks until the workflow is complete
+	// Wait blocks until the workflow with the given ID completes or is canceled by the server.
-	Wait(c context.Context, workflowID string) error
+	//
 	// This is used by agents to monitor for server-side cancellation signals. Typically
 	// called in a background goroutine immediately after Init(), running concurrently
 	// with workflow execution.
 	//
 	// The method serves two purposes:
 	//   1. Signals when server wants to cancel workflow (canceled=true)
 	//   2. Unblocks when workflow completes normally on agent (canceled=false)
 	//
 	// Context Handling:
 	//   - This is a long-running blocking operation for the workflow duration
 	//   - Context cancellation indicates shutdown, not workflow cancellation
 	//   - When context is canceled, should return (false, nil) or (false, ctx.Err())
 	//   - Must not confuse context cancellation with workflow cancellation signal
 	//
 	// Cancellation Flow:
 	//   - Server releases Wait() with canceled=true → agent should stop workflow
 	//   - Agent completes workflow normally → Done() is called → server releases Wait() with canceled=false
 	//   - Agent context canceled → Wait() returns immediately, workflow may continue on agent
 	//
 	// Returns:
 	//   - canceled=true, err=nil: Server initiated cancellation, agent should stop workflow
 	//   - canceled=false, err=nil: Workflow completed normally (Wait unblocked by Done call)
 	//   - canceled=false, err!=nil: Communication error, agent should retry or handle error
 	Wait(c context.Context, workflowID string) (canceled bool, err error)
-	// Init signals the workflow is initialized
+	// Init signals to the server that the workflow has been initialized and execution has started.
 	//
 	// This is called once per workflow immediately after the agent accepts it from Next()
 	// and before starting step execution. It allows the server to track workflow start time
 	// and update workflow status to "running".
 	//
 	// The WorkflowState should have:
 	//   - Started: Unix timestamp when execution began
 	//   - Finished: 0 (not finished yet)
 	//   - Error: empty string (no error yet)
 	//   - Canceled: false (not canceled yet)
 	//
 	// Returns:
 	//   - nil on success
 	//   - error if communication fails or server rejects the state
 	Init(c context.Context, workflowID string, state WorkflowState) error
-	// Done signals the workflow is complete
+	// Done signals to the server that the workflow has completed execution.
 	//
 	// This is called once per workflow after all steps have finished (or workflow was canceled).
 	// It provides the final workflow state including completion time, any errors, and
 	// cancellation status.
 	//
 	// The WorkflowState should have:
 	//   - Started: Unix timestamp when execution began (same as Init)
 	//   - Finished: Unix timestamp when execution completed
 	//   - Error: Error message if workflow failed, empty if successful
 	//   - Canceled: true if workflow was canceled, false otherwise
 	//
 	// After Done() is called:
 	//   - Server updates final workflow status in database
 	//   - Server releases any Wait() calls for this workflow
 	//   - Server removes workflow from active queue
 	//   - Server notifies forge of workflow completion
 	//
 	// Context Handling:
 	//   - MUST attempt to complete even if workflow context is canceled
 	//   - Often called with a shutdown/cleanup context rather than workflow context
 	//   - Critical for proper cleanup - should retry on transient failures
 	//
 	// Returns:
 	//   - nil on success
 	//   - error if communication fails or server rejects the state
 	Done(c context.Context, workflowID string, state WorkflowState) error
-	// Extend extends the workflow deadline
+	// Extend extends the timeout for the workflow with the given ID in the task queue.
 	//
 	// Agents must call Extend() regularly (e.g., every constant.TaskTimeout / 3) to signal
 	// that the workflow is still actively executing and prevent premature timeout.
 	//
 	// If agents don't call Extend periodically, the workflow will be rescheduled to a new
 	// agent after the timeout period expires (specified in constant.TaskTimeout).
 	//
 	// This acts as a heartbeat mechanism to detect stuck workflow executions. If an agent
 	// dies or becomes unresponsive, the server will eventually timeout the workflow and
 	// reassign it.
 	//
 	// IMPORTANT: Don't confuse this with Workflow.Timeout returned by Next() - they serve
 	// different purposes!
 	//
 	// Returns:
 	//   - nil on success (timeout was extended)
 	//   - error if communication fails or workflow is not found
 	Extend(c context.Context, workflowID string) error
-	// Update updates the step state
+	// Update reports step state changes to the server as the workflow progresses.
 	//
 	// This is called multiple times per step:
 	//   1. When step starts (Exited=false, Finished=0)
 	//   2. When step completes (Exited=true, Finished and ExitCode set)
 	//   3. Potentially on progress updates if step has long-running operations
 	//
 	// The server uses these updates to:
 	//   - Track step execution progress
 	//   - Update UI with real-time status
 	//   - Store step results in database
 	//   - Calculate workflow completion
 	//
 	// Context Handling:
 	//   - Failures should be logged but not block workflow execution
 	//
 	// Returns:
 	//   - nil on success
 	//   - error if communication fails or server rejects the state
 	Update(c context.Context, workflowID string, state StepState) error
-	// EnqueueLog queues the step log entry for delayed sending
+	// EnqueueLog queues a log entry for delayed batch sending to the server.
 	//
 	// Log entries are produced continuously during step execution and need to be
 	// transmitted efficiently. This method adds logs to an internal queue that
 	// batches and sends them periodically to reduce network overhead.
 	//
 	// The implementation should:
 	//   - Queue the log entry in a memory buffer
 	//   - Batch multiple entries together
 	//   - Send batches periodically (e.g., every second) or when buffer fills
 	//   - Handle backpressure if server is slow or network is congested
 	//
 	// Unlike other methods, EnqueueLog:
 	//   - Does NOT take a context parameter (fire-and-forget)
 	//   - Does NOT return an error (never blocks the caller)
 	//   - Does NOT guarantee immediate transmission
 	//
 	// Thread Safety:
 	//   - MUST be safe to call concurrently from multiple goroutines
 	//   - May be called concurrently from different steps/workflows
 	//   - Internal queue must be properly synchronized
 	EnqueueLog(logEntry *LogEntry)
-	// RegisterAgent register our agent to the server
+	// RegisterAgent announces this agent to the server and returns an agent ID.
 	//
 	// This is called once during agent startup to:
 	//   - Create an agent record in the server database
 	//   - Obtain a unique agent ID for subsequent requests
 	//   - Declare agent capabilities (platform, backend, capacity, labels)
 	//   - Enable server-side agent tracking and monitoring
 	//
 	// The AgentInfo should specify:
 	//   - Version: Agent version string (e.g., "v2.0.0")
 	//   - Platform: OS/architecture (e.g., "linux/amd64")
 	//   - Backend: Execution backend (e.g., "docker", "kubernetes")
 	//   - Capacity: Maximum concurrent workflows (e.g., 2)
 	//   - CustomLabels: Additional key-value labels for filtering
 	//
 	// Context Handling:
 	//   - Context cancellation indicates agent is aborting startup
 	//   - Should not retry indefinitely - fail fast on persistent errors
 	//
 	// Returns:
 	//   - agentID: Unique identifier for this agent (use in subsequent calls)
 	//   - error: If registration fails
 	RegisterAgent(ctx context.Context, info AgentInfo) (int64, error)
-	// UnregisterAgent unregister our agent from the server
+	// UnregisterAgent removes this agent from the server's registry.
 	//
 	// This is called during graceful agent shutdown to:
 	//   - Mark agent as offline in server database
 	//   - Allow server to stop assigning workflows to this agent
 	//   - Clean up any agent-specific server resources
 	//   - Provide clean shutdown signal to monitoring systems
 	//
 	// After UnregisterAgent:
 	//   - Agent should stop calling Next() for new work
 	//   - Agent should complete any in-progress workflows
 	//   - Agent may call Done() to finish existing workflows
 	//   - Agent should close network connections
 	//
 	// Context Handling:
 	//   - MUST attempt to complete even during forced shutdown
 	//   - Often called with a shutdown context (limited time)
 	//   - Failure is logged but should not prevent agent exit
 	//
 	// Returns:
 	//   - nil on success
 	//   - error if communication fails
 	UnregisterAgent(ctx context.Context) error
-	// ReportHealth reports health status of the agent to the server
+	// ReportHealth sends a periodic health status update to the server.
 	//
 	// This is called regularly (e.g., every 30 seconds) during agent operation to:
 	//   - Prove agent is still alive and responsive
 	//   - Allow server to detect dead or stuck agents
 	//   - Update agent's "last seen" timestamp in database
 	//   - Provide application-level keepalive beyond network keep-alive signals
 	//
 	// Health reporting helps the server:
 	//   - Mark unresponsive agents as offline
 	//   - Redistribute work from dead agents
 	//   - Display accurate agent status in UI
 	//   - Trigger alerts for infrastructure issues
 	//
 	// Returns:
 	//   - nil on success
 	//   - error if communication fails
 	ReportHealth(c context.Context) error
 }
--- a/rpc/proto/version.go
+++ b/rpc/proto/version.go
@@ -16,4 +16,4 @@ package proto
 // Version is the version of the woodpecker.proto file,
 // IMPORTANT: increased by 1 each time it get changed.
-const Version int32 = 14
+const Version int32 = 15
--- a/rpc/proto/woodpecker.pb.go
+++ b/rpc/proto/woodpecker.pb.go
@@ -44,6 +44,7 @@ type StepState struct {
 	Exited        bool                   `protobuf:"varint,4,opt,name=exited,proto3" json:"exited,omitempty"`
 	ExitCode      int32                  `protobuf:"varint,5,opt,name=exit_code,json=exitCode,proto3" json:"exit_code,omitempty"`
 	Error         string                 `protobuf:"bytes,6,opt,name=error,proto3" json:"error,omitempty"`
 	Canceled      bool                   `protobuf:"varint,7,opt,name=canceled,proto3" json:"canceled,omitempty"`
 	unknownFields protoimpl.UnknownFields
 	sizeCache     protoimpl.SizeCache
 }
@@ -120,11 +121,19 @@ func (x *StepState) GetError() string {
 	return ""
 }
 func (x *StepState) GetCanceled() bool {
 	if x != nil {
 		return x.Canceled
 	}
 	return false
 }
 type WorkflowState struct {
 	state         protoimpl.MessageState `protogen:"open.v1"`
-	Started       int64                  `protobuf:"varint,4,opt,name=started,proto3" json:"started,omitempty"`
+	Started       int64                  `protobuf:"varint,1,opt,name=started,proto3" json:"started,omitempty"`
-	Finished      int64                  `protobuf:"varint,5,opt,name=finished,proto3" json:"finished,omitempty"`
+	Finished      int64                  `protobuf:"varint,2,opt,name=finished,proto3" json:"finished,omitempty"`
-	Error         string                 `protobuf:"bytes,6,opt,name=error,proto3" json:"error,omitempty"`
+	Error         string                 `protobuf:"bytes,3,opt,name=error,proto3" json:"error,omitempty"`
 	Canceled      bool                   `protobuf:"varint,4,opt,name=canceled,proto3" json:"canceled,omitempty"`
 	unknownFields protoimpl.UnknownFields
 	sizeCache     protoimpl.SizeCache
 }
@@ -180,6 +189,13 @@ func (x *WorkflowState) GetError() string {
 	return ""
 }
 func (x *WorkflowState) GetCanceled() bool {
 	if x != nil {
 		return x.Canceled
 	}
 	return false
 }
 type LogEntry struct {
 	state         protoimpl.MessageState `protogen:"open.v1"`
 	StepUuid      string                 `protobuf:"bytes,1,opt,name=step_uuid,json=stepUuid,proto3" json:"step_uuid,omitempty"`
@@ -1032,6 +1048,50 @@ func (x *RegisterAgentResponse) GetAgentId() int64 {
 	return 0
 }
 type WaitResponse struct {
 	state         protoimpl.MessageState `protogen:"open.v1"`
 	Canceled      bool                   `protobuf:"varint,1,opt,name=canceled,proto3" json:"canceled,omitempty"`
 	unknownFields protoimpl.UnknownFields
 	sizeCache     protoimpl.SizeCache
 }
 func (x *WaitResponse) Reset() {
 	*x = WaitResponse{}
 	mi := &file_woodpecker_proto_msgTypes[19]
 	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 	ms.StoreMessageInfo(mi)
 }
 func (x *WaitResponse) String() string {
 	return protoimpl.X.MessageStringOf(x)
 }
 func (*WaitResponse) ProtoMessage() {}
 func (x *WaitResponse) ProtoReflect() protoreflect.Message {
 	mi := &file_woodpecker_proto_msgTypes[19]
 	if x != nil {
 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 		if ms.LoadMessageInfo() == nil {
 			ms.StoreMessageInfo(mi)
 		}
 		return ms
 	}
 	return mi.MessageOf(x)
 }
 // Deprecated: Use WaitResponse.ProtoReflect.Descriptor instead.
 func (*WaitResponse) Descriptor() ([]byte, []int) {
 	return file_woodpecker_proto_rawDescGZIP(), []int{19}
 }
 func (x *WaitResponse) GetCanceled() bool {
 	if x != nil {
 		return x.Canceled
 	}
 	return false
 }
 type AuthRequest struct {
 	state         protoimpl.MessageState `protogen:"open.v1"`
 	AgentToken    string                 `protobuf:"bytes,1,opt,name=agent_token,json=agentToken,proto3" json:"agent_token,omitempty"`
@@ -1042,7 +1102,7 @@ type AuthRequest struct {
 func (x *AuthRequest) Reset() {
 	*x = AuthRequest{}
-	mi := &file_woodpecker_proto_msgTypes[19]
+	mi := &file_woodpecker_proto_msgTypes[20]
 	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 	ms.StoreMessageInfo(mi)
 }
@@ -1054,7 +1114,7 @@ func (x *AuthRequest) String() string {
 func (*AuthRequest) ProtoMessage() {}
 func (x *AuthRequest) ProtoReflect() protoreflect.Message {
-	mi := &file_woodpecker_proto_msgTypes[19]
+	mi := &file_woodpecker_proto_msgTypes[20]
 	if x != nil {
 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 		if ms.LoadMessageInfo() == nil {
@@ -1067,7 +1127,7 @@ func (x *AuthRequest) ProtoReflect() protoreflect.Message {
 // Deprecated: Use AuthRequest.ProtoReflect.Descriptor instead.
 func (*AuthRequest) Descriptor() ([]byte, []int) {
-	return file_woodpecker_proto_rawDescGZIP(), []int{19}
+	return file_woodpecker_proto_rawDescGZIP(), []int{20}
 }
 func (x *AuthRequest) GetAgentToken() string {
@@ -1095,7 +1155,7 @@ type AuthResponse struct {
 func (x *AuthResponse) Reset() {
 	*x = AuthResponse{}
-	mi := &file_woodpecker_proto_msgTypes[20]
+	mi := &file_woodpecker_proto_msgTypes[21]
 	ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 	ms.StoreMessageInfo(mi)
 }
@@ -1107,7 +1167,7 @@ func (x *AuthResponse) String() string {
 func (*AuthResponse) ProtoMessage() {}
 func (x *AuthResponse) ProtoReflect() protoreflect.Message {
-	mi := &file_woodpecker_proto_msgTypes[20]
+	mi := &file_woodpecker_proto_msgTypes[21]
 	if x != nil {
 		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
 		if ms.LoadMessageInfo() == nil {
@@ -1120,7 +1180,7 @@ func (x *AuthResponse) ProtoReflect() protoreflect.Message {
 // Deprecated: Use AuthResponse.ProtoReflect.Descriptor instead.
 func (*AuthResponse) Descriptor() ([]byte, []int) {
-	return file_woodpecker_proto_rawDescGZIP(), []int{20}
+	return file_woodpecker_proto_rawDescGZIP(), []int{21}
 }
 func (x *AuthResponse) GetStatus() string {
@@ -1148,18 +1208,20 @@ var File_woodpecker_proto protoreflect.FileDescriptor
 const file_woodpecker_proto_rawDesc = "" +
 	"\n" +
-	"\x10woodpecker.proto\x12\x05proto\"\xa9\x01\n" +
+	"\x10woodpecker.proto\x12\x05proto\"\xc5\x01\n" +
 	"\tStepState\x12\x1b\n" +
 	"\tstep_uuid\x18\x01 \x01(\tR\bstepUuid\x12\x18\n" +
 	"\astarted\x18\x02 \x01(\x03R\astarted\x12\x1a\n" +
 	"\bfinished\x18\x03 \x01(\x03R\bfinished\x12\x16\n" +
 	"\x06exited\x18\x04 \x01(\bR\x06exited\x12\x1b\n" +
 	"\texit_code\x18\x05 \x01(\x05R\bexitCode\x12\x14\n" +
-	"\x05error\x18\x06 \x01(\tR\x05error\"[\n" +
+	"\x05error\x18\x06 \x01(\tR\x05error\x12\x1a\n" +
 	"\bcanceled\x18\a \x01(\bR\bcanceled\"w\n" +
 	"\rWorkflowState\x12\x18\n" +
-	"\astarted\x18\x04 \x01(\x03R\astarted\x12\x1a\n" +
+	"\astarted\x18\x01 \x01(\x03R\astarted\x12\x1a\n" +
-	"\bfinished\x18\x05 \x01(\x03R\bfinished\x12\x14\n" +
+	"\bfinished\x18\x02 \x01(\x03R\bfinished\x12\x14\n" +
-	"\x05error\x18\x06 \x01(\tR\x05error\"w\n" +
+	"\x05error\x18\x03 \x01(\tR\x05error\x12\x1a\n" +
 	"\bcanceled\x18\x04 \x01(\bR\bcanceled\"w\n" +
 	"\bLogEntry\x12\x1b\n" +
 	"\tstep_uuid\x18\x01 \x01(\tR\bstepUuid\x12\x12\n" +
 	"\x04time\x18\x02 \x01(\x03R\x04time\x12\x12\n" +
@@ -1215,7 +1277,9 @@ const file_woodpecker_proto_rawDesc = "" +
 	"\fNextResponse\x12+\n" +
 	"\bworkflow\x18\x01 \x01(\v2\x0f.proto.WorkflowR\bworkflow\"2\n" +
 	"\x15RegisterAgentResponse\x12\x19\n" +
-	"\bagent_id\x18\x01 \x01(\x03R\aagentId\"I\n" +
+	"\bagent_id\x18\x01 \x01(\x03R\aagentId\"*\n" +
 	"\fWaitResponse\x12\x1a\n" +
 	"\bcanceled\x18\x01 \x01(\bR\bcanceled\"I\n" +
 	"\vAuthRequest\x12\x1f\n" +
 	"\vagent_token\x18\x01 \x01(\tR\n" +
 	"agentToken\x12\x19\n" +
@@ -1223,13 +1287,13 @@ const file_woodpecker_proto_rawDesc = "" +
 	"\fAuthResponse\x12\x16\n" +
 	"\x06status\x18\x01 \x01(\tR\x06status\x12\x19\n" +
 	"\bagent_id\x18\x02 \x01(\x03R\aagentId\x12!\n" +
-	"\faccess_token\x18\x03 \x01(\tR\vaccessToken2\xbb\x04\n" +
+	"\faccess_token\x18\x03 \x01(\tR\vaccessToken2\xc2\x04\n" +
 	"\n" +
 	"Woodpecker\x121\n" +
 	"\aVersion\x12\f.proto.Empty\x1a\x16.proto.VersionResponse\"\x00\x121\n" +
 	"\x04Next\x12\x12.proto.NextRequest\x1a\x13.proto.NextResponse\"\x00\x12*\n" +
-	"\x04Init\x12\x12.proto.InitRequest\x1a\f.proto.Empty\"\x00\x12*\n" +
+	"\x04Init\x12\x12.proto.InitRequest\x1a\f.proto.Empty\"\x00\x121\n" +
-	"\x04Wait\x12\x12.proto.WaitRequest\x1a\f.proto.Empty\"\x00\x12*\n" +
+	"\x04Wait\x12\x12.proto.WaitRequest\x1a\x13.proto.WaitResponse\"\x00\x12*\n" +
 	"\x04Done\x12\x12.proto.DoneRequest\x1a\f.proto.Empty\"\x00\x12.\n" +
 	"\x06Extend\x12\x14.proto.ExtendRequest\x1a\f.proto.Empty\"\x00\x12.\n" +
 	"\x06Update\x12\x14.proto.UpdateRequest\x1a\f.proto.Empty\"\x00\x12(\n" +
@@ -1252,7 +1316,7 @@ func file_woodpecker_proto_rawDescGZIP() []byte {
 	return file_woodpecker_proto_rawDescData
 }
-var file_woodpecker_proto_msgTypes = make([]protoimpl.MessageInfo, 23)
+var file_woodpecker_proto_msgTypes = make([]protoimpl.MessageInfo, 24)
 var file_woodpecker_proto_goTypes = []any{
 	(*StepState)(nil),             // 0: proto.StepState
 	(*WorkflowState)(nil),         // 1: proto.WorkflowState
@@ -1273,19 +1337,20 @@ var file_woodpecker_proto_goTypes = []any{
 	(*VersionResponse)(nil),       // 16: proto.VersionResponse
 	(*NextResponse)(nil),          // 17: proto.NextResponse
 	(*RegisterAgentResponse)(nil), // 18: proto.RegisterAgentResponse
-	(*AuthRequest)(nil),           // 19: proto.AuthRequest
+	(*WaitResponse)(nil),          // 19: proto.WaitResponse
-	(*AuthResponse)(nil),          // 20: proto.AuthResponse
+	(*AuthRequest)(nil),           // 20: proto.AuthRequest
-	nil,                           // 21: proto.Filter.LabelsEntry
+	(*AuthResponse)(nil),          // 21: proto.AuthResponse
-	nil,                           // 22: proto.AgentInfo.CustomLabelsEntry
+	nil,                           // 22: proto.Filter.LabelsEntry
 	nil,                           // 23: proto.AgentInfo.CustomLabelsEntry
 }
 var file_woodpecker_proto_depIdxs = []int32{
-	21, // 0: proto.Filter.labels:type_name -> proto.Filter.LabelsEntry
+	22, // 0: proto.Filter.labels:type_name -> proto.Filter.LabelsEntry
 	3,  // 1: proto.NextRequest.filter:type_name -> proto.Filter
 	1,  // 2: proto.InitRequest.state:type_name -> proto.WorkflowState
 	1,  // 3: proto.DoneRequest.state:type_name -> proto.WorkflowState
 	0,  // 4: proto.UpdateRequest.state:type_name -> proto.StepState
 	2,  // 5: proto.LogRequest.logEntries:type_name -> proto.LogEntry
-	22, // 6: proto.AgentInfo.customLabels:type_name -> proto.AgentInfo.CustomLabelsEntry
+	23, // 6: proto.AgentInfo.customLabels:type_name -> proto.AgentInfo.CustomLabelsEntry
 	14, // 7: proto.RegisterAgentRequest.info:type_name -> proto.AgentInfo
 	4,  // 8: proto.NextResponse.workflow:type_name -> proto.Workflow
 	12, // 9: proto.Woodpecker.Version:input_type -> proto.Empty
@@ -1299,11 +1364,11 @@ var file_woodpecker_proto_depIdxs = []int32{
 	15, // 17: proto.Woodpecker.RegisterAgent:input_type -> proto.RegisterAgentRequest
 	12, // 18: proto.Woodpecker.UnregisterAgent:input_type -> proto.Empty
 	13, // 19: proto.Woodpecker.ReportHealth:input_type -> proto.ReportHealthRequest
-	19, // 20: proto.WoodpeckerAuth.Auth:input_type -> proto.AuthRequest
+	20, // 20: proto.WoodpeckerAuth.Auth:input_type -> proto.AuthRequest
 	16, // 21: proto.Woodpecker.Version:output_type -> proto.VersionResponse
 	17, // 22: proto.Woodpecker.Next:output_type -> proto.NextResponse
 	12, // 23: proto.Woodpecker.Init:output_type -> proto.Empty
-	12, // 24: proto.Woodpecker.Wait:output_type -> proto.Empty
+	19, // 24: proto.Woodpecker.Wait:output_type -> proto.WaitResponse
 	12, // 25: proto.Woodpecker.Done:output_type -> proto.Empty
 	12, // 26: proto.Woodpecker.Extend:output_type -> proto.Empty
 	12, // 27: proto.Woodpecker.Update:output_type -> proto.Empty
@@ -1311,7 +1376,7 @@ var file_woodpecker_proto_depIdxs = []int32{
 	18, // 29: proto.Woodpecker.RegisterAgent:output_type -> proto.RegisterAgentResponse
 	12, // 30: proto.Woodpecker.UnregisterAgent:output_type -> proto.Empty
 	12, // 31: proto.Woodpecker.ReportHealth:output_type -> proto.Empty
-	20, // 32: proto.WoodpeckerAuth.Auth:output_type -> proto.AuthResponse
+	21, // 32: proto.WoodpeckerAuth.Auth:output_type -> proto.AuthResponse
 	21, // [21:33] is the sub-list for method output_type
 	9,  // [9:21] is the sub-list for method input_type
 	9,  // [9:9] is the sub-list for extension type_name
@@ -1330,7 +1395,7 @@ func file_woodpecker_proto_init() {
 			GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
 			RawDescriptor: unsafe.Slice(unsafe.StringData(file_woodpecker_proto_rawDesc), len(file_woodpecker_proto_rawDesc)),
 			NumEnums:      0,
-			NumMessages:   23,
+			NumMessages:   24,
 			NumExtensions: 0,
 			NumServices:   2,
 		},
--- a/rpc/proto/woodpecker.proto
+++ b/rpc/proto/woodpecker.proto
@@ -27,7 +27,7 @@ service Woodpecker {
  rpc Version         (Empty)                returns (VersionResponse) {}
  rpc Next            (NextRequest)          returns (NextResponse) {}
  rpc Init            (InitRequest)          returns (Empty) {}
-  rpc Wait            (WaitRequest)          returns (Empty) {}
+  rpc Wait            (WaitRequest)          returns (WaitResponse) {}
  rpc Done            (DoneRequest)          returns (Empty) {}
  rpc Extend          (ExtendRequest)        returns (Empty) {}
  rpc Update          (UpdateRequest)        returns (Empty) {}
@@ -48,12 +48,14 @@ message StepState {
  bool   exited = 4;
  int32  exit_code = 5;
  string error = 6;
  bool   canceled = 7;
 }
 message WorkflowState {
-  int64  started = 4;
+  int64  started = 1;
-  int64  finished = 5;
+  int64  finished = 2;
-  string error = 6;
+  string error = 3;
  bool   canceled = 4;
 }
 message LogEntry {
@@ -145,6 +147,10 @@ message RegisterAgentResponse {
  int64 agent_id = 1;
 }
 message WaitResponse {
  bool canceled = 1;
 };
 // Woodpecker auth service is a simple service to authenticate agents and acquire a token
 service WoodpeckerAuth {
--- a/rpc/proto/woodpecker_grpc.pb.go
+++ b/rpc/proto/woodpecker_grpc.pb.go
@@ -56,7 +56,7 @@ type WoodpeckerClient interface {
 	Version(ctx context.Context, in *Empty, opts ...grpc.CallOption) (*VersionResponse, error)
 	Next(ctx context.Context, in *NextRequest, opts ...grpc.CallOption) (*NextResponse, error)
 	Init(ctx context.Context, in *InitRequest, opts ...grpc.CallOption) (*Empty, error)
-	Wait(ctx context.Context, in *WaitRequest, opts ...grpc.CallOption) (*Empty, error)
+	Wait(ctx context.Context, in *WaitRequest, opts ...grpc.CallOption) (*WaitResponse, error)
 	Done(ctx context.Context, in *DoneRequest, opts ...grpc.CallOption) (*Empty, error)
 	Extend(ctx context.Context, in *ExtendRequest, opts ...grpc.CallOption) (*Empty, error)
 	Update(ctx context.Context, in *UpdateRequest, opts ...grpc.CallOption) (*Empty, error)
@@ -104,9 +104,9 @@ func (c *woodpeckerClient) Init(ctx context.Context, in *InitRequest, opts ...gr
 	return out, nil
 }
-func (c *woodpeckerClient) Wait(ctx context.Context, in *WaitRequest, opts ...grpc.CallOption) (*Empty, error) {
+func (c *woodpeckerClient) Wait(ctx context.Context, in *WaitRequest, opts ...grpc.CallOption) (*WaitResponse, error) {
 	cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...)
-	out := new(Empty)
+	out := new(WaitResponse)
 	err := c.cc.Invoke(ctx, Woodpecker_Wait_FullMethodName, in, out, cOpts...)
 	if err != nil {
 		return nil, err
@@ -193,7 +193,7 @@ type WoodpeckerServer interface {
 	Version(context.Context, *Empty) (*VersionResponse, error)
 	Next(context.Context, *NextRequest) (*NextResponse, error)
 	Init(context.Context, *InitRequest) (*Empty, error)
-	Wait(context.Context, *WaitRequest) (*Empty, error)
+	Wait(context.Context, *WaitRequest) (*WaitResponse, error)
 	Done(context.Context, *DoneRequest) (*Empty, error)
 	Extend(context.Context, *ExtendRequest) (*Empty, error)
 	Update(context.Context, *UpdateRequest) (*Empty, error)
@@ -220,7 +220,7 @@ func (UnimplementedWoodpeckerServer) Next(context.Context, *NextRequest) (*NextR
 func (UnimplementedWoodpeckerServer) Init(context.Context, *InitRequest) (*Empty, error) {
 	return nil, status.Error(codes.Unimplemented, "method Init not implemented")
 }
-func (UnimplementedWoodpeckerServer) Wait(context.Context, *WaitRequest) (*Empty, error) {
+func (UnimplementedWoodpeckerServer) Wait(context.Context, *WaitRequest) (*WaitResponse, error) {
 	return nil, status.Error(codes.Unimplemented, "method Wait not implemented")
 }
 func (UnimplementedWoodpeckerServer) Done(context.Context, *DoneRequest) (*Empty, error) {
--- a/rpc/types.go
+++ b/rpc/types.go
@@ -0,0 +1,66 @@
 // Copyright 2025 Woodpecker Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package rpc
 import (
 	backend "go.woodpecker-ci.org/woodpecker/v3/pipeline/backend/types"
 )
 type (
 	// Filter defines filters for fetching items from the queue.
 	Filter struct {
 		Labels map[string]string `json:"labels"`
 	}
 	// StepState defines the step state.
 	StepState struct {
 		StepUUID string `json:"step_uuid"`
 		Started  int64  `json:"started"`
 		Finished int64  `json:"finished"`
 		Exited   bool   `json:"exited"`
 		ExitCode int    `json:"exit_code"`
 		Error    string `json:"error"`
 		Canceled bool   `json:"canceled"`
 	}
 	// WorkflowState defines the workflow state.
 	WorkflowState struct {
 		Started  int64  `json:"started"`
 		Finished int64  `json:"finished"`
 		Error    string `json:"error"`
 		Canceled bool   `json:"canceled"`
 	}
 	// Workflow defines the workflow execution details.
 	Workflow struct {
 		ID      string          `json:"id"`
 		Config  *backend.Config `json:"config"`
 		Timeout int64           `json:"timeout"`
 	}
 	Version struct {
 		GrpcVersion   int32  `json:"grpc_version,omitempty"`
 		ServerVersion string `json:"server_version,omitempty"`
 	}
 	// AgentInfo represents all the metadata that should be known about an agent.
 	AgentInfo struct {
 		Version      string            `json:"version"`
 		Platform     string            `json:"platform"`
 		Backend      string            `json:"backend"`
 		Capacity     int               `json:"capacity"`
 		CustomLabels map[string]string `json:"custom_labels"`
 	}
 )
--- a/server/pipeline/cancel.go
+++ b/server/pipeline/cancel.go
@@ -40,24 +40,13 @@ func Cancel(ctx context.Context, _forge forge.Forge, store store.Store, repo *mo
 	}
 	// First cancel/evict workflows in the queue in one go
-	var (
+	var workflowsToCancel []string
-		workflowsToCancel []string
+	for _, w := range workflows {
-		workflowsToEvict  []string
+		if w.State == model.StatusRunning || w.State == model.StatusPending {
-	)
+			workflowsToCancel = append(workflowsToCancel, fmt.Sprint(w.ID))
 	for _, workflow := range workflows {
 		if workflow.State == model.StatusRunning {
 			workflowsToCancel = append(workflowsToCancel, fmt.Sprint(workflow.ID))
 		}
 		if workflow.State == model.StatusPending {
 			workflowsToEvict = append(workflowsToEvict, fmt.Sprint(workflow.ID))
 		}
 	}
 	if len(workflowsToEvict) != 0 {
 		if err := server.Config.Services.Queue.ErrorAtOnce(ctx, workflowsToEvict, queue.ErrCancel); err != nil {
 			log.Error().Err(err).Msgf("queue: evict_at_once: %v", workflowsToEvict)
 		}
 	}
 	if len(workflowsToCancel) != 0 {
 		if err := server.Config.Services.Queue.ErrorAtOnce(ctx, workflowsToCancel, queue.ErrCancel); err != nil {
 			log.Error().Err(err).Msgf("queue: evict_at_once: %v", workflowsToCancel)
--- a/server/pipeline/step_status.go
+++ b/server/pipeline/step_status.go
@@ -16,28 +16,76 @@
 package pipeline
 import (
-	"go.woodpecker-ci.org/woodpecker/v3/pipeline"
+	"fmt"
 	"time"
 	"github.com/rs/zerolog/log"
 	"go.woodpecker-ci.org/woodpecker/v3/rpc"
 	"go.woodpecker-ci.org/woodpecker/v3/server/model"
 	"go.woodpecker-ci.org/woodpecker/v3/server/store"
 )
 // UpdateStepStatus updates step status based on agent reports via RPC.
 func UpdateStepStatus(store store.Store, step *model.Step, state rpc.StepState) error {
-	if state.Exited {
+	log.Debug().Str("StepUUID", step.UUID).Msgf("Update step %#v state %#v", *step, state)
-		step.Finished = state.Finished
+
-		step.ExitCode = state.ExitCode
+	switch step.State {
-		step.Error = state.Error
+	case model.StatusPending:
-		step.State = model.StatusSuccess
+		// Transition from pending to running when started
-		if state.ExitCode != 0 || state.Error != "" {
+		if state.Finished == 0 {
-			step.State = model.StatusFailure
+			step.State = model.StatusRunning
 		}
 		if state.ExitCode == pipeline.ExitCodeKilled {
 			step.State = model.StatusKilled
 		}
 	} else if step.Finished == 0 {
 		step.Started = state.Started
-		step.State = model.StatusRunning
+		if step.Started == 0 {
 			step.Started = time.Now().Unix()
 		}
 		// Handle direct transition to finished if step setup error happened
 		if state.Exited || state.Error != "" {
 			step.Finished = state.Finished
 			if step.Finished == 0 {
 				step.Finished = time.Now().Unix()
 			}
 			step.ExitCode = state.ExitCode
 			step.Error = state.Error
 			if state.ExitCode == 0 && state.Error == "" {
 				step.State = model.StatusSuccess
 			} else {
 				step.State = model.StatusFailure
 			}
 		}
 	case model.StatusRunning:
 		// Already running, check if it finished
 		if state.Exited || state.Error != "" {
 			step.Finished = state.Finished
 			if step.Finished == 0 {
 				step.Finished = time.Now().Unix()
 			}
 			step.ExitCode = state.ExitCode
 			step.Error = state.Error
 			if state.ExitCode == 0 && state.Error == "" {
 				step.State = model.StatusSuccess
 			} else {
 				step.State = model.StatusFailure
 			}
 		}
 	default:
 		return fmt.Errorf("step has state %s and does not expect rpc state updates", step.State)
 	}
 	// Handle cancellation across both cases
 	if state.Canceled && step.State != model.StatusKilled {
 		step.State = model.StatusKilled
 		if step.Finished == 0 {
 			step.Finished = time.Now().Unix()
 		}
 	}
 	return store.StepUpdate(step)
 }
--- a/server/pipeline/step_status_test.go
+++ b/server/pipeline/step_status_test.go
@@ -1,5 +1,4 @@
 // Copyright 2022 Woodpecker Authors
 // Copyright 2019 mhmxs
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -34,138 +33,224 @@ func mockStoreStep(t *testing.T) store.Store {
 	return s
 }
-func TestUpdateStepStatusNotExited(t *testing.T) {
+func TestUpdateStepStatus(t *testing.T) {
 	t.Parallel()
 	// step in db before update
 	step := &model.Step{}
 	// advertised step status
 	state := rpc.StepState{
 		Started: int64(42),
 		Exited:  false,
 		// Dummy data
 		Finished: int64(1),
 		ExitCode: pipeline.ExitCodeKilled,
 		Error:    "not an error",
 	}
 	err := UpdateStepStatus(mockStoreStep(t), step, state)
 	assert.NoError(t, err)
 	assert.EqualValues(t, model.StatusRunning, step.State)
 	assert.EqualValues(t, 42, step.Started)
 	assert.EqualValues(t, 0, step.Finished)
 	assert.EqualValues(t, 0, step.ExitCode)
 	assert.EqualValues(t, "", step.Error)
 }
 func TestUpdateStepStatusNotExitedButStopped(t *testing.T) {
 	t.Parallel()
-	// step in db before update
+	t.Run("Pending", func(t *testing.T) {
-	step := &model.Step{Started: 42, Finished: 64, State: model.StatusKilled}
+		t.Parallel()
-	// advertised step status
+		t.Run("TransitionToRunning", func(t *testing.T) {
-	state := rpc.StepState{
+			t.Parallel()
 		Exited: false,
 		// Dummy data
 		Finished: int64(1),
 		ExitCode: pipeline.ExitCodeKilled,
 		Error:    "not an error",
 	}
-	err := UpdateStepStatus(mockStoreStep(t), step, state)
+			t.Run("WithStartTime", func(t *testing.T) {
-	assert.NoError(t, err)
+				t.Parallel()
-	assert.EqualValues(t, model.StatusKilled, step.State)
+				step := &model.Step{State: model.StatusPending}
-	assert.EqualValues(t, 42, step.Started)
+				state := rpc.StepState{Started: 42, Finished: 0}
 	assert.EqualValues(t, 64, step.Finished)
 	assert.EqualValues(t, 0, step.ExitCode)
 	assert.EqualValues(t, "", step.Error)
 }
-func TestUpdateStepStatusExited(t *testing.T) {
+				err := UpdateStepStatus(mockStoreStep(t), step, state)
 	t.Parallel()
-	// step in db before update
+				assert.NoError(t, err)
-	step := &model.Step{Started: 42}
+				assert.Equal(t, model.StatusRunning, step.State)
 				assert.Equal(t, int64(42), step.Started)
 				assert.Equal(t, int64(0), step.Finished)
 			})
-	// advertised step status
+			t.Run("WithoutStartTime", func(t *testing.T) {
-	state := rpc.StepState{
+				t.Parallel()
-		Started:  int64(42),
+				step := &model.Step{State: model.StatusPending}
-		Exited:   true,
+				state := rpc.StepState{Started: 0, Finished: 0}
 		Finished: int64(34),
 		ExitCode: pipeline.ExitCodeKilled,
 		Error:    "an error",
 	}
-	err := UpdateStepStatus(mockStoreStep(t), step, state)
+				err := UpdateStepStatus(mockStoreStep(t), step, state)
 	assert.NoError(t, err)
 	assert.EqualValues(t, model.StatusKilled, step.State)
 	assert.EqualValues(t, 42, step.Started)
 	assert.EqualValues(t, 34, step.Finished)
 	assert.EqualValues(t, pipeline.ExitCodeKilled, step.ExitCode)
 	assert.EqualValues(t, "an error", step.Error)
 }
-func TestUpdateStepStatusExitedButNot137(t *testing.T) {
+				assert.NoError(t, err)
-	t.Parallel()
+				assert.Equal(t, model.StatusRunning, step.State)
 				assert.Greater(t, step.Started, int64(0))
 			})
 		})
-	// step in db before update
+		t.Run("DirectToSuccess", func(t *testing.T) {
-	step := &model.Step{Started: 42}
+			t.Parallel()
-	// advertised step status
+			t.Run("WithFinishTime", func(t *testing.T) {
-	state := rpc.StepState{
+				t.Parallel()
-		Started:  int64(42),
+				step := &model.Step{State: model.StatusPending}
-		Exited:   true,
+				state := rpc.StepState{Started: 42, Exited: true, Finished: 100, ExitCode: 0, Error: ""}
 		Finished: int64(34),
 		Error:    "an error",
 	}
-	err := UpdateStepStatus(mockStoreStep(t), step, state)
+				err := UpdateStepStatus(mockStoreStep(t), step, state)
 	assert.NoError(t, err)
 	assert.EqualValues(t, model.StatusFailure, step.State)
 	assert.EqualValues(t, 42, step.Started)
 	assert.EqualValues(t, 34, step.Finished)
 	assert.EqualValues(t, 0, step.ExitCode)
 	assert.EqualValues(t, "an error", step.Error)
 }
-func TestUpdateStepStatusExitedWithCode(t *testing.T) {
+				assert.NoError(t, err)
-	t.Parallel()
+				assert.Equal(t, model.StatusSuccess, step.State)
 				assert.Equal(t, int64(42), step.Started)
 				assert.Equal(t, int64(100), step.Finished)
 			})
-	// advertised step status
+			t.Run("WithoutFinishTime", func(t *testing.T) {
-	state := rpc.StepState{
+				t.Parallel()
-		Started:  int64(42),
+				step := &model.Step{State: model.StatusPending}
-		Exited:   true,
+				state := rpc.StepState{Started: 42, Exited: true, Finished: 0, ExitCode: 0, Error: ""}
 		Finished: int64(34),
 		ExitCode: 1,
 		Error:    "an error",
 	}
 	step := &model.Step{}
 	err := UpdateStepStatus(mockStoreStep(t), step, state)
 	assert.NoError(t, err)
-	assert.Equal(t, model.StatusFailure, step.State)
+				err := UpdateStepStatus(mockStoreStep(t), step, state)
-	assert.Equal(t, 1, step.ExitCode)
+
 				assert.NoError(t, err)
 				assert.Equal(t, model.StatusSuccess, step.State)
 				assert.Greater(t, step.Finished, int64(0))
 			})
 		})
 		t.Run("DirectToFailure", func(t *testing.T) {
 			t.Parallel()
 			t.Run("WithExitCode", func(t *testing.T) {
 				t.Parallel()
 				step := &model.Step{State: model.StatusPending}
 				state := rpc.StepState{Started: 42, Exited: true, Finished: 34, ExitCode: 1, Error: "an error"}
 				err := UpdateStepStatus(mockStoreStep(t), step, state)
 				assert.NoError(t, err)
 				assert.Equal(t, model.StatusFailure, step.State)
 				assert.Equal(t, 1, step.ExitCode)
 				assert.Equal(t, "an error", step.Error)
 			})
 		})
 	})
 	t.Run("Running", func(t *testing.T) {
 		t.Parallel()
 		t.Run("ToSuccess", func(t *testing.T) {
 			t.Parallel()
 			t.Run("WithFinishTime", func(t *testing.T) {
 				t.Parallel()
 				step := &model.Step{State: model.StatusRunning, Started: 42}
 				state := rpc.StepState{Exited: true, Finished: 100, ExitCode: 0, Error: ""}
 				err := UpdateStepStatus(mockStoreStep(t), step, state)
 				assert.NoError(t, err)
 				assert.Equal(t, model.StatusSuccess, step.State)
 				assert.Equal(t, int64(100), step.Finished)
 			})
 			t.Run("WithoutFinishTime", func(t *testing.T) {
 				t.Parallel()
 				step := &model.Step{State: model.StatusRunning, Started: 42}
 				state := rpc.StepState{Exited: true, Finished: 0, ExitCode: 0, Error: ""}
 				err := UpdateStepStatus(mockStoreStep(t), step, state)
 				assert.NoError(t, err)
 				assert.Equal(t, model.StatusSuccess, step.State)
 				assert.Greater(t, step.Finished, int64(0))
 			})
 		})
 		t.Run("ToFailure", func(t *testing.T) {
 			t.Parallel()
 			t.Run("WithExitCode137", func(t *testing.T) {
 				t.Parallel()
 				step := &model.Step{State: model.StatusRunning, Started: 42}
 				state := rpc.StepState{Exited: true, Finished: 34, ExitCode: pipeline.ExitCodeKilled, Error: "an error"}
 				err := UpdateStepStatus(mockStoreStep(t), step, state)
 				assert.NoError(t, err)
 				assert.Equal(t, model.StatusFailure, step.State)
 				assert.Equal(t, int64(34), step.Finished)
 				assert.Equal(t, pipeline.ExitCodeKilled, step.ExitCode)
 			})
 			t.Run("WithError", func(t *testing.T) {
 				t.Parallel()
 				step := &model.Step{State: model.StatusRunning, Started: 42}
 				state := rpc.StepState{Exited: true, Finished: 34, ExitCode: 0, Error: "an error"}
 				err := UpdateStepStatus(mockStoreStep(t), step, state)
 				assert.NoError(t, err)
 				assert.Equal(t, model.StatusFailure, step.State)
 				assert.Equal(t, "an error", step.Error)
 			})
 		})
 		t.Run("StillRunning", func(t *testing.T) {
 			t.Parallel()
 			step := &model.Step{State: model.StatusRunning, Started: 42}
 			state := rpc.StepState{Exited: false, Finished: 0}
 			err := UpdateStepStatus(mockStoreStep(t), step, state)
 			assert.NoError(t, err)
 			assert.Equal(t, model.StatusRunning, step.State)
 			assert.Equal(t, int64(0), step.Finished)
 		})
 	})
 	t.Run("Canceled", func(t *testing.T) {
 		t.Parallel()
 		t.Run("WithoutFinishTime", func(t *testing.T) {
 			t.Parallel()
 			step := &model.Step{State: model.StatusRunning, Started: 42}
 			state := rpc.StepState{Canceled: true}
 			err := UpdateStepStatus(mockStoreStep(t), step, state)
 			assert.NoError(t, err)
 			assert.Equal(t, model.StatusKilled, step.State)
 			assert.Greater(t, step.Finished, int64(0))
 		})
 		t.Run("WithExitedAndFinishTime", func(t *testing.T) {
 			t.Parallel()
 			step := &model.Step{State: model.StatusRunning, Started: 42}
 			state := rpc.StepState{Canceled: true, Exited: true, Finished: 100, ExitCode: 1, Error: "canceled"}
 			err := UpdateStepStatus(mockStoreStep(t), step, state)
 			assert.NoError(t, err)
 			assert.Equal(t, model.StatusKilled, step.State)
 			assert.Equal(t, int64(100), step.Finished)
 			assert.Equal(t, 1, step.ExitCode)
 			assert.Equal(t, "canceled", step.Error)
 		})
 	})
 	t.Run("TerminalState", func(t *testing.T) {
 		t.Parallel()
 		step := &model.Step{State: model.StatusKilled, Started: 42, Finished: 64}
 		state := rpc.StepState{Exited: false}
 		err := UpdateStepStatus(mocks.NewMockStore(t), step, state)
 		assert.Error(t, err)
 		assert.Contains(t, err.Error(), "does not expect rpc state updates")
 		assert.Equal(t, model.StatusKilled, step.State)
 	})
 }
 func TestUpdateStepToStatusSkipped(t *testing.T) {
 	t.Parallel()
-	step, _ := UpdateStepToStatusSkipped(mockStoreStep(t), model.Step{}, int64(1))
+	t.Run("NotStarted", func(t *testing.T) {
 		t.Parallel()
-	assert.Equal(t, model.StatusSkipped, step.State)
+		step, err := UpdateStepToStatusSkipped(mockStoreStep(t), model.Step{}, int64(1))
-	assert.EqualValues(t, 0, step.Finished)
+
-}
+		assert.NoError(t, err)
-
+		assert.Equal(t, model.StatusSkipped, step.State)
-func TestUpdateStepToStatusSkippedButStarted(t *testing.T) {
+		assert.Equal(t, int64(0), step.Finished)
-	t.Parallel()
+	})
-
+
-	step := &model.Step{
+	t.Run("AlreadyStarted", func(t *testing.T) {
-		Started: int64(42),
+		t.Parallel()
-	}
+
-
+		step, err := UpdateStepToStatusSkipped(mockStoreStep(t), model.Step{Started: 42}, int64(100))
-	step, _ = UpdateStepToStatusSkipped(mockStoreStep(t), *step, int64(1))
+
-
+		assert.NoError(t, err)
-	assert.Equal(t, model.StatusSuccess, step.State)
+		assert.Equal(t, model.StatusSuccess, step.State)
-	assert.EqualValues(t, 1, step.Finished)
+		assert.Equal(t, int64(100), step.Finished)
 	})
 }
--- a/server/pipeline/workflow_status.go
+++ b/server/pipeline/workflow_status.go
@@ -42,5 +42,8 @@ func UpdateWorkflowStatusToDone(store store.Store, workflow model.Workflow, stat
 	if workflow.Error != "" {
 		workflow.State = model.StatusFailure
 	}
 	if state.Canceled {
 		workflow.State = model.StatusKilled
 	}
 	return &workflow, store.WorkflowUpdate(&workflow)
 }
--- a/server/queue/fifo.go
+++ b/server/queue/fifo.go
@@ -157,6 +157,7 @@ func (q *fifo) finished(ids []string, exitStatus model.StatusValue, err error) e
 }
 // Wait waits until the item is done executing.
 // Also signals via error ErrCancel if workflow got canceled.
 func (q *fifo) Wait(ctx context.Context, taskID string) error {
 	q.Lock()
 	state := q.running[taskID]
--- a/server/queue/persistent.go
+++ b/server/queue/persistent.go
@@ -17,11 +17,14 @@ package queue
 import (
 	"context"
 	"errors"
 	"fmt"
 	"github.com/rs/zerolog/log"
 	"go.woodpecker-ci.org/woodpecker/v3/server/model"
 	"go.woodpecker-ci.org/woodpecker/v3/server/store"
 	"go.woodpecker-ci.org/woodpecker/v3/server/store/types"
 )
 // WithTaskStore returns a queue that is backed by the TaskStore. This
@@ -77,7 +80,14 @@ func (q *persistentQueue) Error(c context.Context, id string, err error) error {
 	if err := q.Queue.Error(c, id, err); err != nil {
 		return err
 	}
-	return q.store.TaskDelete(id)
+
 	if deleteErr := q.store.TaskDelete(id); deleteErr != nil {
 		if !errors.Is(deleteErr, types.RecordNotExist) {
 			return deleteErr
 		}
 		log.Debug().Msgf("task %s already removed from store", id)
 	}
 	return nil
 }
 // ErrorAtOnce signals multiple tasks are done and complete with an error.
@@ -86,10 +96,16 @@ func (q *persistentQueue) ErrorAtOnce(c context.Context, ids []string, err error
 	if err := q.Queue.ErrorAtOnce(c, ids, err); err != nil {
 		return err
 	}
 	var errs []error
 	for _, id := range ids {
-		if err := q.store.TaskDelete(id); err != nil {
+		if deleteErr := q.store.TaskDelete(id); deleteErr != nil && !errors.Is(deleteErr, types.RecordNotExist) {
-			return err
+			errs = append(errs, fmt.Errorf("task id [%s]: %w", id, deleteErr))
 		}
 	}
 	if len(errs) != 0 {
 		return fmt.Errorf("failed to delete tasks from persistent store: %w", errors.Join(errs...))
 	}
 	return nil
 }
--- a/server/queue/queue.go
+++ b/server/queue/queue.go
@@ -101,6 +101,7 @@ type Queue interface {
 	ErrorAtOnce(c context.Context, ids []string, err error) error
 	// Wait waits until the task is complete.
 	// Also signals via error ErrCancel if workflow got canceled.
 	Wait(c context.Context, id string) error
 	// Info returns internal queue information.
--- a/server/rpc/authorizer.go
+++ b/server/rpc/authorizer.go
@@ -12,6 +12,38 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // Package grpc provides gRPC server implementation with JWT-based authentication.
 //
 // # Authentication Flow
 //
 // Uses a two-token approach:
 //
 //  1. Agent Token (long-lived): Configured via WOODPECKER_AGENT_SECRET, used only for initial Auth() call
 //  2. JWT Access Token (short-lived, 1 hour): Obtained from Auth(), included in metadata["token"] for all service calls
 //
 // # Interceptor Architecture
 //
 // Authorizer interceptors validate JWT tokens on every request:
 //  1. Extract JWT from metadata["token"]
 //  2. Verify signature and expiration
 //  3. Extract and add agent_id to metadata for downstream handlers
 //
 // Auth endpoint (/proto.WoodpeckerAuth/Auth) bypasses validation to allow initial authentication.
 //
 // # Usage
 //
 //	// Server setup
 //	jwtManager := NewJWTManager(c.String("grpc-secret"))
 //	authorizer := NewAuthorizer(jwtManager)
 //	grpcServer := grpc.NewServer(
 //	    grpc.StreamInterceptor(authorizer.StreamInterceptor),
 //	    grpc.UnaryInterceptor(authorizer.UnaryInterceptor),
 //	)
 //
 //	// Client usage
 //	resp, _ := authClient.Auth(ctx, &proto.AuthRequest{AgentToken: "secret", AgentId: -1})
 //	ctx = metadata.AppendToOutgoingContext(ctx, "token", resp.AccessToken)
 //	workflow, _ := woodpeckerClient.Next(ctx, &proto.NextRequest{...})
 package grpc
 import (
@@ -24,6 +56,7 @@ import (
 	"google.golang.org/grpc/status"
 )
 // StreamContextWrapper wraps gRPC ServerStream to allow context modification.
 type StreamContextWrapper interface {
 	grpc.ServerStream
 	SetContext(context.Context)
@@ -50,14 +83,17 @@ func newStreamContextWrapper(inner grpc.ServerStream) StreamContextWrapper {
 	}
 }
 // Authorizer validates JWT tokens and enriches context with agent information.
 type Authorizer struct {
 	jwtManager *JWTManager
 }
 // NewAuthorizer creates a new JWT authorizer.
 func NewAuthorizer(jwtManager *JWTManager) *Authorizer {
 	return &Authorizer{jwtManager: jwtManager}
 }
 // StreamInterceptor validates JWT tokens for streaming gRPC calls.
 func (a *Authorizer) StreamInterceptor(srv any, stream grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error {
 	_stream := newStreamContextWrapper(stream)
@@ -71,7 +107,8 @@ func (a *Authorizer) StreamInterceptor(srv any, stream grpc.ServerStream, info *
 	return handler(srv, _stream)
 }
-func (a *Authorizer) UnaryInterceptor(ctx context.Context, req any, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp any, err error) {
+// UnaryInterceptor validates JWT tokens for unary gRPC calls.
 func (a *Authorizer) UnaryInterceptor(ctx context.Context, req any, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (any, error) {
 	newCtx, err := a.authorize(ctx, info.FullMethod)
 	if err != nil {
 		return nil, err
@@ -79,6 +116,8 @@ func (a *Authorizer) UnaryInterceptor(ctx context.Context, req any, info *grpc.U
 	return handler(newCtx, req)
 }
 // authorize validates JWT and enriches context with agent_id metadata.
 // Bypasses validation for /proto.WoodpeckerAuth/Auth endpoint.
 func (a *Authorizer) authorize(ctx context.Context, fullMethod string) (context.Context, error) {
 	// bypass auth for token endpoint
 	if fullMethod == "/proto.WoodpeckerAuth/Auth" {
--- a/server/rpc/rpc.go
+++ b/server/rpc/rpc.go
@@ -53,6 +53,7 @@ type RPC struct {
 }
 // Next blocks until it provides the next workflow to execute.
 // TODO (6038): Server does not release waiting agents on graceful shutdown.
 func (s *RPC) Next(c context.Context, agentFilter rpc.Filter) (*rpc.Workflow, error) {
 	if hostname, err := s.getHostnameFromContext(c); err == nil {
 		log.Debug().Msgf("agent connected: %s: polling", hostname)
@@ -100,18 +101,29 @@ func (s *RPC) Next(c context.Context, agentFilter rpc.Filter) (*rpc.Workflow, er
 	}
 }
-// Wait blocks until the workflow with the given ID is done.
+// Wait blocks until the workflow with the given ID is completed or got canceled.
-func (s *RPC) Wait(c context.Context, workflowID string) error {
+// Used to let agents wait for cancel signals from server side.
 func (s *RPC) Wait(c context.Context, workflowID string) (canceled bool, err error) {
 	agent, err := s.getAgentFromContext(c)
 	if err != nil {
-		return err
+		return false, err
 	}
 	if err := s.checkAgentPermissionByWorkflow(c, agent, workflowID, nil, nil); err != nil {
-		return err
+		return false, err
 	}
-	return s.queue.Wait(c, workflowID)
+	if err := s.queue.Wait(c, workflowID); err != nil {
 		if errors.Is(err, queue.ErrCancel) {
 			// we explicit send a cancel signal
 			return true, nil
 		}
 		// unknown error happened
 		return false, err
 	}
 	// workflow finished and on issues appeared
 	return false, nil
 }
 // Extend extends the lease for the workflow with the given ID.
@@ -133,7 +145,7 @@ func (s *RPC) Extend(c context.Context, workflowID string) error {
 	return s.queue.Extend(c, agent.ID, workflowID)
 }
-// Update updates the state of a step.
+// Update let agent updates the step state at the server.
 func (s *RPC) Update(c context.Context, strWorkflowID string, state rpc.StepState) error {
 	workflowID, err := strconv.ParseInt(strWorkflowID, 10, 64)
 	if err != nil {
@@ -213,7 +225,7 @@ func (s *RPC) Update(c context.Context, strWorkflowID string, state rpc.StepStat
 	return nil
 }
-// Init implements the rpc.Init function.
+// Init signals the workflow is initialized.
 func (s *RPC) Init(c context.Context, strWorkflowID string, state rpc.WorkflowState) error {
 	workflowID, err := strconv.ParseInt(strWorkflowID, 10, 64)
 	if err != nil {
@@ -286,7 +298,7 @@ func (s *RPC) Init(c context.Context, strWorkflowID string, state rpc.WorkflowSt
 	return s.updateAgentLastWork(agent)
 }
-// Done marks the workflow with the given ID as done.
+// Done marks the workflow with the given ID as stope.
 func (s *RPC) Done(c context.Context, strWorkflowID string, state rpc.WorkflowState) error {
 	workflowID, err := strconv.ParseInt(strWorkflowID, 10, 64)
 	if err != nil {
@@ -331,20 +343,23 @@ func (s *RPC) Done(c context.Context, strWorkflowID string, state rpc.WorkflowSt
 		Str("pipeline_id", fmt.Sprint(currentPipeline.ID)).
 		Str("workflow_id", strWorkflowID).Logger()
-	logger.Trace().Msgf("gRPC Done with state: %#v", state)
+	logger.Debug().Msgf("workflow state in store: %#v", workflow)
 	logger.Debug().Msgf("gRPC Done with state: %#v", state)
 	if workflow, err = pipeline.UpdateWorkflowStatusToDone(s.store, *workflow, state); err != nil {
 		logger.Error().Err(err).Msgf("pipeline.UpdateWorkflowStatusToDone: cannot update workflow state: %s", err)
 	}
-	var queueErr error
+	if !state.Canceled {
-	if workflow.Failing() {
+		var queueErr error
-		queueErr = s.queue.Error(c, strWorkflowID, fmt.Errorf("workflow finished with error %s", state.Error))
+		if workflow.Failing() {
-	} else {
+			queueErr = s.queue.Error(c, strWorkflowID, fmt.Errorf("workflow finished with error %s", state.Error))
-		queueErr = s.queue.Done(c, strWorkflowID, workflow.State)
+		} else {
-	}
+			queueErr = s.queue.Done(c, strWorkflowID, workflow.State)
-	if queueErr != nil {
+		}
-		logger.Error().Err(queueErr).Msg("queue.Done: cannot ack workflow")
+		if queueErr != nil {
 			logger.Error().Err(queueErr).Msg("queue.Done: cannot ack workflow")
 		}
 	}
 	currentPipeline.Workflows, err = s.store.WorkflowGetTree(currentPipeline)
--- a/server/rpc/server.go
+++ b/server/rpc/server.go
@@ -59,6 +59,7 @@ func NewWoodpeckerServer(queue queue.Queue, logger logging.Log, pubsub *pubsub.P
 	return &WoodpeckerServer{peer: peer}
 }
 // Version returns the server- & grpc-version.
 func (s *WoodpeckerServer) Version(_ context.Context, _ *proto.Empty) (*proto.VersionResponse, error) {
 	return &proto.VersionResponse{
 		GrpcVersion:   proto.Version,
@@ -66,6 +67,7 @@ func (s *WoodpeckerServer) Version(_ context.Context, _ *proto.Empty) (*proto.Ve
 	}, nil
 }
 // Next blocks until it provides the next workflow to execute from the queue.
 func (s *WoodpeckerServer) Next(c context.Context, req *proto.NextRequest) (*proto.NextResponse, error) {
 	filter := rpc.Filter{
 		Labels: req.GetFilter().GetLabels(),
@@ -85,6 +87,7 @@ func (s *WoodpeckerServer) Next(c context.Context, req *proto.NextRequest) (*pro
 	return res, err
 }
 // Init let agent signals to server the workflow is initialized.
 func (s *WoodpeckerServer) Init(c context.Context, req *proto.InitRequest) (*proto.Empty, error) {
 	state := rpc.WorkflowState{
 		Started:  req.GetState().GetStarted(),
@@ -96,6 +99,7 @@ func (s *WoodpeckerServer) Init(c context.Context, req *proto.InitRequest) (*pro
 	return res, err
 }
 // Update let agent updates the step state at the server.
 func (s *WoodpeckerServer) Update(c context.Context, req *proto.UpdateRequest) (*proto.Empty, error) {
 	state := rpc.StepState{
 		StepUUID: req.GetState().GetStepUuid(),
@@ -104,29 +108,36 @@ func (s *WoodpeckerServer) Update(c context.Context, req *proto.UpdateRequest) (
 		Exited:   req.GetState().GetExited(),
 		Error:    req.GetState().GetError(),
 		ExitCode: int(req.GetState().GetExitCode()),
 		Canceled: req.GetState().GetCanceled(),
 	}
 	res := new(proto.Empty)
 	err := s.peer.Update(c, req.GetId(), state)
 	return res, err
 }
 // Done let agent signal to server the workflow has stopped.
 func (s *WoodpeckerServer) Done(c context.Context, req *proto.DoneRequest) (*proto.Empty, error) {
 	state := rpc.WorkflowState{
 		Started:  req.GetState().GetStarted(),
 		Finished: req.GetState().GetFinished(),
 		Error:    req.GetState().GetError(),
 		Canceled: req.GetState().GetCanceled(),
 	}
 	res := new(proto.Empty)
 	err := s.peer.Done(c, req.GetId(), state)
 	return res, err
 }
-func (s *WoodpeckerServer) Wait(c context.Context, req *proto.WaitRequest) (*proto.Empty, error) {
+// Wait blocks until the workflow is complete.
-	res := new(proto.Empty)
+// Also signals via err if workflow got canceled.
-	err := s.peer.Wait(c, req.GetId())
+func (s *WoodpeckerServer) Wait(c context.Context, req *proto.WaitRequest) (*proto.WaitResponse, error) {
 	res := new(proto.WaitResponse)
 	canceled, err := s.peer.Wait(c, req.GetId())
 	res.Canceled = canceled
 	return res, err
 }
 // Extend extends the workflow deadline.
 func (s *WoodpeckerServer) Extend(c context.Context, req *proto.ExtendRequest) (*proto.Empty, error) {
 	res := new(proto.Empty)
 	err := s.peer.Extend(c, req.GetId())
@@ -170,6 +181,7 @@ func (s *WoodpeckerServer) Log(c context.Context, req *proto.LogRequest) (*proto
 	return res, err
 }
 // RegisterAgent register our agent to the server.
 func (s *WoodpeckerServer) RegisterAgent(c context.Context, req *proto.RegisterAgentRequest) (*proto.RegisterAgentResponse, error) {
 	res := new(proto.RegisterAgentResponse)
 	agentInfo := req.GetInfo()
@@ -184,11 +196,13 @@ func (s *WoodpeckerServer) RegisterAgent(c context.Context, req *proto.RegisterA
 	return res, err
 }
 // UnregisterAgent unregister our agent from the server.
 func (s *WoodpeckerServer) UnregisterAgent(ctx context.Context, _ *proto.Empty) (*proto.Empty, error) {
 	err := s.peer.UnregisterAgent(ctx)
 	return new(proto.Empty), err
 }
 // ReportHealth reports health status of the agent to the server.
 func (s *WoodpeckerServer) ReportHealth(c context.Context, req *proto.ReportHealthRequest) (*proto.Empty, error) {
 	res := new(proto.Empty)
 	err := s.peer.ReportHealth(c, req.GetStatus())