From c749e52bb712bf8029bc8d9193297e32740305c6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 2 Jun 2026 16:53:27 +0000 Subject: [PATCH] fix(cleanup): kill Windows step process tree on cancel to avoid hang (#1011) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem Cancelling a job on a Windows host runner can leave the spawned process tree running and hang the runner. When a step launches a shell that starts a child which in turn spawns further GUI/background processes, cancelling the job kills only the direct child (the default `exec.CommandContext` behaviour). The surviving descendants inherited the step's stdout/stderr pipe, so the read end never hit EOF and `cmd.Wait()` blocked forever. Because the step executor never returned: - the orphaned processes kept running (the cancelled work was not actually stopped), and - end-of-job cleanup (`Remove` → `terminateRunningProcesses`) was never reached, so the runner appeared to go offline / stop picking up jobs. `CREATE_NEW_PROCESS_GROUP` does not help here — it affects Ctrl-C signal delivery, not handle inheritance or tree termination. ## Fix - Assign each Windows step process to a **Job Object** immediately after `cmd.Start()`. Descendants created afterwards are automatically part of the job. - Override `cmd.Cancel` to `TerminateJobObject`, so cancellation kills the **entire descendant tree** atomically. This also closes the inherited pipe handles, so `cmd.Wait()` can return. - Set `cmd.WaitDelay` (10s) as a safety net: once the process has exited, Wait force-closes the pipes and returns rather than blocking forever — covering the case where the job-object setup fails (e.g. nested-job restrictions), in which we fall back to the previous single-process kill. - The Job Object is created **without** `JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE`, so closing the handle on normal completion does not kill legitimate background processes; the tree is only torn down on explicit cancel. Implemented behind `runtime.GOOS == "windows"` with a Windows-only `processKiller` (Job Object) and no-op stubs elsewhere, so non-Windows behaviour (default cancellation + `Setpgid`) is unchanged. ## Changes - `act/container/process_windows.go` — Job Object `processKiller` (create / assign / terminate). - `act/container/process_other.go` — no-op stubs (`//go:build !windows`). - `act/container/host_environment.go` — wire `cmd.Cancel` (tree kill) and `cmd.WaitDelay` into `exec()`. - `go.mod` / `go.sum` — promote `golang.org/x/sys` to a direct dependency. ## Testing I fully tested it already ## Notes Follow-up to the Windows leftover-process reaping in #996: that sweep now actually runs on cancellation because the step no longer hangs before reaching it. Reviewed-on: https://gitea.com/gitea/runner/pulls/1011 Reviewed-by: techknowlogick <9+techknowlogick@noreply.gitea.com> --- act/container/host_environment.go | 36 +++++++++++++ act/container/process_other.go | 19 +++++++ act/container/process_windows.go | 71 ++++++++++++++++++++++++ act/container/process_windows_test.go | 78 +++++++++++++++++++++++++++ go.mod | 2 +- go.sum | 4 -- 6 files changed, 205 insertions(+), 5 deletions(-) create mode 100644 act/container/process_other.go create mode 100644 act/container/process_windows.go create mode 100644 act/container/process_windows_test.go diff --git a/act/container/host_environment.go b/act/container/host_environment.go index 03f453a1..023e3fdc 100644 --- a/act/container/host_environment.go +++ b/act/container/host_environment.go @@ -322,6 +322,30 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st cmd.Stderr = e.StdOut cmd.Dir = wd cmd.SysProcAttr = getSysProcAttr(cmdline, false) + + // On Windows a step often launches a process tree (a shell that starts a + // child which spawns further GUI or background processes). The default + // context cancellation only kills the direct child, leaving the rest of the + // tree running; and because the orphans inherit cmd's stdout/stderr pipe, + // cmd.Wait() would block forever, hanging the runner. Kill the whole tree + // via a Job Object on cancellation, and bound the wait so a leftover pipe + // writer can never hang Wait indefinitely. + var killer atomic.Pointer[processKiller] + if runtime.GOOS == "windows" { + cmd.Cancel = func() error { + if k := killer.Load(); k != nil { + return k.Kill() + } + if cmd.Process != nil { + return cmd.Process.Kill() + } + return nil + } + // Once the step process has exited, give its I/O pipes at most this long + // to drain before Wait force-closes them and returns (Go's WaitDelay). + cmd.WaitDelay = 10 * time.Second + } + var ppty *os.File var tty *os.File defer func() { @@ -351,6 +375,18 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st if err := cmd.Start(); err != nil { return err } + if runtime.GOOS == "windows" { + // Assign the started process to a Job Object so cmd.Cancel can kill the + // whole descendant tree. Children spawned afterwards are auto-included. + // On failure (e.g. nested-job restrictions) we fall back to the default + // single-process kill; WaitDelay + end-of-job cleanup still apply. + if k, kerr := newProcessKiller(cmd.Process); kerr != nil { + common.Logger(ctx).Warnf("process tree kill setup failed, falling back to single-process kill: %v", kerr) + } else { + killer.Store(k) + defer k.Close() + } + } err = cmd.Wait() if err != nil { var exitErr *exec.ExitError diff --git a/act/container/process_other.go b/act/container/process_other.go new file mode 100644 index 00000000..a08e60b0 --- /dev/null +++ b/act/container/process_other.go @@ -0,0 +1,19 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows + +package container + +import "os" + +// processKiller is a no-op on non-Windows platforms. The Job Object based +// tree-kill is only wired in on Windows (see exec()); elsewhere the default +// exec.CommandContext cancellation and Setpgid handling apply. +type processKiller struct{} + +func newProcessKiller(_ *os.Process) (*processKiller, error) { return &processKiller{}, nil } + +func (k *processKiller) Kill() error { return nil } + +func (k *processKiller) Close() error { return nil } diff --git a/act/container/process_windows.go b/act/container/process_windows.go new file mode 100644 index 00000000..7ad61dca --- /dev/null +++ b/act/container/process_windows.go @@ -0,0 +1,71 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package container + +import ( + "os" + + "golang.org/x/sys/windows" +) + +// processKiller terminates a step process together with its entire descendant +// tree via a Windows Job Object. +// +// Background: a step often launches a process tree (a shell that starts a +// child which in turn spawns further GUI or background processes). The default +// exec.CommandContext cancellation only kills the direct child, so cancelling a +// job left the rest of the tree running. Because those orphans inherited the +// step's stdout/stderr pipe, cmd.Wait() also blocked forever and the runner hung. +// +// Assigning the step process to a Job Object lets us kill the whole tree +// atomically on cancellation (TerminateJobObject), which also closes the +// inherited pipe handles so cmd.Wait() can return. +type processKiller struct { + job windows.Handle +} + +// newProcessKiller creates a Job Object and assigns p (an already-started +// process) to it. Children spawned by p afterwards are automatically part of +// the job. The job does NOT use JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE, so closing +// the handle on normal completion does not kill legitimate background +// processes; the tree is only torn down by an explicit Kill (cancellation). +func newProcessKiller(p *os.Process) (*processKiller, error) { + job, err := windows.CreateJobObject(nil, nil) + if err != nil { + return nil, err + } + + h, err := windows.OpenProcess(windows.PROCESS_SET_QUOTA|windows.PROCESS_TERMINATE, false, uint32(p.Pid)) + if err != nil { + windows.CloseHandle(job) + return nil, err + } + defer windows.CloseHandle(h) + + if err := windows.AssignProcessToJobObject(job, h); err != nil { + windows.CloseHandle(job) + return nil, err + } + + return &processKiller{job: job}, nil +} + +// Kill terminates every process currently assigned to the job (the step process +// and all of its descendants). +func (k *processKiller) Kill() error { + if k == nil || k.job == 0 { + return nil + } + return windows.TerminateJobObject(k.job, 1) +} + +// Close releases the job handle. It does not terminate the processes. +func (k *processKiller) Close() error { + if k == nil || k.job == 0 { + return nil + } + h := k.job + k.job = 0 + return windows.CloseHandle(h) +} diff --git a/act/container/process_windows_test.go b/act/container/process_windows_test.go new file mode 100644 index 00000000..c06cfbf8 --- /dev/null +++ b/act/container/process_windows_test.go @@ -0,0 +1,78 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package container + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + "golang.org/x/sys/windows" +) + +// processAlive reports whether pid refers to a still-running process. +func processAlive(pid int) bool { + h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid)) + if err != nil { + return false + } + defer windows.CloseHandle(h) + var code uint32 + if err := windows.GetExitCodeProcess(h, &code); err != nil { + return false + } + const stillActive = 259 // STILL_ACTIVE + return code == stillActive +} + +// TestProcessKillerKillsTree verifies that a process assigned to the Job Object +// is terminated together with a child it spawns afterwards. This mirrors a step +// that launches a child which spawns further processes, where cancelling the +// job must take down the whole tree, not just the direct child. +func TestProcessKillerKillsTree(t *testing.T) { + dir := t.TempDir() + pidFile := filepath.Join(dir, "child.pid") + + // Parent powershell spawns a detached, long-lived child powershell (writing + // its PID to a file) and then sleeps. The child is launched AFTER the parent + // has been assigned to the job, so it must be captured by the job too. + script := fmt.Sprintf( + `$c = Start-Process powershell -PassThru -ArgumentList '-NoProfile','-Command','Start-Sleep -Seconds 600'; `+ + `Set-Content -LiteralPath %q -Value $c.Id; Start-Sleep -Seconds 600`, pidFile) + cmd := exec.Command("powershell.exe", "-NoProfile", "-Command", script) + require.NoError(t, cmd.Start()) + t.Cleanup(func() { _ = cmd.Process.Kill() }) + + killer, err := newProcessKiller(cmd.Process) + require.NoError(t, err) + defer killer.Close() + + // Wait for the child PID to be reported. + var childPID int + require.Eventually(t, func() bool { + b, e := os.ReadFile(pidFile) + if e != nil { + return false + } + s := strings.TrimSpace(string(b)) + if s == "" { + return false + } + childPID, _ = strconv.Atoi(s) + return childPID > 0 && processAlive(childPID) + }, 20*time.Second, 200*time.Millisecond, "child process should start") + + // Killing the job must terminate both the parent and the detached child. + require.NoError(t, killer.Kill()) + + require.Eventually(t, func() bool { + return !processAlive(cmd.Process.Pid) && !processAlive(childPID) + }, 20*time.Second, 200*time.Millisecond, "parent and child should both be terminated") +} diff --git a/go.mod b/go.mod index 3cb863a1..c0a969e2 100644 --- a/go.mod +++ b/go.mod @@ -37,6 +37,7 @@ require ( github.com/timshannon/bolthold v0.0.0-20240314194003-30aac6950928 go.etcd.io/bbolt v1.4.3 go.yaml.in/yaml/v4 v4.0.0-rc.3 + golang.org/x/sys v0.44.0 golang.org/x/term v0.43.0 google.golang.org/protobuf v1.36.11 gotest.tools/v3 v3.5.2 @@ -106,7 +107,6 @@ require ( golang.org/x/crypto v0.50.0 // indirect golang.org/x/net v0.53.0 // indirect golang.org/x/sync v0.20.0 // indirect - golang.org/x/sys v0.44.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 6c013146..1689762f 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,5 @@ code.gitea.io/actions-proto-go v0.4.1 h1:l0EYhjsgpUe/1VABo2eK7zcoNX2W44WOnb0MSLrKfls= code.gitea.io/actions-proto-go v0.4.1/go.mod h1:mn7Wkqz6JbnTOHQpot3yDeHx+O5C9EGhMEE+htvHBas= -connectrpc.com/connect v1.19.2 h1:McQ83FGdzL+t60peksi0gXC7MQ/iLKgLduAnThbM0mo= -connectrpc.com/connect v1.19.2/go.mod h1:tN20fjdGlewnSFeZxLKb0xwIZ6ozc3OQs2hTXy4du9w= connectrpc.com/connect v1.20.0 h1:6TNDAB+WeNd2uolWNlYczB5E0KNNaVMNUEx8JEUsPmQ= connectrpc.com/connect v1.20.0/go.mod h1:A2ygJrukXwWy32vkCAAHNVguZrqZ+jeZ9rGRnGR4dN4= cyphar.com/go-pathrs v0.2.3 h1:0pH8gep37wB0BgaXrEaN1OtZhUMeS7VvaejSr6i822o= @@ -149,8 +147,6 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M= -github.com/opencontainers/selinux v1.14.1 h1:a7XlXV/nN/l5zFP1FWZYoExpClu1QOPMfWUV2CZ8kEQ= -github.com/opencontainers/selinux v1.14.1/go.mod h1:LenyElirjUHszfxrjuFqC85HIeXZKumHcKMQtnaDlQQ= github.com/opencontainers/selinux v1.15.0 h1:4Gs40e/R2FvM8PC1HPaPncLLaDor8Y2WDfk5gjU9o5M= github.com/opencontainers/selinux v1.15.0/go.mod h1:LenyElirjUHszfxrjuFqC85HIeXZKumHcKMQtnaDlQQ= github.com/pjbgf/sha1cd v0.6.0 h1:3WJ8Wz8gvDz29quX1OcEmkAlUg9diU4GxJHqs0/XiwU=