fix(collector): solving pod stuck in terminating with CNI issue (#1196)

* feat(collector): add force delete for pods when they're done with in a collector
2026-02-14 10:19:54 +00:00 · 2023-06-15 13:59:59 +12:00
parent 03c53cabd6
commit 5b1e48258f
4 changed files with 138 additions and 9 deletions
--- a/examples/preflight/e2e.yaml
+++ b/examples/preflight/e2e.yaml
@@ -8,7 +8,15 @@ spec:
        name: config/replicas.txt
        data: "5"
    - runPod:
-        collectorName: "static-hi"
+        collectorName: "static-hi-1"
+        podSpec:
+          containers:
+          - name: static-hi
+            image: alpine:3
+            command: ["echo", "hi static!"]
+    ## This collector is intentionally duplicated to test same pod should be terminated after success
+    - runPod:
+        collectorName: "static-hi-2"
        podSpec:
          containers:
          - name: static-hi
@@ -47,7 +55,16 @@ spec:
              message: You have at least 5 replicas
    - textAnalyze:
        checkName: Said hi!
-        fileName: /static-hi.log
+        fileName: /static-hi-1.log
+        regex: 'hi static'
+        outcomes:
+          - fail:
+              message: Didn't say hi.
+          - pass:
+              message: Said hi!
+    - textAnalyze:
+        checkName: Said hi!
+        fileName: /static-hi-2.log
        regex: 'hi static'
        outcomes:
          - fail:
--- a/pkg/collect/copy_from_host.go
+++ b/pkg/collect/copy_from_host.go
@@ -4,13 +4,16 @@ import (
 	"archive/tar"
 	"bytes"
 	"context"
+	"fmt"
 	"io"
 	"os"
 	"path/filepath"
+	"strings"
 	"time"

 	"github.com/pkg/errors"
 	troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
+	"github.com/replicatedhq/troubleshoot/pkg/constants"
 	"github.com/replicatedhq/troubleshoot/pkg/k8sutil"
 	"github.com/segmentio/ksuid"
 	appsv1 "k8s.io/api/apps/v1"
@@ -19,6 +22,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
 	restclient "k8s.io/client-go/rest"
@@ -210,13 +214,12 @@ func copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interfac
 	}

 	createdDS, err := client.AppsV1().DaemonSets(namespace).Create(ctx, &ds, metav1.CreateOptions{})
+
 	if err != nil {
 		return "", cleanup, errors.Wrap(err, "create daemonset")
 	}
 	cleanupFuncs = append(cleanupFuncs, func() {
-		if err := client.AppsV1().DaemonSets(namespace).Delete(context.Background(), createdDS.Name, metav1.DeleteOptions{}); err != nil {
-			klog.Errorf("Failed to delete daemonset %s: %v", createdDS.Name, err)
-		}
+		deleteDaemonSet(client, ctx, createdDS, namespace, labels)
 	})

 	// This timeout is different from collector timeout.
@@ -227,6 +230,7 @@ func copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interfac
 		select {
 		case <-time.After(1 * time.Second):
 		case <-childCtx.Done():
+			klog.V(2).Infof("Timed out waiting for daemonset %s to be ready", createdDS.Name)
 			return createdDS.Name, cleanup, errors.Wrap(ctx.Err(), "wait for daemonset")
 		}

@@ -366,3 +370,57 @@ func copyFilesFromHost(ctx context.Context, dstPath string, clientConfig *restcl

 	return result, stderr.Bytes(), nil
 }
+
+func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS *appsv1.DaemonSet, namespace string, labels map[string]string) {
+	klog.V(2).Infof("Daemonset %s has been scheduled for deletion", createdDS.Name)
+	if err := client.AppsV1().DaemonSets(namespace).Delete(context.Background(), createdDS.Name, metav1.DeleteOptions{}); err != nil {
+		klog.Errorf("Failed to delete daemonset %s: %v", createdDS.Name, err)
+		return
+	}
+
+	var labelSelector []string
+	for k, v := range labels {
+		labelSelector = append(labelSelector, fmt.Sprintf("%s=%s", k, v))
+	}
+
+	dsPods := &corev1.PodList{}
+	klog.V(2).Infof("Continuously poll each second for Pod deletion of DaemontSet %s for maximum %d seconds", createdDS.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
+
+	err := wait.PollUntilContextTimeout(ctx, time.Second, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION, true, func(ctx context.Context) (bool, error) {
+		pods, listErr := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
+			LabelSelector: strings.Join(labelSelector, ","),
+		})
+
+		if listErr != nil {
+			klog.Errorf("Failed to list pods created by %s daemonset: %v", createdDS.Name, listErr)
+		}
+		// If there are no pods remaining, return true to stop the polling
+		if len(pods.Items) == 0 {
+			return true, nil
+		}
+		// If there is an error from context (e.g., context deadline exceeded), return the error.
+		if ctx.Err() != nil {
+			return false, ctx.Err()
+		}
+		// If there are still pods remaining and there was no context error, save the list of pods,
+		dsPods = pods
+		return false, nil
+	})
+
+	// If there was an error from the polling (e.g., the context deadline was exceeded before all pods were deleted),
+	// delete each remaining pod with a zero-second grace period
+	if err != nil {
+		zeroGracePeriod := int64(0)
+		for _, pod := range dsPods.Items {
+			klog.V(2).Infof("Pod %s forcefully deleted after reaching the maximum wait time of %d seconds", pod.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
+			err := client.CoreV1().Pods(namespace).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{
+				GracePeriodSeconds: &zeroGracePeriod,
+			})
+			if err != nil {
+				klog.Errorf("Failed to wait for pod %s deletion: %v", pod.Name, err)
+				return
+			}
+			klog.V(2).Infof("Daemonset pod %s in %s namespace has been deleted", pod.Name, pod.Namespace)
+		}
+	}
+}
--- a/pkg/collect/run_pod.go
+++ b/pkg/collect/run_pod.go
@@ -13,6 +13,7 @@ import (

 	"github.com/pkg/errors"
 	troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
+	"github.com/replicatedhq/troubleshoot/pkg/constants"
 	"github.com/replicatedhq/troubleshoot/pkg/k8sutil"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/client-go/kubernetes"
@@ -22,6 +23,7 @@ import (

 	kuberneteserrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/wait"
 )

 type CollectRunPod struct {
@@ -55,9 +57,7 @@ func (c *CollectRunPod) Collect(progressChan chan<- interface{}) (CollectorResul
 		return nil, errors.Wrap(err, "failed to run pod")
 	}
 	defer func() {
-		if err := client.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{}); err != nil {
-			klog.Errorf("Failed to delete pod %s: %v", pod.Name, err)
-		}
+		deletePod(ctx, client, pod)
 	}()

 	if c.Collector.ImagePullSecret != nil && c.Collector.ImagePullSecret.Data != nil {
@@ -152,6 +152,8 @@ func runPodWithSpec(ctx context.Context, client *kubernetes.Clientset, runPodCol
 	}

 	created, err := client.CoreV1().Pods(namespace).Create(ctx, &pod, metav1.CreateOptions{})
+	klog.V(2).Infof("Pod %s has been created", pod.Name)
+
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to create pod")
 	}
@@ -180,8 +182,23 @@ func runWithoutTimeout(ctx context.Context, bundlePath string, clientConfig *res
 				if v.State.Waiting != nil && v.State.Waiting.Reason == "ImagePullBackOff" {
 					return nil, errors.Errorf("run pod aborted after getting pod status 'ImagePullBackOff'")
 				}
+
+				if v.State.Waiting != nil && v.State.Waiting.Reason == "ContainerCreating" {
+					podEvents, err := client.CoreV1().Events(pod.Namespace).List(ctx, metav1.ListOptions{FieldSelector: (fmt.Sprintf("involvedObject.name=%s", pod.Name)), TypeMeta: metav1.TypeMeta{Kind: "Pod"}})
+					if err != nil {
+						return nil, errors.Wrap(err, "failed to get pod events")
+					}
+
+					for _, podEvent := range podEvents.Items {
+						if podEvent.Reason == "FailedCreatePodSandBox" {
+							klog.V(2).Infof("Pod %s failed to setup network for sandbox", pod.Name)
+							return nil, errors.Errorf("run pod aborted after getting pod status 'FailedCreatePodSandBox'")
+						}
+					}
+				}
 			}
 		}
+
 		time.Sleep(time.Second * 1)
 	}

@@ -424,3 +441,39 @@ func savePodDetails(ctx context.Context, client *kubernetes.Clientset, output Co
 	}
 	return output, nil
 }
+
+func deletePod(ctx context.Context, client *kubernetes.Clientset, pod *corev1.Pod) {
+	if err := client.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{}); err != nil {
+		klog.Errorf("Failed to delete pod %s: %v", pod.Name, err)
+		return
+	}
+	klog.V(2).Infof("Pod %s has been scheduled for deletion", pod.Name)
+
+	// Wait until the pod is deleted
+	// Poll every second to check if the Pod has been deleted.
+	klog.V(2).Infof("Continuously poll each second for Pod %s deletion for maximum %d seconds", pod.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
+	err := wait.PollUntilContextTimeout(ctx, time.Second, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION, true, func(ctx context.Context) (bool, error) {
+		_, getErr := client.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
+		// If the Pod is not found, it has been deleted.
+		if kuberneteserrors.IsNotFound(getErr) {
+			return true, nil
+		}
+		// If there is an error from context (e.g., context deadline exceeded), return the error.
+		if ctx.Err() != nil {
+			return false, ctx.Err()
+		}
+		// Otherwise, the Pod has not yet been deleted. Keep polling.
+		return false, nil
+	})
+	if err != nil {
+		zeroGracePeriod := int64(0)
+		klog.V(2).Infof("Pod %s forcefully deleted after reaching the maximum wait time of %d seconds", pod.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
+		if err := client.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{
+			GracePeriodSeconds: &zeroGracePeriod,
+		}); err != nil {
+			klog.Errorf("Failed to wait for pod %s deletion: %v", pod.Name, err)
+			return
+		}
+		klog.V(2).Infof("Pod %s in %s namespace has been deleted", pod.Name, pod.Namespace)
+	}
+}
--- a/pkg/constants/constants.go
+++ b/pkg/constants/constants.go
@@ -13,7 +13,8 @@ const (
 	VERSION_FILENAME = "version.yaml"
 	// DEFAULT_LOGS_COLLECTOR_TIMEOUT is the default timeout for logs collector.
 	DEFAULT_LOGS_COLLECTOR_TIMEOUT = 60 * time.Second
-
+	// MAX_TIME_TO_WAIT_FOR_POD_DELETION is the maximum time to wait for pod deletion.
+	MAX_TIME_TO_WAIT_FOR_POD_DELETION = 60 * time.Second
 	// Tracing constants
 	LIB_TRACER_NAME             = "github.com/replicatedhq/troubleshoot"
 	TROUBLESHOOT_ROOT_SPAN_NAME = "ReplicatedTroubleshootRootSpan"