mirror of
https://github.com/replicatedhq/troubleshoot.git
synced 2026-02-14 10:19:54 +00:00
fix(collector): solving pod stuck in terminating with CNI issue (#1196)
* feat(collector): add force delete for pods when they're done with in a collector
This commit is contained in:
@@ -8,7 +8,15 @@ spec:
|
||||
name: config/replicas.txt
|
||||
data: "5"
|
||||
- runPod:
|
||||
collectorName: "static-hi"
|
||||
collectorName: "static-hi-1"
|
||||
podSpec:
|
||||
containers:
|
||||
- name: static-hi
|
||||
image: alpine:3
|
||||
command: ["echo", "hi static!"]
|
||||
## This collector is intentionally duplicated to test same pod should be terminated after success
|
||||
- runPod:
|
||||
collectorName: "static-hi-2"
|
||||
podSpec:
|
||||
containers:
|
||||
- name: static-hi
|
||||
@@ -47,7 +55,16 @@ spec:
|
||||
message: You have at least 5 replicas
|
||||
- textAnalyze:
|
||||
checkName: Said hi!
|
||||
fileName: /static-hi.log
|
||||
fileName: /static-hi-1.log
|
||||
regex: 'hi static'
|
||||
outcomes:
|
||||
- fail:
|
||||
message: Didn't say hi.
|
||||
- pass:
|
||||
message: Said hi!
|
||||
- textAnalyze:
|
||||
checkName: Said hi!
|
||||
fileName: /static-hi-2.log
|
||||
regex: 'hi static'
|
||||
outcomes:
|
||||
- fail:
|
||||
|
||||
@@ -4,13 +4,16 @@ import (
|
||||
"archive/tar"
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
|
||||
"github.com/replicatedhq/troubleshoot/pkg/constants"
|
||||
"github.com/replicatedhq/troubleshoot/pkg/k8sutil"
|
||||
"github.com/segmentio/ksuid"
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
@@ -19,6 +22,7 @@ import (
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
restclient "k8s.io/client-go/rest"
|
||||
@@ -210,13 +214,12 @@ func copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interfac
|
||||
}
|
||||
|
||||
createdDS, err := client.AppsV1().DaemonSets(namespace).Create(ctx, &ds, metav1.CreateOptions{})
|
||||
|
||||
if err != nil {
|
||||
return "", cleanup, errors.Wrap(err, "create daemonset")
|
||||
}
|
||||
cleanupFuncs = append(cleanupFuncs, func() {
|
||||
if err := client.AppsV1().DaemonSets(namespace).Delete(context.Background(), createdDS.Name, metav1.DeleteOptions{}); err != nil {
|
||||
klog.Errorf("Failed to delete daemonset %s: %v", createdDS.Name, err)
|
||||
}
|
||||
deleteDaemonSet(client, ctx, createdDS, namespace, labels)
|
||||
})
|
||||
|
||||
// This timeout is different from collector timeout.
|
||||
@@ -227,6 +230,7 @@ func copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interfac
|
||||
select {
|
||||
case <-time.After(1 * time.Second):
|
||||
case <-childCtx.Done():
|
||||
klog.V(2).Infof("Timed out waiting for daemonset %s to be ready", createdDS.Name)
|
||||
return createdDS.Name, cleanup, errors.Wrap(ctx.Err(), "wait for daemonset")
|
||||
}
|
||||
|
||||
@@ -366,3 +370,57 @@ func copyFilesFromHost(ctx context.Context, dstPath string, clientConfig *restcl
|
||||
|
||||
return result, stderr.Bytes(), nil
|
||||
}
|
||||
|
||||
func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS *appsv1.DaemonSet, namespace string, labels map[string]string) {
|
||||
klog.V(2).Infof("Daemonset %s has been scheduled for deletion", createdDS.Name)
|
||||
if err := client.AppsV1().DaemonSets(namespace).Delete(context.Background(), createdDS.Name, metav1.DeleteOptions{}); err != nil {
|
||||
klog.Errorf("Failed to delete daemonset %s: %v", createdDS.Name, err)
|
||||
return
|
||||
}
|
||||
|
||||
var labelSelector []string
|
||||
for k, v := range labels {
|
||||
labelSelector = append(labelSelector, fmt.Sprintf("%s=%s", k, v))
|
||||
}
|
||||
|
||||
dsPods := &corev1.PodList{}
|
||||
klog.V(2).Infof("Continuously poll each second for Pod deletion of DaemontSet %s for maximum %d seconds", createdDS.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
|
||||
|
||||
err := wait.PollUntilContextTimeout(ctx, time.Second, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION, true, func(ctx context.Context) (bool, error) {
|
||||
pods, listErr := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
|
||||
LabelSelector: strings.Join(labelSelector, ","),
|
||||
})
|
||||
|
||||
if listErr != nil {
|
||||
klog.Errorf("Failed to list pods created by %s daemonset: %v", createdDS.Name, listErr)
|
||||
}
|
||||
// If there are no pods remaining, return true to stop the polling
|
||||
if len(pods.Items) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
// If there is an error from context (e.g., context deadline exceeded), return the error.
|
||||
if ctx.Err() != nil {
|
||||
return false, ctx.Err()
|
||||
}
|
||||
// If there are still pods remaining and there was no context error, save the list of pods,
|
||||
dsPods = pods
|
||||
return false, nil
|
||||
})
|
||||
|
||||
// If there was an error from the polling (e.g., the context deadline was exceeded before all pods were deleted),
|
||||
// delete each remaining pod with a zero-second grace period
|
||||
if err != nil {
|
||||
zeroGracePeriod := int64(0)
|
||||
for _, pod := range dsPods.Items {
|
||||
klog.V(2).Infof("Pod %s forcefully deleted after reaching the maximum wait time of %d seconds", pod.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
|
||||
err := client.CoreV1().Pods(namespace).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{
|
||||
GracePeriodSeconds: &zeroGracePeriod,
|
||||
})
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to wait for pod %s deletion: %v", pod.Name, err)
|
||||
return
|
||||
}
|
||||
klog.V(2).Infof("Daemonset pod %s in %s namespace has been deleted", pod.Name, pod.Namespace)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
|
||||
"github.com/pkg/errors"
|
||||
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
|
||||
"github.com/replicatedhq/troubleshoot/pkg/constants"
|
||||
"github.com/replicatedhq/troubleshoot/pkg/k8sutil"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
@@ -22,6 +23,7 @@ import (
|
||||
|
||||
kuberneteserrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
)
|
||||
|
||||
type CollectRunPod struct {
|
||||
@@ -55,9 +57,7 @@ func (c *CollectRunPod) Collect(progressChan chan<- interface{}) (CollectorResul
|
||||
return nil, errors.Wrap(err, "failed to run pod")
|
||||
}
|
||||
defer func() {
|
||||
if err := client.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{}); err != nil {
|
||||
klog.Errorf("Failed to delete pod %s: %v", pod.Name, err)
|
||||
}
|
||||
deletePod(ctx, client, pod)
|
||||
}()
|
||||
|
||||
if c.Collector.ImagePullSecret != nil && c.Collector.ImagePullSecret.Data != nil {
|
||||
@@ -152,6 +152,8 @@ func runPodWithSpec(ctx context.Context, client *kubernetes.Clientset, runPodCol
|
||||
}
|
||||
|
||||
created, err := client.CoreV1().Pods(namespace).Create(ctx, &pod, metav1.CreateOptions{})
|
||||
klog.V(2).Infof("Pod %s has been created", pod.Name)
|
||||
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "failed to create pod")
|
||||
}
|
||||
@@ -180,8 +182,23 @@ func runWithoutTimeout(ctx context.Context, bundlePath string, clientConfig *res
|
||||
if v.State.Waiting != nil && v.State.Waiting.Reason == "ImagePullBackOff" {
|
||||
return nil, errors.Errorf("run pod aborted after getting pod status 'ImagePullBackOff'")
|
||||
}
|
||||
|
||||
if v.State.Waiting != nil && v.State.Waiting.Reason == "ContainerCreating" {
|
||||
podEvents, err := client.CoreV1().Events(pod.Namespace).List(ctx, metav1.ListOptions{FieldSelector: (fmt.Sprintf("involvedObject.name=%s", pod.Name)), TypeMeta: metav1.TypeMeta{Kind: "Pod"}})
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "failed to get pod events")
|
||||
}
|
||||
|
||||
for _, podEvent := range podEvents.Items {
|
||||
if podEvent.Reason == "FailedCreatePodSandBox" {
|
||||
klog.V(2).Infof("Pod %s failed to setup network for sandbox", pod.Name)
|
||||
return nil, errors.Errorf("run pod aborted after getting pod status 'FailedCreatePodSandBox'")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
time.Sleep(time.Second * 1)
|
||||
}
|
||||
|
||||
@@ -424,3 +441,39 @@ func savePodDetails(ctx context.Context, client *kubernetes.Clientset, output Co
|
||||
}
|
||||
return output, nil
|
||||
}
|
||||
|
||||
func deletePod(ctx context.Context, client *kubernetes.Clientset, pod *corev1.Pod) {
|
||||
if err := client.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{}); err != nil {
|
||||
klog.Errorf("Failed to delete pod %s: %v", pod.Name, err)
|
||||
return
|
||||
}
|
||||
klog.V(2).Infof("Pod %s has been scheduled for deletion", pod.Name)
|
||||
|
||||
// Wait until the pod is deleted
|
||||
// Poll every second to check if the Pod has been deleted.
|
||||
klog.V(2).Infof("Continuously poll each second for Pod %s deletion for maximum %d seconds", pod.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
|
||||
err := wait.PollUntilContextTimeout(ctx, time.Second, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION, true, func(ctx context.Context) (bool, error) {
|
||||
_, getErr := client.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
|
||||
// If the Pod is not found, it has been deleted.
|
||||
if kuberneteserrors.IsNotFound(getErr) {
|
||||
return true, nil
|
||||
}
|
||||
// If there is an error from context (e.g., context deadline exceeded), return the error.
|
||||
if ctx.Err() != nil {
|
||||
return false, ctx.Err()
|
||||
}
|
||||
// Otherwise, the Pod has not yet been deleted. Keep polling.
|
||||
return false, nil
|
||||
})
|
||||
if err != nil {
|
||||
zeroGracePeriod := int64(0)
|
||||
klog.V(2).Infof("Pod %s forcefully deleted after reaching the maximum wait time of %d seconds", pod.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
|
||||
if err := client.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{
|
||||
GracePeriodSeconds: &zeroGracePeriod,
|
||||
}); err != nil {
|
||||
klog.Errorf("Failed to wait for pod %s deletion: %v", pod.Name, err)
|
||||
return
|
||||
}
|
||||
klog.V(2).Infof("Pod %s in %s namespace has been deleted", pod.Name, pod.Namespace)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,8 @@ const (
|
||||
VERSION_FILENAME = "version.yaml"
|
||||
// DEFAULT_LOGS_COLLECTOR_TIMEOUT is the default timeout for logs collector.
|
||||
DEFAULT_LOGS_COLLECTOR_TIMEOUT = 60 * time.Second
|
||||
|
||||
// MAX_TIME_TO_WAIT_FOR_POD_DELETION is the maximum time to wait for pod deletion.
|
||||
MAX_TIME_TO_WAIT_FOR_POD_DELETION = 60 * time.Second
|
||||
// Tracing constants
|
||||
LIB_TRACER_NAME = "github.com/replicatedhq/troubleshoot"
|
||||
TROUBLESHOOT_ROOT_SPAN_NAME = "ReplicatedTroubleshootRootSpan"
|
||||
|
||||
Reference in New Issue
Block a user