mirror of
https://github.com/replicatedhq/troubleshoot.git
synced 2026-02-14 18:29:53 +00:00
Fix(support-bundle): improve copy from host collector performance by checking daemonset pod FailedMount event and retry (#1281)
This commit is contained in:
@@ -39,7 +39,16 @@ func (c *CollectCollectd) Collect(progressChan chan<- interface{}) (CollectorRes
|
||||
}
|
||||
|
||||
rbacErrors := c.GetRBACErrors()
|
||||
copyFromHostCollector := &CollectCopyFromHost{copyFromHost, c.BundlePath, c.Namespace, c.ClientConfig, c.Client, c.Context, rbacErrors}
|
||||
copyFromHostCollector := &CollectCopyFromHost{
|
||||
Collector: copyFromHost,
|
||||
BundlePath: c.BundlePath,
|
||||
Namespace: c.Namespace,
|
||||
ClientConfig: c.ClientConfig,
|
||||
Client: c.Client,
|
||||
Context: c.Context,
|
||||
RetryFailedMount: false,
|
||||
RBACErrors: rbacErrors,
|
||||
}
|
||||
|
||||
return copyFromHostCollector.Collect(progressChan)
|
||||
}
|
||||
|
||||
@@ -82,7 +82,16 @@ func GetCollector(collector *troubleshootv1beta2.Collect, bundlePath string, nam
|
||||
case collector.Copy != nil:
|
||||
return &CollectCopy{collector.Copy, bundlePath, namespace, clientConfig, client, ctx, RBACErrors}, true
|
||||
case collector.CopyFromHost != nil:
|
||||
return &CollectCopyFromHost{collector.CopyFromHost, bundlePath, namespace, clientConfig, client, ctx, RBACErrors}, true
|
||||
return &CollectCopyFromHost{
|
||||
Collector: collector.CopyFromHost,
|
||||
BundlePath: bundlePath,
|
||||
Namespace: namespace,
|
||||
ClientConfig: clientConfig,
|
||||
Client: client,
|
||||
Context: ctx,
|
||||
RetryFailedMount: true,
|
||||
RBACErrors: RBACErrors,
|
||||
}, true
|
||||
case collector.HTTP != nil:
|
||||
return &CollectHTTP{collector.HTTP, bundlePath, namespace, clientConfig, client, RBACErrors}, true
|
||||
case collector.Postgres != nil:
|
||||
|
||||
@@ -31,12 +31,13 @@ import (
|
||||
)
|
||||
|
||||
type CollectCopyFromHost struct {
|
||||
Collector *troubleshootv1beta2.CopyFromHost
|
||||
BundlePath string
|
||||
Namespace string
|
||||
ClientConfig *rest.Config
|
||||
Client kubernetes.Interface
|
||||
Context context.Context
|
||||
Collector *troubleshootv1beta2.CopyFromHost
|
||||
BundlePath string
|
||||
Namespace string
|
||||
ClientConfig *rest.Config
|
||||
Client kubernetes.Interface
|
||||
Context context.Context
|
||||
RetryFailedMount bool
|
||||
RBACErrors
|
||||
}
|
||||
|
||||
@@ -73,7 +74,7 @@ func (c *CollectCopyFromHost) Collect(progressChan chan<- interface{}) (Collecto
|
||||
namespace, _, _ = kubeconfig.Namespace()
|
||||
}
|
||||
|
||||
_, cleanup, err := copyFromHostCreateDaemonSet(c.Context, c.Client, c.Collector, hostDir, namespace, "troubleshoot-copyfromhost-", labels)
|
||||
_, cleanup, err := c.copyFromHostCreateDaemonSet(c.Context, c.Client, c.Collector, hostDir, namespace, "troubleshoot-copyfromhost-", labels)
|
||||
defer cleanup()
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "create daemonset")
|
||||
@@ -125,7 +126,7 @@ func (c *CollectCopyFromHost) Collect(progressChan chan<- interface{}) (Collecto
|
||||
}
|
||||
}
|
||||
|
||||
func copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interface, collector *troubleshootv1beta2.CopyFromHost, hostPath string, namespace string, generateName string, labels map[string]string) (name string, cleanup func(), err error) {
|
||||
func (c *CollectCopyFromHost) copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interface, collector *troubleshootv1beta2.CopyFromHost, hostPath string, namespace string, generateName string, labels map[string]string) (name string, cleanup func(), err error) {
|
||||
pullPolicy := corev1.PullIfNotPresent
|
||||
volumeType := corev1.HostPathDirectory
|
||||
if collector.ImagePullPolicy != "" {
|
||||
@@ -229,6 +230,11 @@ func copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interfac
|
||||
for {
|
||||
select {
|
||||
case <-time.After(1 * time.Second):
|
||||
err = checkDaemonPodStatus(client, ctx, labels, namespace, c.RetryFailedMount)
|
||||
if err != nil {
|
||||
return createdDS.Name, cleanup, err
|
||||
}
|
||||
|
||||
case <-childCtx.Done():
|
||||
klog.V(2).Infof("Timed out waiting for daemonset %s to be ready", createdDS.Name)
|
||||
return createdDS.Name, cleanup, errors.Wrap(ctx.Err(), "wait for daemonset")
|
||||
@@ -373,7 +379,14 @@ func copyFilesFromHost(ctx context.Context, dstPath string, clientConfig *restcl
|
||||
|
||||
func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS *appsv1.DaemonSet, namespace string, labels map[string]string) {
|
||||
klog.V(2).Infof("Daemonset %s has been scheduled for deletion", createdDS.Name)
|
||||
if err := client.AppsV1().DaemonSets(namespace).Delete(context.Background(), createdDS.Name, metav1.DeleteOptions{}); err != nil {
|
||||
zeroGracePeriod := int64(0)
|
||||
// Foreground is used to delete the DaemonSet pods before deleting the DaemonSet
|
||||
deletePropagationForeground := metav1.DeletePropagationForeground
|
||||
|
||||
if err := client.AppsV1().DaemonSets(namespace).Delete(ctx, createdDS.Name, metav1.DeleteOptions{
|
||||
GracePeriodSeconds: &zeroGracePeriod,
|
||||
PropagationPolicy: &deletePropagationForeground,
|
||||
}); err != nil {
|
||||
klog.Errorf("Failed to delete daemonset %s: %v", createdDS.Name, err)
|
||||
return
|
||||
}
|
||||
@@ -383,10 +396,24 @@ func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS
|
||||
labelSelector = append(labelSelector, fmt.Sprintf("%s=%s", k, v))
|
||||
}
|
||||
|
||||
dsPods := &corev1.PodList{}
|
||||
dsPods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: strings.Join(labelSelector, ",")})
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to list pods for DaemonSet %s: %v", createdDS.Name, err)
|
||||
return
|
||||
}
|
||||
|
||||
for _, pod := range dsPods.Items {
|
||||
klog.V(2).Infof("Deleting pod %s", pod.Name)
|
||||
if err := client.CoreV1().Pods(namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{
|
||||
GracePeriodSeconds: &zeroGracePeriod,
|
||||
}); err != nil {
|
||||
klog.Errorf("Failed to delete pod %s: %v", pod.Name, err)
|
||||
}
|
||||
}
|
||||
|
||||
klog.V(2).Infof("Continuously poll each second for Pod deletion of DaemontSet %s for maximum %d seconds", createdDS.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
|
||||
|
||||
err := wait.PollUntilContextTimeout(ctx, time.Second, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION, true, func(ctx context.Context) (bool, error) {
|
||||
err = wait.PollUntilContextTimeout(ctx, time.Second, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION, true, func(ctx context.Context) (bool, error) {
|
||||
pods, listErr := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
|
||||
LabelSelector: strings.Join(labelSelector, ","),
|
||||
})
|
||||
@@ -410,7 +437,6 @@ func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS
|
||||
// If there was an error from the polling (e.g., the context deadline was exceeded before all pods were deleted),
|
||||
// delete each remaining pod with a zero-second grace period
|
||||
if err != nil {
|
||||
zeroGracePeriod := int64(0)
|
||||
for _, pod := range dsPods.Items {
|
||||
klog.V(2).Infof("Pod %s forcefully deleted after reaching the maximum wait time of %d seconds", pod.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
|
||||
err := client.CoreV1().Pods(namespace).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{
|
||||
@@ -424,3 +450,34 @@ func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func checkDaemonPodStatus(client kubernetes.Interface, ctx context.Context, labels map[string]string, namespace string, retryFailedMount bool) error {
|
||||
var labelSelector []string
|
||||
for k, v := range labels {
|
||||
labelSelector = append(labelSelector, fmt.Sprintf("%s=%s", k, v))
|
||||
}
|
||||
pods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
|
||||
LabelSelector: strings.Join(labelSelector, ","),
|
||||
})
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "get daemonset pods")
|
||||
}
|
||||
|
||||
for _, pod := range pods.Items {
|
||||
if pod.Status.Phase != corev1.PodRunning {
|
||||
events, _ := client.CoreV1().Events(namespace).List(ctx, metav1.ListOptions{
|
||||
FieldSelector: fmt.Sprintf("involvedObject.uid=%s", pod.UID),
|
||||
})
|
||||
|
||||
for _, event := range events.Items {
|
||||
// If the pod has a FailedMount event, it means that the pod failed to mount the volume and the pod will be stuck in the Pending state.
|
||||
// In this case, we return an error to the caller to indicate that path does not exist.
|
||||
if event.Reason == "FailedMount" && !retryFailedMount {
|
||||
klog.V(2).Infof("pod %s has a FailedMount event: %s", pod.Name, event.Message)
|
||||
return errors.Errorf("path does not exist")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
140
pkg/collect/copy_from_host_test.go
Normal file
140
pkg/collect/copy_from_host_test.go
Normal file
@@ -0,0 +1,140 @@
|
||||
package collect
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
testclient "k8s.io/client-go/kubernetes/fake"
|
||||
)
|
||||
|
||||
func Test_checkDaemonPodStatus(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
namespace string
|
||||
podStatus corev1.PodPhase
|
||||
mockPod *corev1.Pod
|
||||
mockEvent *corev1.Event
|
||||
labels map[string]string
|
||||
retryFailedMount bool
|
||||
expectedErr bool
|
||||
eventMessage string
|
||||
}{
|
||||
{
|
||||
name: "Pod running without FailedMount event",
|
||||
namespace: "test",
|
||||
podStatus: corev1.PodRunning,
|
||||
labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
|
||||
mockPod: &corev1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-pod",
|
||||
Namespace: "test",
|
||||
Labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
|
||||
},
|
||||
Status: corev1.PodStatus{
|
||||
Phase: corev1.PodRunning,
|
||||
},
|
||||
},
|
||||
expectedErr: false,
|
||||
},
|
||||
{
|
||||
name: "Pod not running without FailedMount event",
|
||||
namespace: "test",
|
||||
labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
|
||||
podStatus: corev1.PodPending,
|
||||
mockPod: &corev1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-pod",
|
||||
Namespace: "test",
|
||||
Labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
|
||||
},
|
||||
Status: corev1.PodStatus{
|
||||
Phase: corev1.PodPending,
|
||||
},
|
||||
},
|
||||
expectedErr: false,
|
||||
},
|
||||
{
|
||||
name: "Pod running with FailedMount event and retryFailedMount disabled",
|
||||
namespace: "test",
|
||||
podStatus: corev1.PodRunning,
|
||||
mockPod: &corev1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-pod",
|
||||
Namespace: "test",
|
||||
Labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
|
||||
},
|
||||
Status: corev1.PodStatus{
|
||||
Phase: corev1.PodPending,
|
||||
},
|
||||
},
|
||||
mockEvent: &corev1.Event{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-event",
|
||||
Namespace: "test",
|
||||
},
|
||||
Reason: "FailedMount",
|
||||
},
|
||||
retryFailedMount: false,
|
||||
expectedErr: true,
|
||||
eventMessage: `MountVolume.SetUp failed for volume "host" : hostPath type check failed: /var/lib/collectd is not a directory`,
|
||||
},
|
||||
{
|
||||
name: "Pod running with FailedMount event and retryFailedMount enabled",
|
||||
namespace: "test",
|
||||
podStatus: corev1.PodRunning,
|
||||
mockPod: &corev1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-pod",
|
||||
Namespace: "test",
|
||||
Labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
|
||||
},
|
||||
Status: corev1.PodStatus{
|
||||
Phase: corev1.PodPending,
|
||||
},
|
||||
},
|
||||
mockEvent: &corev1.Event{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-event",
|
||||
Namespace: "test",
|
||||
},
|
||||
Reason: "FailedMount",
|
||||
},
|
||||
retryFailedMount: true,
|
||||
expectedErr: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
client := testclient.NewSimpleClientset()
|
||||
|
||||
if tt.mockPod != nil {
|
||||
pod, err := client.CoreV1().Pods(tt.namespace).Create(ctx, tt.mockPod, metav1.CreateOptions{})
|
||||
require.NoError(t, err)
|
||||
|
||||
if tt.mockEvent != nil {
|
||||
event := tt.mockEvent
|
||||
event.InvolvedObject = corev1.ObjectReference{
|
||||
UID: pod.UID,
|
||||
}
|
||||
_, err = client.CoreV1().Events(tt.namespace).Create(ctx, event, metav1.CreateOptions{})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
}
|
||||
|
||||
err := checkDaemonPodStatus(client, ctx, tt.labels, tt.namespace, tt.retryFailedMount)
|
||||
if tt.expectedErr {
|
||||
require.Error(t, err)
|
||||
if tt.mockEvent != nil {
|
||||
require.Contains(t, err.Error(), "path does not exist")
|
||||
}
|
||||
} else {
|
||||
require.NoError(t, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -14,6 +14,7 @@ const (
|
||||
// DEFAULT_LOGS_COLLECTOR_TIMEOUT is the default timeout for logs collector.
|
||||
DEFAULT_LOGS_COLLECTOR_TIMEOUT = 60 * time.Second
|
||||
// MAX_TIME_TO_WAIT_FOR_POD_DELETION is the maximum time to wait for pod deletion.
|
||||
// 0 seconds for force deletion.
|
||||
MAX_TIME_TO_WAIT_FOR_POD_DELETION = 60 * time.Second
|
||||
// Tracing constants
|
||||
LIB_TRACER_NAME = "github.com/replicatedhq/troubleshoot"
|
||||
|
||||
Reference in New Issue
Block a user