Fix(support-bundle): improve copy from host collector performance by checking daemonset pod FailedMount event and retry (#1281)

This commit is contained in:
Dexter Yan
2023-08-09 22:32:14 +12:00
committed by GitHub
parent 39314ef200
commit 5bb4358f7a
5 changed files with 230 additions and 14 deletions

View File

@@ -39,7 +39,16 @@ func (c *CollectCollectd) Collect(progressChan chan<- interface{}) (CollectorRes
}
rbacErrors := c.GetRBACErrors()
copyFromHostCollector := &CollectCopyFromHost{copyFromHost, c.BundlePath, c.Namespace, c.ClientConfig, c.Client, c.Context, rbacErrors}
copyFromHostCollector := &CollectCopyFromHost{
Collector: copyFromHost,
BundlePath: c.BundlePath,
Namespace: c.Namespace,
ClientConfig: c.ClientConfig,
Client: c.Client,
Context: c.Context,
RetryFailedMount: false,
RBACErrors: rbacErrors,
}
return copyFromHostCollector.Collect(progressChan)
}

View File

@@ -82,7 +82,16 @@ func GetCollector(collector *troubleshootv1beta2.Collect, bundlePath string, nam
case collector.Copy != nil:
return &CollectCopy{collector.Copy, bundlePath, namespace, clientConfig, client, ctx, RBACErrors}, true
case collector.CopyFromHost != nil:
return &CollectCopyFromHost{collector.CopyFromHost, bundlePath, namespace, clientConfig, client, ctx, RBACErrors}, true
return &CollectCopyFromHost{
Collector: collector.CopyFromHost,
BundlePath: bundlePath,
Namespace: namespace,
ClientConfig: clientConfig,
Client: client,
Context: ctx,
RetryFailedMount: true,
RBACErrors: RBACErrors,
}, true
case collector.HTTP != nil:
return &CollectHTTP{collector.HTTP, bundlePath, namespace, clientConfig, client, RBACErrors}, true
case collector.Postgres != nil:

View File

@@ -31,12 +31,13 @@ import (
)
type CollectCopyFromHost struct {
Collector *troubleshootv1beta2.CopyFromHost
BundlePath string
Namespace string
ClientConfig *rest.Config
Client kubernetes.Interface
Context context.Context
Collector *troubleshootv1beta2.CopyFromHost
BundlePath string
Namespace string
ClientConfig *rest.Config
Client kubernetes.Interface
Context context.Context
RetryFailedMount bool
RBACErrors
}
@@ -73,7 +74,7 @@ func (c *CollectCopyFromHost) Collect(progressChan chan<- interface{}) (Collecto
namespace, _, _ = kubeconfig.Namespace()
}
_, cleanup, err := copyFromHostCreateDaemonSet(c.Context, c.Client, c.Collector, hostDir, namespace, "troubleshoot-copyfromhost-", labels)
_, cleanup, err := c.copyFromHostCreateDaemonSet(c.Context, c.Client, c.Collector, hostDir, namespace, "troubleshoot-copyfromhost-", labels)
defer cleanup()
if err != nil {
return nil, errors.Wrap(err, "create daemonset")
@@ -125,7 +126,7 @@ func (c *CollectCopyFromHost) Collect(progressChan chan<- interface{}) (Collecto
}
}
func copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interface, collector *troubleshootv1beta2.CopyFromHost, hostPath string, namespace string, generateName string, labels map[string]string) (name string, cleanup func(), err error) {
func (c *CollectCopyFromHost) copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interface, collector *troubleshootv1beta2.CopyFromHost, hostPath string, namespace string, generateName string, labels map[string]string) (name string, cleanup func(), err error) {
pullPolicy := corev1.PullIfNotPresent
volumeType := corev1.HostPathDirectory
if collector.ImagePullPolicy != "" {
@@ -229,6 +230,11 @@ func copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interfac
for {
select {
case <-time.After(1 * time.Second):
err = checkDaemonPodStatus(client, ctx, labels, namespace, c.RetryFailedMount)
if err != nil {
return createdDS.Name, cleanup, err
}
case <-childCtx.Done():
klog.V(2).Infof("Timed out waiting for daemonset %s to be ready", createdDS.Name)
return createdDS.Name, cleanup, errors.Wrap(ctx.Err(), "wait for daemonset")
@@ -373,7 +379,14 @@ func copyFilesFromHost(ctx context.Context, dstPath string, clientConfig *restcl
func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS *appsv1.DaemonSet, namespace string, labels map[string]string) {
klog.V(2).Infof("Daemonset %s has been scheduled for deletion", createdDS.Name)
if err := client.AppsV1().DaemonSets(namespace).Delete(context.Background(), createdDS.Name, metav1.DeleteOptions{}); err != nil {
zeroGracePeriod := int64(0)
// Foreground is used to delete the DaemonSet pods before deleting the DaemonSet
deletePropagationForeground := metav1.DeletePropagationForeground
if err := client.AppsV1().DaemonSets(namespace).Delete(ctx, createdDS.Name, metav1.DeleteOptions{
GracePeriodSeconds: &zeroGracePeriod,
PropagationPolicy: &deletePropagationForeground,
}); err != nil {
klog.Errorf("Failed to delete daemonset %s: %v", createdDS.Name, err)
return
}
@@ -383,10 +396,24 @@ func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS
labelSelector = append(labelSelector, fmt.Sprintf("%s=%s", k, v))
}
dsPods := &corev1.PodList{}
dsPods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: strings.Join(labelSelector, ",")})
if err != nil {
klog.Errorf("Failed to list pods for DaemonSet %s: %v", createdDS.Name, err)
return
}
for _, pod := range dsPods.Items {
klog.V(2).Infof("Deleting pod %s", pod.Name)
if err := client.CoreV1().Pods(namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{
GracePeriodSeconds: &zeroGracePeriod,
}); err != nil {
klog.Errorf("Failed to delete pod %s: %v", pod.Name, err)
}
}
klog.V(2).Infof("Continuously poll each second for Pod deletion of DaemontSet %s for maximum %d seconds", createdDS.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
err := wait.PollUntilContextTimeout(ctx, time.Second, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION, true, func(ctx context.Context) (bool, error) {
err = wait.PollUntilContextTimeout(ctx, time.Second, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION, true, func(ctx context.Context) (bool, error) {
pods, listErr := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
LabelSelector: strings.Join(labelSelector, ","),
})
@@ -410,7 +437,6 @@ func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS
// If there was an error from the polling (e.g., the context deadline was exceeded before all pods were deleted),
// delete each remaining pod with a zero-second grace period
if err != nil {
zeroGracePeriod := int64(0)
for _, pod := range dsPods.Items {
klog.V(2).Infof("Pod %s forcefully deleted after reaching the maximum wait time of %d seconds", pod.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
err := client.CoreV1().Pods(namespace).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{
@@ -424,3 +450,34 @@ func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS
}
}
}
func checkDaemonPodStatus(client kubernetes.Interface, ctx context.Context, labels map[string]string, namespace string, retryFailedMount bool) error {
var labelSelector []string
for k, v := range labels {
labelSelector = append(labelSelector, fmt.Sprintf("%s=%s", k, v))
}
pods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
LabelSelector: strings.Join(labelSelector, ","),
})
if err != nil {
return errors.Wrap(err, "get daemonset pods")
}
for _, pod := range pods.Items {
if pod.Status.Phase != corev1.PodRunning {
events, _ := client.CoreV1().Events(namespace).List(ctx, metav1.ListOptions{
FieldSelector: fmt.Sprintf("involvedObject.uid=%s", pod.UID),
})
for _, event := range events.Items {
// If the pod has a FailedMount event, it means that the pod failed to mount the volume and the pod will be stuck in the Pending state.
// In this case, we return an error to the caller to indicate that path does not exist.
if event.Reason == "FailedMount" && !retryFailedMount {
klog.V(2).Infof("pod %s has a FailedMount event: %s", pod.Name, event.Message)
return errors.Errorf("path does not exist")
}
}
}
}
return nil
}

View File

@@ -0,0 +1,140 @@
package collect
import (
"context"
"testing"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
testclient "k8s.io/client-go/kubernetes/fake"
)
func Test_checkDaemonPodStatus(t *testing.T) {
tests := []struct {
name string
namespace string
podStatus corev1.PodPhase
mockPod *corev1.Pod
mockEvent *corev1.Event
labels map[string]string
retryFailedMount bool
expectedErr bool
eventMessage string
}{
{
name: "Pod running without FailedMount event",
namespace: "test",
podStatus: corev1.PodRunning,
labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
mockPod: &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod",
Namespace: "test",
Labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
},
Status: corev1.PodStatus{
Phase: corev1.PodRunning,
},
},
expectedErr: false,
},
{
name: "Pod not running without FailedMount event",
namespace: "test",
labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
podStatus: corev1.PodPending,
mockPod: &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod",
Namespace: "test",
Labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
},
Status: corev1.PodStatus{
Phase: corev1.PodPending,
},
},
expectedErr: false,
},
{
name: "Pod running with FailedMount event and retryFailedMount disabled",
namespace: "test",
podStatus: corev1.PodRunning,
mockPod: &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod",
Namespace: "test",
Labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
},
Status: corev1.PodStatus{
Phase: corev1.PodPending,
},
},
mockEvent: &corev1.Event{
ObjectMeta: metav1.ObjectMeta{
Name: "test-event",
Namespace: "test",
},
Reason: "FailedMount",
},
retryFailedMount: false,
expectedErr: true,
eventMessage: `MountVolume.SetUp failed for volume "host" : hostPath type check failed: /var/lib/collectd is not a directory`,
},
{
name: "Pod running with FailedMount event and retryFailedMount enabled",
namespace: "test",
podStatus: corev1.PodRunning,
mockPod: &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod",
Namespace: "test",
Labels: map[string]string{"app.kubernetes.io/managed-by": "troubleshoot.sh"},
},
Status: corev1.PodStatus{
Phase: corev1.PodPending,
},
},
mockEvent: &corev1.Event{
ObjectMeta: metav1.ObjectMeta{
Name: "test-event",
Namespace: "test",
},
Reason: "FailedMount",
},
retryFailedMount: true,
expectedErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx := context.Background()
client := testclient.NewSimpleClientset()
if tt.mockPod != nil {
pod, err := client.CoreV1().Pods(tt.namespace).Create(ctx, tt.mockPod, metav1.CreateOptions{})
require.NoError(t, err)
if tt.mockEvent != nil {
event := tt.mockEvent
event.InvolvedObject = corev1.ObjectReference{
UID: pod.UID,
}
_, err = client.CoreV1().Events(tt.namespace).Create(ctx, event, metav1.CreateOptions{})
require.NoError(t, err)
}
}
err := checkDaemonPodStatus(client, ctx, tt.labels, tt.namespace, tt.retryFailedMount)
if tt.expectedErr {
require.Error(t, err)
if tt.mockEvent != nil {
require.Contains(t, err.Error(), "path does not exist")
}
} else {
require.NoError(t, err)
}
})
}
}

View File

@@ -14,6 +14,7 @@ const (
// DEFAULT_LOGS_COLLECTOR_TIMEOUT is the default timeout for logs collector.
DEFAULT_LOGS_COLLECTOR_TIMEOUT = 60 * time.Second
// MAX_TIME_TO_WAIT_FOR_POD_DELETION is the maximum time to wait for pod deletion.
// 0 seconds for force deletion.
MAX_TIME_TO_WAIT_FOR_POD_DELETION = 60 * time.Second
// Tracing constants
LIB_TRACER_NAME = "github.com/replicatedhq/troubleshoot"