From 1ed8532663b34567f19ad12e0bb33a1dbfce2675 Mon Sep 17 00:00:00 2001 From: Andrew Reed Date: Thu, 1 Jul 2021 16:52:59 +0000 Subject: [PATCH] Speed up replica checksum --- pkg/collect/longhorn.go | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/pkg/collect/longhorn.go b/pkg/collect/longhorn.go index 8116bf49..551c84e5 100644 --- a/pkg/collect/longhorn.go +++ b/pkg/collect/longhorn.go @@ -7,12 +7,14 @@ import ( "fmt" "path/filepath" "regexp" + "sync" longhornv1beta1types "github.com/longhorn/longhorn-manager/k8s/pkg/apis/longhorn/v1beta1" longhornv1beta1 "github.com/longhorn/longhorn-manager/k8s/pkg/client/clientset/versioned/typed/longhorn/v1beta1" longhorntypes "github.com/longhorn/longhorn-manager/types" "github.com/pkg/errors" troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" + "github.com/replicatedhq/troubleshoot/pkg/logger" "gopkg.in/yaml.v2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" @@ -40,6 +42,7 @@ func Longhorn(c *Collector, longhornCollector *troubleshootv1beta2.Longhorn) (ma } final := map[string][]byte{} + var mtx sync.Mutex // collect nodes.longhorn.io nodes, err := client.Nodes(ns).List(ctx, metav1.ListOptions{}) @@ -213,6 +216,8 @@ func Longhorn(c *Collector, longhornCollector *troubleshootv1beta2.Longhorn) (ma // exec into that pod and get the sha256sum of all files in the replica data directory. var replicaPodsByNode map[string]string + var wg sync.WaitGroup + for _, volume := range volumes.Items { if volume.Status.State != longhorntypes.VolumeStateDetached { // cannot checksum volumes in use @@ -224,6 +229,9 @@ func Longhorn(c *Collector, longhornCollector *troubleshootv1beta2.Longhorn) (ma if replica.Spec.InstanceSpec.VolumeName != volume.Name { continue } + if replica.Spec.InstanceSpec.NodeID == "" { + continue + } volReplicas = append(volReplicas, replica) } if len(volReplicas) <= 1 { @@ -251,16 +259,25 @@ func Longhorn(c *Collector, longhornCollector *troubleshootv1beta2.Longhorn) (ma continue } - checksums, err := GetLonghornReplicaChecksum(c.ClientConfig, replica, podName) - if err != nil { - return nil, err - } - volsDir := GetLonghornVolumesDirectory(ns) - key := filepath.Join(volsDir, volume.Name, "replicachecksums", replica.Name+".txt") - final[key] = []byte(checksums) + wg.Add(1) + go func(replica longhornv1beta1types.Replica) { + defer wg.Done() + checksums, err := GetLonghornReplicaChecksum(c.ClientConfig, replica, podName) + if err != nil { + logger.Printf("Failed to get replica %s checksum: %v", replica.Name, err) + return + } + volsDir := GetLonghornVolumesDirectory(ns) + key := filepath.Join(volsDir, volume.Name, "replicachecksums", replica.Name+".txt") + mtx.Lock() + final[key] = []byte(checksums) + mtx.Unlock() + }(replica) } } + wg.Wait() + return final, nil } @@ -313,6 +330,7 @@ func GetLonghornReplicaChecksum(clientConfig *rest.Config, replica longhornv1bet if err != nil { return "", err } + dir := fmt.Sprintf("/host/var/lib/longhorn/replicas/%s", replica.Spec.DataDirectoryName) req := client. CoreV1(). @@ -327,7 +345,7 @@ func GetLonghornReplicaChecksum(clientConfig *rest.Config, replica longhornv1bet Param("stdin", "true"). Param("command", "/bin/bash"). Param("command", "-c"). - Param("command", fmt.Sprintf("sha256sum /host/var/lib/longhorn/replicas/%s/*", replica.Spec.DataDirectoryName)) + Param("command", fmt.Sprintf("if [ -d %s ]; then md5sum %s/*; fi", dir, dir)) executor, err := remotecommand.NewSPDYExecutor(clientConfig, "POST", req.URL()) if err != nil {