From cb3925a0af904ac693383f3d0ddc91a463158b3b Mon Sep 17 00:00:00 2001 From: Andrew Reed Date: Tue, 22 Jun 2021 19:40:22 +0000 Subject: [PATCH] Longhorn replica corruption analyzer This automates the procedure from https://longhorn.io/docs/1.1.1/advanced-resources/data-recovery/corrupted-replica/ --- pkg/analyze/longhorn.go | 64 +++++++++++++ pkg/analyze/longhorn_test.go | 178 +++++++++++++++++++++++++++++++++++ pkg/collect/longhorn.go | 144 ++++++++++++++++++++++++++++ pkg/collect/longhorn_test.go | 25 +++++ 4 files changed, 411 insertions(+) create mode 100644 pkg/collect/longhorn_test.go diff --git a/pkg/analyze/longhorn.go b/pkg/analyze/longhorn.go index be15de2a..4f4c090b 100644 --- a/pkg/analyze/longhorn.go +++ b/pkg/analyze/longhorn.go @@ -5,6 +5,7 @@ import ( "bytes" "fmt" "path/filepath" + "reflect" "strings" longhornv1beta1 "github.com/longhorn/longhorn-manager/k8s/pkg/apis/longhorn/v1beta1" @@ -76,6 +77,24 @@ func longhorn(analyzer *troubleshootv1beta2.LonghornAnalyze, getCollectedFileCon engines = append(engines, engine) } + // get volumes.longhorn.io + volumesDir := collect.GetLonghornVolumesDirectory(ns) + volumesGlob := filepath.Join(volumesDir, "*.yaml") + volumesYaml, err := findFiles(volumesGlob) + if err != nil { + return nil, errors.Wrapf(err, "Failed to find longhorn volumes files under %s", volumesDir) + } + volumes := []*longhornv1beta1.Volume{} + for key, volumeYaml := range volumesYaml { + volumeYaml = stripRedactedLines(volumeYaml) + volume := &longhornv1beta1.Volume{} + err := yaml.Unmarshal(volumeYaml, volume) + if err != nil { + return nil, errors.Wrapf(err, "failed to unmarshal volume yaml from %s", key) + } + volumes = append(volumes, volume) + } + results := []*AnalyzeResult{} for _, node := range nodes { @@ -90,6 +109,28 @@ func longhorn(analyzer *troubleshootv1beta2.LonghornAnalyze, getCollectedFileCon results = append(results, analyzeLonghornEngine(engine)) } + // get replica checksums for each volume if provided + for _, volume := range volumes { + checksumsGlob := filepath.Join(volumesDir, volume.Name, "replicachecksums", "*") + checksumFiles, err := findFiles(checksumsGlob) + if err != nil { + return nil, errors.Wrapf(err, "Failed to find longhorn replica checksums under %s", checksumsGlob) + } + + checksums := []map[string]string{} + for key, checksumTxt := range checksumFiles { + checksum, err := collect.ParseReplicaChecksum(checksumTxt) + if err != nil { + return nil, errors.Wrapf(err, "Failed to parse %s", key) + } + checksums = append(checksums, checksum) + } + + if len(checksums) > 1 { + results = append(results, analyzeLonghornReplicaChecksums(volume.Name, checksums)) + } + } + return results, nil } @@ -179,3 +220,26 @@ func stripRedactedLines(yaml []byte) []byte { return out } + +func analyzeLonghornReplicaChecksums(volumeName string, checksums []map[string]string) *AnalyzeResult { + result := &AnalyzeResult{ + Title: fmt.Sprintf("Longhorn Volume Replica Corruption: %s", volumeName), + } + + for i, checksum := range checksums { + if i == 0 { + continue + } + prior := checksums[i-1] + if !reflect.DeepEqual(prior, checksum) { + result.IsWarn = true + result.Message = "Replica corruption detected" + return result + } + } + + result.IsPass = true + result.Message = "No replica corruption detected" + + return result +} diff --git a/pkg/analyze/longhorn_test.go b/pkg/analyze/longhorn_test.go index f1542780..fe47db32 100644 --- a/pkg/analyze/longhorn_test.go +++ b/pkg/analyze/longhorn_test.go @@ -204,3 +204,181 @@ func TestAnalyzeLonghornEngine(t *testing.T) { }) } } + +func TestAnalyzeLonghornReplicaChecksums(t *testing.T) { + tests := []struct { + name string + checksums []map[string]string + volumeName string + expect *AnalyzeResult + }{ + { + name: "3 ok", + checksums: []map[string]string{ + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + { + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + { + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + }, + }, + volumeName: "pvc-uuid-123", + expect: &AnalyzeResult{ + Title: "Longhorn Volume Replica Corruption: pvc-uuid-123", + IsPass: true, + Message: "No replica corruption detected", + }, + }, + { + name: "2 ok", + checksums: []map[string]string{ + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + { + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + }, + }, + volumeName: "pvc-uuid-123", + expect: &AnalyzeResult{ + Title: "Longhorn Volume Replica Corruption: pvc-uuid-123", + IsPass: true, + Message: "No replica corruption detected", + }, + }, + { + name: "1 of 3 corrupt", + checksums: []map[string]string{ + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + }, + volumeName: "pvc-uuid-123", + expect: &AnalyzeResult{ + Title: "Longhorn Volume Replica Corruption: pvc-uuid-123", + IsWarn: true, + Message: "Replica corruption detected", + }, + }, + { + name: "2 of 3 corrupt", + checksums: []map[string]string{ + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + }, + volumeName: "pvc-uuid-123", + expect: &AnalyzeResult{ + Title: "Longhorn Volume Replica Corruption: pvc-uuid-123", + IsWarn: true, + Message: "Replica corruption detected", + }, + }, + { + name: "3 of 3 corrupt", + checksums: []map[string]string{ + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + }, + volumeName: "pvc-uuid-123", + expect: &AnalyzeResult{ + Title: "Longhorn Volume Replica Corruption: pvc-uuid-123", + IsWarn: true, + Message: "Replica corruption detected", + }, + }, + { + name: "1 of 2 corrupt", + checksums: []map[string]string{ + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + { + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + }, + }, + volumeName: "pvc-uuid-123", + expect: &AnalyzeResult{ + Title: "Longhorn Volume Replica Corruption: pvc-uuid-123", + IsWarn: true, + Message: "Replica corruption detected", + }, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got := analyzeLonghornReplicaChecksums(test.volumeName, test.checksums) + assert.Equal(t, test.expect, got) + }) + } +} diff --git a/pkg/collect/longhorn.go b/pkg/collect/longhorn.go index 88c17968..446f399b 100644 --- a/pkg/collect/longhorn.go +++ b/pkg/collect/longhorn.go @@ -1,21 +1,31 @@ package collect import ( + "bufio" + "bytes" "context" "fmt" "path/filepath" + "regexp" + longhornv1beta1types "github.com/longhorn/longhorn-manager/k8s/pkg/apis/longhorn/v1beta1" longhornv1beta1 "github.com/longhorn/longhorn-manager/k8s/pkg/client/clientset/versioned/typed/longhorn/v1beta1" + longhorntypes "github.com/longhorn/longhorn-manager/types" "github.com/pkg/errors" troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" "gopkg.in/yaml.v2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/remotecommand" ) const ( DefaultLonghornNamespace = "longhorn-system" ) +var checksumRX = regexp.MustCompile(`(\S+)\s+(\S+)`) + func Longhorn(c *Collector, longhornCollector *troubleshootv1beta2.Longhorn) (map[string][]byte, error) { ctx := context.TODO() @@ -197,6 +207,60 @@ func Longhorn(c *Collector, longhornCollector *troubleshootv1beta2.Longhorn) (ma final[key] = log } + // https://longhorn.io/docs/1.1.1/advanced-resources/data-recovery/corrupted-replica/ + + // There is one instance manager replica pod per node. To checksum a replica we will + // exec into that pod and get the sha256sum of all files in the replica data directory. + var replicaPodsByNode map[string]string + + for _, volume := range volumes.Items { + if volume.Status.State != longhorntypes.VolumeStateDetached { + // cannot checksum volumes in use + continue + } + + var volReplicas []longhornv1beta1types.Replica + for _, replica := range replicas.Items { + if replica.Spec.InstanceSpec.VolumeName != volume.Name { + continue + } + volReplicas = append(volReplicas, replica) + } + if len(volReplicas) <= 1 { + // no reason to checksum volumes with a single replica + continue + } + + // At this point we've found a detached volume with multiple replicas so we have to checksum + // each replica. + + // First initialize the map of nodes to pods we will exec into + if replicaPodsByNode == nil { + pods, err := ListInstanceManagerReplicaPods(ctx, c.ClientConfig, ns) + if err != nil { + return nil, err + } + replicaPodsByNode = pods + } + + for _, replica := range volReplicas { + // Find the name of the instance manager replica pod running on the node where this + // replica is scheduled + podName := replicaPodsByNode[replica.Spec.InstanceSpec.NodeID] + if podName == "" { + continue + } + + checksums, err := GetLonghornReplicaChecksum(c.ClientConfig, replica, podName) + if err != nil { + return nil, err + } + volsDir := GetLonghornVolumesDirectory(ns) + key := filepath.Join(volsDir, volume.Name, "replicachecksums", replica.Name+".txt") + final[key] = []byte(checksums) + } + } + return final, nil } @@ -276,3 +340,83 @@ func GetLonghornLogsDirectory(namespace string) string { } return "longhorn/logs" } + +func GetLonghornReplicaChecksum(clientConfig *rest.Config, replica longhornv1beta1types.Replica, podName string) (string, error) { + client, err := kubernetes.NewForConfig(clientConfig) + if err != nil { + return "", err + } + + req := client. + CoreV1(). + RESTClient(). + Post(). + Namespace(replica.Namespace). + Name(podName). + Resource("pods"). + SubResource("exec"). + Param("container", "replica-manager"). + Param("stdout", "true"). + Param("stdin", "true"). + Param("command", "/bin/bash"). + Param("command", "-c"). + Param("command", fmt.Sprintf("sha256sum /host/var/lib/longhorn/replicas/%s/*", replica.Spec.DataDirectoryName)) + + executor, err := remotecommand.NewSPDYExecutor(clientConfig, "POST", req.URL()) + if err != nil { + return "", errors.Wrapf(err, "create remote exec") + } + + var stdout bytes.Buffer + var stderr bytes.Buffer + err = executor.Stream(remotecommand.StreamOptions{ + Stdout: &stdout, + Stderr: &stderr, + }) + if err != nil { + return "", errors.Wrapf(err, "stream remote exec: %s", stderr.String()) + } + + return stdout.String(), nil +} + +// Returns a map of nodeName:podName +func ListInstanceManagerReplicaPods(ctx context.Context, clientConfig *rest.Config, namespace string) (map[string]string, error) { + client, err := kubernetes.NewForConfig(clientConfig) + if err != nil { + return nil, err + } + options := metav1.ListOptions{ + LabelSelector: "longhorn.io/instance-manager-type=replica", + } + pods, err := client.CoreV1().Pods(namespace).List(ctx, options) + if err != nil { + return nil, err + } + + out := map[string]string{} + for _, pod := range pods.Items { + node := pod.Labels["longhorn.io/node"] + out[node] = pod.Name + } + + return out, nil +} + +func ParseReplicaChecksum(data []byte) (map[string]string, error) { + buf := bytes.NewBuffer(data) + scanner := bufio.NewScanner(buf) + + out := map[string]string{} + + for scanner.Scan() { + matches := checksumRX.FindStringSubmatch(scanner.Text()) + if len(matches) < 3 { + continue + } + filename := filepath.Base(matches[2]) + out[filename] = matches[1] + } + + return out, scanner.Err() +} diff --git a/pkg/collect/longhorn_test.go b/pkg/collect/longhorn_test.go new file mode 100644 index 00000000..d8840787 --- /dev/null +++ b/pkg/collect/longhorn_test.go @@ -0,0 +1,25 @@ +package collect + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestParseReplicaChecksums(t *testing.T) { + data := []byte(` +7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9 /host/var/lib/longhorn/replicas/pvc-1f9ee2f6-078e-42a6-bf5c-3eaa0722fbfc-68bd18ca/revision.counter +7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce /host/var/lib/longhorn/replicas/pvc-1f9ee2f6-078e-42a6-bf5c-3eaa0722fbfc-68bd18ca/volume-head-000.img +ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9 /host/var/lib/longhorn/replicas/pvc-1f9ee2f6-078e-42a6-bf5c-3eaa0722fbfc-68bd18ca/volume-head-000.img.meta +e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b /host/var/lib/longhorn/replicas/pvc-1f9ee2f6-078e-42a6-bf5c-3eaa0722fbfc-68bd18ca/volume.meta +`) + got, err := ParseReplicaChecksum(data) + assert.Nil(t, err) + want := map[string]string{ + "revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9", + "volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce", + "volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9", + "volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b", + } + assert.Equal(t, want, got) +}