Longhorn replica corruption analyzer

This automates the procedure from
https://longhorn.io/docs/1.1.1/advanced-resources/data-recovery/corrupted-replica/
This commit is contained in:
Andrew Reed
2021-06-22 19:40:22 +00:00
parent 900dd9e417
commit cb3925a0af
4 changed files with 411 additions and 0 deletions

View File

@@ -5,6 +5,7 @@ import (
"bytes"
"fmt"
"path/filepath"
"reflect"
"strings"
longhornv1beta1 "github.com/longhorn/longhorn-manager/k8s/pkg/apis/longhorn/v1beta1"
@@ -76,6 +77,24 @@ func longhorn(analyzer *troubleshootv1beta2.LonghornAnalyze, getCollectedFileCon
engines = append(engines, engine)
}
// get volumes.longhorn.io
volumesDir := collect.GetLonghornVolumesDirectory(ns)
volumesGlob := filepath.Join(volumesDir, "*.yaml")
volumesYaml, err := findFiles(volumesGlob)
if err != nil {
return nil, errors.Wrapf(err, "Failed to find longhorn volumes files under %s", volumesDir)
}
volumes := []*longhornv1beta1.Volume{}
for key, volumeYaml := range volumesYaml {
volumeYaml = stripRedactedLines(volumeYaml)
volume := &longhornv1beta1.Volume{}
err := yaml.Unmarshal(volumeYaml, volume)
if err != nil {
return nil, errors.Wrapf(err, "failed to unmarshal volume yaml from %s", key)
}
volumes = append(volumes, volume)
}
results := []*AnalyzeResult{}
for _, node := range nodes {
@@ -90,6 +109,28 @@ func longhorn(analyzer *troubleshootv1beta2.LonghornAnalyze, getCollectedFileCon
results = append(results, analyzeLonghornEngine(engine))
}
// get replica checksums for each volume if provided
for _, volume := range volumes {
checksumsGlob := filepath.Join(volumesDir, volume.Name, "replicachecksums", "*")
checksumFiles, err := findFiles(checksumsGlob)
if err != nil {
return nil, errors.Wrapf(err, "Failed to find longhorn replica checksums under %s", checksumsGlob)
}
checksums := []map[string]string{}
for key, checksumTxt := range checksumFiles {
checksum, err := collect.ParseReplicaChecksum(checksumTxt)
if err != nil {
return nil, errors.Wrapf(err, "Failed to parse %s", key)
}
checksums = append(checksums, checksum)
}
if len(checksums) > 1 {
results = append(results, analyzeLonghornReplicaChecksums(volume.Name, checksums))
}
}
return results, nil
}
@@ -179,3 +220,26 @@ func stripRedactedLines(yaml []byte) []byte {
return out
}
func analyzeLonghornReplicaChecksums(volumeName string, checksums []map[string]string) *AnalyzeResult {
result := &AnalyzeResult{
Title: fmt.Sprintf("Longhorn Volume Replica Corruption: %s", volumeName),
}
for i, checksum := range checksums {
if i == 0 {
continue
}
prior := checksums[i-1]
if !reflect.DeepEqual(prior, checksum) {
result.IsWarn = true
result.Message = "Replica corruption detected"
return result
}
}
result.IsPass = true
result.Message = "No replica corruption detected"
return result
}

View File

@@ -204,3 +204,181 @@ func TestAnalyzeLonghornEngine(t *testing.T) {
})
}
}
func TestAnalyzeLonghornReplicaChecksums(t *testing.T) {
tests := []struct {
name string
checksums []map[string]string
volumeName string
expect *AnalyzeResult
}{
{
name: "3 ok",
checksums: []map[string]string{
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
{
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
{
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
},
},
volumeName: "pvc-uuid-123",
expect: &AnalyzeResult{
Title: "Longhorn Volume Replica Corruption: pvc-uuid-123",
IsPass: true,
Message: "No replica corruption detected",
},
},
{
name: "2 ok",
checksums: []map[string]string{
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
{
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
},
},
volumeName: "pvc-uuid-123",
expect: &AnalyzeResult{
Title: "Longhorn Volume Replica Corruption: pvc-uuid-123",
IsPass: true,
Message: "No replica corruption detected",
},
},
{
name: "1 of 3 corrupt",
checksums: []map[string]string{
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
},
volumeName: "pvc-uuid-123",
expect: &AnalyzeResult{
Title: "Longhorn Volume Replica Corruption: pvc-uuid-123",
IsWarn: true,
Message: "Replica corruption detected",
},
},
{
name: "2 of 3 corrupt",
checksums: []map[string]string{
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
},
volumeName: "pvc-uuid-123",
expect: &AnalyzeResult{
Title: "Longhorn Volume Replica Corruption: pvc-uuid-123",
IsWarn: true,
Message: "Replica corruption detected",
},
},
{
name: "3 of 3 corrupt",
checksums: []map[string]string{
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
},
volumeName: "pvc-uuid-123",
expect: &AnalyzeResult{
Title: "Longhorn Volume Replica Corruption: pvc-uuid-123",
IsWarn: true,
Message: "Replica corruption detected",
},
},
{
name: "1 of 2 corrupt",
checksums: []map[string]string{
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
},
},
volumeName: "pvc-uuid-123",
expect: &AnalyzeResult{
Title: "Longhorn Volume Replica Corruption: pvc-uuid-123",
IsWarn: true,
Message: "Replica corruption detected",
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
got := analyzeLonghornReplicaChecksums(test.volumeName, test.checksums)
assert.Equal(t, test.expect, got)
})
}
}

View File

@@ -1,21 +1,31 @@
package collect
import (
"bufio"
"bytes"
"context"
"fmt"
"path/filepath"
"regexp"
longhornv1beta1types "github.com/longhorn/longhorn-manager/k8s/pkg/apis/longhorn/v1beta1"
longhornv1beta1 "github.com/longhorn/longhorn-manager/k8s/pkg/client/clientset/versioned/typed/longhorn/v1beta1"
longhorntypes "github.com/longhorn/longhorn-manager/types"
"github.com/pkg/errors"
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
"gopkg.in/yaml.v2"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/remotecommand"
)
const (
DefaultLonghornNamespace = "longhorn-system"
)
var checksumRX = regexp.MustCompile(`(\S+)\s+(\S+)`)
func Longhorn(c *Collector, longhornCollector *troubleshootv1beta2.Longhorn) (map[string][]byte, error) {
ctx := context.TODO()
@@ -197,6 +207,60 @@ func Longhorn(c *Collector, longhornCollector *troubleshootv1beta2.Longhorn) (ma
final[key] = log
}
// https://longhorn.io/docs/1.1.1/advanced-resources/data-recovery/corrupted-replica/
// There is one instance manager replica pod per node. To checksum a replica we will
// exec into that pod and get the sha256sum of all files in the replica data directory.
var replicaPodsByNode map[string]string
for _, volume := range volumes.Items {
if volume.Status.State != longhorntypes.VolumeStateDetached {
// cannot checksum volumes in use
continue
}
var volReplicas []longhornv1beta1types.Replica
for _, replica := range replicas.Items {
if replica.Spec.InstanceSpec.VolumeName != volume.Name {
continue
}
volReplicas = append(volReplicas, replica)
}
if len(volReplicas) <= 1 {
// no reason to checksum volumes with a single replica
continue
}
// At this point we've found a detached volume with multiple replicas so we have to checksum
// each replica.
// First initialize the map of nodes to pods we will exec into
if replicaPodsByNode == nil {
pods, err := ListInstanceManagerReplicaPods(ctx, c.ClientConfig, ns)
if err != nil {
return nil, err
}
replicaPodsByNode = pods
}
for _, replica := range volReplicas {
// Find the name of the instance manager replica pod running on the node where this
// replica is scheduled
podName := replicaPodsByNode[replica.Spec.InstanceSpec.NodeID]
if podName == "" {
continue
}
checksums, err := GetLonghornReplicaChecksum(c.ClientConfig, replica, podName)
if err != nil {
return nil, err
}
volsDir := GetLonghornVolumesDirectory(ns)
key := filepath.Join(volsDir, volume.Name, "replicachecksums", replica.Name+".txt")
final[key] = []byte(checksums)
}
}
return final, nil
}
@@ -276,3 +340,83 @@ func GetLonghornLogsDirectory(namespace string) string {
}
return "longhorn/logs"
}
func GetLonghornReplicaChecksum(clientConfig *rest.Config, replica longhornv1beta1types.Replica, podName string) (string, error) {
client, err := kubernetes.NewForConfig(clientConfig)
if err != nil {
return "", err
}
req := client.
CoreV1().
RESTClient().
Post().
Namespace(replica.Namespace).
Name(podName).
Resource("pods").
SubResource("exec").
Param("container", "replica-manager").
Param("stdout", "true").
Param("stdin", "true").
Param("command", "/bin/bash").
Param("command", "-c").
Param("command", fmt.Sprintf("sha256sum /host/var/lib/longhorn/replicas/%s/*", replica.Spec.DataDirectoryName))
executor, err := remotecommand.NewSPDYExecutor(clientConfig, "POST", req.URL())
if err != nil {
return "", errors.Wrapf(err, "create remote exec")
}
var stdout bytes.Buffer
var stderr bytes.Buffer
err = executor.Stream(remotecommand.StreamOptions{
Stdout: &stdout,
Stderr: &stderr,
})
if err != nil {
return "", errors.Wrapf(err, "stream remote exec: %s", stderr.String())
}
return stdout.String(), nil
}
// Returns a map of nodeName:podName
func ListInstanceManagerReplicaPods(ctx context.Context, clientConfig *rest.Config, namespace string) (map[string]string, error) {
client, err := kubernetes.NewForConfig(clientConfig)
if err != nil {
return nil, err
}
options := metav1.ListOptions{
LabelSelector: "longhorn.io/instance-manager-type=replica",
}
pods, err := client.CoreV1().Pods(namespace).List(ctx, options)
if err != nil {
return nil, err
}
out := map[string]string{}
for _, pod := range pods.Items {
node := pod.Labels["longhorn.io/node"]
out[node] = pod.Name
}
return out, nil
}
func ParseReplicaChecksum(data []byte) (map[string]string, error) {
buf := bytes.NewBuffer(data)
scanner := bufio.NewScanner(buf)
out := map[string]string{}
for scanner.Scan() {
matches := checksumRX.FindStringSubmatch(scanner.Text())
if len(matches) < 3 {
continue
}
filename := filepath.Base(matches[2])
out[filename] = matches[1]
}
return out, scanner.Err()
}

View File

@@ -0,0 +1,25 @@
package collect
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestParseReplicaChecksums(t *testing.T) {
data := []byte(`
7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9 /host/var/lib/longhorn/replicas/pvc-1f9ee2f6-078e-42a6-bf5c-3eaa0722fbfc-68bd18ca/revision.counter
7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce /host/var/lib/longhorn/replicas/pvc-1f9ee2f6-078e-42a6-bf5c-3eaa0722fbfc-68bd18ca/volume-head-000.img
ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9 /host/var/lib/longhorn/replicas/pvc-1f9ee2f6-078e-42a6-bf5c-3eaa0722fbfc-68bd18ca/volume-head-000.img.meta
e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b /host/var/lib/longhorn/replicas/pvc-1f9ee2f6-078e-42a6-bf5c-3eaa0722fbfc-68bd18ca/volume.meta
`)
got, err := ParseReplicaChecksum(data)
assert.Nil(t, err)
want := map[string]string{
"revision.counter": "7cc93e21d84bb7d0db0a72281f21500ba3847dea6467631cca91523d01ace8c9",
"volume-head-000.img": "7637cb563f796f8d6358ff4fc635ce596e5326a7f940cc2ea2eaee0acff843ce",
"volume-head-000.img.meta": "ca21027be32ef389de0b21d0c4713e824cad7114a287e05e56de49c948492fc9",
"volume.meta": "e9ce811b3f11dfe3af0bdd46581f23ba2c570be5dc3b807652ad6142322c706b",
}
assert.Equal(t, want, got)
}