Add e2e tests for reporting filesystem problems

Also added support for running e2e tests in parallel.
This commit is contained in:
Xuewei Zhang
2019-09-19 11:52:11 -07:00
parent b3f811d171
commit dd37dfe12c
7 changed files with 132 additions and 55 deletions

View File

@@ -67,6 +67,9 @@ function install-npd() {
echo "Installing NPD systemd service."
cp "${workdir}"/config/systemd/node-problem-detector-metric-only.service /etc/systemd/system/node-problem-detector.service
echo "Installing problem maker binary, used only for e2e testing."
cp "${workdir}"/test/bin/problem-maker "${BIN_DIR}"
rm -rf "${workdir}"
# Start systemd service.

View File

@@ -6,7 +6,7 @@ Currently the tests only support Google Compute Engine (GCE) environment. Suppor
## Prerequisites
1. Setup [Google Application Default Credentials](https://developers.google.com/identity/protocols/application-default-credentials), which is [required for authentication](https://godoc.org/google.golang.org/api/compute/v1#hdr-Creating_a_client) by the Compute Engine API.
1. Setup [Google Application Default Credentials (ADC)](https://developers.google.com/identity/protocols/application-default-credentials), which is [required for authentication](https://godoc.org/google.golang.org/api/compute/v1#hdr-Creating_a_client) by the Compute Engine API.
2. Setup a [project-wide SSH key](https://cloud.google.com/compute/docs/instances/adding-removing-ssh-keys#project-wide) that can be used to SSH into the GCE VMs.
## Running tests
@@ -21,5 +21,6 @@ export VM_IMAGE=[TESTED_OS_IMAGE:cos-73-11647-217-0]
export IMAGE_PROJECT=[TESTED_OS_IMAGE_PROJECT:cos-cloud]
export SSH_USER=${USER}
export SSH_KEY=~/.ssh/id_rsa
export ARTIFACTS=/tmp/npd
make e2e-test
```

View File

@@ -23,6 +23,7 @@ import (
"k8s.io/node-problem-detector/test/e2e/lib/ssh"
. "github.com/onsi/gomega"
compute "google.golang.org/api/compute/v1"
)
@@ -145,6 +146,14 @@ func (ins *Instance) RunCommand(cmd string) ssh.Result {
return ssh.Run(cmd, ins.ExternalIP, ins.SshUser, ins.SshKey)
}
// RunCommand runs a command on the GCE instance and returns the command result, and fails the test when the command failed.
func (ins *Instance) RunCommandOrFail(cmd string) ssh.Result {
result := ins.RunCommand(cmd)
Expect(result.SSHError).ToNot(HaveOccurred(), "SSH-ing to the instance failed: %v\n", result)
Expect(result.Code).To(Equal(0), "Running command failed: %v\n", result)
return result
}
// PushFile pushes a local file to a GCE instance.
func (ins *Instance) PushFile(srcPath, destPath string) error {
if ins.ExternalIP == "" {

View File

@@ -29,9 +29,6 @@ import (
"github.com/avast/retry-go"
)
const npdMetricsFilename = "node-problem-detector-metrics.txt"
const npdLogsFilename = "node-problem-detector.log"
// SetupNPD installs NPD from the test tarball onto the provided GCE instance.
//
// Here is how it works:
@@ -91,6 +88,20 @@ func FetchNPDMetrics(ins gce.Instance) ([]metrics.Float64MetricRepresentation, e
return npdMetrics, nil
}
// FetchNPDMetric fetches and parses a specific metric reported by NPD on the provided GCE instance.
func FetchNPDMetric(ins gce.Instance, metricName string, labels map[string]string) (float64, error) {
gotMetrics, err := FetchNPDMetrics(ins)
if err != nil {
return 0.0, err
}
metric, err := metrics.GetFloat64Metric(gotMetrics, metricName, labels, true)
if err != nil {
return 0.0, fmt.Errorf("Failed to find %s metric with label %v: %v.\nHere is all NPD exported metrics: %v",
metricName, labels, err, gotMetrics)
}
return metric.Value, nil
}
// WaitForNPD waits for NPD to become ready by waiting for expected metrics.
func WaitForNPD(ins gce.Instance, metricNames []string, timeoutSeconds uint) error {
verifyMetricExist := func() error {
@@ -116,30 +127,33 @@ func WaitForNPD(ins gce.Instance, metricNames []string, timeoutSeconds uint) err
}
// SaveTestArtifacts saves debugging data from NPD.
func SaveTestArtifacts(ins gce.Instance, directory string) []error {
func SaveTestArtifacts(ins gce.Instance, artifactDirectory string, testID int) []error {
var errs []error
npdMetrics := ins.RunCommand("curl http://localhost:20257/metrics")
if npdMetrics.SSHError != nil || npdMetrics.Code != 0 {
errs = append(errs, fmt.Errorf("Error fetching NPD metrics: %v\n", npdMetrics))
} else {
npdMetricsPath := path.Join(directory, npdMetricsFilename)
err := ioutil.WriteFile(npdMetricsPath, []byte(npdMetrics.Stdout), 0644)
if err != nil {
errs = append(errs, fmt.Errorf("Error writing to %s: %v", npdMetricsPath, err))
}
if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID,
"curl http://localhost:20257/metrics", "node-problem-detector-metrics"); err != nil {
errs = append(errs, err)
}
npdLog := ins.RunCommand("sudo journalctl -u node-problem-detector.service")
if npdLog.SSHError != nil || npdLog.Code != 0 {
errs = append(errs, fmt.Errorf("Error fetching NPD logs: %v\n", npdLog))
} else {
npdLogsPath := path.Join(directory, npdLogsFilename)
err := ioutil.WriteFile(npdLogsPath, []byte(npdLog.Stdout), 0644)
if err != nil {
errs = append(errs, fmt.Errorf("Error writing to %s: %v", npdLogsPath, err))
}
if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID,
"sudo journalctl -u node-problem-detector.service", "node-problem-detector"); err != nil {
errs = append(errs, err)
}
if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID,
"sudo journalctl -k", "kernel-logs"); err != nil {
errs = append(errs, err)
}
return errs
}
func saveCommandResultAsArtifact(ins gce.Instance, artifactDirectory string, testID int, command string, artifactPrefix string) error {
artifactPath := path.Join(artifactDirectory, fmt.Sprintf("%v-%02d.txt", artifactPrefix, testID))
result := ins.RunCommand(command)
if result.SSHError != nil || result.Code != 0 {
return fmt.Errorf("Error running command: %v\n", result)
}
if err := ioutil.WriteFile(artifactPath, []byte(result.Stdout), 0644); err != nil {
return fmt.Errorf("Error writing artifact to %v: %v\n", artifactPath, err)
}
return nil
}

View File

@@ -29,12 +29,12 @@ import (
"k8s.io/test-infra/boskos/client"
"github.com/onsi/ginkgo"
"github.com/onsi/ginkgo/config"
"github.com/onsi/ginkgo/reporters"
. "github.com/onsi/gomega"
compute "google.golang.org/api/compute/v1"
)
const junitFileName = "junit.xml"
var zone = flag.String("zone", "", "gce zone the hosts live in")
var project = flag.String("project", "", "gce project the hosts live in")
var image = flag.String("image", "", "image to test")
@@ -80,7 +80,7 @@ func TestNPD(t *testing.T) {
}
// The junit formatted result output is for showing test results on testgrid.
junitReporter := reporters.NewJUnitReporter(path.Join(*artifactsDir, junitFileName))
junitReporter := reporters.NewJUnitReporter(path.Join(*artifactsDir, fmt.Sprintf("junit-%02d.xml", config.GinkgoConfig.ParallelNode)))
ginkgo.RunSpecsWithDefaultAndCustomReporters(t, "NPD Metric-only Suite", []ginkgo.Reporter{junitReporter})
}
@@ -89,9 +89,8 @@ func acquireProjectOrDie(boskosClient *client.Client) string {
ctx, cancel := context.WithTimeout(context.Background(), *boskosWaitDuration)
defer cancel()
p, err := boskosClient.AcquireWait(ctx, *boskosProjectType, "free", "busy")
if err != nil {
panic(fmt.Sprintf("Unable to rent project from Boskos: %v\n", err))
}
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to rent project from Boskos: %v\n", err))
fmt.Printf("Rented project %s from Boskos", p.Name)
go func(boskosClient *client.Client, projectName string) {
@@ -110,12 +109,11 @@ func releaseProjectOrDie(boskosClient *client.Client) {
return
}
err := boskosClient.ReleaseAll("dirty")
if err != nil {
panic(fmt.Sprintf("Failed to release project to Boskos: %v", err))
}
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to release project to Boskos: %v", err))
}
func TestMain(m *testing.M) {
RegisterFailHandler(ginkgo.Fail)
flag.Parse()
os.Exit(m.Run())

View File

@@ -21,12 +21,15 @@ import (
"os"
"path"
"strings"
"time"
"k8s.io/node-problem-detector/pkg/util/metrics"
"k8s.io/node-problem-detector/test/e2e/lib/gce"
"k8s.io/node-problem-detector/test/e2e/lib/npd"
"github.com/onsi/ginkgo"
"github.com/onsi/ginkgo/config"
. "github.com/onsi/gomega"
"github.com/pborman/uuid"
)
@@ -57,42 +60,77 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
},
*image,
*imageProject)
if err != nil {
ginkgo.Fail(fmt.Sprintf("Unable to create test instance: %v", err))
}
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to create test instance: %v", err))
err = npd.SetupNPD(instance, *npdBuildTar)
if err != nil {
ginkgo.Fail(fmt.Sprintf("Unable to setup NPD: %v", err))
}
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to setup NPD: %v", err))
})
ginkgo.Context("On a clean node", func() {
ginkgo.It("NPD should export host_uptime metric", func() {
err := npd.WaitForNPD(instance, []string{"host_uptime"}, 120)
if err != nil {
ginkgo.Fail(fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
}
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
gotMetrics, err := npd.FetchNPDMetrics(instance)
if err != nil {
ginkgo.Fail(fmt.Sprintf("Error fetching NPD metrics: %v", err))
}
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Error fetching NPD metrics: %v", err))
_, err = metrics.GetFloat64Metric(gotMetrics, "host_uptime", map[string]string{}, false)
if err != nil {
ginkgo.Fail(fmt.Sprintf("Failed to find uptime metric: %v.\nHere is all NPD exported metrics: %v",
err, gotMetrics))
}
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to find uptime metric: %v.\nHere is all NPD exported metrics: %v", err, gotMetrics))
})
ginkgo.It("NPD should not report any problem", func() {
err := npd.WaitForNPD(instance, []string{"problem_gauge"}, 120)
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
assertMetricValueInBound(instance,
"problem_gauge", map[string]string{"reason": "DockerHung", "type": "KernelDeadlock"},
0.0, 0.0)
assertMetricValueInBound(instance,
"problem_counter", map[string]string{"reason": "DockerHung"},
0.0, 0.0)
assertMetricValueInBound(instance,
"problem_counter", map[string]string{"reason": "FilesystemIsReadOnly"},
0.0, 0.0)
assertMetricValueInBound(instance,
"problem_counter", map[string]string{"reason": "KernelOops"},
0.0, 0.0)
assertMetricValueInBound(instance,
"problem_counter", map[string]string{"reason": "OOMKilling"},
0.0, 0.0)
})
})
ginkgo.Context("When ext4 filesystem error happens", func() {
ginkgo.BeforeEach(func() {
err := npd.WaitForNPD(instance, []string{"problem_gauge"}, 120)
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
// This will trigger a ext4 error on the boot disk, causing the boot disk mounted as read-only and systemd-journald crashing.
instance.RunCommandOrFail("sudo /home/kubernetes/bin/problem-maker --problem Ext4FilesystemError")
})
ginkgo.It("NPD should update problem_counter{reason:Ext4Error} and problem_gauge{type:ReadonlyFilesystem}", func() {
time.Sleep(5 * time.Second)
assertMetricValueInBound(instance,
"problem_counter", map[string]string{"reason": "Ext4Error"},
1.0, 2.0)
assertMetricValueInBound(instance,
"problem_gauge", map[string]string{"reason": "FilesystemIsReadOnly", "type": "ReadonlyFilesystem"},
1.0, 1.0)
})
ginkgo.It("NPD should remain healthy", func() {
npdStates := instance.RunCommandOrFail("sudo systemctl show node-problem-detector -p ActiveState -p SubState")
Expect(npdStates.Stdout).To(ContainSubstring("ActiveState=active"), "NPD is no longer active: %v", npdStates)
Expect(npdStates.Stdout).To(ContainSubstring("SubState=running"), "NPD is no longer running: %v", npdStates)
})
})
ginkgo.AfterEach(func() {
defer func() {
err := instance.DeleteInstance()
if err != nil {
ginkgo.Fail(fmt.Sprintf("Failed to clean up the test VM: %v", err))
}
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to clena up the test VM: %v", err))
}()
artifactSubDir := ""
@@ -109,9 +147,20 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
}
}
errs := npd.SaveTestArtifacts(instance, artifactSubDir)
errs := npd.SaveTestArtifacts(instance, artifactSubDir, config.GinkgoConfig.ParallelNode)
if len(errs) != 0 {
fmt.Printf("Error storing debugging data to test artifacts: %v", errs)
}
})
})
func assertMetricValueInBound(instance gce.Instance, metricName string, labels map[string]string, lowBound float64, highBound float64) {
value, err := npd.FetchNPDMetric(instance, metricName, labels)
if err != nil {
ginkgo.Fail(fmt.Sprintf("Failed to find %s metric with label %v: %v", metricName, labels, err))
}
Expect(value).Should(BeNumerically(">=", lowBound),
"Got value for metric %s with label %v: %v, expect at least %v.", metricName, labels, value, lowBound)
Expect(value).Should(BeNumerically("<=", highBound),
"Got value for metric %s with label %v: %v, expect at most %v.", metricName, labels, value, highBound)
}