Add e2e tests for reporting filesystem problems

Also added support for running e2e tests in parallel.
2026-05-06 17:27:16 +00:00 · 2019-09-19 11:52:11 -07:00
parent b3f811d171
commit dd37dfe12c
7 changed files with 132 additions and 55 deletions
--- a/test/e2e-install.sh
+++ b/test/e2e-install.sh
@@ -67,6 +67,9 @@ function install-npd() {
  echo "Installing NPD systemd service."
  cp "${workdir}"/config/systemd/node-problem-detector-metric-only.service /etc/systemd/system/node-problem-detector.service

+  echo "Installing problem maker binary, used only for e2e testing."
+  cp "${workdir}"/test/bin/problem-maker "${BIN_DIR}"
+
  rm -rf "${workdir}"

  # Start systemd service.
--- a/test/e2e/README.md
+++ b/test/e2e/README.md
@@ -6,7 +6,7 @@ Currently the tests only support Google Compute Engine (GCE) environment. Suppor

 ## Prerequisites

-1. Setup [Google Application Default Credentials](https://developers.google.com/identity/protocols/application-default-credentials), which is [required for authentication](https://godoc.org/google.golang.org/api/compute/v1#hdr-Creating_a_client) by the Compute Engine API.
+1. Setup [Google Application Default Credentials (ADC)](https://developers.google.com/identity/protocols/application-default-credentials), which is [required for authentication](https://godoc.org/google.golang.org/api/compute/v1#hdr-Creating_a_client) by the Compute Engine API.
 2. Setup a [project-wide SSH key](https://cloud.google.com/compute/docs/instances/adding-removing-ssh-keys#project-wide) that can be used to SSH into the GCE VMs.

 ## Running tests
@@ -21,5 +21,6 @@ export VM_IMAGE=[TESTED_OS_IMAGE:cos-73-11647-217-0]
 export IMAGE_PROJECT=[TESTED_OS_IMAGE_PROJECT:cos-cloud]
 export SSH_USER=${USER}
 export SSH_KEY=~/.ssh/id_rsa
+export ARTIFACTS=/tmp/npd
 make e2e-test
 ```
--- a/test/e2e/lib/gce/instance.go
+++ b/test/e2e/lib/gce/instance.go
@@ -23,6 +23,7 @@ import (

 	"k8s.io/node-problem-detector/test/e2e/lib/ssh"

+	. "github.com/onsi/gomega"
 	compute "google.golang.org/api/compute/v1"
 )

@@ -145,6 +146,14 @@ func (ins *Instance) RunCommand(cmd string) ssh.Result {
 	return ssh.Run(cmd, ins.ExternalIP, ins.SshUser, ins.SshKey)
 }

+// RunCommand runs a command on the GCE instance and returns the command result, and fails the test when the command failed.
+func (ins *Instance) RunCommandOrFail(cmd string) ssh.Result {
+	result := ins.RunCommand(cmd)
+	Expect(result.SSHError).ToNot(HaveOccurred(), "SSH-ing to the instance failed: %v\n", result)
+	Expect(result.Code).To(Equal(0), "Running command failed: %v\n", result)
+	return result
+}
+
 // PushFile pushes a local file to a GCE instance.
 func (ins *Instance) PushFile(srcPath, destPath string) error {
 	if ins.ExternalIP == "" {
--- a/test/e2e/lib/npd/npd.go
+++ b/test/e2e/lib/npd/npd.go
@@ -29,9 +29,6 @@ import (
 	"github.com/avast/retry-go"
 )

-const npdMetricsFilename = "node-problem-detector-metrics.txt"
-const npdLogsFilename = "node-problem-detector.log"
-
 // SetupNPD installs NPD from the test tarball onto the provided GCE instance.
 //
 // Here is how it works:
@@ -91,6 +88,20 @@ func FetchNPDMetrics(ins gce.Instance) ([]metrics.Float64MetricRepresentation, e
 	return npdMetrics, nil
 }

+// FetchNPDMetric fetches and parses a specific metric reported by NPD on the provided GCE instance.
+func FetchNPDMetric(ins gce.Instance, metricName string, labels map[string]string) (float64, error) {
+	gotMetrics, err := FetchNPDMetrics(ins)
+	if err != nil {
+		return 0.0, err
+	}
+	metric, err := metrics.GetFloat64Metric(gotMetrics, metricName, labels, true)
+	if err != nil {
+		return 0.0, fmt.Errorf("Failed to find %s metric with label %v: %v.\nHere is all NPD exported metrics: %v",
+			metricName, labels, err, gotMetrics)
+	}
+	return metric.Value, nil
+}
+
 // WaitForNPD waits for NPD to become ready by waiting for expected metrics.
 func WaitForNPD(ins gce.Instance, metricNames []string, timeoutSeconds uint) error {
 	verifyMetricExist := func() error {
@@ -116,30 +127,33 @@ func WaitForNPD(ins gce.Instance, metricNames []string, timeoutSeconds uint) err
 }

 // SaveTestArtifacts saves debugging data from NPD.
-func SaveTestArtifacts(ins gce.Instance, directory string) []error {
+func SaveTestArtifacts(ins gce.Instance, artifactDirectory string, testID int) []error {
 	var errs []error

-	npdMetrics := ins.RunCommand("curl http://localhost:20257/metrics")
-	if npdMetrics.SSHError != nil || npdMetrics.Code != 0 {
-		errs = append(errs, fmt.Errorf("Error fetching NPD metrics: %v\n", npdMetrics))
-	} else {
-		npdMetricsPath := path.Join(directory, npdMetricsFilename)
-		err := ioutil.WriteFile(npdMetricsPath, []byte(npdMetrics.Stdout), 0644)
-		if err != nil {
-			errs = append(errs, fmt.Errorf("Error writing to %s: %v", npdMetricsPath, err))
-		}
+	if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID,
+		"curl http://localhost:20257/metrics", "node-problem-detector-metrics"); err != nil {
+		errs = append(errs, err)
 	}
-
-	npdLog := ins.RunCommand("sudo journalctl -u node-problem-detector.service")
-	if npdLog.SSHError != nil || npdLog.Code != 0 {
-		errs = append(errs, fmt.Errorf("Error fetching NPD logs: %v\n", npdLog))
-	} else {
-		npdLogsPath := path.Join(directory, npdLogsFilename)
-		err := ioutil.WriteFile(npdLogsPath, []byte(npdLog.Stdout), 0644)
-		if err != nil {
-			errs = append(errs, fmt.Errorf("Error writing to %s: %v", npdLogsPath, err))
-		}
+	if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID,
+		"sudo journalctl -u node-problem-detector.service", "node-problem-detector"); err != nil {
+		errs = append(errs, err)
+	}
+	if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID,
+		"sudo journalctl -k", "kernel-logs"); err != nil {
+		errs = append(errs, err)
 	}

 	return errs
 }
+
+func saveCommandResultAsArtifact(ins gce.Instance, artifactDirectory string, testID int, command string, artifactPrefix string) error {
+	artifactPath := path.Join(artifactDirectory, fmt.Sprintf("%v-%02d.txt", artifactPrefix, testID))
+	result := ins.RunCommand(command)
+	if result.SSHError != nil || result.Code != 0 {
+		return fmt.Errorf("Error running command: %v\n", result)
+	}
+	if err := ioutil.WriteFile(artifactPath, []byte(result.Stdout), 0644); err != nil {
+		return fmt.Errorf("Error writing artifact to %v: %v\n", artifactPath, err)
+	}
+	return nil
+}
--- a/test/e2e/metriconly/e2e_npd_test.go
+++ b/test/e2e/metriconly/e2e_npd_test.go
@@ -29,12 +29,12 @@ import (
 	"k8s.io/test-infra/boskos/client"

 	"github.com/onsi/ginkgo"
+	"github.com/onsi/ginkgo/config"
 	"github.com/onsi/ginkgo/reporters"
+	. "github.com/onsi/gomega"
 	compute "google.golang.org/api/compute/v1"
 )

-const junitFileName = "junit.xml"
-
 var zone = flag.String("zone", "", "gce zone the hosts live in")
 var project = flag.String("project", "", "gce project the hosts live in")
 var image = flag.String("image", "", "image to test")
@@ -80,7 +80,7 @@ func TestNPD(t *testing.T) {
 	}

 	// The junit formatted result output is for showing test results on testgrid.
-	junitReporter := reporters.NewJUnitReporter(path.Join(*artifactsDir, junitFileName))
+	junitReporter := reporters.NewJUnitReporter(path.Join(*artifactsDir, fmt.Sprintf("junit-%02d.xml", config.GinkgoConfig.ParallelNode)))
 	ginkgo.RunSpecsWithDefaultAndCustomReporters(t, "NPD Metric-only Suite", []ginkgo.Reporter{junitReporter})
 }

@@ -89,9 +89,8 @@ func acquireProjectOrDie(boskosClient *client.Client) string {
 	ctx, cancel := context.WithTimeout(context.Background(), *boskosWaitDuration)
 	defer cancel()
 	p, err := boskosClient.AcquireWait(ctx, *boskosProjectType, "free", "busy")
-	if err != nil {
-		panic(fmt.Sprintf("Unable to rent project from Boskos: %v\n", err))
-	}
+	Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to rent project from Boskos: %v\n", err))
+
 	fmt.Printf("Rented project %s from Boskos", p.Name)

 	go func(boskosClient *client.Client, projectName string) {
@@ -110,12 +109,11 @@ func releaseProjectOrDie(boskosClient *client.Client) {
 		return
 	}
 	err := boskosClient.ReleaseAll("dirty")
-	if err != nil {
-		panic(fmt.Sprintf("Failed to release project to Boskos: %v", err))
-	}
+	Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to release project to Boskos: %v", err))
 }

 func TestMain(m *testing.M) {
+	RegisterFailHandler(ginkgo.Fail)
 	flag.Parse()

 	os.Exit(m.Run())
--- a/test/e2e/metriconly/metrics_test.go
+++ b/test/e2e/metriconly/metrics_test.go
@@ -21,12 +21,15 @@ import (
 	"os"
 	"path"
 	"strings"
+	"time"

 	"k8s.io/node-problem-detector/pkg/util/metrics"
 	"k8s.io/node-problem-detector/test/e2e/lib/gce"
 	"k8s.io/node-problem-detector/test/e2e/lib/npd"

 	"github.com/onsi/ginkgo"
+	"github.com/onsi/ginkgo/config"
+	. "github.com/onsi/gomega"
 	"github.com/pborman/uuid"
 )

@@ -57,42 +60,77 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
 			},
 			*image,
 			*imageProject)
-		if err != nil {
-			ginkgo.Fail(fmt.Sprintf("Unable to create test instance: %v", err))
-		}
+		Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to create test instance: %v", err))

 		err = npd.SetupNPD(instance, *npdBuildTar)
-		if err != nil {
-			ginkgo.Fail(fmt.Sprintf("Unable to setup NPD: %v", err))
-		}
+		Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to setup NPD: %v", err))
 	})

 	ginkgo.Context("On a clean node", func() {

 		ginkgo.It("NPD should export host_uptime metric", func() {
 			err := npd.WaitForNPD(instance, []string{"host_uptime"}, 120)
-			if err != nil {
-				ginkgo.Fail(fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
-			}
+			Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))

 			gotMetrics, err := npd.FetchNPDMetrics(instance)
-			if err != nil {
-				ginkgo.Fail(fmt.Sprintf("Error fetching NPD metrics: %v", err))
-			}
+			Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Error fetching NPD metrics: %v", err))
+
 			_, err = metrics.GetFloat64Metric(gotMetrics, "host_uptime", map[string]string{}, false)
-			if err != nil {
-				ginkgo.Fail(fmt.Sprintf("Failed to find uptime metric: %v.\nHere is all NPD exported metrics: %v",
-					err, gotMetrics))
-			}
+			Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to find uptime metric: %v.\nHere is all NPD exported metrics: %v", err, gotMetrics))
+		})
+
+		ginkgo.It("NPD should not report any problem", func() {
+			err := npd.WaitForNPD(instance, []string{"problem_gauge"}, 120)
+			Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
+
+			assertMetricValueInBound(instance,
+				"problem_gauge", map[string]string{"reason": "DockerHung", "type": "KernelDeadlock"},
+				0.0, 0.0)
+			assertMetricValueInBound(instance,
+				"problem_counter", map[string]string{"reason": "DockerHung"},
+				0.0, 0.0)
+			assertMetricValueInBound(instance,
+				"problem_counter", map[string]string{"reason": "FilesystemIsReadOnly"},
+				0.0, 0.0)
+			assertMetricValueInBound(instance,
+				"problem_counter", map[string]string{"reason": "KernelOops"},
+				0.0, 0.0)
+			assertMetricValueInBound(instance,
+				"problem_counter", map[string]string{"reason": "OOMKilling"},
+				0.0, 0.0)
+		})
+	})
+
+	ginkgo.Context("When ext4 filesystem error happens", func() {
+
+		ginkgo.BeforeEach(func() {
+			err := npd.WaitForNPD(instance, []string{"problem_gauge"}, 120)
+			Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
+			// This will trigger a ext4 error on the boot disk, causing the boot disk mounted as read-only and systemd-journald crashing.
+			instance.RunCommandOrFail("sudo /home/kubernetes/bin/problem-maker --problem Ext4FilesystemError")
+		})
+
+		ginkgo.It("NPD should update problem_counter{reason:Ext4Error} and problem_gauge{type:ReadonlyFilesystem}", func() {
+			time.Sleep(5 * time.Second)
+			assertMetricValueInBound(instance,
+				"problem_counter", map[string]string{"reason": "Ext4Error"},
+				1.0, 2.0)
+			assertMetricValueInBound(instance,
+				"problem_gauge", map[string]string{"reason": "FilesystemIsReadOnly", "type": "ReadonlyFilesystem"},
+				1.0, 1.0)
+		})
+
+		ginkgo.It("NPD should remain healthy", func() {
+			npdStates := instance.RunCommandOrFail("sudo systemctl show node-problem-detector -p ActiveState -p SubState")
+			Expect(npdStates.Stdout).To(ContainSubstring("ActiveState=active"), "NPD is no longer active: %v", npdStates)
+			Expect(npdStates.Stdout).To(ContainSubstring("SubState=running"), "NPD is no longer running: %v", npdStates)
 		})
 	})

 	ginkgo.AfterEach(func() {
 		defer func() {
 			err := instance.DeleteInstance()
-			if err != nil {
-				ginkgo.Fail(fmt.Sprintf("Failed to clean up the test VM: %v", err))
-			}
+			Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to clena up the test VM: %v", err))
 		}()

 		artifactSubDir := ""
@@ -109,9 +147,20 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
 			}
 		}

-		errs := npd.SaveTestArtifacts(instance, artifactSubDir)
+		errs := npd.SaveTestArtifacts(instance, artifactSubDir, config.GinkgoConfig.ParallelNode)
 		if len(errs) != 0 {
 			fmt.Printf("Error storing debugging data to test artifacts: %v", errs)
 		}
 	})
 })
+
+func assertMetricValueInBound(instance gce.Instance, metricName string, labels map[string]string, lowBound float64, highBound float64) {
+	value, err := npd.FetchNPDMetric(instance, metricName, labels)
+	if err != nil {
+		ginkgo.Fail(fmt.Sprintf("Failed to find %s metric with label %v: %v", metricName, labels, err))
+	}
+	Expect(value).Should(BeNumerically(">=", lowBound),
+		"Got value for metric %s with label %v: %v, expect at least %v.", metricName, labels, value, lowBound)
+	Expect(value).Should(BeNumerically("<=", highBound),
+		"Got value for metric %s with label %v: %v, expect at most %v.", metricName, labels, value, highBound)
+}