diff --git a/Makefile b/Makefile index 78faa759..e528df84 100644 --- a/Makefile +++ b/Makefile @@ -40,6 +40,9 @@ PKG:=k8s.io/node-problem-detector # PKG_SOURCES are all the go source code. PKG_SOURCES:=$(shell find pkg cmd -name '*.go') +# PARALLEL specifies the number of parallel test nodes to run for e2e tests. +PARALLEL?=3 + # TARBALL is the name of release tar. Include binary version by default. TARBALL?=node-problem-detector-$(VERSION).tar.gz @@ -122,8 +125,8 @@ test: vet fmt GO111MODULE=on go test -mod vendor -timeout=1m -v -race -short -tags "$(BUILD_TAGS)" ./... e2e-test: vet fmt build-tar - GO111MODULE=on go test -mod vendor -timeout=10m -v -tags "$(BUILD_TAGS)" \ - ./test/e2e/metriconly/... \ + GO111MODULE=on ginkgo -nodes=$(PARALLEL) -mod vendor -timeout=10m -v -tags "$(BUILD_TAGS)" \ + ./test/e2e/metriconly/... -- \ -project=$(PROJECT) -zone=$(ZONE) \ -image=$(VM_IMAGE) -image-family=$(IMAGE_FAMILY) -image-project=$(IMAGE_PROJECT) \ -ssh-user=$(SSH_USER) -ssh-key=$(SSH_KEY) \ diff --git a/test/e2e-install.sh b/test/e2e-install.sh index ee5d29a4..a789ca46 100755 --- a/test/e2e-install.sh +++ b/test/e2e-install.sh @@ -67,6 +67,9 @@ function install-npd() { echo "Installing NPD systemd service." cp "${workdir}"/config/systemd/node-problem-detector-metric-only.service /etc/systemd/system/node-problem-detector.service + echo "Installing problem maker binary, used only for e2e testing." + cp "${workdir}"/test/bin/problem-maker "${BIN_DIR}" + rm -rf "${workdir}" # Start systemd service. diff --git a/test/e2e/README.md b/test/e2e/README.md index 2c9e23da..938edbae 100644 --- a/test/e2e/README.md +++ b/test/e2e/README.md @@ -6,7 +6,7 @@ Currently the tests only support Google Compute Engine (GCE) environment. Suppor ## Prerequisites -1. Setup [Google Application Default Credentials](https://developers.google.com/identity/protocols/application-default-credentials), which is [required for authentication](https://godoc.org/google.golang.org/api/compute/v1#hdr-Creating_a_client) by the Compute Engine API. +1. Setup [Google Application Default Credentials (ADC)](https://developers.google.com/identity/protocols/application-default-credentials), which is [required for authentication](https://godoc.org/google.golang.org/api/compute/v1#hdr-Creating_a_client) by the Compute Engine API. 2. Setup a [project-wide SSH key](https://cloud.google.com/compute/docs/instances/adding-removing-ssh-keys#project-wide) that can be used to SSH into the GCE VMs. ## Running tests @@ -21,5 +21,6 @@ export VM_IMAGE=[TESTED_OS_IMAGE:cos-73-11647-217-0] export IMAGE_PROJECT=[TESTED_OS_IMAGE_PROJECT:cos-cloud] export SSH_USER=${USER} export SSH_KEY=~/.ssh/id_rsa +export ARTIFACTS=/tmp/npd make e2e-test ``` diff --git a/test/e2e/lib/gce/instance.go b/test/e2e/lib/gce/instance.go index 69d13556..b8c0f9fc 100644 --- a/test/e2e/lib/gce/instance.go +++ b/test/e2e/lib/gce/instance.go @@ -23,6 +23,7 @@ import ( "k8s.io/node-problem-detector/test/e2e/lib/ssh" + . "github.com/onsi/gomega" compute "google.golang.org/api/compute/v1" ) @@ -145,6 +146,14 @@ func (ins *Instance) RunCommand(cmd string) ssh.Result { return ssh.Run(cmd, ins.ExternalIP, ins.SshUser, ins.SshKey) } +// RunCommand runs a command on the GCE instance and returns the command result, and fails the test when the command failed. +func (ins *Instance) RunCommandOrFail(cmd string) ssh.Result { + result := ins.RunCommand(cmd) + Expect(result.SSHError).ToNot(HaveOccurred(), "SSH-ing to the instance failed: %v\n", result) + Expect(result.Code).To(Equal(0), "Running command failed: %v\n", result) + return result +} + // PushFile pushes a local file to a GCE instance. func (ins *Instance) PushFile(srcPath, destPath string) error { if ins.ExternalIP == "" { diff --git a/test/e2e/lib/npd/npd.go b/test/e2e/lib/npd/npd.go index e897b885..230a3fd7 100644 --- a/test/e2e/lib/npd/npd.go +++ b/test/e2e/lib/npd/npd.go @@ -29,9 +29,6 @@ import ( "github.com/avast/retry-go" ) -const npdMetricsFilename = "node-problem-detector-metrics.txt" -const npdLogsFilename = "node-problem-detector.log" - // SetupNPD installs NPD from the test tarball onto the provided GCE instance. // // Here is how it works: @@ -91,6 +88,20 @@ func FetchNPDMetrics(ins gce.Instance) ([]metrics.Float64MetricRepresentation, e return npdMetrics, nil } +// FetchNPDMetric fetches and parses a specific metric reported by NPD on the provided GCE instance. +func FetchNPDMetric(ins gce.Instance, metricName string, labels map[string]string) (float64, error) { + gotMetrics, err := FetchNPDMetrics(ins) + if err != nil { + return 0.0, err + } + metric, err := metrics.GetFloat64Metric(gotMetrics, metricName, labels, true) + if err != nil { + return 0.0, fmt.Errorf("Failed to find %s metric with label %v: %v.\nHere is all NPD exported metrics: %v", + metricName, labels, err, gotMetrics) + } + return metric.Value, nil +} + // WaitForNPD waits for NPD to become ready by waiting for expected metrics. func WaitForNPD(ins gce.Instance, metricNames []string, timeoutSeconds uint) error { verifyMetricExist := func() error { @@ -116,30 +127,33 @@ func WaitForNPD(ins gce.Instance, metricNames []string, timeoutSeconds uint) err } // SaveTestArtifacts saves debugging data from NPD. -func SaveTestArtifacts(ins gce.Instance, directory string) []error { +func SaveTestArtifacts(ins gce.Instance, artifactDirectory string, testID int) []error { var errs []error - npdMetrics := ins.RunCommand("curl http://localhost:20257/metrics") - if npdMetrics.SSHError != nil || npdMetrics.Code != 0 { - errs = append(errs, fmt.Errorf("Error fetching NPD metrics: %v\n", npdMetrics)) - } else { - npdMetricsPath := path.Join(directory, npdMetricsFilename) - err := ioutil.WriteFile(npdMetricsPath, []byte(npdMetrics.Stdout), 0644) - if err != nil { - errs = append(errs, fmt.Errorf("Error writing to %s: %v", npdMetricsPath, err)) - } + if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID, + "curl http://localhost:20257/metrics", "node-problem-detector-metrics"); err != nil { + errs = append(errs, err) } - - npdLog := ins.RunCommand("sudo journalctl -u node-problem-detector.service") - if npdLog.SSHError != nil || npdLog.Code != 0 { - errs = append(errs, fmt.Errorf("Error fetching NPD logs: %v\n", npdLog)) - } else { - npdLogsPath := path.Join(directory, npdLogsFilename) - err := ioutil.WriteFile(npdLogsPath, []byte(npdLog.Stdout), 0644) - if err != nil { - errs = append(errs, fmt.Errorf("Error writing to %s: %v", npdLogsPath, err)) - } + if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID, + "sudo journalctl -u node-problem-detector.service", "node-problem-detector"); err != nil { + errs = append(errs, err) + } + if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID, + "sudo journalctl -k", "kernel-logs"); err != nil { + errs = append(errs, err) } return errs } + +func saveCommandResultAsArtifact(ins gce.Instance, artifactDirectory string, testID int, command string, artifactPrefix string) error { + artifactPath := path.Join(artifactDirectory, fmt.Sprintf("%v-%02d.txt", artifactPrefix, testID)) + result := ins.RunCommand(command) + if result.SSHError != nil || result.Code != 0 { + return fmt.Errorf("Error running command: %v\n", result) + } + if err := ioutil.WriteFile(artifactPath, []byte(result.Stdout), 0644); err != nil { + return fmt.Errorf("Error writing artifact to %v: %v\n", artifactPath, err) + } + return nil +} diff --git a/test/e2e/metriconly/e2e_npd_test.go b/test/e2e/metriconly/e2e_npd_test.go index d496a5a1..6e6108f0 100644 --- a/test/e2e/metriconly/e2e_npd_test.go +++ b/test/e2e/metriconly/e2e_npd_test.go @@ -29,12 +29,12 @@ import ( "k8s.io/test-infra/boskos/client" "github.com/onsi/ginkgo" + "github.com/onsi/ginkgo/config" "github.com/onsi/ginkgo/reporters" + . "github.com/onsi/gomega" compute "google.golang.org/api/compute/v1" ) -const junitFileName = "junit.xml" - var zone = flag.String("zone", "", "gce zone the hosts live in") var project = flag.String("project", "", "gce project the hosts live in") var image = flag.String("image", "", "image to test") @@ -80,7 +80,7 @@ func TestNPD(t *testing.T) { } // The junit formatted result output is for showing test results on testgrid. - junitReporter := reporters.NewJUnitReporter(path.Join(*artifactsDir, junitFileName)) + junitReporter := reporters.NewJUnitReporter(path.Join(*artifactsDir, fmt.Sprintf("junit-%02d.xml", config.GinkgoConfig.ParallelNode))) ginkgo.RunSpecsWithDefaultAndCustomReporters(t, "NPD Metric-only Suite", []ginkgo.Reporter{junitReporter}) } @@ -89,9 +89,8 @@ func acquireProjectOrDie(boskosClient *client.Client) string { ctx, cancel := context.WithTimeout(context.Background(), *boskosWaitDuration) defer cancel() p, err := boskosClient.AcquireWait(ctx, *boskosProjectType, "free", "busy") - if err != nil { - panic(fmt.Sprintf("Unable to rent project from Boskos: %v\n", err)) - } + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to rent project from Boskos: %v\n", err)) + fmt.Printf("Rented project %s from Boskos", p.Name) go func(boskosClient *client.Client, projectName string) { @@ -110,12 +109,11 @@ func releaseProjectOrDie(boskosClient *client.Client) { return } err := boskosClient.ReleaseAll("dirty") - if err != nil { - panic(fmt.Sprintf("Failed to release project to Boskos: %v", err)) - } + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to release project to Boskos: %v", err)) } func TestMain(m *testing.M) { + RegisterFailHandler(ginkgo.Fail) flag.Parse() os.Exit(m.Run()) diff --git a/test/e2e/metriconly/metrics_test.go b/test/e2e/metriconly/metrics_test.go index a1001aff..ad3dbecf 100644 --- a/test/e2e/metriconly/metrics_test.go +++ b/test/e2e/metriconly/metrics_test.go @@ -21,12 +21,15 @@ import ( "os" "path" "strings" + "time" "k8s.io/node-problem-detector/pkg/util/metrics" "k8s.io/node-problem-detector/test/e2e/lib/gce" "k8s.io/node-problem-detector/test/e2e/lib/npd" "github.com/onsi/ginkgo" + "github.com/onsi/ginkgo/config" + . "github.com/onsi/gomega" "github.com/pborman/uuid" ) @@ -57,42 +60,77 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() { }, *image, *imageProject) - if err != nil { - ginkgo.Fail(fmt.Sprintf("Unable to create test instance: %v", err)) - } + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to create test instance: %v", err)) err = npd.SetupNPD(instance, *npdBuildTar) - if err != nil { - ginkgo.Fail(fmt.Sprintf("Unable to setup NPD: %v", err)) - } + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to setup NPD: %v", err)) }) ginkgo.Context("On a clean node", func() { ginkgo.It("NPD should export host_uptime metric", func() { err := npd.WaitForNPD(instance, []string{"host_uptime"}, 120) - if err != nil { - ginkgo.Fail(fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err)) - } + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err)) gotMetrics, err := npd.FetchNPDMetrics(instance) - if err != nil { - ginkgo.Fail(fmt.Sprintf("Error fetching NPD metrics: %v", err)) - } + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Error fetching NPD metrics: %v", err)) + _, err = metrics.GetFloat64Metric(gotMetrics, "host_uptime", map[string]string{}, false) - if err != nil { - ginkgo.Fail(fmt.Sprintf("Failed to find uptime metric: %v.\nHere is all NPD exported metrics: %v", - err, gotMetrics)) - } + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to find uptime metric: %v.\nHere is all NPD exported metrics: %v", err, gotMetrics)) + }) + + ginkgo.It("NPD should not report any problem", func() { + err := npd.WaitForNPD(instance, []string{"problem_gauge"}, 120) + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err)) + + assertMetricValueInBound(instance, + "problem_gauge", map[string]string{"reason": "DockerHung", "type": "KernelDeadlock"}, + 0.0, 0.0) + assertMetricValueInBound(instance, + "problem_counter", map[string]string{"reason": "DockerHung"}, + 0.0, 0.0) + assertMetricValueInBound(instance, + "problem_counter", map[string]string{"reason": "FilesystemIsReadOnly"}, + 0.0, 0.0) + assertMetricValueInBound(instance, + "problem_counter", map[string]string{"reason": "KernelOops"}, + 0.0, 0.0) + assertMetricValueInBound(instance, + "problem_counter", map[string]string{"reason": "OOMKilling"}, + 0.0, 0.0) + }) + }) + + ginkgo.Context("When ext4 filesystem error happens", func() { + + ginkgo.BeforeEach(func() { + err := npd.WaitForNPD(instance, []string{"problem_gauge"}, 120) + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err)) + // This will trigger a ext4 error on the boot disk, causing the boot disk mounted as read-only and systemd-journald crashing. + instance.RunCommandOrFail("sudo /home/kubernetes/bin/problem-maker --problem Ext4FilesystemError") + }) + + ginkgo.It("NPD should update problem_counter{reason:Ext4Error} and problem_gauge{type:ReadonlyFilesystem}", func() { + time.Sleep(5 * time.Second) + assertMetricValueInBound(instance, + "problem_counter", map[string]string{"reason": "Ext4Error"}, + 1.0, 2.0) + assertMetricValueInBound(instance, + "problem_gauge", map[string]string{"reason": "FilesystemIsReadOnly", "type": "ReadonlyFilesystem"}, + 1.0, 1.0) + }) + + ginkgo.It("NPD should remain healthy", func() { + npdStates := instance.RunCommandOrFail("sudo systemctl show node-problem-detector -p ActiveState -p SubState") + Expect(npdStates.Stdout).To(ContainSubstring("ActiveState=active"), "NPD is no longer active: %v", npdStates) + Expect(npdStates.Stdout).To(ContainSubstring("SubState=running"), "NPD is no longer running: %v", npdStates) }) }) ginkgo.AfterEach(func() { defer func() { err := instance.DeleteInstance() - if err != nil { - ginkgo.Fail(fmt.Sprintf("Failed to clean up the test VM: %v", err)) - } + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to clena up the test VM: %v", err)) }() artifactSubDir := "" @@ -109,9 +147,20 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() { } } - errs := npd.SaveTestArtifacts(instance, artifactSubDir) + errs := npd.SaveTestArtifacts(instance, artifactSubDir, config.GinkgoConfig.ParallelNode) if len(errs) != 0 { fmt.Printf("Error storing debugging data to test artifacts: %v", errs) } }) }) + +func assertMetricValueInBound(instance gce.Instance, metricName string, labels map[string]string, lowBound float64, highBound float64) { + value, err := npd.FetchNPDMetric(instance, metricName, labels) + if err != nil { + ginkgo.Fail(fmt.Sprintf("Failed to find %s metric with label %v: %v", metricName, labels, err)) + } + Expect(value).Should(BeNumerically(">=", lowBound), + "Got value for metric %s with label %v: %v, expect at least %v.", metricName, labels, value, lowBound) + Expect(value).Should(BeNumerically("<=", highBound), + "Got value for metric %s with label %v: %v, expect at most %v.", metricName, labels, value, highBound) +}