mirror of
https://github.com/kubernetes/node-problem-detector.git
synced 2026-03-03 10:10:52 +00:00
Add e2e tests for reporting filesystem problems
Also added support for running e2e tests in parallel.
This commit is contained in:
@@ -67,6 +67,9 @@ function install-npd() {
|
||||
echo "Installing NPD systemd service."
|
||||
cp "${workdir}"/config/systemd/node-problem-detector-metric-only.service /etc/systemd/system/node-problem-detector.service
|
||||
|
||||
echo "Installing problem maker binary, used only for e2e testing."
|
||||
cp "${workdir}"/test/bin/problem-maker "${BIN_DIR}"
|
||||
|
||||
rm -rf "${workdir}"
|
||||
|
||||
# Start systemd service.
|
||||
|
||||
@@ -6,7 +6,7 @@ Currently the tests only support Google Compute Engine (GCE) environment. Suppor
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. Setup [Google Application Default Credentials](https://developers.google.com/identity/protocols/application-default-credentials), which is [required for authentication](https://godoc.org/google.golang.org/api/compute/v1#hdr-Creating_a_client) by the Compute Engine API.
|
||||
1. Setup [Google Application Default Credentials (ADC)](https://developers.google.com/identity/protocols/application-default-credentials), which is [required for authentication](https://godoc.org/google.golang.org/api/compute/v1#hdr-Creating_a_client) by the Compute Engine API.
|
||||
2. Setup a [project-wide SSH key](https://cloud.google.com/compute/docs/instances/adding-removing-ssh-keys#project-wide) that can be used to SSH into the GCE VMs.
|
||||
|
||||
## Running tests
|
||||
@@ -21,5 +21,6 @@ export VM_IMAGE=[TESTED_OS_IMAGE:cos-73-11647-217-0]
|
||||
export IMAGE_PROJECT=[TESTED_OS_IMAGE_PROJECT:cos-cloud]
|
||||
export SSH_USER=${USER}
|
||||
export SSH_KEY=~/.ssh/id_rsa
|
||||
export ARTIFACTS=/tmp/npd
|
||||
make e2e-test
|
||||
```
|
||||
|
||||
@@ -23,6 +23,7 @@ import (
|
||||
|
||||
"k8s.io/node-problem-detector/test/e2e/lib/ssh"
|
||||
|
||||
. "github.com/onsi/gomega"
|
||||
compute "google.golang.org/api/compute/v1"
|
||||
)
|
||||
|
||||
@@ -145,6 +146,14 @@ func (ins *Instance) RunCommand(cmd string) ssh.Result {
|
||||
return ssh.Run(cmd, ins.ExternalIP, ins.SshUser, ins.SshKey)
|
||||
}
|
||||
|
||||
// RunCommand runs a command on the GCE instance and returns the command result, and fails the test when the command failed.
|
||||
func (ins *Instance) RunCommandOrFail(cmd string) ssh.Result {
|
||||
result := ins.RunCommand(cmd)
|
||||
Expect(result.SSHError).ToNot(HaveOccurred(), "SSH-ing to the instance failed: %v\n", result)
|
||||
Expect(result.Code).To(Equal(0), "Running command failed: %v\n", result)
|
||||
return result
|
||||
}
|
||||
|
||||
// PushFile pushes a local file to a GCE instance.
|
||||
func (ins *Instance) PushFile(srcPath, destPath string) error {
|
||||
if ins.ExternalIP == "" {
|
||||
|
||||
@@ -29,9 +29,6 @@ import (
|
||||
"github.com/avast/retry-go"
|
||||
)
|
||||
|
||||
const npdMetricsFilename = "node-problem-detector-metrics.txt"
|
||||
const npdLogsFilename = "node-problem-detector.log"
|
||||
|
||||
// SetupNPD installs NPD from the test tarball onto the provided GCE instance.
|
||||
//
|
||||
// Here is how it works:
|
||||
@@ -91,6 +88,20 @@ func FetchNPDMetrics(ins gce.Instance) ([]metrics.Float64MetricRepresentation, e
|
||||
return npdMetrics, nil
|
||||
}
|
||||
|
||||
// FetchNPDMetric fetches and parses a specific metric reported by NPD on the provided GCE instance.
|
||||
func FetchNPDMetric(ins gce.Instance, metricName string, labels map[string]string) (float64, error) {
|
||||
gotMetrics, err := FetchNPDMetrics(ins)
|
||||
if err != nil {
|
||||
return 0.0, err
|
||||
}
|
||||
metric, err := metrics.GetFloat64Metric(gotMetrics, metricName, labels, true)
|
||||
if err != nil {
|
||||
return 0.0, fmt.Errorf("Failed to find %s metric with label %v: %v.\nHere is all NPD exported metrics: %v",
|
||||
metricName, labels, err, gotMetrics)
|
||||
}
|
||||
return metric.Value, nil
|
||||
}
|
||||
|
||||
// WaitForNPD waits for NPD to become ready by waiting for expected metrics.
|
||||
func WaitForNPD(ins gce.Instance, metricNames []string, timeoutSeconds uint) error {
|
||||
verifyMetricExist := func() error {
|
||||
@@ -116,30 +127,33 @@ func WaitForNPD(ins gce.Instance, metricNames []string, timeoutSeconds uint) err
|
||||
}
|
||||
|
||||
// SaveTestArtifacts saves debugging data from NPD.
|
||||
func SaveTestArtifacts(ins gce.Instance, directory string) []error {
|
||||
func SaveTestArtifacts(ins gce.Instance, artifactDirectory string, testID int) []error {
|
||||
var errs []error
|
||||
|
||||
npdMetrics := ins.RunCommand("curl http://localhost:20257/metrics")
|
||||
if npdMetrics.SSHError != nil || npdMetrics.Code != 0 {
|
||||
errs = append(errs, fmt.Errorf("Error fetching NPD metrics: %v\n", npdMetrics))
|
||||
} else {
|
||||
npdMetricsPath := path.Join(directory, npdMetricsFilename)
|
||||
err := ioutil.WriteFile(npdMetricsPath, []byte(npdMetrics.Stdout), 0644)
|
||||
if err != nil {
|
||||
errs = append(errs, fmt.Errorf("Error writing to %s: %v", npdMetricsPath, err))
|
||||
}
|
||||
if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID,
|
||||
"curl http://localhost:20257/metrics", "node-problem-detector-metrics"); err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
|
||||
npdLog := ins.RunCommand("sudo journalctl -u node-problem-detector.service")
|
||||
if npdLog.SSHError != nil || npdLog.Code != 0 {
|
||||
errs = append(errs, fmt.Errorf("Error fetching NPD logs: %v\n", npdLog))
|
||||
} else {
|
||||
npdLogsPath := path.Join(directory, npdLogsFilename)
|
||||
err := ioutil.WriteFile(npdLogsPath, []byte(npdLog.Stdout), 0644)
|
||||
if err != nil {
|
||||
errs = append(errs, fmt.Errorf("Error writing to %s: %v", npdLogsPath, err))
|
||||
}
|
||||
if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID,
|
||||
"sudo journalctl -u node-problem-detector.service", "node-problem-detector"); err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID,
|
||||
"sudo journalctl -k", "kernel-logs"); err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
|
||||
return errs
|
||||
}
|
||||
|
||||
func saveCommandResultAsArtifact(ins gce.Instance, artifactDirectory string, testID int, command string, artifactPrefix string) error {
|
||||
artifactPath := path.Join(artifactDirectory, fmt.Sprintf("%v-%02d.txt", artifactPrefix, testID))
|
||||
result := ins.RunCommand(command)
|
||||
if result.SSHError != nil || result.Code != 0 {
|
||||
return fmt.Errorf("Error running command: %v\n", result)
|
||||
}
|
||||
if err := ioutil.WriteFile(artifactPath, []byte(result.Stdout), 0644); err != nil {
|
||||
return fmt.Errorf("Error writing artifact to %v: %v\n", artifactPath, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -29,12 +29,12 @@ import (
|
||||
"k8s.io/test-infra/boskos/client"
|
||||
|
||||
"github.com/onsi/ginkgo"
|
||||
"github.com/onsi/ginkgo/config"
|
||||
"github.com/onsi/ginkgo/reporters"
|
||||
. "github.com/onsi/gomega"
|
||||
compute "google.golang.org/api/compute/v1"
|
||||
)
|
||||
|
||||
const junitFileName = "junit.xml"
|
||||
|
||||
var zone = flag.String("zone", "", "gce zone the hosts live in")
|
||||
var project = flag.String("project", "", "gce project the hosts live in")
|
||||
var image = flag.String("image", "", "image to test")
|
||||
@@ -80,7 +80,7 @@ func TestNPD(t *testing.T) {
|
||||
}
|
||||
|
||||
// The junit formatted result output is for showing test results on testgrid.
|
||||
junitReporter := reporters.NewJUnitReporter(path.Join(*artifactsDir, junitFileName))
|
||||
junitReporter := reporters.NewJUnitReporter(path.Join(*artifactsDir, fmt.Sprintf("junit-%02d.xml", config.GinkgoConfig.ParallelNode)))
|
||||
ginkgo.RunSpecsWithDefaultAndCustomReporters(t, "NPD Metric-only Suite", []ginkgo.Reporter{junitReporter})
|
||||
}
|
||||
|
||||
@@ -89,9 +89,8 @@ func acquireProjectOrDie(boskosClient *client.Client) string {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), *boskosWaitDuration)
|
||||
defer cancel()
|
||||
p, err := boskosClient.AcquireWait(ctx, *boskosProjectType, "free", "busy")
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Unable to rent project from Boskos: %v\n", err))
|
||||
}
|
||||
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to rent project from Boskos: %v\n", err))
|
||||
|
||||
fmt.Printf("Rented project %s from Boskos", p.Name)
|
||||
|
||||
go func(boskosClient *client.Client, projectName string) {
|
||||
@@ -110,12 +109,11 @@ func releaseProjectOrDie(boskosClient *client.Client) {
|
||||
return
|
||||
}
|
||||
err := boskosClient.ReleaseAll("dirty")
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("Failed to release project to Boskos: %v", err))
|
||||
}
|
||||
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to release project to Boskos: %v", err))
|
||||
}
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
RegisterFailHandler(ginkgo.Fail)
|
||||
flag.Parse()
|
||||
|
||||
os.Exit(m.Run())
|
||||
|
||||
@@ -21,12 +21,15 @@ import (
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"k8s.io/node-problem-detector/pkg/util/metrics"
|
||||
"k8s.io/node-problem-detector/test/e2e/lib/gce"
|
||||
"k8s.io/node-problem-detector/test/e2e/lib/npd"
|
||||
|
||||
"github.com/onsi/ginkgo"
|
||||
"github.com/onsi/ginkgo/config"
|
||||
. "github.com/onsi/gomega"
|
||||
"github.com/pborman/uuid"
|
||||
)
|
||||
|
||||
@@ -57,42 +60,77 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
|
||||
},
|
||||
*image,
|
||||
*imageProject)
|
||||
if err != nil {
|
||||
ginkgo.Fail(fmt.Sprintf("Unable to create test instance: %v", err))
|
||||
}
|
||||
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to create test instance: %v", err))
|
||||
|
||||
err = npd.SetupNPD(instance, *npdBuildTar)
|
||||
if err != nil {
|
||||
ginkgo.Fail(fmt.Sprintf("Unable to setup NPD: %v", err))
|
||||
}
|
||||
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to setup NPD: %v", err))
|
||||
})
|
||||
|
||||
ginkgo.Context("On a clean node", func() {
|
||||
|
||||
ginkgo.It("NPD should export host_uptime metric", func() {
|
||||
err := npd.WaitForNPD(instance, []string{"host_uptime"}, 120)
|
||||
if err != nil {
|
||||
ginkgo.Fail(fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
|
||||
}
|
||||
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
|
||||
|
||||
gotMetrics, err := npd.FetchNPDMetrics(instance)
|
||||
if err != nil {
|
||||
ginkgo.Fail(fmt.Sprintf("Error fetching NPD metrics: %v", err))
|
||||
}
|
||||
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Error fetching NPD metrics: %v", err))
|
||||
|
||||
_, err = metrics.GetFloat64Metric(gotMetrics, "host_uptime", map[string]string{}, false)
|
||||
if err != nil {
|
||||
ginkgo.Fail(fmt.Sprintf("Failed to find uptime metric: %v.\nHere is all NPD exported metrics: %v",
|
||||
err, gotMetrics))
|
||||
}
|
||||
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to find uptime metric: %v.\nHere is all NPD exported metrics: %v", err, gotMetrics))
|
||||
})
|
||||
|
||||
ginkgo.It("NPD should not report any problem", func() {
|
||||
err := npd.WaitForNPD(instance, []string{"problem_gauge"}, 120)
|
||||
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
|
||||
|
||||
assertMetricValueInBound(instance,
|
||||
"problem_gauge", map[string]string{"reason": "DockerHung", "type": "KernelDeadlock"},
|
||||
0.0, 0.0)
|
||||
assertMetricValueInBound(instance,
|
||||
"problem_counter", map[string]string{"reason": "DockerHung"},
|
||||
0.0, 0.0)
|
||||
assertMetricValueInBound(instance,
|
||||
"problem_counter", map[string]string{"reason": "FilesystemIsReadOnly"},
|
||||
0.0, 0.0)
|
||||
assertMetricValueInBound(instance,
|
||||
"problem_counter", map[string]string{"reason": "KernelOops"},
|
||||
0.0, 0.0)
|
||||
assertMetricValueInBound(instance,
|
||||
"problem_counter", map[string]string{"reason": "OOMKilling"},
|
||||
0.0, 0.0)
|
||||
})
|
||||
})
|
||||
|
||||
ginkgo.Context("When ext4 filesystem error happens", func() {
|
||||
|
||||
ginkgo.BeforeEach(func() {
|
||||
err := npd.WaitForNPD(instance, []string{"problem_gauge"}, 120)
|
||||
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
|
||||
// This will trigger a ext4 error on the boot disk, causing the boot disk mounted as read-only and systemd-journald crashing.
|
||||
instance.RunCommandOrFail("sudo /home/kubernetes/bin/problem-maker --problem Ext4FilesystemError")
|
||||
})
|
||||
|
||||
ginkgo.It("NPD should update problem_counter{reason:Ext4Error} and problem_gauge{type:ReadonlyFilesystem}", func() {
|
||||
time.Sleep(5 * time.Second)
|
||||
assertMetricValueInBound(instance,
|
||||
"problem_counter", map[string]string{"reason": "Ext4Error"},
|
||||
1.0, 2.0)
|
||||
assertMetricValueInBound(instance,
|
||||
"problem_gauge", map[string]string{"reason": "FilesystemIsReadOnly", "type": "ReadonlyFilesystem"},
|
||||
1.0, 1.0)
|
||||
})
|
||||
|
||||
ginkgo.It("NPD should remain healthy", func() {
|
||||
npdStates := instance.RunCommandOrFail("sudo systemctl show node-problem-detector -p ActiveState -p SubState")
|
||||
Expect(npdStates.Stdout).To(ContainSubstring("ActiveState=active"), "NPD is no longer active: %v", npdStates)
|
||||
Expect(npdStates.Stdout).To(ContainSubstring("SubState=running"), "NPD is no longer running: %v", npdStates)
|
||||
})
|
||||
})
|
||||
|
||||
ginkgo.AfterEach(func() {
|
||||
defer func() {
|
||||
err := instance.DeleteInstance()
|
||||
if err != nil {
|
||||
ginkgo.Fail(fmt.Sprintf("Failed to clean up the test VM: %v", err))
|
||||
}
|
||||
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to clena up the test VM: %v", err))
|
||||
}()
|
||||
|
||||
artifactSubDir := ""
|
||||
@@ -109,9 +147,20 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
|
||||
}
|
||||
}
|
||||
|
||||
errs := npd.SaveTestArtifacts(instance, artifactSubDir)
|
||||
errs := npd.SaveTestArtifacts(instance, artifactSubDir, config.GinkgoConfig.ParallelNode)
|
||||
if len(errs) != 0 {
|
||||
fmt.Printf("Error storing debugging data to test artifacts: %v", errs)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
func assertMetricValueInBound(instance gce.Instance, metricName string, labels map[string]string, lowBound float64, highBound float64) {
|
||||
value, err := npd.FetchNPDMetric(instance, metricName, labels)
|
||||
if err != nil {
|
||||
ginkgo.Fail(fmt.Sprintf("Failed to find %s metric with label %v: %v", metricName, labels, err))
|
||||
}
|
||||
Expect(value).Should(BeNumerically(">=", lowBound),
|
||||
"Got value for metric %s with label %v: %v, expect at least %v.", metricName, labels, value, lowBound)
|
||||
Expect(value).Should(BeNumerically("<=", highBound),
|
||||
"Got value for metric %s with label %v: %v, expect at most %v.", metricName, labels, value, highBound)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user