Add problem maker to simulate problems for e2e test

This commit is contained in:
Xuewei Zhang
2019-11-20 14:49:21 -08:00
parent 40cb3e0fec
commit 5da72e86bb
9 changed files with 302 additions and 2 deletions

1
.gitignore vendored
View File

@@ -1,5 +1,6 @@
/bin/
/Dockerfile
/test/bin/
/*.tar.gz
ci.env
pr.env

View File

@@ -103,6 +103,13 @@ endif
-tags "$(BUILD_TAGS)" \
./cmd/nodeproblemdetector
./test/bin/problem-maker: $(PKG_SOURCES)
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
-mod vendor \
-o test/bin/problem-maker \
-tags "$(BUILD_TAGS)" \
./test/e2e/problemmaker/problem_maker.go
Dockerfile: Dockerfile.in
sed -e 's|@BASEIMAGE@|$(BASEIMAGE)|g' $< >$@
ifneq ($(ENABLE_JOURNALD), 1)
@@ -129,8 +136,8 @@ build-binaries: ./bin/node-problem-detector ./bin/log-counter
build-container: build-binaries Dockerfile
docker build -t $(IMAGE) .
build-tar: ./bin/node-problem-detector ./bin/log-counter
tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh
build-tar: ./bin/node-problem-detector ./bin/log-counter ./test/bin/problem-maker
tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh test/bin/problem-maker
sha1sum $(TARBALL)
md5sum $(TARBALL)
@@ -156,4 +163,5 @@ push: push-container push-tar
clean:
rm -f bin/log-counter
rm -f bin/node-problem-detector
rm -f test/bin/problem-maker
rm -f node-problem-detector-*.tar.gz

View File

@@ -249,6 +249,26 @@ Kubernetes cluster to a healthy state. The following remedy systems exist:
[this issue](https://github.com/kubernetes/node-problem-detector/issues/199)
for an example production use case for Draino.
# Testing
NPD is tested via unit tests, [NPD e2e tests](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/README.md), Kubernetes e2e tests and Kubernetes nodes e2e tests. Prow handles the [pre-submit tests](https://github.com/kubernetes/test-infra/blob/master/config/jobs/kubernetes/node-problem-detector/node-problem-detector-presubmits.yaml) and [CI tests](https://github.com/kubernetes/test-infra/blob/master/config/jobs/kubernetes/node-problem-detector/node-problem-detector-ci.yaml).
CI test results can be found at below:
1. [Unit tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-test)
2. [NPD e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-test)
3. [Kubernetes e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-kubernetes-gce-gci)
4. [Kubernetes nodes e2e tests](https://k8s-testgrid.appspot.com/sig-node-node-problem-detector#ci-npd-e2e-node)
## Running tests
Unit test is ran via `make test`.
See [NPD e2e test documentation](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/README.md) for how to setup and run NPD e2e tests.
## Problem Maker
[Problem maker](https://github.com/kubernetes/node-problem-detector/blob/master/test/e2e/problemmaker/README.md) is a program used in NPD e2e tests to generate/simulate node problems. It is ONLY indented to be used by NPD e2e tests. Please do NOT run it on your workstation, as it could cause real node problems.
# Docs
* [Custom plugin monitor](docs/custom_plugin_monitor.md)

View File

@@ -0,0 +1,20 @@
# Problem Maker
Problem maker is a program to generate/simulate various kinds of node problems. It is used in NPD e2e tests to verify NPD's behavior when node problems happen:
1. NPD should report the problems correctly.
2. NPD should survive the problems as much as possible.
**Problem maker is NOT intended to be used in any other places. And please do NOT run this directly on your workstation, as it can cause real OS failures.** For example, running `sudo problem-maker --problem Ext4FilesystemError` will cause an ext4 file system error, which could result in the boot disk being mounted as readonly, requiring a reboot to recover from the failure.
You shouldn't need to run it anyways. If you want to test NPD, it's best to run NPD e2e test.
## Developing/Testing Problem Maker
If you want to enrich the problems that problem maker can generate, you may want to run it to test the behavior. Then the recommended way for running it is to run it in a VM:
```
sudo problem-maker --help
sudo problem-maker --problem DockerHung
sudo problem-maker --problem Ext4FilesystemError
```
Problem maker tries to generate real node problems, and can cause real node failures. And when we do not have a good way to generate the problems, we instruct problem maker to simulate problems by injecting logs. In most (if not all) scenarios, generating real problems is preferred over injecting logs. This is because when kernel is upgraded, log patterns can change. NPD e2e tests is supposed to verify whether NPD can correctly understand the tested kernel.

View File

@@ -0,0 +1,45 @@
/*
Copyright 2019 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package makers
func init() {
ProblemGenerators["DockerHung"] = makeDockerHung
}
func makeDockerHung() {
const dockerHungPattern = `INFO: task docker:20744 blocked for more than 120 seconds.
Tainted: G C 3.16.0-4-amd64 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
docker D ffff8801a8f2b078 0 20744 1 0x00000000
ffff8801a8f2ac20 0000000000000082 0000000000012f00 ffff880057a17fd8
0000000000012f00 ffff8801a8f2ac20 ffffffff818bb4a0 ffff880057a17d80
ffffffff818bb4a4 ffff8801a8f2ac20 00000000ffffffff ffffffff818bb4a8
Call Trace:
[<ffffffff81510915>] ? schedule_preempt_disabled+0x25/0x70
[<ffffffff815123c3>] ? __mutex_lock_slowpath+0xd3/0x1c0
[<ffffffff815124cb>] ? mutex_lock+0x1b/0x2a
[<ffffffff814175bc>] ? copy_net_ns+0x6c/0x130
[<ffffffff8108bdf4>] ? create_new_namespaces+0xf4/0x180
[<ffffffff8108beec>] ? copy_namespaces+0x6c/0x90
[<ffffffff810654f6>] ? copy_process.part.25+0x966/0x1c30
[<ffffffff81066991>] ? do_fork+0xe1/0x390
[<ffffffff811c442c>] ? __alloc_fd+0x7c/0x120
[<ffffffff81514079>] ? stub_clone+0x69/0x90
[<ffffffff81513d0d>] ? system_call_fast_compare_end+0x10/0x15`
writeKernelMessageOrDie(dockerHungPattern)
}

View File

@@ -0,0 +1,37 @@
/*
Copyright 2019 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package makers
import (
"io/ioutil"
"github.com/golang/glog"
)
func init() {
ProblemGenerators["Ext4FilesystemError"] = makeFilesystemError
}
const ext4ErrorTrigger = "/sys/fs/ext4/sda1/trigger_fs_error"
func makeFilesystemError() {
msg := []byte("fake filesystem error from problem-maker")
err := ioutil.WriteFile(ext4ErrorTrigger, msg, 0200)
if err != nil {
glog.Fatalf("Failed writting log to %q: %v", ext4ErrorTrigger, err)
}
}

View File

@@ -0,0 +1,46 @@
/*
Copyright 2019 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package makers
import (
"io/ioutil"
"strings"
"github.com/golang/glog"
)
func init() {
ProblemGenerators["OOMKill"] = makeOOMKill
}
const kmsgPath = "/dev/kmsg"
func makeOOMKill() {
const oomKillPattern = `Memory cgroup out of memory: Kill process 1012 (heapster) score 1035 or sacrifice child
Killed process 1012 (heapster) total-vm:327128kB, anon-rss:306328kB, file-rss:11132kB, shmem-rss:12345kB`
writeKernelMessageOrDie(oomKillPattern)
}
func writeKernelMessageOrDie(msg string) {
for _, line := range strings.Split(msg, "\n") {
err := ioutil.WriteFile(kmsgPath, []byte(line), 0644)
if err != nil {
glog.Fatalf("Failed writting to %q: %v", kmsgPath, err)
}
}
}

View File

@@ -0,0 +1,27 @@
/*
Copyright 2019 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package makers
var ProblemGenerators = make(map[string]func())
func GetProblemTypes() []string {
var problems []string
for problem := range ProblemGenerators {
problems = append(problems, problem)
}
return problems
}

View File

@@ -0,0 +1,96 @@
/*
Copyright 2019 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"flag"
"fmt"
"os"
"strings"
"time"
"github.com/golang/glog"
"github.com/spf13/pflag"
"k8s.io/node-problem-detector/test/e2e/problemmaker/makers"
)
func init() {
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
}
type options struct {
// Command line options. See flag descriptions for the description
Rate float32
Duration time.Duration
Problem string
}
// AddFlags adds log counter command line options to pflag.
func (o *options) AddFlags(fs *pflag.FlagSet) {
fs.Float32Var(&o.Rate, "rate", 1.0,
"Number of times the problem should be generated per second")
fs.DurationVar(&o.Duration, "duration", time.Duration(1)*time.Second,
"Duration for problem maker to keep generating problems")
problems := makers.GetProblemTypes()
fs.StringVar(&o.Problem, "problem", "",
fmt.Sprintf("The type of problem to be generated. Supported types: %q",
strings.Join(problems, ", ")))
}
func main() {
// Set glog flag so that it does not log to files.
if err := flag.Set("logtostderr", "true"); err != nil {
fmt.Printf("Failed to set logtostderr=true: %v\n", err)
os.Exit(1)
}
o := options{}
o.AddFlags(pflag.CommandLine)
pflag.Parse()
if o.Problem == "" {
glog.Fatalf("Please specify the type of problem to make using the --problem argument.")
}
problemGenerator, ok := makers.ProblemGenerators[o.Problem]
if !ok {
glog.Fatalf("Expected to see a problem type of one of %q, but got %q.",
makers.GetProblemTypes(), o.Problem)
}
periodMilli := int(1000.0 / o.Rate)
ticker := time.NewTicker(time.Duration(periodMilli) * time.Millisecond)
defer ticker.Stop()
done := make(chan bool)
go func() {
time.Sleep(o.Duration)
done <- true
}()
for {
select {
case <-done:
return
case <-ticker.C:
glog.Infof("Generating problem: %q", o.Problem)
problemGenerator()
}
}
}