call out to fio for host filesystem performance (#1275)

* stashing changes

* split filesystem collector into fio and legacy functions

* read fio results into analyzer

* remove test script

* update go.mod

* remove old notes

* go mod tidy

* fix up go.mod

* fix up go.mod

* refactor tests for fio

* make schemas

* remove local scripts

* local watch script for building troubleshoot

* document watch script

* fix var names

* handle errors if run as non-root

* go mod tidy

* use String interface

* collector happy path test

* invalid filesize

* invalid filesize

* tests

* remove old code

* remove old init function

* let actions tests run this

* clean up tests

* go mod tidy

* remove duplicated type declaration

* remove old file create code
This commit is contained in:
ada mancini
2023-10-03 14:21:56 -04:00
committed by GitHub
parent 359475ba87
commit e3adc1cb35
9 changed files with 1725 additions and 178 deletions

5
go.mod
View File

@@ -86,6 +86,7 @@ require (
github.com/mistifyio/go-zfs/v3 v3.0.0 // indirect
github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect
github.com/onsi/ginkgo v1.14.0 // indirect
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
github.com/rubenv/sql-migrate v1.3.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
@@ -118,7 +119,7 @@ require (
github.com/Microsoft/go-winio v0.6.0 // indirect
github.com/Microsoft/hcsshim v0.10.0-rc.7 // indirect
github.com/andybalholm/brotli v1.0.1 // indirect
github.com/aws/aws-sdk-go v1.44.122 // indirect
github.com/aws/aws-sdk-go v1.44.198 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d // indirect
github.com/c9s/goprocinfo v0.0.0-20170724085704-0010a05ce49f // indirect
@@ -221,7 +222,7 @@ require (
go.opencensus.io v0.24.0 // indirect
go.starlark.net v0.0.0-20230525235612-a134d8f9ddca // indirect
golang.org/x/crypto v0.12.0 // indirect
golang.org/x/net v0.14.0 // indirect
golang.org/x/net v0.14.0
golang.org/x/oauth2 v0.8.0 // indirect
golang.org/x/sys v0.12.0 // indirect
golang.org/x/term v0.11.0 // indirect

17
go.sum
View File

@@ -244,8 +244,9 @@ github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgI
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so=
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw=
github.com/aws/aws-sdk-go v1.44.122 h1:p6mw01WBaNpbdP2xrisz5tIkcNwzj/HysobNoaAHjgo=
github.com/aws/aws-sdk-go v1.44.122/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo=
github.com/aws/aws-sdk-go v1.44.198 h1:kgnvxQv4/kP5M0nbxBx0Ac0so9ndr9f8Ti0g+NmPQF8=
github.com/aws/aws-sdk-go v1.44.198/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@@ -609,7 +610,6 @@ github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO
github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0mNTz8vQ=
github.com/hashicorp/memberlist v0.1.3/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I=
github.com/hashicorp/serf v0.8.2/go.mod h1:6hOLApaqBFA1NXqRQAsxw9QxuDEvNxSQRwA/JwenrHc=
github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/huandu/xstrings v1.3.1/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
github.com/huandu/xstrings v1.3.2/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
@@ -800,13 +800,19 @@ github.com/nsf/termbox-go v0.0.0-20190121233118-02980233997d/go.mod h1:IuKpRQcYE
github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
github.com/nwaples/rardecode v1.1.2 h1:Cj0yZY6T1Zx1R7AhTbyGSALm44/Mmq+BAPc4B/p/d3M=
github.com/nwaples/rardecode v1.1.2/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78=
github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/ginkgo v1.10.1 h1:q/mM8GF/n0shIN8SaAZ0V+jnLPzen6WIVZdiwrRlMlo=
github.com/onsi/ginkgo v1.10.1/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk=
github.com/onsi/ginkgo v1.14.0 h1:2mOpI4JVVPBN+WQRa0WKH2eXR+Ey+uK4n7Zj0aYpIQA=
github.com/onsi/ginkgo v1.14.0/go.mod h1:iSB4RoI2tjJc9BBv4NKIKWKya62Rps+oPG/Lv9klQyY=
github.com/onsi/ginkgo/v2 v2.11.0 h1:WgqUCUt/lT6yXoQ8Wef0fsNn5cAuMK7+KT9UFRz2tcU=
github.com/onsi/gomega v1.7.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY=
github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo=
github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
@@ -1138,6 +1144,7 @@ golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/
golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
@@ -1234,12 +1241,14 @@ golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190801041406-cbf593c0f2f3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191010194322-b09406accb47/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -1254,6 +1263,7 @@ golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200519105757-fe76b779f299/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200905004654-be1d3432aa8f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -1648,7 +1658,6 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntN
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/cheggaaa/pb.v1 v1.0.27/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qStrOgw=
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4=
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=

View File

@@ -42,7 +42,18 @@ func (a *AnalyzeHostFilesystemPerformance) Analyze(
return nil, errors.Wrapf(err, "failed to get collected file %s", name)
}
fsPerf := collect.FSPerfResults{}
fioResult := collect.FioResult{}
if err := json.Unmarshal(contents, &fioResult); err != nil {
return nil, errors.Wrapf(err, "failed to unmarshal fio results from %s", name)
}
if len(fioResult.Jobs) == 0 {
return nil, errors.Errorf("no jobs found in fio results from %s", name)
}
fioWriteLatency := fioResult.Jobs[0].Sync
fsPerf := fioWriteLatency.FSPerfResults()
if err := json.Unmarshal(contents, &fsPerf); err != nil {
return nil, errors.Wrapf(err, "failed to unmarshal filesystem performance results from %s", name)
}
@@ -179,7 +190,7 @@ func compareHostFilesystemPerformanceConditionalToActual(conditional string, fsP
return doCompareHostFilesystemPerformance(comparator, fsPerf.P9999, desiredDuration)
}
return false, fmt.Errorf("Unknown filesystem performance keyword %q", keyword)
return false, fmt.Errorf("unknown filesystem performance keyword %q", keyword)
}
func doCompareHostFilesystemPerformance(operator string, actual time.Duration, desired time.Duration) (bool, error) {
@@ -196,7 +207,7 @@ func doCompareHostFilesystemPerformance(operator string, actual time.Duration, d
return actual == desired, nil
}
return false, fmt.Errorf("Unknown filesystem performance operator %q", operator)
return false, fmt.Errorf("unknown filesystem performance operator %q", operator)
}
func renderFSPerfOutcome(outcome string, fsPerf collect.FSPerfResults) string {

View File

@@ -1,12 +1,9 @@
package analyzer
import (
"encoding/json"
"testing"
"time"
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
"github.com/replicatedhq/troubleshoot/pkg/collect"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@@ -14,35 +11,305 @@ import (
func TestAnalyzeHostFilesystemPerformance(t *testing.T) {
tests := []struct {
name string
fsPerf *collect.FSPerfResults
fioResult string
hostAnalyzer *troubleshootv1beta2.FilesystemPerformanceAnalyze
result []*AnalyzeResult
expectErr bool
}{
{
name: "Cover",
fsPerf: &collect.FSPerfResults{
Min: 200 * time.Nanosecond,
Max: time.Second,
Average: 55 * time.Microsecond,
P1: 1 * time.Microsecond,
P5: 5 * time.Microsecond,
P10: 10 * time.Microsecond,
P20: 20 * time.Microsecond,
P30: 30 * time.Microsecond,
P40: 40 * time.Microsecond,
P50: 50 * time.Microsecond,
P60: 60 * time.Microsecond,
P70: 70 * time.Microsecond,
P80: 80 * time.Microsecond,
P90: 90 * time.Microsecond,
P95: 95 * time.Microsecond,
P99: 99 * time.Microsecond,
P995: 995 * time.Microsecond,
P999: 999 * time.Microsecond,
P9995: 5 * time.Millisecond,
P9999: 9 * time.Millisecond,
},
fioResult: `{
"fio version" : "fio-3.28",
"timestamp" : 1691679955,
"timestamp_ms" : 1691679955590,
"time" : "Thu Aug 10 15:05:55 2023",
"global options" : {
"rw" : "write",
"ioengine" : "sync",
"fdatasync" : "1",
"directory" : "/var/lib/etcd",
"size" : "23068672",
"bs" : "1024"
},
"jobs" : [
{
"jobname" : "fsperf",
"groupid" : 0,
"error" : 0,
"eta" : 0,
"elapsed" : 15,
"job options" : {
"name" : "fsperf",
"runtime" : "120"
},
"read" : {
"io_bytes" : 0,
"io_kbytes" : 0,
"bw_bytes" : 0,
"bw" : 0,
"iops" : 0.000000,
"runtime" : 0,
"total_ios" : 0,
"short_ios" : 22527,
"drop_ios" : 0,
"slat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"clat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"lat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"bw_min" : 0,
"bw_max" : 0,
"bw_agg" : 0.000000,
"bw_mean" : 0.000000,
"bw_dev" : 0.000000,
"bw_samples" : 0,
"iops_min" : 0,
"iops_max" : 0,
"iops_mean" : 0.000000,
"iops_stddev" : 0.000000,
"iops_samples" : 0
},
"write" : {
"io_bytes" : 23068672,
"io_kbytes" : 22528,
"bw_bytes" : 1651182,
"bw" : 1612,
"iops" : 1612.483001,
"runtime" : 13971,
"total_ios" : 22528,
"short_ios" : 0,
"drop_ios" : 0,
"slat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"clat_ns" : {
"min" : 200,
"max" : 1000000000,
"mean" : 55000,
"stddev" : 12345.6789,
"N" : 32400,
"percentile" : {
"1.000000" : 1000,
"5.000000" : 5000,
"10.000000" : 10000,
"20.000000" : 20000,
"30.000000" : 30000,
"40.000000" : 40000,
"50.000000" : 50000,
"60.000000" : 60000,
"70.000000" : 70000,
"80.000000" : 80000,
"90.000000" : 90000,
"95.000000" : 95000,
"99.000000" : 99000,
"99.500000" : 995000,
"99.900000" : 999000,
"99.950000" : 5000000,
"99.990000" : 9000000
}
},
"lat_ns" : {
"min" : 2684,
"max" : 8710446,
"mean" : 95169.335405,
"stddev" : 172145.383902,
"N" : 22528
},
"bw_min" : 1516,
"bw_max" : 1706,
"bw_agg" : 100.000000,
"bw_mean" : 1613.629630,
"bw_dev" : 35.708379,
"bw_samples" : 27,
"iops_min" : 1516,
"iops_max" : 1706,
"iops_mean" : 1613.629630,
"iops_stddev" : 35.708379,
"iops_samples" : 27
},
"trim" : {
"io_bytes" : 0,
"io_kbytes" : 0,
"bw_bytes" : 0,
"bw" : 0,
"iops" : 0.000000,
"runtime" : 0,
"total_ios" : 0,
"short_ios" : 0,
"drop_ios" : 0,
"slat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"clat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"lat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"bw_min" : 0,
"bw_max" : 0,
"bw_agg" : 0.000000,
"bw_mean" : 0.000000,
"bw_dev" : 0.000000,
"bw_samples" : 0,
"iops_min" : 0,
"iops_max" : 0,
"iops_mean" : 0.000000,
"iops_stddev" : 0.000000,
"iops_samples" : 0
},
"sync" : {
"total_ios" : 0,
"lat_ns" : {
"min" : 200,
"max" : 1000000000,
"mean" : 55000,
"stddev" : 12345.6789,
"N" : 32400,
"percentile" : {
"1.000000" : 1000,
"5.000000" : 5000,
"10.000000" : 10000,
"20.000000" : 20000,
"30.000000" : 30000,
"40.000000" : 40000,
"50.000000" : 50000,
"60.000000" : 60000,
"70.000000" : 70000,
"80.000000" : 80000,
"90.000000" : 90000,
"95.000000" : 95000,
"99.000000" : 99000,
"99.500000" : 995000,
"99.900000" : 999000,
"99.950000" : 5000000,
"99.990000" : 9000000
}
}
},
"job_runtime" : 13970,
"usr_cpu" : 1.410165,
"sys_cpu" : 5.454545,
"ctx" : 72137,
"majf" : 0,
"minf" : 16,
"iodepth_level" : {
"1" : 199.995561,
"2" : 0.000000,
"4" : 0.000000,
"8" : 0.000000,
"16" : 0.000000,
"32" : 0.000000,
">=64" : 0.000000
},
"iodepth_submit" : {
"0" : 0.000000,
"4" : 100.000000,
"8" : 0.000000,
"16" : 0.000000,
"32" : 0.000000,
"64" : 0.000000,
">=64" : 0.000000
},
"iodepth_complete" : {
"0" : 0.000000,
"4" : 100.000000,
"8" : 0.000000,
"16" : 0.000000,
"32" : 0.000000,
"64" : 0.000000,
">=64" : 0.000000
},
"latency_ns" : {
"2" : 0.000000,
"4" : 0.000000,
"10" : 0.000000,
"20" : 0.000000,
"50" : 0.000000,
"100" : 0.000000,
"250" : 0.000000,
"500" : 0.000000,
"750" : 0.000000,
"1000" : 0.000000
},
"latency_us" : {
"2" : 0.000000,
"4" : 27.077415,
"10" : 42.032138,
"20" : 5.450994,
"50" : 0.306286,
"100" : 0.026634,
"250" : 0.461648,
"500" : 23.291016,
"750" : 1.269531,
"1000" : 0.035511
},
"latency_ms" : {
"2" : 0.026634,
"4" : 0.017756,
"10" : 0.010000,
"20" : 0.000000,
"50" : 0.000000,
"100" : 0.000000,
"250" : 0.000000,
"500" : 0.000000,
"750" : 0.000000,
"1000" : 0.000000,
"2000" : 0.000000,
">=2000" : 0.000000
},
"latency_depth" : 1,
"latency_target" : 0,
"latency_percentile" : 100.000000,
"latency_window" : 0
}
],
"disk_util" : [
{
"name" : "sda",
"read_ios" : 5610,
"write_ios" : 45550,
"read_merges" : 0,
"write_merges" : 568,
"read_ticks" : 1863,
"write_ticks" : 11605,
"in_queue" : 14353,
"util" : 99.435028
}
]
}`,
hostAnalyzer: &troubleshootv1beta2.FilesystemPerformanceAnalyze{
CollectorName: "etcd",
Outcomes: []*troubleshootv1beta2.Outcome{
@@ -298,9 +565,298 @@ func TestAnalyzeHostFilesystemPerformance(t *testing.T) {
},
{
name: "skip warn if pass first",
fsPerf: &collect.FSPerfResults{
P99: 9 * time.Millisecond,
},
fioResult: `{
"fio version" : "fio-3.28",
"timestamp" : 1691679955,
"timestamp_ms" : 1691679955590,
"time" : "Thu Aug 10 15:05:55 2023",
"global options" : {
"rw" : "write",
"ioengine" : "sync",
"fdatasync" : "1",
"directory" : "/var/lib/etcd",
"size" : "23068672",
"bs" : "1024"
},
"jobs" : [
{
"jobname" : "fsperf",
"groupid" : 0,
"error" : 0,
"eta" : 0,
"elapsed" : 15,
"job options" : {
"name" : "fsperf",
"runtime" : "120"
},
"read" : {
"io_bytes" : 0,
"io_kbytes" : 0,
"bw_bytes" : 0,
"bw" : 0,
"iops" : 0.000000,
"runtime" : 0,
"total_ios" : 0,
"short_ios" : 22527,
"drop_ios" : 0,
"slat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"clat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"lat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"bw_min" : 0,
"bw_max" : 0,
"bw_agg" : 0.000000,
"bw_mean" : 0.000000,
"bw_dev" : 0.000000,
"bw_samples" : 0,
"iops_min" : 0,
"iops_max" : 0,
"iops_mean" : 0.000000,
"iops_stddev" : 0.000000,
"iops_samples" : 0
},
"write" : {
"io_bytes" : 23068672,
"io_kbytes" : 22528,
"bw_bytes" : 1651182,
"bw" : 1612,
"iops" : 1612.483001,
"runtime" : 13971,
"total_ios" : 22528,
"short_ios" : 0,
"drop_ios" : 0,
"slat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"clat_ns" : {
"min" : 200,
"max" : 1000000000,
"mean" : 55000,
"stddev" : 12345.6789,
"N" : 32400,
"percentile" : {
"1.000000" : 1000,
"5.000000" : 5000,
"10.000000" : 10000,
"20.000000" : 20000,
"30.000000" : 30000,
"40.000000" : 40000,
"50.000000" : 50000,
"60.000000" : 60000,
"70.000000" : 70000,
"80.000000" : 80000,
"90.000000" : 90000,
"95.000000" : 95000,
"99.000000" : 99000,
"99.500000" : 995000,
"99.900000" : 999000,
"99.950000" : 5000000,
"99.990000" : 9000000
}
},
"lat_ns" : {
"min" : 2684,
"max" : 8710446,
"mean" : 95169.335405,
"stddev" : 172145.383902,
"N" : 22528
},
"bw_min" : 1516,
"bw_max" : 1706,
"bw_agg" : 100.000000,
"bw_mean" : 1613.629630,
"bw_dev" : 35.708379,
"bw_samples" : 27,
"iops_min" : 1516,
"iops_max" : 1706,
"iops_mean" : 1613.629630,
"iops_stddev" : 35.708379,
"iops_samples" : 27
},
"trim" : {
"io_bytes" : 0,
"io_kbytes" : 0,
"bw_bytes" : 0,
"bw" : 0,
"iops" : 0.000000,
"runtime" : 0,
"total_ios" : 0,
"short_ios" : 0,
"drop_ios" : 0,
"slat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"clat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"lat_ns" : {
"min" : 0,
"max" : 0,
"mean" : 0.000000,
"stddev" : 0.000000,
"N" : 0
},
"bw_min" : 0,
"bw_max" : 0,
"bw_agg" : 0.000000,
"bw_mean" : 0.000000,
"bw_dev" : 0.000000,
"bw_samples" : 0,
"iops_min" : 0,
"iops_max" : 0,
"iops_mean" : 0.000000,
"iops_stddev" : 0.000000,
"iops_samples" : 0
},
"sync" : {
"total_ios" : 0,
"lat_ns" : {
"min" : 200,
"max" : 1000000000,
"mean" : 55000,
"stddev" : 12345.6789,
"N" : 32400,
"percentile" : {
"1.000000" : 1000,
"5.000000" : 5000,
"10.000000" : 10000,
"20.000000" : 20000,
"30.000000" : 30000,
"40.000000" : 40000,
"50.000000" : 50000,
"60.000000" : 60000,
"70.000000" : 70000,
"80.000000" : 80000,
"90.000000" : 90000,
"95.000000" : 95000,
"99.000000" : 9000000,
"99.500000" : 995000,
"99.900000" : 999000,
"99.950000" : 5000000,
"99.990000" : 9000000
}
}
},
"job_runtime" : 13970,
"usr_cpu" : 1.410165,
"sys_cpu" : 5.454545,
"ctx" : 72137,
"majf" : 0,
"minf" : 16,
"iodepth_level" : {
"1" : 199.995561,
"2" : 0.000000,
"4" : 0.000000,
"8" : 0.000000,
"16" : 0.000000,
"32" : 0.000000,
">=64" : 0.000000
},
"iodepth_submit" : {
"0" : 0.000000,
"4" : 100.000000,
"8" : 0.000000,
"16" : 0.000000,
"32" : 0.000000,
"64" : 0.000000,
">=64" : 0.000000
},
"iodepth_complete" : {
"0" : 0.000000,
"4" : 100.000000,
"8" : 0.000000,
"16" : 0.000000,
"32" : 0.000000,
"64" : 0.000000,
">=64" : 0.000000
},
"latency_ns" : {
"2" : 0.000000,
"4" : 0.000000,
"10" : 0.000000,
"20" : 0.000000,
"50" : 0.000000,
"100" : 0.000000,
"250" : 0.000000,
"500" : 0.000000,
"750" : 0.000000,
"1000" : 0.000000
},
"latency_us" : {
"2" : 0.000000,
"4" : 27.077415,
"10" : 42.032138,
"20" : 5.450994,
"50" : 0.306286,
"100" : 0.026634,
"250" : 0.461648,
"500" : 23.291016,
"750" : 1.269531,
"1000" : 0.035511
},
"latency_ms" : {
"2" : 0.026634,
"4" : 0.017756,
"10" : 0.010000,
"20" : 0.000000,
"50" : 0.000000,
"100" : 0.000000,
"250" : 0.000000,
"500" : 0.000000,
"750" : 0.000000,
"1000" : 0.000000,
"2000" : 0.000000,
">=2000" : 0.000000
},
"latency_depth" : 1,
"latency_target" : 0,
"latency_percentile" : 100.000000,
"latency_window" : 0
}
],
"disk_util" : [
{
"name" : "sda",
"read_ios" : 5610,
"write_ios" : 45550,
"read_merges" : 0,
"write_merges" : 568,
"read_ticks" : 1863,
"write_ticks" : 11605,
"in_queue" : 14353,
"util" : 99.435028
}
]
}`,
hostAnalyzer: &troubleshootv1beta2.FilesystemPerformanceAnalyze{
CollectorName: "file system performance",
Outcomes: []*troubleshootv1beta2.Outcome{
@@ -332,20 +888,66 @@ func TestAnalyzeHostFilesystemPerformance(t *testing.T) {
},
},
},
{
name: "bail if malformed JSON",
fioResult: `{
bad JSON
}`,
hostAnalyzer: &troubleshootv1beta2.FilesystemPerformanceAnalyze{
CollectorName: "file system performance",
Outcomes: []*troubleshootv1beta2.Outcome{
{
Fail: &troubleshootv1beta2.SingleOutcome{
Message: "bad JSON should not be analyzed",
},
},
},
},
expectErr: true,
},
{
name: "bail if fio ran no jobs",
fioResult: `{
"fio version" : "fio-3.28",
"timestamp" : 1691679955,
"timestamp_ms" : 1691679955590,
"time" : "Thu Aug 10 15:05:55 2023",
"global options" : {
"rw" : "write",
"ioengine" : "sync",
"fdatasync" : "1",
"directory" : "/var/lib/etcd",
"size" : "23068672",
"bs" : "1024"
},
"jobs" : [
]
}`,
hostAnalyzer: &troubleshootv1beta2.FilesystemPerformanceAnalyze{
CollectorName: "file system performance",
Outcomes: []*troubleshootv1beta2.Outcome{
{
Fail: &troubleshootv1beta2.SingleOutcome{
Message: "an empty Jobs array should not be analyzed",
},
},
},
},
expectErr: true,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
req := require.New(t)
b, err := json.Marshal(test.fsPerf)
if err != nil {
t.Fatal(err)
}
b := []byte(test.fioResult)
getCollectedFileContents := func(filename string) ([]byte, error) {
return b, nil
}
result, err := (&AnalyzeHostFilesystemPerformance{test.hostAnalyzer}).Analyze(getCollectedFileContents, nil)
a := AnalyzeHostFilesystemPerformance{test.hostAnalyzer}
result, err := a.Analyze(getCollectedFileContents, nil)
if test.expectErr {
req.Error(err)
} else {

View File

@@ -2,16 +2,35 @@ package collect
import (
"bytes"
"encoding/json"
"fmt"
"math"
"math/rand"
"os/exec"
"reflect"
"strconv"
"strings"
"text/template"
"time"
"github.com/pkg/errors"
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
"golang.org/x/net/context"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/klog/v2"
)
func init() {
rand.Seed(time.Now().UnixNano())
type Durations []time.Duration
func (d Durations) Len() int {
return len(d)
}
func (d Durations) Less(i, j int) bool {
return d[i] < d[j]
}
func (d Durations) Swap(i, j int) {
d[i], d[j] = d[j], d[i]
}
type CollectHostFilesystemPerformance struct {
@@ -90,3 +109,324 @@ func (f FSPerfResults) String() string {
return buf.String()
}
type FioResult struct {
FioVersion string `json:"fio version,omitempty"`
Timestamp int64 `json:"timestamp,omitempty"`
TimestampMS int64 `json:"timestamp_ms,omitempty"`
Time string `json:"time,omitempty"`
GlobalOptions FioGlobalOptions `json:"global options,omitempty"`
Jobs []FioJobs `json:"jobs,omitempty"`
DiskUtil []FioDiskUtil `json:"disk_util,omitempty"`
}
func (f FioResult) String() string {
var res string
res += fmt.Sprintf("FIO version - %s\n", f.FioVersion)
res += fmt.Sprintf("Global options - %s\n\n", f.GlobalOptions)
for _, job := range f.Jobs {
res += fmt.Sprintf("%s\n", job)
}
res += "Disk stats (read/write):\n"
for _, du := range f.DiskUtil {
res += fmt.Sprintf("%s\n", du)
}
return res
}
type FioGlobalOptions struct {
Directory string `json:"directory,omitempty"`
RandRepeat string `json:"randrepeat,omitempty"`
Verify string `json:"verify,omitempty"`
IOEngine string `json:"ioengine,omitempty"`
Direct string `json:"direct,omitempty"`
GtodReduce string `json:"gtod_reduce,omitempty"`
}
func (g FioGlobalOptions) String() string {
return fmt.Sprintf("ioengine=%s verify=%s direct=%s gtod_reduce=%s", g.IOEngine, g.Verify, g.Direct, g.GtodReduce)
}
type FioJobs struct {
JobName string `json:"jobname,omitempty"`
GroupID int `json:"groupid,omitempty"`
Error int `json:"error,omitempty"`
Eta int `json:"eta,omitempty"`
Elapsed int `json:"elapsed,omitempty"`
JobOptions FioJobOptions `json:"job options,omitempty"`
Read FioStats `json:"read,omitempty"`
Write FioStats `json:"write,omitempty"`
Trim FioStats `json:"trim,omitempty"`
Sync FioStats `json:"sync,omitempty"`
JobRuntime int32 `json:"job_runtime,omitempty"`
UsrCpu float32 `json:"usr_cpu,omitempty"`
SysCpu float32 `json:"sys_cpu,omitempty"`
Ctx int32 `json:"ctx,omitempty"`
MajF int32 `json:"majf,omitempty"`
MinF int32 `json:"minf,omitempty"`
IoDepthLevel FioDepth `json:"iodepth_level,omitempty"`
IoDepthSubmit FioDepth `json:"iodepth_submit,omitempty"`
IoDepthComplete FioDepth `json:"iodepth_complete,omitempty"`
LatencyNs FioLatency `json:"latency_ns,omitempty"`
LatencyUs FioLatency `json:"latency_us,omitempty"`
LatencyMs FioLatency `json:"latency_ms,omitempty"`
LatencyDepth int32 `json:"latency_depth,omitempty"`
LatencyTarget int32 `json:"latency_target,omitempty"`
LatencyPercentile float32 `json:"latency_percentile,omitempty"`
LatencyWindow int32 `json:"latency_window,omitempty"`
}
func (j FioJobs) String() string {
var job string
job += fmt.Sprintf("%s\n", j.JobOptions)
if j.Read.Iops != 0 || j.Read.BW != 0 {
job += fmt.Sprintf("read:\n%s\n", j.Read)
}
if j.Write.Iops != 0 || j.Write.BW != 0 {
job += fmt.Sprintf("write:\n%s\n", j.Write)
}
return job
}
type FioJobOptions struct {
Name string `json:"name,omitempty"`
BS string `json:"bs,omitempty"`
Directory string `json:"directory,omitempty"`
RW string `json:"rw,omitempty"`
IOEngine string `json:"ioengine,omitempty"`
FDataSync string `json:"fdatasync,omitempty"`
Size string `json:"size,omitempty"`
RunTime string `json:"runtime,omitempty"`
}
func (o FioJobOptions) String() string {
return fmt.Sprintf("JobName: %s\n blocksize=%s filesize=%s rw=%s", o.Name, o.BS, o.Size, o.RW)
}
type FioStats struct {
IOBytes int64 `json:"io_bytes,omitempty"`
IOKBytes int64 `json:"io_kbytes,omitempty"`
BWBytes int64 `json:"bw_bytes,omitempty"`
BW int64 `json:"bw,omitempty"`
Iops float32 `json:"iops,omitempty"`
Runtime int64 `json:"runtime,omitempty"`
TotalIos int64 `json:"total_ios,omitempty"`
ShortIos int64 `json:"short_ios,omitempty"`
DropIos int64 `json:"drop_ios,omitempty"`
SlatNs FioNS `json:"slat_ns,omitempty"`
ClatNs FioNS `json:"clat_ns,omitempty"`
LatNs FioNS `json:"lat_ns,omitempty"`
Percentile FioPercentile `json:"percentile,omitempty"`
BwMin int64 `json:"bw_min,omitempty"`
BwMax int64 `json:"bw_max,omitempty"`
BwAgg float32 `json:"bw_agg,omitempty"`
BwMean float32 `json:"bw_mean,omitempty"`
BwDev float32 `json:"bw_dev,omitempty"`
BwSamples int32 `json:"bw_samples,omitempty"`
IopsMin int32 `json:"iops_min,omitempty"`
IopsMax int32 `json:"iops_max,omitempty"`
IopsMean float32 `json:"iops_mean,omitempty"`
IopsStdDev float32 `json:"iops_stddev,omitempty"`
IopsSamples int32 `json:"iops_samples,omitempty"`
}
func (s FioStats) String() string {
var stats string
stats += fmt.Sprintf(" IOPS=%f BW(KiB/s)=%d\n", s.Iops, s.BW)
stats += fmt.Sprintf(" iops: min=%d max=%d avg=%f\n", s.IopsMin, s.IopsMax, s.IopsMean)
stats += fmt.Sprintf(" bw(KiB/s): min=%d max=%d avg=%f", s.BwMin, s.BwMax, s.BwMean)
return stats
}
func (s FioStats) FSPerfResults() FSPerfResults {
return FSPerfResults{
Min: time.Duration(s.LatNs.Min),
Max: time.Duration(s.LatNs.Max),
Average: time.Duration(s.LatNs.Mean),
P1: time.Duration(s.LatNs.Percentile.P1),
P5: time.Duration(s.LatNs.Percentile.P5),
P10: time.Duration(s.LatNs.Percentile.P10),
P20: time.Duration(s.LatNs.Percentile.P20),
P30: time.Duration(s.LatNs.Percentile.P30),
P40: time.Duration(s.LatNs.Percentile.P40),
P50: time.Duration(s.LatNs.Percentile.P50),
P60: time.Duration(s.LatNs.Percentile.P60),
P70: time.Duration(s.LatNs.Percentile.P70),
P80: time.Duration(s.LatNs.Percentile.P80),
P90: time.Duration(s.LatNs.Percentile.P90),
P95: time.Duration(s.LatNs.Percentile.P95),
P99: time.Duration(s.LatNs.Percentile.P99),
P995: time.Duration(s.LatNs.Percentile.P995),
P999: time.Duration(s.LatNs.Percentile.P999),
P9995: time.Duration(s.LatNs.Percentile.P9995),
P9999: time.Duration(s.LatNs.Percentile.P9999),
}
}
type FioNS struct {
Min int64 `json:"min,omitempty"`
Max int64 `json:"max,omitempty"`
Mean float32 `json:"mean,omitempty"`
StdDev float32 `json:"stddev,omitempty"`
N int64 `json:"N,omitempty"`
Percentile FioPercentile `json:"percentile,omitempty"`
}
type FioDepth struct {
FioDepth0 float32 `json:"0,omitempty"`
FioDepth1 float32 `json:"1,omitempty"`
FioDepth2 float32 `json:"2,omitempty"`
FioDepth4 float32 `json:"4,omitempty"`
FioDepth8 float32 `json:"8,omitempty"`
FioDepth16 float32 `json:"16,omitempty"`
FioDepth32 float32 `json:"32,omitempty"`
FioDepth64 float32 `json:"64,omitempty"`
FioDepthGE64 float32 `json:">=64,omitempty"`
}
type FioLatency struct {
FioLat2 float32 `json:"2,omitempty"`
FioLat4 float32 `json:"4,omitempty"`
FioLat10 float32 `json:"10,omitempty"`
FioLat20 float32 `json:"20,omitempty"`
FioLat50 float32 `json:"50,omitempty"`
FioLat100 float32 `json:"100,omitempty"`
FioLat250 float32 `json:"250,omitempty"`
FioLat500 float32 `json:"500,omitempty"`
FioLat750 float32 `json:"750,omitempty"`
FioLat1000 float32 `json:"1000,omitempty"`
FioLat2000 float32 `json:"2000,omitempty"`
FioLatGE2000 float32 `json:">=2000,omitempty"`
}
type FioDiskUtil struct {
Name string `json:"name,omitempty"`
ReadIos int64 `json:"read_ios,omitempty"`
WriteIos int64 `json:"write_ios,omitempty"`
ReadMerges int64 `json:"read_merges,omitempty"`
WriteMerges int64 `json:"write_merges,omitempty"`
ReadTicks int64 `json:"read_ticks,omitempty"`
WriteTicks int64 `json:"write_ticks,omitempty"`
InQueue int64 `json:"in_queue,omitempty"`
Util float32 `json:"util,omitempty"`
}
type FioPercentile struct {
P1 int `json:"1.000000,omitempty"`
P5 int `json:"5.000000,omitempty"`
P10 int `json:"10.000000,omitempty"`
P20 int `json:"20.000000,omitempty"`
P30 int `json:"30.000000,omitempty"`
P40 int `json:"40.000000,omitempty"`
P50 int `json:"50.000000,omitempty"`
P60 int `json:"60.000000,omitempty"`
P70 int `json:"70.000000,omitempty"`
P80 int `json:"80.000000,omitempty"`
P90 int `json:"90.000000,omitempty"`
P95 int `json:"95.000000,omitempty"`
P99 int `json:"99.000000,omitempty"`
P995 int `json:"99.500000,omitempty"`
P999 int `json:"99.900000,omitempty"`
P9995 int `json:"99.950000,omitempty"`
P9999 int `json:"99.990000,omitempty"`
}
func (d FioDiskUtil) String() string {
//Disk stats (read/write):
//rbd4: ios=30022/11982, merge=0/313, ticks=1028675/1022768, in_queue=2063740, util=99.67%
var du string
du += fmt.Sprintf(" %s: ios=%d/%d merge=%d/%d ticks=%d/%d in_queue=%d, util=%f%%", d.Name, d.ReadIos,
d.WriteIos, d.ReadMerges, d.WriteMerges, d.ReadTicks, d.WriteTicks, d.InQueue, d.Util)
return du
}
func parseCollectorOptions(hostCollector *troubleshootv1beta2.FilesystemPerformance) ([]string, *FioJobOptions, error) {
var operationSize uint64 = 1024
if hostCollector.OperationSizeBytes > 0 {
operationSize = hostCollector.OperationSizeBytes
}
var fileSize uint64 = 10 * 1024 * 1024
if hostCollector.FileSize != "" {
quantity, err := resource.ParseQuantity(hostCollector.FileSize)
if err != nil {
return nil, nil, errors.Wrapf(err, "failed to parse fileSize %q", hostCollector.FileSize)
}
fileSizeInt64, ok := quantity.AsInt64()
if !ok {
return nil, nil, errors.Wrapf(err, "failed to parse fileSize %q", hostCollector.FileSize)
}
if fileSizeInt64 <= 0 {
return nil, nil, errors.Wrapf(err, "fileSize %q must be greater than 0", hostCollector.FileSize)
}
fileSize = uint64(fileSizeInt64)
}
if hostCollector.Directory == "" {
return nil, nil, errors.New("Directory is required to collect filesystem performance info")
}
latencyBenchmarkOptions := FioJobOptions{
RW: "write",
IOEngine: "sync",
FDataSync: "1",
Directory: hostCollector.Directory,
Size: strconv.FormatUint(fileSize, 10),
BS: strconv.FormatUint(operationSize, 10),
Name: "fsperf",
RunTime: "120",
}
command := buildFioCommand(latencyBenchmarkOptions)
return command, &latencyBenchmarkOptions, nil
}
func buildFioCommand(opts FioJobOptions) []string {
command := []string{"fio"}
v := reflect.ValueOf(opts)
t := reflect.TypeOf(opts)
for i := 0; i < v.NumField(); i++ {
field := t.Field(i)
value := v.Field(i)
if !value.IsZero() {
command = append(command, fmt.Sprintf("--%s=%v", strings.ToLower(field.Name), value.Interface()))
}
}
command = append(command, "--output-format=json")
return command
}
func collectFioResults(ctx context.Context, hostCollector *troubleshootv1beta2.FilesystemPerformance) (*FioResult, error) {
command, opts, err := parseCollectorOptions(hostCollector)
if err != nil {
return nil, errors.Wrap(err, "failed to parse collector options")
}
klog.V(2).Infof("collecting fio results: %s", strings.Join(command, " "))
output, err := exec.CommandContext(ctx, command[0], command[1:]...).Output()
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
if exitErr.ExitCode() == 1 {
return nil, errors.Wrapf(err, "fio failed; permission denied opening %s. ensure this collector runs as root", opts.Directory)
} else {
return nil, errors.Wrapf(err, "fio failed with exit status %d", exitErr.ExitCode())
}
} else if e, ok := err.(*exec.Error); ok && e.Err == exec.ErrNotFound {
return nil, errors.Wrapf(err, "command not found: %v. ensure fio is installed", command)
} else {
return nil, errors.Wrapf(err, "failed to run command: %v", command)
}
}
var result FioResult
err = json.Unmarshal([]byte(output), &result)
if err != nil {
return nil, errors.Wrap(err, "failed to unmarshal fio result")
}
return &result, nil
}

View File

@@ -10,36 +10,22 @@ import (
"math/rand"
"os"
"path/filepath"
"sort"
"sync"
"syscall"
"time"
"github.com/pkg/errors"
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
"k8s.io/apimachinery/pkg/api/resource"
)
func init() {
rand.Seed(time.Now().UnixNano())
}
type Durations []time.Duration
func (d Durations) Len() int {
return len(d)
}
func (d Durations) Less(i, j int) bool {
return d[i] < d[j]
}
func (d Durations) Swap(i, j int) {
d[i], d[j] = d[j], d[i]
}
// Today we only care about checking for write latency so the options struct
// only has what we need for that. we'll collect all the results from a single run of fio
// and filter out the fsync results for analysis. TODO: update the analyzer so any/all results
// from fio can be analyzed.
func collectHostFilesystemPerformance(hostCollector *troubleshootv1beta2.FilesystemPerformance, bundlePath string) (map[string][]byte, error) {
timeout := time.Minute
if hostCollector.Timeout != "" {
d, err := time.ParseDuration(hostCollector.Timeout)
if err != nil {
@@ -50,46 +36,15 @@ func collectHostFilesystemPerformance(hostCollector *troubleshootv1beta2.Filesys
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
var operationSize uint64 = 1024
if hostCollector.OperationSizeBytes != 0 {
operationSize = hostCollector.OperationSizeBytes
collectorName := hostCollector.CollectorName
if collectorName == "" {
collectorName = "filesystemPerformance"
}
name := filepath.Join("host-collectors/filesystemPerformance", collectorName+".json")
var fileSize uint64 = 10 * 1024 * 1024
if hostCollector.FileSize != "" {
quantity, err := resource.ParseQuantity(hostCollector.FileSize)
if err != nil {
return nil, errors.Wrapf(err, "failed to parse fileSize %q", hostCollector.FileSize)
}
fileSizeInt64, ok := quantity.AsInt64()
if !ok {
return nil, errors.Wrapf(err, "failed to parse fileSize %q", hostCollector.FileSize)
}
fileSize = uint64(fileSizeInt64)
}
if hostCollector.Directory == "" {
return nil, errors.New("Directory is required to collect filesystem performance info")
}
// TODO: clean up this directory if its created
if err := os.MkdirAll(hostCollector.Directory, 0700); err != nil {
return nil, errors.Wrapf(err, "failed to mkdir %q", hostCollector.Directory)
}
filename := filepath.Join(hostCollector.Directory, "fsperf")
f, err := os.OpenFile(filename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
if err != nil {
log.Panic(err)
return nil, errors.Wrapf(err, "open %s", filename)
}
defer func() {
if err := f.Close(); err != nil {
log.Println(err.Error())
}
if err := os.Remove(filename); err != nil {
log.Println(err.Error())
}
}()
// Start the background IOPS task and wait for warmup
if hostCollector.EnableBackgroundIOPS {
@@ -123,86 +78,16 @@ func collectHostFilesystemPerformance(hostCollector *troubleshootv1beta2.Filesys
time.Sleep(time.Second * time.Duration(hostCollector.BackgroundIOPSWarmupSeconds))
}
// Sequential writes benchmark
var written uint64 = 0
var results Durations
var fioResult *FioResult
for {
if written >= fileSize {
break
}
data := make([]byte, int(operationSize))
rand.Read(data)
start := time.Now()
n, err := f.Write(data)
if err != nil {
return nil, errors.Wrapf(err, "write to %s", filename)
}
if hostCollector.Sync {
if err := f.Sync(); err != nil {
return nil, errors.Wrapf(err, "sync %s", filename)
}
} else if hostCollector.Datasync {
if err := syscall.Fdatasync(int(f.Fd())); err != nil {
return nil, errors.Wrapf(err, "datasync %s", filename)
}
}
d := time.Now().Sub(start)
results = append(results, d)
written += uint64(n)
if ctx.Err() != nil {
break
}
}
if len(results) == 0 {
return nil, errors.New("No filesystem performance results collected")
}
sort.Sort(results)
var sum time.Duration
for _, d := range results {
sum += d
}
fsPerf := &FSPerfResults{
Min: results[0],
Max: results[len(results)-1],
Average: sum / time.Duration(len(results)),
P1: results[getPercentileIndex(.01, len(results))],
P5: results[getPercentileIndex(.05, len(results))],
P10: results[getPercentileIndex(.1, len(results))],
P20: results[getPercentileIndex(.2, len(results))],
P30: results[getPercentileIndex(.3, len(results))],
P40: results[getPercentileIndex(.4, len(results))],
P50: results[getPercentileIndex(.5, len(results))],
P60: results[getPercentileIndex(.6, len(results))],
P70: results[getPercentileIndex(.7, len(results))],
P80: results[getPercentileIndex(.8, len(results))],
P90: results[getPercentileIndex(.9, len(results))],
P95: results[getPercentileIndex(.95, len(results))],
P99: results[getPercentileIndex(.99, len(results))],
P995: results[getPercentileIndex(.995, len(results))],
P999: results[getPercentileIndex(.999, len(results))],
P9995: results[getPercentileIndex(.9995, len(results))],
P9999: results[getPercentileIndex(.9999, len(results))],
}
collectorName := hostCollector.CollectorName
if collectorName == "" {
collectorName = "filesystemPerformance"
}
name := filepath.Join("host-collectors/filesystemPerformance", collectorName+".json")
b, err := json.Marshal(fsPerf)
fioResult, err := collectFioResults(ctx, hostCollector)
if err != nil {
return nil, errors.Wrap(err, "failed to marshal fs perf results")
return nil, errors.Wrap(err, "failed to collect fio results")
}
b, err := json.Marshal(fioResult)
if err != nil {
return nil, errors.Wrap(err, "failed to unmarshal fio results")
}
output := NewResult()

View File

@@ -2,7 +2,10 @@ package collect
import (
"fmt"
"reflect"
"testing"
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
)
func TestGetPercentileIndex(t *testing.T) {
@@ -57,3 +60,128 @@ func TestGetPercentileIndex(t *testing.T) {
})
}
}
func Test_parseCollectorOptions(t *testing.T) {
type args struct {
hostCollector *troubleshootv1beta2.FilesystemPerformance
}
tests := []struct {
name string
args args
wantCommand []string
wantOptions *FioJobOptions
wantErr bool
}{
{
name: "Happy spec",
args: args{
hostCollector: &troubleshootv1beta2.FilesystemPerformance{
HostCollectorMeta: troubleshootv1beta2.HostCollectorMeta{
CollectorName: "fsperf",
},
OperationSizeBytes: 1024,
Directory: "/var/lib/etcd",
FileSize: "22Mi",
Sync: true,
Datasync: true,
Timeout: "120",
EnableBackgroundIOPS: true,
BackgroundIOPSWarmupSeconds: 10,
BackgroundWriteIOPS: 100,
BackgroundReadIOPS: 100,
BackgroundWriteIOPSJobs: 1,
BackgroundReadIOPSJobs: 1,
},
},
wantCommand: []string{
"fio",
"--name=fsperf",
"--bs=1024",
"--directory=/var/lib/etcd",
"--rw=write",
"--ioengine=sync",
"--fdatasync=1",
"--size=23068672",
"--runtime=120",
"--output-format=json",
},
wantOptions: &FioJobOptions{
RW: "write",
IOEngine: "sync",
FDataSync: "1",
Directory: "/var/lib/etcd",
Size: "23068672",
BS: "1024",
Name: "fsperf",
RunTime: "120",
},
wantErr: false,
},
{
name: "Empty spec fails",
args: args{
hostCollector: &troubleshootv1beta2.FilesystemPerformance{
HostCollectorMeta: troubleshootv1beta2.HostCollectorMeta{
CollectorName: "fsperf",
},
},
},
wantCommand: nil,
wantOptions: nil,
wantErr: true,
},
{
name: "Invalid filesize",
args: args{
hostCollector: &troubleshootv1beta2.FilesystemPerformance{
HostCollectorMeta: troubleshootv1beta2.HostCollectorMeta{
CollectorName: "fsperf",
},
OperationSizeBytes: 1024,
Directory: "/var/lib/etcd",
FileSize: "abcd",
Sync: true,
Datasync: true,
Timeout: "120",
},
},
wantCommand: nil,
wantOptions: nil,
wantErr: true,
},
{
name: "invalid path parameter",
args: args{
hostCollector: &troubleshootv1beta2.FilesystemPerformance{
HostCollectorMeta: troubleshootv1beta2.HostCollectorMeta{
CollectorName: "fsperf",
},
OperationSizeBytes: 1024,
Directory: "",
FileSize: "22Mi",
Sync: true,
Datasync: true,
Timeout: "120",
},
},
wantCommand: nil,
wantOptions: nil,
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotCommand, gotOptions, err := parseCollectorOptions(tt.args.hostCollector)
if (err != nil) != tt.wantErr {
t.Errorf("parseCollectorOptions() error = %v, wantErr %v", err, tt.wantErr)
} else {
if !reflect.DeepEqual(gotCommand, tt.wantCommand) {
t.Errorf("parseCollectorOptions() got command = %v, want %v", gotCommand, tt.wantCommand)
}
if !reflect.DeepEqual(gotOptions, tt.wantOptions) {
t.Errorf("parseCollectorOptions() got options = %v, want %v", gotOptions, tt.wantOptions)
}
}
})
}
}

View File

@@ -0,0 +1,19 @@
apiVersion: troubleshoot.sh/v1beta2
kind: HostPreflight
metadata:
name: sample
spec:
collectors:
- filesystemPerformance:
collectorName: Filesystem Latency Two Minute Benchmark
timeout: 2m
directory: /var/lib/etcd
fileSize: 22Mi
operationSizeBytes: 2300
datasync: true
enableBackgroundIOPS: true
backgroundIOPSWarmupSeconds: 10
backgroundWriteIOPS: 300
backgroundWriteIOPSJobs: 6
backgroundReadIOPS: 50
backgroundReadIOPSJobs: 1

552
testdata/kurl_preflights.yaml vendored Normal file
View File

@@ -0,0 +1,552 @@
# https://kurl.sh/docs/install-with-kurl/system-requirements
apiVersion: troubleshoot.sh/v1beta2
kind: HostPreflight
metadata:
name: kurl-builtin
spec:
collectors:
- time: {}
- cpu: {}
- memory: {}
- hostServices: {}
- hostOS: {}
- diskUsage:
collectorName: "Ephemeral Disk Usage /var/lib/kubelet"
path: /var/lib/kubelet
- diskUsage:
collectorName: "Ephemeral Disk Usage /var/lib/docker"
path: /var/lib/docker
exclude: '{{kurl not .Installer.Spec.Docker.Version}}'
- diskUsage:
collectorName: "Ephemeral Disk Usage /var/lib/containerd"
path: /var/lib/containerd
exclude: '{{kurl not .Installer.Spec.Containerd.Version}}'
- diskUsage:
collectorName: "Ephemeral Disk Usage /var/lib/rook"
path: /var/lib/rook
exclude: '{{kurl not .Installer.Spec.Rook.Version}}'
- diskUsage:
collectorName: "Ephemeral Disk Usage /var/openebs"
path: /var/openebs
exclude: '{{kurl not .Installer.Spec.OpenEBS.Version}}'
- tcpLoadBalancer:
collectorName: "Kubernetes API Server Load Balancer"
port: 6443
address: {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }}
timeout: 3m
# ha and is first master (primary and not join) and not is upgrade
exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.LoadBalancerAddress .IsPrimary (not .IsJoin) (not .IsUpgrade) | not }}'
- http:
collectorName: "Kubernetes API Server Load Balancer Upgrade"
get:
url: https://{{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress | trimSuffix "/" }}/healthz
insecureSkipVerify: true
# ha and is first master (primary and not join) and is upgrade (the load balancer backend should already be available)
exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.LoadBalancerAddress .IsPrimary .IsUpgrade (not .IsJoin) | not }}'
- tcpPortStatus:
collectorName: "Kubernetes API TCP Port Status"
port: 6443
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
- tcpPortStatus:
collectorName: "ETCD Client API TCP Port Status"
port: 2379
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
- tcpPortStatus:
collectorName: "ETCD Server API TCP Port Status"
port: 2380
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
- tcpPortStatus:
collectorName: "ETCD Health Server TCP Port Status"
port: 2381
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
interface: lo
- tcpPortStatus:
collectorName: "Kubelet Health Server TCP Port Status"
port: 10248
exclude: '{{kurl and (not .IsUpgrade) | not }}'
interface: lo
- tcpPortStatus:
collectorName: "Kubelet API TCP Port Status"
port: 10250
exclude: '{{kurl and (not .IsUpgrade) | not }}'
- tcpPortStatus:
collectorName: "Kube Controller Manager Health Server TCP Port Status"
port: 10257
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
interface: lo
- tcpPortStatus:
collectorName: "Kube Scheduler Health Server TCP Port Status"
port: 10259
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
interface: lo
- tcpConnect:
collectorName: "Kubernetes API TCP Connection Status"
address: '{{kurl .Installer.Spec.Kubernetes.MasterAddress }}'
# run the collector if 1. there is a master address set AND this is a node joining the cluster AND this is not an EKCO internalLB install
exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.MasterAddress .IsJoin (and .Installer.Spec.Ekco.Version .Installer.Spec.Ekco.EnableInternalLoadBalancer | not) | not }}'
- filesystemPerformance:
collectorName: Filesystem Latency Two Minute Benchmark
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
timeout: 2m
directory: /var/lib/etcd
fileSize: 22Mi
operationSizeBytes: 2300
datasync: true
enableBackgroundIOPS: true
backgroundIOPSWarmupSeconds: 10
backgroundWriteIOPS: 300
backgroundWriteIOPSJobs: 6
backgroundReadIOPS: 50
backgroundReadIOPSJobs: 1
- certificate:
collectorName: "Kubernetes API key pair certificate"
exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}'
certificatePath: /etc/kubernetes/pki/apiserver.crt
keyPath: /etc/kubernetes/pki/apiserver.key
- certificate:
collectorName: "Kubernetes ETCD key pair certificate"
exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}'
certificatePath: /etc/kubernetes/pki/etcd/server.crt
keyPath: /etc/kubernetes/pki/etcd/server.key
- http:
collectorName: "Kubernetes API Health"
exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}'
get:
url: https://localhost:6443/healthz
insecureSkipVerify: true
analyzers:
- certificate:
collectorName: "Kubernetes API key pair certificate"
exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}'
outcomes:
- fail:
when: "key-pair-missing"
message: Kubernetes API key pair certificate not found in /etc/kubernetes/pki/apiserver.*
- fail:
when: "key-pair-switched"
message: Kubernetes API key pair certificate and key pair are switched
- fail:
when: "key-pair-encrypted"
message: Kubernetes API key pair certificate private key is encrypted
- fail:
when: "key-pair-mismatch"
message: Kubernetes API key pair certificate and key do not match
- fail:
when: "key-pair-invalid"
message: Kubernetes API key pair certificate is invalid
- pass:
when: "key-pair-valid"
message: Kubernetes API key pair certificate is valid
- certificate:
collectorName: "Kubernetes ETCD key pair certificate"
exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}'
outcomes:
- fail:
when: "key-pair-missing"
message: Kubernetes ETCD key pair certificate not found in /etc/kubernetes/pki/etcd/server.*
- fail:
when: "key-pair-switched"
message: Kubernetes ETCD certificate and key pair are switched
- fail:
when: "key-pair-encrypted"
message: Kubernetes ETCD certificate private key is encrypted
- fail:
when: "key-pair-mismatch"
message: Kubernetes ETCD certificate and key do not match
- fail:
when: "key-pair-invalid"
message: Kubernetes ETCD key pair certificate is invalid
- pass:
when: "key-pair-valid"
message: Kubernetes ETCD key pair certificate is valid
- http:
checkName: "Kubernetes API Health"
exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}'
collectorName: "Kubernetes API Health"
outcomes:
- warn:
when: "error"
message: Error connecting to Kubernetes API at https://localhost:6443/healthz
- pass:
when: "statusCode == 200"
message: OK HTTP status response from Kubernetes API at https://localhost:6443/healthz
- warn:
message: Unexpected status code response from Kubernetes API at https://localhost:6443/healthz
- cpu:
checkName: "Number of CPUs"
outcomes:
- fail:
when: "count < 2"
message: At least 2 CPU cores are required, and 4 CPU cores are recommended
- warn:
when: "count < 4"
message: At least 4 CPU cores are recommended
- pass:
message: This server has at least 4 CPU cores
- memory:
checkName: "Amount of Memory"
outcomes:
- fail:
when: "< 4G"
message: At least 4G of memory is required, and 8G of memory is recommended
- warn:
when: "< 8G"
message: At least 8G of memory is recommended
- pass:
message: The system has at least 8G of memory
- diskUsage:
checkName: "Ephemeral Disk Usage /var/lib/kubelet"
collectorName: "Ephemeral Disk Usage /var/lib/kubelet"
outcomes:
- fail:
when: "total < 30Gi"
message: The disk containing directory /var/lib/kubelet has less than 30Gi of total space
- fail:
when: "used/total > 80%"
message: The disk containing directory /var/lib/kubelet is more than 80% full
- warn:
when: "used/total > 60%"
message: The disk containing directory /var/lib/kubelet is more than 60% full
- warn:
when: "available < 10Gi"
message: The disk containing directory /var/lib/kubelet has less than 10Gi of disk space available
- pass:
message: The disk containing directory /var/lib/kubelet has at least 30Gi of total space, has at least 10Gi of disk space available, and is less than 60% full
- diskUsage:
checkName: "Ephemeral Disk Usage /var/lib/docker"
collectorName: "Ephemeral Disk Usage /var/lib/docker"
exclude: '{{kurl not .Installer.Spec.Docker.Version}}'
outcomes:
- fail:
when: "total < 30Gi"
message: The disk containing directory /var/lib/docker has less than 30Gi of total space
- fail:
when: "used/total > 80%"
message: The disk containing directory /var/lib/docker is more than 80% full
- warn:
when: "used/total > 60%"
message: The disk containing directory /var/lib/docker is more than 60% full
- warn:
when: "available < 10Gi"
message: The disk containing directory /var/lib/docker has less than 10Gi of disk space available
- pass:
message: The disk containing directory /var/lib/docker has at least 30Gi of total space, has at least 10Gi of disk space available, and is less than 60% full.
- diskUsage:
checkName: "Ephemeral Disk Usage /var/lib/containerd"
collectorName: "Ephemeral Disk Usage /var/lib/containerd"
exclude: '{{kurl not .Installer.Spec.Containerd.Version}}'
outcomes:
- fail:
when: "total < 30Gi"
message: The disk containing directory /var/lib/containerd has less than 30Gi of total space
- fail:
when: "used/total > 80%"
message: The disk containing directory /var/lib/containerd is more than 80% full
- warn:
when: "used/total > 60%"
message: The disk containing directory /var/lib/containerd is more than 60% full
- warn:
when: "available < 10Gi"
message: The disk containing directory /var/lib/containerd has less than 10Gi of disk space available
- pass:
message: The disk containing directory /var/lib/containerd has at least 30Gi of total space, has at least 10Gi of disk space available, and is less than 60% full.
- diskUsage:
checkName: "Ephemeral Disk Usage /var/lib/rook"
collectorName: "Ephemeral Disk Usage /var/lib/rook"
exclude: '{{kurl not .Installer.Spec.Rook.Version}}'
outcomes:
- fail:
when: "used/total > 80%"
message: The disk containing directory /var/lib/rook is more than 80% full
- fail:
when: "available < 10Gi"
message: The disk containing directory /var/lib/rook has less than 10Gi of disk space available
- pass:
message: The disk containing directory /var/lib/rook has sufficient space
- diskUsage:
checkName: "Ephemeral Disk Usage /var/openebs"
collectorName: "Ephemeral Disk Usage /var/openebs"
exclude: '{{kurl not .Installer.Spec.OpenEBS.Version}}'
outcomes:
- warn:
when: "used/total > 80%"
message: The disk containing directory /var/openebs is more than 80% full
- warn:
when: "available < 10Gi"
message: The disk containing directory /var/openebs has less than 10Gi of disk space available
- pass:
message: The disk containing directory /var/openebs has sufficient space
- tcpLoadBalancer:
checkName: "Kubernetes API Server Load Balancer"
collectorName: "Kubernetes API Server Load Balancer"
# ha and is first master (primary and not join) and not is upgrade
exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.LoadBalancerAddress .IsPrimary (not .IsJoin) (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "invalid-address"
message: The load balancer address {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} is not valid.
- warn:
when: "connection-refused"
message: Connection to {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} via load balancer was refused.
- warn:
when: "connection-timeout"
message: Timed out connecting to {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} via load balancer. Check your firewall.
- warn:
when: "error"
message: Unexpected port status
- warn:
when: "address-in-use"
message: Port 6443 is unavailable
- pass:
when: "connected"
message: Successfully connected to {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} via load balancer
- warn:
message: Unexpected port status
- http:
checkName: "Kubernetes API Server Load Balancer Upgrade"
collectorName: "Kubernetes API Server Load Balancer Upgrade"
exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.LoadBalancerAddress .IsPrimary .IsUpgrade (not .IsJoin) | not }}'
outcomes:
- fail:
when: "error"
message: Error connecting to load balancer at https://{{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }}/healthz
- pass:
when: "statusCode == 200"
message: OK HTTP status response from load balancer at https://{{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }}/healthz
- fail:
message: Unexpected status code response from load balancer at https://{{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }}/healthz
- tcpPortStatus:
checkName: "Kubernetes API TCP Port Status"
collectorName: "Kubernetes API TCP Port Status"
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 6443 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 6443.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 6443. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 6443 is open
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "ETCD Client API TCP Port Status"
collectorName: "ETCD Client API TCP Port Status"
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 2379 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 2379.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 2379. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 2379 is open
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "ETCD Server API TCP Port Status"
collectorName: "ETCD Server API TCP Port Status"
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 2380 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 2380.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 2380. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 2380 is open
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "ETCD Health Server TCP Port Status"
collectorName: "ETCD Health Server TCP Port Status"
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 2381 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 2381.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 2381. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 2381 is available
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "Kubelet Health Server TCP Port Status"
collectorName: "Kubelet Health Server TCP Port Status"
exclude: '{{kurl and (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 10248 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 10248.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 10248. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 10248 is available
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "Kubelet API TCP Port Status"
collectorName: "Kubelet API TCP Port Status"
exclude: '{{kurl and (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 10250 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 10250.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 10250. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 10250 is open
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "Kube Controller Manager Health Server TCP Port Status"
collectorName: "Kube Controller Manager Health Server TCP Port Status"
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 10257 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 10257.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 10257. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 10257 is available
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "Kube Scheduler Health Server TCP Port Status"
collectorName: "Kube Scheduler Health Server TCP Port Status"
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 10259 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 10259.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 10259. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 10259 is available
- warn:
message: Unexpected port status
- tcpConnect:
checkName: "Kubernetes API TCP Connection Status"
collectorName: "Kubernetes API TCP Connection Status"
# run the analyzer if 1. there is a master address set AND this is a node joining the cluster AND this is not an EKCO internalLB install
exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.MasterAddress .IsJoin (and .Installer.Spec.Ekco.Version .Installer.Spec.Ekco.EnableInternalLoadBalancer | not) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to the Kubernetes API at address {{kurl .Installer.Spec.Kubernetes.MasterAddress }} was refused
- fail:
when: "connection-timeout"
message: Timed out connecting to the Kubernetes API at address {{kurl .Installer.Spec.Kubernetes.MasterAddress }}
- fail:
when: "error"
message: Unexpected error connecting to the Kubernetes API at address {{kurl .Installer.Spec.Kubernetes.MasterAddress }}
- pass:
when: "connected"
message: Successfully connected to the Kubernetes API at address {{kurl .Installer.Spec.Kubernetes.MasterAddress }}
- filesystemPerformance:
collectorName: Filesystem Latency Two Minute Benchmark
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- pass:
when: "p99 < 10ms"
message: "Write latency is ok (p99 target < 10ms, actual: {{ .P99 }})"
- warn:
message: "Write latency is high. p99 target < 10ms, actual:{{ .String }}"
- time:
checkName: "NTP Status"
outcomes:
- fail:
when: "ntp == unsynchronized+inactive"
message: "System clock is not synchronized"
- warn:
when: "ntp == unsynchronized+active"
message: System clock not yet synchronized
- pass:
when: "ntp == synchronized+active"
message: "System clock is synchronized"
- warn:
when: "timezone != UTC"
message: "Non UTC timezone can interfere with system function"
- pass:
when: "timezone == UTC"
message: "Timezone is set to UTC"
- hostOS:
checkName: "Docker Support"
exclude: '{{kurl or (not .Installer.Spec.Docker.Version) (semverCompare ">= 20.10.17" .Installer.Spec.Docker.Version) }}'
outcomes:
- fail:
when: "ubuntu = 22.04"
message: "Docker versions < 20.10.17 not supported on ubuntu 22.04"
# hijack hostOS analyzer in order to analyze the kURL Installer spec
- hostOS:
checkName: "Containerd and Weave Compatibility"
exclude: '{{kurl or (not .Installer.Spec.Weave.Version) (not .Installer.Spec.Containerd.Version) (semverCompare "1.6.0 - 1.6.4" .Installer.Spec.Containerd.Version | not) }}'
outcomes:
- fail:
message: "Weave is not compatible with containerd versions 1.6.0 - 1.6.4"