Files
troubleshoot/testdata/kurl_preflights.yaml
ada mancini e3adc1cb35 call out to fio for host filesystem performance (#1275)
* stashing changes

* split filesystem collector into fio and legacy functions

* read fio results into analyzer

* remove test script

* update go.mod

* remove old notes

* go mod tidy

* fix up go.mod

* fix up go.mod

* refactor tests for fio

* make schemas

* remove local scripts

* local watch script for building troubleshoot

* document watch script

* fix var names

* handle errors if run as non-root

* go mod tidy

* use String interface

* collector happy path test

* invalid filesize

* invalid filesize

* tests

* remove old code

* remove old init function

* let actions tests run this

* clean up tests

* go mod tidy

* remove duplicated type declaration

* remove old file create code
2023-10-03 14:21:56 -04:00

553 lines
25 KiB
YAML

# https://kurl.sh/docs/install-with-kurl/system-requirements
apiVersion: troubleshoot.sh/v1beta2
kind: HostPreflight
metadata:
name: kurl-builtin
spec:
collectors:
- time: {}
- cpu: {}
- memory: {}
- hostServices: {}
- hostOS: {}
- diskUsage:
collectorName: "Ephemeral Disk Usage /var/lib/kubelet"
path: /var/lib/kubelet
- diskUsage:
collectorName: "Ephemeral Disk Usage /var/lib/docker"
path: /var/lib/docker
exclude: '{{kurl not .Installer.Spec.Docker.Version}}'
- diskUsage:
collectorName: "Ephemeral Disk Usage /var/lib/containerd"
path: /var/lib/containerd
exclude: '{{kurl not .Installer.Spec.Containerd.Version}}'
- diskUsage:
collectorName: "Ephemeral Disk Usage /var/lib/rook"
path: /var/lib/rook
exclude: '{{kurl not .Installer.Spec.Rook.Version}}'
- diskUsage:
collectorName: "Ephemeral Disk Usage /var/openebs"
path: /var/openebs
exclude: '{{kurl not .Installer.Spec.OpenEBS.Version}}'
- tcpLoadBalancer:
collectorName: "Kubernetes API Server Load Balancer"
port: 6443
address: {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }}
timeout: 3m
# ha and is first master (primary and not join) and not is upgrade
exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.LoadBalancerAddress .IsPrimary (not .IsJoin) (not .IsUpgrade) | not }}'
- http:
collectorName: "Kubernetes API Server Load Balancer Upgrade"
get:
url: https://{{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress | trimSuffix "/" }}/healthz
insecureSkipVerify: true
# ha and is first master (primary and not join) and is upgrade (the load balancer backend should already be available)
exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.LoadBalancerAddress .IsPrimary .IsUpgrade (not .IsJoin) | not }}'
- tcpPortStatus:
collectorName: "Kubernetes API TCP Port Status"
port: 6443
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
- tcpPortStatus:
collectorName: "ETCD Client API TCP Port Status"
port: 2379
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
- tcpPortStatus:
collectorName: "ETCD Server API TCP Port Status"
port: 2380
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
- tcpPortStatus:
collectorName: "ETCD Health Server TCP Port Status"
port: 2381
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
interface: lo
- tcpPortStatus:
collectorName: "Kubelet Health Server TCP Port Status"
port: 10248
exclude: '{{kurl and (not .IsUpgrade) | not }}'
interface: lo
- tcpPortStatus:
collectorName: "Kubelet API TCP Port Status"
port: 10250
exclude: '{{kurl and (not .IsUpgrade) | not }}'
- tcpPortStatus:
collectorName: "Kube Controller Manager Health Server TCP Port Status"
port: 10257
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
interface: lo
- tcpPortStatus:
collectorName: "Kube Scheduler Health Server TCP Port Status"
port: 10259
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
interface: lo
- tcpConnect:
collectorName: "Kubernetes API TCP Connection Status"
address: '{{kurl .Installer.Spec.Kubernetes.MasterAddress }}'
# run the collector if 1. there is a master address set AND this is a node joining the cluster AND this is not an EKCO internalLB install
exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.MasterAddress .IsJoin (and .Installer.Spec.Ekco.Version .Installer.Spec.Ekco.EnableInternalLoadBalancer | not) | not }}'
- filesystemPerformance:
collectorName: Filesystem Latency Two Minute Benchmark
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
timeout: 2m
directory: /var/lib/etcd
fileSize: 22Mi
operationSizeBytes: 2300
datasync: true
enableBackgroundIOPS: true
backgroundIOPSWarmupSeconds: 10
backgroundWriteIOPS: 300
backgroundWriteIOPSJobs: 6
backgroundReadIOPS: 50
backgroundReadIOPSJobs: 1
- certificate:
collectorName: "Kubernetes API key pair certificate"
exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}'
certificatePath: /etc/kubernetes/pki/apiserver.crt
keyPath: /etc/kubernetes/pki/apiserver.key
- certificate:
collectorName: "Kubernetes ETCD key pair certificate"
exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}'
certificatePath: /etc/kubernetes/pki/etcd/server.crt
keyPath: /etc/kubernetes/pki/etcd/server.key
- http:
collectorName: "Kubernetes API Health"
exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}'
get:
url: https://localhost:6443/healthz
insecureSkipVerify: true
analyzers:
- certificate:
collectorName: "Kubernetes API key pair certificate"
exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}'
outcomes:
- fail:
when: "key-pair-missing"
message: Kubernetes API key pair certificate not found in /etc/kubernetes/pki/apiserver.*
- fail:
when: "key-pair-switched"
message: Kubernetes API key pair certificate and key pair are switched
- fail:
when: "key-pair-encrypted"
message: Kubernetes API key pair certificate private key is encrypted
- fail:
when: "key-pair-mismatch"
message: Kubernetes API key pair certificate and key do not match
- fail:
when: "key-pair-invalid"
message: Kubernetes API key pair certificate is invalid
- pass:
when: "key-pair-valid"
message: Kubernetes API key pair certificate is valid
- certificate:
collectorName: "Kubernetes ETCD key pair certificate"
exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}'
outcomes:
- fail:
when: "key-pair-missing"
message: Kubernetes ETCD key pair certificate not found in /etc/kubernetes/pki/etcd/server.*
- fail:
when: "key-pair-switched"
message: Kubernetes ETCD certificate and key pair are switched
- fail:
when: "key-pair-encrypted"
message: Kubernetes ETCD certificate private key is encrypted
- fail:
when: "key-pair-mismatch"
message: Kubernetes ETCD certificate and key do not match
- fail:
when: "key-pair-invalid"
message: Kubernetes ETCD key pair certificate is invalid
- pass:
when: "key-pair-valid"
message: Kubernetes ETCD key pair certificate is valid
- http:
checkName: "Kubernetes API Health"
exclude: '{{kurl or (not .IsPrimary) (not .IsUpgrade) }}'
collectorName: "Kubernetes API Health"
outcomes:
- warn:
when: "error"
message: Error connecting to Kubernetes API at https://localhost:6443/healthz
- pass:
when: "statusCode == 200"
message: OK HTTP status response from Kubernetes API at https://localhost:6443/healthz
- warn:
message: Unexpected status code response from Kubernetes API at https://localhost:6443/healthz
- cpu:
checkName: "Number of CPUs"
outcomes:
- fail:
when: "count < 2"
message: At least 2 CPU cores are required, and 4 CPU cores are recommended
- warn:
when: "count < 4"
message: At least 4 CPU cores are recommended
- pass:
message: This server has at least 4 CPU cores
- memory:
checkName: "Amount of Memory"
outcomes:
- fail:
when: "< 4G"
message: At least 4G of memory is required, and 8G of memory is recommended
- warn:
when: "< 8G"
message: At least 8G of memory is recommended
- pass:
message: The system has at least 8G of memory
- diskUsage:
checkName: "Ephemeral Disk Usage /var/lib/kubelet"
collectorName: "Ephemeral Disk Usage /var/lib/kubelet"
outcomes:
- fail:
when: "total < 30Gi"
message: The disk containing directory /var/lib/kubelet has less than 30Gi of total space
- fail:
when: "used/total > 80%"
message: The disk containing directory /var/lib/kubelet is more than 80% full
- warn:
when: "used/total > 60%"
message: The disk containing directory /var/lib/kubelet is more than 60% full
- warn:
when: "available < 10Gi"
message: The disk containing directory /var/lib/kubelet has less than 10Gi of disk space available
- pass:
message: The disk containing directory /var/lib/kubelet has at least 30Gi of total space, has at least 10Gi of disk space available, and is less than 60% full
- diskUsage:
checkName: "Ephemeral Disk Usage /var/lib/docker"
collectorName: "Ephemeral Disk Usage /var/lib/docker"
exclude: '{{kurl not .Installer.Spec.Docker.Version}}'
outcomes:
- fail:
when: "total < 30Gi"
message: The disk containing directory /var/lib/docker has less than 30Gi of total space
- fail:
when: "used/total > 80%"
message: The disk containing directory /var/lib/docker is more than 80% full
- warn:
when: "used/total > 60%"
message: The disk containing directory /var/lib/docker is more than 60% full
- warn:
when: "available < 10Gi"
message: The disk containing directory /var/lib/docker has less than 10Gi of disk space available
- pass:
message: The disk containing directory /var/lib/docker has at least 30Gi of total space, has at least 10Gi of disk space available, and is less than 60% full.
- diskUsage:
checkName: "Ephemeral Disk Usage /var/lib/containerd"
collectorName: "Ephemeral Disk Usage /var/lib/containerd"
exclude: '{{kurl not .Installer.Spec.Containerd.Version}}'
outcomes:
- fail:
when: "total < 30Gi"
message: The disk containing directory /var/lib/containerd has less than 30Gi of total space
- fail:
when: "used/total > 80%"
message: The disk containing directory /var/lib/containerd is more than 80% full
- warn:
when: "used/total > 60%"
message: The disk containing directory /var/lib/containerd is more than 60% full
- warn:
when: "available < 10Gi"
message: The disk containing directory /var/lib/containerd has less than 10Gi of disk space available
- pass:
message: The disk containing directory /var/lib/containerd has at least 30Gi of total space, has at least 10Gi of disk space available, and is less than 60% full.
- diskUsage:
checkName: "Ephemeral Disk Usage /var/lib/rook"
collectorName: "Ephemeral Disk Usage /var/lib/rook"
exclude: '{{kurl not .Installer.Spec.Rook.Version}}'
outcomes:
- fail:
when: "used/total > 80%"
message: The disk containing directory /var/lib/rook is more than 80% full
- fail:
when: "available < 10Gi"
message: The disk containing directory /var/lib/rook has less than 10Gi of disk space available
- pass:
message: The disk containing directory /var/lib/rook has sufficient space
- diskUsage:
checkName: "Ephemeral Disk Usage /var/openebs"
collectorName: "Ephemeral Disk Usage /var/openebs"
exclude: '{{kurl not .Installer.Spec.OpenEBS.Version}}'
outcomes:
- warn:
when: "used/total > 80%"
message: The disk containing directory /var/openebs is more than 80% full
- warn:
when: "available < 10Gi"
message: The disk containing directory /var/openebs has less than 10Gi of disk space available
- pass:
message: The disk containing directory /var/openebs has sufficient space
- tcpLoadBalancer:
checkName: "Kubernetes API Server Load Balancer"
collectorName: "Kubernetes API Server Load Balancer"
# ha and is first master (primary and not join) and not is upgrade
exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.LoadBalancerAddress .IsPrimary (not .IsJoin) (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "invalid-address"
message: The load balancer address {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} is not valid.
- warn:
when: "connection-refused"
message: Connection to {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} via load balancer was refused.
- warn:
when: "connection-timeout"
message: Timed out connecting to {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} via load balancer. Check your firewall.
- warn:
when: "error"
message: Unexpected port status
- warn:
when: "address-in-use"
message: Port 6443 is unavailable
- pass:
when: "connected"
message: Successfully connected to {{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }} via load balancer
- warn:
message: Unexpected port status
- http:
checkName: "Kubernetes API Server Load Balancer Upgrade"
collectorName: "Kubernetes API Server Load Balancer Upgrade"
exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.LoadBalancerAddress .IsPrimary .IsUpgrade (not .IsJoin) | not }}'
outcomes:
- fail:
when: "error"
message: Error connecting to load balancer at https://{{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }}/healthz
- pass:
when: "statusCode == 200"
message: OK HTTP status response from load balancer at https://{{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }}/healthz
- fail:
message: Unexpected status code response from load balancer at https://{{kurl .Installer.Spec.Kubernetes.LoadBalancerAddress }}/healthz
- tcpPortStatus:
checkName: "Kubernetes API TCP Port Status"
collectorName: "Kubernetes API TCP Port Status"
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 6443 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 6443.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 6443. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 6443 is open
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "ETCD Client API TCP Port Status"
collectorName: "ETCD Client API TCP Port Status"
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 2379 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 2379.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 2379. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 2379 is open
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "ETCD Server API TCP Port Status"
collectorName: "ETCD Server API TCP Port Status"
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 2380 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 2380.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 2380. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 2380 is open
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "ETCD Health Server TCP Port Status"
collectorName: "ETCD Health Server TCP Port Status"
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 2381 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 2381.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 2381. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 2381 is available
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "Kubelet Health Server TCP Port Status"
collectorName: "Kubelet Health Server TCP Port Status"
exclude: '{{kurl and (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 10248 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 10248.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 10248. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 10248 is available
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "Kubelet API TCP Port Status"
collectorName: "Kubelet API TCP Port Status"
exclude: '{{kurl and (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 10250 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 10250.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 10250. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 10250 is open
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "Kube Controller Manager Health Server TCP Port Status"
collectorName: "Kube Controller Manager Health Server TCP Port Status"
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 10257 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 10257.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 10257. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 10257 is available
- warn:
message: Unexpected port status
- tcpPortStatus:
checkName: "Kube Scheduler Health Server TCP Port Status"
collectorName: "Kube Scheduler Health Server TCP Port Status"
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to port 10259 was refused. This is likely to be a routing problem since this preflight configures a test server to listen on this port.
- warn:
when: "address-in-use"
message: Another process was already listening on port 10259.
- fail:
when: "connection-timeout"
message: Timed out connecting to port 10259. Check your firewall.
- fail:
when: "error"
message: Unexpected port status
- pass:
when: "connected"
message: Port 10259 is available
- warn:
message: Unexpected port status
- tcpConnect:
checkName: "Kubernetes API TCP Connection Status"
collectorName: "Kubernetes API TCP Connection Status"
# run the analyzer if 1. there is a master address set AND this is a node joining the cluster AND this is not an EKCO internalLB install
exclude: '{{kurl and .Installer.Spec.Kubernetes.Version .Installer.Spec.Kubernetes.MasterAddress .IsJoin (and .Installer.Spec.Ekco.Version .Installer.Spec.Ekco.EnableInternalLoadBalancer | not) | not }}'
outcomes:
- fail:
when: "connection-refused"
message: Connection to the Kubernetes API at address {{kurl .Installer.Spec.Kubernetes.MasterAddress }} was refused
- fail:
when: "connection-timeout"
message: Timed out connecting to the Kubernetes API at address {{kurl .Installer.Spec.Kubernetes.MasterAddress }}
- fail:
when: "error"
message: Unexpected error connecting to the Kubernetes API at address {{kurl .Installer.Spec.Kubernetes.MasterAddress }}
- pass:
when: "connected"
message: Successfully connected to the Kubernetes API at address {{kurl .Installer.Spec.Kubernetes.MasterAddress }}
- filesystemPerformance:
collectorName: Filesystem Latency Two Minute Benchmark
exclude: '{{kurl and .IsPrimary (not .IsUpgrade) | not }}'
outcomes:
- pass:
when: "p99 < 10ms"
message: "Write latency is ok (p99 target < 10ms, actual: {{ .P99 }})"
- warn:
message: "Write latency is high. p99 target < 10ms, actual:{{ .String }}"
- time:
checkName: "NTP Status"
outcomes:
- fail:
when: "ntp == unsynchronized+inactive"
message: "System clock is not synchronized"
- warn:
when: "ntp == unsynchronized+active"
message: System clock not yet synchronized
- pass:
when: "ntp == synchronized+active"
message: "System clock is synchronized"
- warn:
when: "timezone != UTC"
message: "Non UTC timezone can interfere with system function"
- pass:
when: "timezone == UTC"
message: "Timezone is set to UTC"
- hostOS:
checkName: "Docker Support"
exclude: '{{kurl or (not .Installer.Spec.Docker.Version) (semverCompare ">= 20.10.17" .Installer.Spec.Docker.Version) }}'
outcomes:
- fail:
when: "ubuntu = 22.04"
message: "Docker versions < 20.10.17 not supported on ubuntu 22.04"
# hijack hostOS analyzer in order to analyze the kURL Installer spec
- hostOS:
checkName: "Containerd and Weave Compatibility"
exclude: '{{kurl or (not .Installer.Spec.Weave.Version) (not .Installer.Spec.Containerd.Version) (semverCompare "1.6.0 - 1.6.4" .Installer.Spec.Containerd.Version | not) }}'
outcomes:
- fail:
message: "Weave is not compatible with containerd versions 1.6.0 - 1.6.4"