mirror of
https://github.com/replicatedhq/troubleshoot.git
synced 2026-04-15 07:16:34 +00:00
* Add collect command and remote host collectors
Adds the ability to run a host collector on a set of remote k8s nodes.
Target nodes can be filtered using the --selector flag, with the same
syntax as kubectl. Existing flags for --collector-image,
--collector-pullpolicy and --request-timeout are used. To run on a
specified node, --selector="kubernetes.io/hostname=kind-worker2" could
be used.
The collect command is used by the remote collector to output the
results using a "raw" format, which uses the filename as the key, and
the value the output as a escaped json string. When run manually it
defaults to fully decoded json. The existing block devices,
ipv4interfaces and services host collectors don't decode properly - the
fix is to convert their slice output to a map (fix not included as
unsure what depends on the existing format).
The collect command is also useful for troubleshooting preflight issues.
Examples are included to show remote collector usage.
```
bin/collect --collector-image=croomes/troubleshoot:latest examples/collect/remote/memory.yaml --namespace test
{
"kind-control-plane": {
"system/memory.json": {
"total": 1304207360
}
},
"kind-worker": {
"system/memory.json": {
"total": 1695780864
}
},
"kind-worker2": {
"system/memory.json": {
"total": 1726353408
}
}
}
```
The preflight command has been updated to run remote collectors. To run
a host collector remotely it must be specified in the spec as a
`remoteCollector`:
```
apiVersion: troubleshoot.sh/v1beta2
kind: HostPreflight
metadata:
name: memory
spec:
remoteCollectors:
- memory:
collectorName: memory
analyzers:
- memory:
outcomes:
- fail:
when: "< 8Gi"
message: At least 8Gi of memory is required
- warn:
when: "< 32Gi"
message: At least 32Gi of memory is recommended
- pass:
message: The system has as sufficient memory
```
Results for each node are analyzed separately, with the node name
appended to the title:
```
bin/preflight --interactive=false --collector-image=croomes/troubleshoot:latest examples/preflight/remote/memory.yaml --format=json
{memory running 0 1}
{memory completed 1 1}
{
"fail": [
{
"title": "Amount of Memory (kind-worker2)",
"message": "At least 8Gi of memory is required"
},
{
"title": "Amount of Memory (kind-worker)",
"message": "At least 8Gi of memory is required"
},
{
"title": "Amount of Memory (kind-control-plane)",
"message": "At least 8Gi of memory is required"
}
]
}
```
Also added a host collector to allow preflight checks of required kernel
modules, which is the main driver for this change.
187 lines
4.8 KiB
Go
187 lines
4.8 KiB
Go
package collect
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"encoding/json"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/pkg/errors"
|
|
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
|
|
)
|
|
|
|
const (
|
|
KernelModuleUnknown = "unknown"
|
|
KernelModuleLoaded = "loaded"
|
|
KernelModuleLoadable = "loadable"
|
|
KernelModuleLoading = "loading"
|
|
KernelModuleUnloading = "unloading"
|
|
)
|
|
|
|
type KernelModuleStatus string
|
|
|
|
type KernelModuleInfo struct {
|
|
Size uint64 `json:"size"`
|
|
Instances uint `json:"instances"`
|
|
Status KernelModuleStatus `json:"status"`
|
|
}
|
|
|
|
// kernelModuleCollector defines the interface used to collect modules from the
|
|
// underlying host.
|
|
type kernelModuleCollector interface {
|
|
collect() (map[string]KernelModuleInfo, error)
|
|
}
|
|
|
|
// CollectHostKernelModules is responsible for collecting kernel module status
|
|
// from the host.
|
|
type CollectHostKernelModules struct {
|
|
hostCollector *troubleshootv1beta2.HostKernelModules
|
|
loadable kernelModuleCollector
|
|
loaded kernelModuleCollector
|
|
}
|
|
|
|
// Title is the name of the collector.
|
|
func (c *CollectHostKernelModules) Title() string {
|
|
return hostCollectorTitleOrDefault(c.hostCollector.HostCollectorMeta, "Kernel Modules")
|
|
}
|
|
|
|
// IsExcluded returns true if the collector has been excluded from the results.
|
|
func (c *CollectHostKernelModules) IsExcluded() (bool, error) {
|
|
return isExcluded(c.hostCollector.Exclude)
|
|
}
|
|
|
|
// Collect the kernel module status from the host. Modules are returned as a
|
|
// map keyed on the module name used by the kernel, e.g:
|
|
//
|
|
// {
|
|
// "system/kernel_modules.json": {
|
|
// ...
|
|
// "dm_snapshot": {
|
|
// "instances": 8,
|
|
// "size": 45056,
|
|
// "status": "loaded"
|
|
// },
|
|
// ...
|
|
// },
|
|
// }
|
|
//
|
|
// Module status may be: loaded, loadable, loading, unloading or unknown. When
|
|
// a module is loaded, it may have one or more instances. The size represents
|
|
// the amount of memory (in bytes) that the module is using.
|
|
func (c *CollectHostKernelModules) Collect(progressChan chan<- interface{}) (map[string][]byte, error) {
|
|
modules, err := c.loadable.collect()
|
|
if err != nil {
|
|
return nil, errors.Wrap(err, "failed to read loadable kernel modules")
|
|
}
|
|
if modules == nil {
|
|
modules = map[string]KernelModuleInfo{}
|
|
}
|
|
loaded, err := c.loaded.collect()
|
|
if err != nil {
|
|
return nil, errors.Wrap(err, "failed to read loaded kernel modules")
|
|
}
|
|
|
|
// Overlay with loaded modules.
|
|
for name, module := range loaded {
|
|
modules[name] = module
|
|
}
|
|
|
|
b, err := json.Marshal(modules)
|
|
if err != nil {
|
|
return nil, errors.Wrap(err, "failed to marshal kernel modules")
|
|
}
|
|
|
|
return map[string][]byte{
|
|
"system/kernel_modules.json": b,
|
|
}, nil
|
|
}
|
|
|
|
// kernelModulesLoadable retrieves the list of modules that can be loaded by
|
|
// the kernel.
|
|
type kernelModulesLoadable struct{}
|
|
|
|
// collect the list of modules that can be loaded by the kernel.
|
|
func (l kernelModulesLoadable) collect() (map[string]KernelModuleInfo, error) {
|
|
modules := make(map[string]KernelModuleInfo)
|
|
|
|
out, err := exec.Command("uname", "-r").Output()
|
|
if err != nil {
|
|
return nil, errors.Wrap(err, "failed to determine kernel release")
|
|
}
|
|
kernel := strings.TrimSpace(string(out))
|
|
|
|
cmd := exec.Command("/usr/bin/find", "/lib/modules/"+kernel, "-type", "f", "-name", "*.ko*")
|
|
stdout, err := cmd.Output()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
buf := bytes.NewBuffer(stdout)
|
|
scanner := bufio.NewScanner(buf)
|
|
|
|
for scanner.Scan() {
|
|
_, file := filepath.Split(scanner.Text())
|
|
name := strings.TrimSuffix(file, filepath.Ext(file))
|
|
|
|
if name == "" {
|
|
continue
|
|
}
|
|
|
|
modules[name] = KernelModuleInfo{
|
|
Status: KernelModuleLoadable,
|
|
}
|
|
}
|
|
return modules, nil
|
|
}
|
|
|
|
// kernelModulesLoaded retrieves the list of modules that the kernel is aware of. The
|
|
// modules will either be in loaded, loading or unloading state.
|
|
type kernelModulesLoaded struct{}
|
|
|
|
// collect the list of modules that the kernel is aware of.
|
|
func (l kernelModulesLoaded) collect() (map[string]KernelModuleInfo, error) {
|
|
modules := make(map[string]KernelModuleInfo)
|
|
|
|
file, err := os.Open("/proc/modules")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer file.Close()
|
|
scanner := bufio.NewScanner(file)
|
|
|
|
for scanner.Scan() {
|
|
s := strings.Split(scanner.Text(), " ")
|
|
name, size, instances, state := s[0], s[1], s[2], s[4]
|
|
|
|
sizeInt, err := strconv.Atoi(size)
|
|
if err != nil {
|
|
sizeInt = 0
|
|
}
|
|
|
|
instancesInt, err := strconv.Atoi(instances)
|
|
if err != nil {
|
|
instancesInt = 0
|
|
}
|
|
|
|
var status KernelModuleStatus = KernelModuleUnknown
|
|
switch state {
|
|
case "Live":
|
|
status = KernelModuleLoaded
|
|
case "Loading":
|
|
status = KernelModuleLoading
|
|
case "Unloading":
|
|
status = KernelModuleUnloading
|
|
}
|
|
|
|
modules[name] = KernelModuleInfo{
|
|
Size: uint64(sizeInt),
|
|
Instances: uint(instancesInt),
|
|
Status: status,
|
|
}
|
|
}
|
|
return modules, nil
|
|
}
|