mirror of
https://github.com/replicatedhq/troubleshoot.git
synced 2026-04-15 07:16:34 +00:00
* Add collect command and remote host collectors
Adds the ability to run a host collector on a set of remote k8s nodes.
Target nodes can be filtered using the --selector flag, with the same
syntax as kubectl. Existing flags for --collector-image,
--collector-pullpolicy and --request-timeout are used. To run on a
specified node, --selector="kubernetes.io/hostname=kind-worker2" could
be used.
The collect command is used by the remote collector to output the
results using a "raw" format, which uses the filename as the key, and
the value the output as a escaped json string. When run manually it
defaults to fully decoded json. The existing block devices,
ipv4interfaces and services host collectors don't decode properly - the
fix is to convert their slice output to a map (fix not included as
unsure what depends on the existing format).
The collect command is also useful for troubleshooting preflight issues.
Examples are included to show remote collector usage.
```
bin/collect --collector-image=croomes/troubleshoot:latest examples/collect/remote/memory.yaml --namespace test
{
"kind-control-plane": {
"system/memory.json": {
"total": 1304207360
}
},
"kind-worker": {
"system/memory.json": {
"total": 1695780864
}
},
"kind-worker2": {
"system/memory.json": {
"total": 1726353408
}
}
}
```
The preflight command has been updated to run remote collectors. To run
a host collector remotely it must be specified in the spec as a
`remoteCollector`:
```
apiVersion: troubleshoot.sh/v1beta2
kind: HostPreflight
metadata:
name: memory
spec:
remoteCollectors:
- memory:
collectorName: memory
analyzers:
- memory:
outcomes:
- fail:
when: "< 8Gi"
message: At least 8Gi of memory is required
- warn:
when: "< 32Gi"
message: At least 32Gi of memory is recommended
- pass:
message: The system has as sufficient memory
```
Results for each node are analyzed separately, with the node name
appended to the title:
```
bin/preflight --interactive=false --collector-image=croomes/troubleshoot:latest examples/preflight/remote/memory.yaml --format=json
{memory running 0 1}
{memory completed 1 1}
{
"fail": [
{
"title": "Amount of Memory (kind-worker2)",
"message": "At least 8Gi of memory is required"
},
{
"title": "Amount of Memory (kind-worker)",
"message": "At least 8Gi of memory is required"
},
{
"title": "Amount of Memory (kind-control-plane)",
"message": "At least 8Gi of memory is required"
}
]
}
```
Also added a host collector to allow preflight checks of required kernel
modules, which is the main driver for this change.
143 lines
3.6 KiB
Go
143 lines
3.6 KiB
Go
package collect
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"reflect"
|
|
"regexp"
|
|
"strings"
|
|
|
|
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
|
|
corev1 "k8s.io/api/core/v1"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/client-go/kubernetes"
|
|
)
|
|
|
|
func DeterministicIDForCollector(collector *troubleshootv1beta2.Collect) string {
|
|
unsafeID := ""
|
|
|
|
if collector.ClusterInfo != nil {
|
|
unsafeID = "cluster-info"
|
|
}
|
|
|
|
if collector.ClusterResources != nil {
|
|
unsafeID = "cluster-resources"
|
|
}
|
|
|
|
if collector.Secret != nil {
|
|
if collector.Secret.Name != "" {
|
|
unsafeID = fmt.Sprintf("secret-%s-%s", collector.Secret.Namespace, collector.Secret.Name)
|
|
} else {
|
|
unsafeID = fmt.Sprintf("secret-%s-%s", collector.Secret.Namespace, selectorToString(collector.Secret.Selector))
|
|
}
|
|
}
|
|
|
|
if collector.ConfigMap != nil {
|
|
if collector.ConfigMap.Name != "" {
|
|
unsafeID = fmt.Sprintf("configmap-%s-%s", collector.ConfigMap.Namespace, collector.ConfigMap.Name)
|
|
} else {
|
|
unsafeID = fmt.Sprintf("configmap-%s-%s", collector.ConfigMap.Namespace, selectorToString(collector.ConfigMap.Selector))
|
|
}
|
|
}
|
|
|
|
if collector.Logs != nil {
|
|
unsafeID = fmt.Sprintf("logs-%s-%s", collector.Logs.Namespace, selectorToString(collector.Logs.Selector))
|
|
}
|
|
|
|
if collector.Run != nil {
|
|
unsafeID = "run"
|
|
if collector.Run.CollectorName != "" {
|
|
unsafeID = fmt.Sprintf("%s-%s", unsafeID, strings.ToLower(collector.Run.CollectorName))
|
|
}
|
|
}
|
|
|
|
if collector.Exec != nil {
|
|
unsafeID = "exec"
|
|
if collector.Exec.CollectorName != "" {
|
|
unsafeID = fmt.Sprintf("%s-%s", unsafeID, strings.ToLower(collector.Exec.CollectorName))
|
|
}
|
|
}
|
|
|
|
if collector.Copy != nil {
|
|
unsafeID = fmt.Sprintf("copy-%s-%s", selectorToString(collector.Copy.Selector), pathToString(collector.Copy.ContainerPath))
|
|
}
|
|
|
|
if collector.HTTP != nil {
|
|
unsafeID = "http"
|
|
if collector.HTTP.CollectorName != "" {
|
|
unsafeID = fmt.Sprintf("%s-%s", unsafeID, strings.ToLower(collector.HTTP.CollectorName))
|
|
}
|
|
}
|
|
|
|
return rfc1035(unsafeID)
|
|
}
|
|
|
|
func selectorToString(selector []string) string {
|
|
return strings.Replace(strings.Join(selector, "-"), "=", "-", -1)
|
|
}
|
|
|
|
func pathToString(path string) string {
|
|
return strings.Replace(path, "/", "-", -1)
|
|
}
|
|
|
|
func rfc1035(in string) string {
|
|
reg := regexp.MustCompile("[^a-z0-9\\-]+")
|
|
out := reg.ReplaceAllString(in, "-")
|
|
|
|
if len(out) > 63 {
|
|
out = out[:63]
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
// Use for error maps and arrays. These are guaraneteed to not result in a error when marshaling.
|
|
func marshalErrors(errors interface{}) io.Reader {
|
|
if errors == nil {
|
|
return nil
|
|
}
|
|
|
|
val := reflect.ValueOf(errors)
|
|
switch val.Kind() {
|
|
case reflect.Array, reflect.Slice, reflect.Map:
|
|
if val.Len() == 0 {
|
|
return nil
|
|
}
|
|
}
|
|
|
|
m, _ := json.MarshalIndent(errors, "", " ")
|
|
return bytes.NewBuffer(m)
|
|
}
|
|
|
|
// listNodesNamesInSelector returns a list of node names matching the label
|
|
// selector,
|
|
func listNodesNamesInSelector(ctx context.Context, client *kubernetes.Clientset, selector string) ([]string, error) {
|
|
var names []string
|
|
nodes, err := listNodesInSelector(ctx, client, selector)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
for _, node := range nodes {
|
|
names = append(names, node.GetName())
|
|
}
|
|
return names, nil
|
|
}
|
|
|
|
// listNodesInSelector returns a list of node names matching the label
|
|
// selector,
|
|
func listNodesInSelector(ctx context.Context, client *kubernetes.Clientset, selector string) ([]corev1.Node, error) {
|
|
listOptions := metav1.ListOptions{
|
|
LabelSelector: selector,
|
|
}
|
|
|
|
nodes, err := client.CoreV1().Nodes().List(ctx, listOptions)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Can't get the list of nodes, got: %w", err)
|
|
}
|
|
|
|
return nodes.Items, nil
|
|
}
|