From 100f9a13b60c2e14b6231abc4a1331bb4d6c231e Mon Sep 17 00:00:00 2001 From: Evans Mungai Date: Tue, 7 Feb 2023 09:50:21 +0000 Subject: [PATCH] feat: Record summary of execution times of support bundle operations (collect/redact/analyse) (#935) When running a support bundle, we want to know how long each operation (collect, redact, analyze) takes. This commit adds a new trace exporter that records the start and end times of each operation, and then prints a summary of the execution. The summary is also stored in the support bundle. Related to #923 --- Makefile | 15 +- cmd/preflight/cli/root.go | 16 +- cmd/troubleshoot/cli/root.go | 17 +- cmd/troubleshoot/cli/version.go | 2 +- .../support-bundle/sample-supportbundle.yaml | 2 + go.mod | 6 +- go.sum | 9 + internal/traces/exporter.go | 273 ++++++++++++++++++ internal/traces/exporter_test.go | 187 ++++++++++++ internal/traces/otel.go | 60 ++++ pkg/analyze/analyzer.go | 34 ++- pkg/analyze/analyzer_test.go | 3 +- pkg/analyze/download.go | 20 +- pkg/collect/host_udpportstatus_test.go | 4 - pkg/constants/constants.go | 10 +- pkg/preflight/analyze.go | 19 +- pkg/preflight/collect.go | 59 +++- pkg/preflight/run.go | 30 +- pkg/supportbundle/collect.go | 45 ++- pkg/supportbundle/supportbundle.go | 44 ++- 20 files changed, 795 insertions(+), 60 deletions(-) create mode 100644 internal/traces/exporter.go create mode 100644 internal/traces/exporter_test.go create mode 100644 internal/traces/otel.go diff --git a/Makefile b/Makefile index 19600434..e472bf82 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,7 @@ define LDFLAGS endef BUILDFLAGS = -tags "netgo containers_image_ostree_stub exclude_graphdriver_devicemapper exclude_graphdriver_btrfs containers_image_openpgp" -installsuffix netgo +BUILDPATHS = ./pkg/... ./cmd/... ./internal/... all: test support-bundle preflight collect analyze @@ -45,16 +46,16 @@ ffi: fmt vet .PHONY: test test: generate fmt vet if [ -n $(RUN) ]; then \ - go test ${BUILDFLAGS} ./pkg/... ./cmd/... -coverprofile cover.out -run $(RUN); \ + go test ${BUILDFLAGS} ${BUILDPATHS} -coverprofile cover.out -run $(RUN); \ else \ - go test ${BUILDFLAGS} ./pkg/... ./cmd/... -coverprofile cover.out; \ + go test ${BUILDFLAGS} ${BUILDPATHS} -coverprofile cover.out; \ fi # Go tests that require a K8s instance # TODOLATER: merge with test, so we get unified coverage reports? it'll add 21~sec to the test job though... .PHONY: test-integration test-integration: - go test -v --tags "integration exclude_graphdriver_devicemapper exclude_graphdriver_btrfs" ./pkg/... ./cmd/... + go test -v --tags "integration exclude_graphdriver_devicemapper exclude_graphdriver_btrfs" ${BUILDPATHS} .PHONY: preflight-e2e-test preflight-e2e-test: @@ -82,11 +83,11 @@ collect: .PHONY: fmt fmt: - go fmt ./pkg/... ./cmd/... + go fmt ${BUILDPATHS} .PHONY: vet vet: - go vet ${BUILDFLAGS} ./pkg/... ./cmd/... + go vet ${BUILDFLAGS} ${BUILDPATHS} .PHONY: generate generate: controller-gen client-gen @@ -213,8 +214,8 @@ scan: .PHONY: lint lint: - golangci-lint run --new -c .golangci.yaml pkg/... cmd/... + golangci-lint run --new -c .golangci.yaml ${BUILDPATHS} .PHONY: lint-and-fix lint-and-fix: - golangci-lint run --new --fix -c .golangci.yaml pkg/... cmd/... + golangci-lint run --new --fix -c .golangci.yaml ${BUILDPATHS} diff --git a/cmd/preflight/cli/root.go b/cmd/preflight/cli/root.go index 41dd9bca..12d91c61 100644 --- a/cmd/preflight/cli/root.go +++ b/cmd/preflight/cli/root.go @@ -1,11 +1,13 @@ package cli import ( + "fmt" "os" "strings" "github.com/go-logr/logr" "github.com/replicatedhq/troubleshoot/cmd/util" + "github.com/replicatedhq/troubleshoot/internal/traces" "github.com/replicatedhq/troubleshoot/pkg/k8sutil" "github.com/replicatedhq/troubleshoot/pkg/logger" "github.com/replicatedhq/troubleshoot/pkg/preflight" @@ -37,7 +39,19 @@ that a cluster meets the requirements to run an application.`, }, RunE: func(cmd *cobra.Command, args []string) error { v := viper.GetViper() - return preflight.RunPreflights(v.GetBool("interactive"), v.GetString("output"), v.GetString("format"), args) + closer, err := traces.ConfigureTracing("preflight") + if err != nil { + // Do not fail running preflights if tracing fails + logger.Printf("Failed to initialize open tracing provider: %v", err) + } else { + defer closer() + } + + err = preflight.RunPreflights(v.GetBool("interactive"), v.GetString("output"), v.GetString("format"), args) + if v.GetBool("debug") { + fmt.Printf("\n%s", traces.GetExporterInstance().GetSummary()) + } + return err }, PostRun: func(cmd *cobra.Command, args []string) { if err := util.StopProfiling(); err != nil { diff --git a/cmd/troubleshoot/cli/root.go b/cmd/troubleshoot/cli/root.go index 3e4f2a71..0e52b4b6 100644 --- a/cmd/troubleshoot/cli/root.go +++ b/cmd/troubleshoot/cli/root.go @@ -1,11 +1,13 @@ package cli import ( + "fmt" "os" "strings" "github.com/go-logr/logr" "github.com/replicatedhq/troubleshoot/cmd/util" + "github.com/replicatedhq/troubleshoot/internal/traces" "github.com/replicatedhq/troubleshoot/pkg/k8sutil" "github.com/replicatedhq/troubleshoot/pkg/logger" "github.com/spf13/cobra" @@ -40,7 +42,20 @@ from a server that can be used to assist when troubleshooting a Kubernetes clust RunE: func(cmd *cobra.Command, args []string) error { v := viper.GetViper() - return runTroubleshoot(v, args) + closer, err := traces.ConfigureTracing("support-bundle") + if err != nil { + // Do not fail running support-bundle if tracing fails + logger.Printf("Failed to initialize open tracing provider: %v", err) + } else { + defer closer() + } + + err = runTroubleshoot(v, args) + if v.GetBool("debug") { + fmt.Printf("\n%s", traces.GetExporterInstance().GetSummary()) + } + + return err }, PersistentPostRun: func(cmd *cobra.Command, args []string) { if err := util.StopProfiling(); err != nil { diff --git a/cmd/troubleshoot/cli/version.go b/cmd/troubleshoot/cli/version.go index 7baf5805..38bb26d1 100644 --- a/cmd/troubleshoot/cli/version.go +++ b/cmd/troubleshoot/cli/version.go @@ -40,7 +40,7 @@ func writeVersionFile(path string) error { return err } - filename := filepath.Join(path, constants.VersionFilename) + filename := filepath.Join(path, constants.VERSION_FILENAME) err = ioutil.WriteFile(filename, b, 0644) if err != nil { return err diff --git a/examples/support-bundle/sample-supportbundle.yaml b/examples/support-bundle/sample-supportbundle.yaml index 3baf560c..c395b0dd 100644 --- a/examples/support-bundle/sample-supportbundle.yaml +++ b/examples/support-bundle/sample-supportbundle.yaml @@ -3,6 +3,8 @@ kind: SupportBundle metadata: name: example spec: + hostCollectors: + - hostOS: {} collectors: - logs: selector: diff --git a/go.mod b/go.mod index 4e76bca1..f857d5c4 100644 --- a/go.mod +++ b/go.mod @@ -34,6 +34,8 @@ require ( github.com/spf13/viper v1.15.0 github.com/stretchr/testify v1.8.1 github.com/tj/go-spin v1.1.0 + go.opentelemetry.io/otel v1.12.0 + go.opentelemetry.io/otel/sdk v1.11.2 golang.org/x/sync v0.1.0 gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.26.1 @@ -53,6 +55,7 @@ require ( github.com/docker/distribution v2.8.1+incompatible // indirect github.com/emicklei/go-restful/v3 v3.9.0 // indirect github.com/evanphx/json-patch/v5 v5.6.0 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/googleapis/enterprise-certificate-proxy v0.2.1 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20200714003250-2b9c44734f2b // indirect @@ -61,6 +64,7 @@ require ( github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/sylabs/sif/v2 v2.9.0 // indirect + go.opentelemetry.io/otel/trace v1.12.0 // indirect golang.org/x/mod v0.7.0 // indirect golang.org/x/tools v0.4.0 // indirect ) @@ -186,7 +190,7 @@ require ( golang.org/x/oauth2 v0.4.0 // indirect golang.org/x/sys v0.4.0 // indirect golang.org/x/term v0.4.0 // indirect - golang.org/x/text v0.6.0 // indirect + golang.org/x/text v0.6.0 golang.org/x/time v0.3.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/api v0.107.0 // indirect diff --git a/go.sum b/go.sum index 27020945..26517275 100644 --- a/go.sum +++ b/go.sum @@ -399,8 +399,11 @@ github.com/go-logfmt/logfmt v0.5.1/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KE github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= github.com/go-logr/logr v0.2.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU= github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0= github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.2.3 h1:a9vnzlIBPQBBkeaR9IuMUfmVOrQlkoC4YfPoFkX3T7A= github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= @@ -1031,6 +1034,12 @@ go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= +go.opentelemetry.io/otel v1.12.0 h1:IgfC7kqQrRccIKuB7Cl+SRUmsKbEwSGPr0Eu+/ht1SQ= +go.opentelemetry.io/otel v1.12.0/go.mod h1:geaoz0L0r1BEOR81k7/n9W4TCXYCJ7bPO7K374jQHG0= +go.opentelemetry.io/otel/sdk v1.11.2 h1:GF4JoaEx7iihdMFu30sOyRx52HDHOkl9xQ8SMqNXUiU= +go.opentelemetry.io/otel/sdk v1.11.2/go.mod h1:wZ1WxImwpq+lVRo4vsmSOxdd+xwoUJ6rqyLc3SyX9aU= +go.opentelemetry.io/otel/trace v1.12.0 h1:p28in++7Kd0r2d8gSt931O57fdjUyWxkVbESuILAeUc= +go.opentelemetry.io/otel/trace v1.12.0/go.mod h1:pHlgBynn6s25qJ2szD+Bv+iwKJttjHSI3lUAyf0GNuQ= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 h1:+FNtrFTmVw0YZGpBGX56XDee331t6JAXeK2bcyhLOOc= go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5/go.mod h1:nmDLcffg48OtT/PSW0Hg7FvpRQsQh5OSqIylirxKC7o= diff --git a/internal/traces/exporter.go b/internal/traces/exporter.go new file mode 100644 index 00000000..8f9d8271 --- /dev/null +++ b/internal/traces/exporter.go @@ -0,0 +1,273 @@ +package traces + +import ( + "context" + "sort" + "strings" + "sync" + "time" + + "github.com/replicatedhq/troubleshoot/pkg/constants" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/sdk/trace" + "go.opentelemetry.io/otel/sdk/trace/tracetest" + "golang.org/x/text/language" + "golang.org/x/text/message" +) + +var ( + _ trace.SpanExporter = (*Exporter)(nil) + once sync.Once + exporter *Exporter + printer = message.NewPrinter(language.English) +) + +const legend = "Suceeded (S), eXcluded (X), Failed (F)\n" + +// FUTURE WORK: This exporter should only be used by troubleshoot CLIs +// until the following issue is addressed: +// 1. The cache of spans grows infinitely at the moment. This is OK for short lived +// invocations such as CLI applications running one-shot commands. For long running +// applications, this will be a problem. +// * We can drop non-troubleshoot spans from the cache. There is an OTEP to add +// "inheritable attributes" to spans which is unsupported as of today. Once/if +// it gets accepted and implemented, it will allow us to add a "troubleshoot" +// attribute to the "root" span which gets propagated to downstream spans which +// we can use to filter out non-troubleshoot spans. +// https://github.com/dynatrace-oss-contrib/oteps/blob/feature/context-scoped-attributes/text/0207-context-scoped-attributes.md +// https://github.com/open-telemetry/opentelemetry-specification/issues/1337 +// * We can "search" for the "root" span by traversing up the span tree and drop +// all spans that are not descendants of the "root" span. This can be slow, but +// we can do it in a background goroutine (adds a bit of complexity). +// 2. At the moment, the summary of an execution is the only case this exporter +// is being used for. + +// GetExporterInstance creates a singleton exporter instance +func GetExporterInstance() *Exporter { + once.Do(func() { + exporter = &Exporter{ + allSpans: make([]trace.ReadOnlySpan, 1024), + } + }) + return exporter +} + +// Exporter is an implementation of trace.SpanExporter that writes to a destination. +type Exporter struct { + spansMu sync.Mutex + allSpans []trace.ReadOnlySpan + + stoppedMu sync.RWMutex + stopped bool +} + +// ExportSpans writes spans to an in-memory cache +// This function can/will be called on every span.End() at worst. +func (e *Exporter) ExportSpans(ctx context.Context, spans []trace.ReadOnlySpan) error { + // This is a no-op if the context is canceled. + if ctx.Err() != nil { + return ctx.Err() + } + + e.stoppedMu.RLock() + stopped := e.stopped + e.stoppedMu.RUnlock() + if stopped { + return nil + } + + if len(spans) == 0 { + return nil + } + + e.spansMu.Lock() + defer e.spansMu.Unlock() + + // Cache received span updates allSpans + e.allSpans = append(e.allSpans, spans...) + + return nil +} + +func isType(stub *tracetest.SpanStub, t string) bool { + if stub == nil { + return false + } + + for _, attr := range stub.Attributes { + if string(attr.Key) == "type" && strings.Contains(attr.Value.AsString(), t) { + return true + } + } + return false +} + +func formatStubName(stub *tracetest.SpanStub) string { + if stub == nil { + return "" + } + + for _, attr := range stub.Attributes { + if stub.Status.Code == codes.Error { + return stub.Name + " (F)" + } + if string(attr.Key) == constants.EXCLUDED && attr.Value.AsBool() { + return stub.Name + " (X)" + } + } + return stub.Name + " (S)" // Assume success +} + +// maxInt returns the larger of x or y. +func maxInt(x, y int) int { + if x < y { + return y + } + return x +} + +// GetSummary returns the runtime summary of the execution +// so far. Call this function after your "root" span has ended +// and the program operations needing tracing have completed. +func (e *Exporter) GetSummary() string { + e.spansMu.Lock() + stubs := tracetest.SpanStubsFromReadOnlySpans(e.allSpans) + e.spansMu.Unlock() + + // No spans to log + if len(stubs) == 0 { + return "" + } + + // TODO: We may want to collect more information about the + // execution of the program. For example, we can collect + // the number of times a collector was executed, whether + // it was successful or not, if it was skipped, etc. + collectors := make(map[string]time.Duration) + redactors := make(map[string]time.Duration) + analysers := make(map[string]time.Duration) + + totalDuration := time.Duration(0) + + for i := range stubs { + stub := &stubs[i] + + // Summary of span stubs + duration := stub.EndTime.Sub(stub.StartTime) + stubName := formatStubName(stub) + switch { + case stub.Name == constants.TROUBLESHOOT_ROOT_SPAN_NAME: + totalDuration = duration + case isType(stub, "Collect"): + collectors[stubName] = duration + case isType(stub, "Redactors"): + redactors[stub.Name] = duration + case isType(stub, "Analyze"): + analysers[stubName] = duration + default: + continue + } + } + + sb := strings.Builder{} + + collectorsSummary(collectors, &sb) + redactorsSummary(redactors, &sb) + analysersSummary(analysers, &sb) + sb.WriteString(printer.Sprintf("\nDuration: %dms\n", totalDuration/time.Millisecond)) + + return sb.String() +} + +// summary of collector runtimes +func collectorsSummary(summary map[string]time.Duration, sb *strings.Builder) { + padding, keys := sortedKeysAndPadding(summary) + + sb.WriteString("============ Collectors summary =============\n") + sb.WriteString(legend) + sb.WriteString("=============================================\n") + if len(summary) == 0 { + sb.WriteString("No collectors executed\n") + return + } + + for _, name := range keys { + sb.WriteString(printer.Sprintf("%-*s : %dms\n", padding, name, summary[name]/time.Millisecond)) + } +} + +// summary of redactor runtime +func redactorsSummary(summary map[string]time.Duration, sb *strings.Builder) { + padding, keys := sortedKeysAndPadding(summary) + + sb.WriteString("\n============ Redactors summary =============\n") + if len(summary) == 0 { + sb.WriteString("No redactors executed\n") + return + } + + for _, name := range keys { + sb.WriteString(printer.Sprintf("%-*s : %dms\n", padding, name, summary[name]/time.Millisecond)) + } +} + +// summary of analyser runtime +func analysersSummary(summary map[string]time.Duration, sb *strings.Builder) { + padding, keys := sortedKeysAndPadding(summary) + + sb.WriteString("\n============= Analyzers summary =============\n") + sb.WriteString(legend) + sb.WriteString("=============================================\n") + if len(summary) == 0 { + sb.WriteString("No analyzers executed\n") + return + } + + for _, name := range keys { + sb.WriteString(printer.Sprintf("%-*s : %dms\n", padding, name, summary[name]/time.Millisecond)) + } +} + +func sortedKeysAndPadding(summary map[string]time.Duration) (int, []string) { + keys := make([]string, 0, len(summary)) + padding := 0 + for k := range summary { + padding = maxInt(padding, len(k)) + keys = append(keys, k) + } + sort.SliceStable(keys, func(l, r int) bool { + return summary[keys[l]] > summary[keys[r]] + }) + return padding, keys +} + +// Shutdown is called to stop the exporter, it preforms no action. +func (e *Exporter) Shutdown(ctx context.Context) error { + e.stoppedMu.Lock() + e.stopped = true + e.stoppedMu.Unlock() + + e.Reset() + + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + return nil +} + +func (e *Exporter) Reset() { + e.spansMu.Lock() + e.allSpans = e.allSpans[:0] // clear the slice + e.spansMu.Unlock() +} + +// MarshalLog is the marshaling function used by the logging system to represent this exporter. +func (e *Exporter) MarshalLog() interface{} { + return struct { + Type string + }{ + Type: "troubleshoot", + } +} diff --git a/internal/traces/exporter_test.go b/internal/traces/exporter_test.go new file mode 100644 index 00000000..e7fa9ae7 --- /dev/null +++ b/internal/traces/exporter_test.go @@ -0,0 +1,187 @@ +package traces + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + "github.com/replicatedhq/troubleshoot/pkg/constants" + "github.com/replicatedhq/troubleshoot/pkg/logger" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/sdk/trace" + "go.opentelemetry.io/otel/sdk/trace/tracetest" +) + +func TestExporter_GetSummary(t *testing.T) { + logger.SetQuiet(true) + + tests := []struct { + name string + spans tracetest.SpanStubs + want string + }{ + { + name: "with no spans", + spans: tracetest.SpanStubs{}, + want: "", + }, + { + name: "with root span only", + spans: tracetest.SpanStubs{ + tracetest.SpanStub{ + Name: constants.TROUBLESHOOT_ROOT_SPAN_NAME, + StartTime: time.Now(), + EndTime: time.Now().Add(time.Second), + }, + }, + want: "Duration: 1,000ms", + }, + { + name: "with collectors", + spans: tracetest.SpanStubs{ + tracetest.SpanStub{ + Name: "all-logs", StartTime: time.Now(), EndTime: time.Now().Add(time.Minute), + Attributes: []attribute.KeyValue{ + attribute.String("type", "*collect.CollectLogs"), + }, + }, + tracetest.SpanStub{ + Name: "host-os", StartTime: time.Now(), EndTime: time.Now().Add(time.Second), + Attributes: []attribute.KeyValue{ + attribute.String("type", "*collect.CollectHostOS"), + }, + }, + tracetest.SpanStub{ + Name: "excluded-collector", StartTime: time.Now(), EndTime: time.Now().Add(time.Millisecond * 2), + Attributes: []attribute.KeyValue{ + attribute.String("type", "*collect.CollectHostOS"), + attribute.Bool("excluded", true), + }, + }, + tracetest.SpanStub{ + Name: "failed-collector", StartTime: time.Now(), EndTime: time.Now().Add(time.Millisecond), + Attributes: []attribute.KeyValue{ + attribute.String("type", "*collect.CollectHostOS"), + }, + Status: trace.Status{ + Code: codes.Error, + Description: "some error", + }, + }, + }, + want: ` +============ Collectors summary ============= +Suceeded (S), eXcluded (X), Failed (F) +============================================= +all-logs (S) : 60,000ms +host-os (S) : 1,000ms +excluded-collector (X) : 2ms +failed-collector (F) : 1ms`, + }, + { + name: "with analyzers", + spans: tracetest.SpanStubs{ + tracetest.SpanStub{ + Name: "cluster-version", StartTime: time.Now(), EndTime: time.Now().Add(time.Second), + Attributes: []attribute.KeyValue{ + attribute.String("type", "*analyzer.AnalyzeClusterVersion"), + }, + }, + tracetest.SpanStub{ + Name: "host-cpu", StartTime: time.Now(), EndTime: time.Now().Add(time.Minute), + Attributes: []attribute.KeyValue{ + attribute.String("type", "*analyzer.AnalyzeHostCPU"), + }, + }, + tracetest.SpanStub{ + Name: "excluded-analyser", StartTime: time.Now(), EndTime: time.Now().Add(time.Millisecond * 2), + Attributes: []attribute.KeyValue{ + attribute.String("type", "*collect.AnalyzeHostCPU"), + attribute.Bool("excluded", true), + }, + }, + tracetest.SpanStub{ + Name: "failed-analyser", StartTime: time.Now(), EndTime: time.Now().Add(time.Millisecond), + Attributes: []attribute.KeyValue{ + attribute.String("type", "*collect.AnalyzeHostCPU"), + }, + Status: trace.Status{ + Code: codes.Error, + Description: "some error", + }, + }, + }, + want: ` +============= Analyzers summary ============= +Suceeded (S), eXcluded (X), Failed (F) +============================================= +host-cpu (S) : 60,000ms +cluster-version (S) : 1,000ms +excluded-analyser (X) : 2ms +failed-analyser (F) : 1ms`, + }, + { + name: "with redactors", + spans: tracetest.SpanStubs{ + tracetest.SpanStub{ + Name: "cluster redactor", StartTime: time.Now(), EndTime: time.Now().Add(time.Second), + Attributes: []attribute.KeyValue{ + attribute.String("type", "Redactors"), + }, + }, + }, + want: ` +============ Redactors summary ============= +cluster redactor : 1,000ms`, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + e := &Exporter{} + + ctx := context.Background() + err := e.ExportSpans(ctx, tt.spans.Snapshots()) + require.NoError(t, err) + + assert.Contains(t, e.GetSummary(), strings.TrimSpace(tt.want)) + }) + } +} + +func TestExporter_ExportSpansWithDoneContext(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + e := &Exporter{} + spans := tracetest.SpanStubs{} + + assert.EqualError(t, e.ExportSpans(ctx, spans.Snapshots()), context.Canceled.Error()) +} + +func TestExporter_Shutdown(t *testing.T) { + e := &Exporter{} + + ctx := context.Background() + spans := tracetest.SpanStubs{} + for i := 0; i < 5; i++ { + spans = append(spans, tracetest.SpanStub{Name: fmt.Sprintf("span-%d", i)}) + } + + err := e.ExportSpans(ctx, spans.Snapshots()) + require.NoError(t, err) + + assert.Len(t, e.allSpans, 5) + + require.NoError(t, e.Shutdown(ctx)) + assert.Len(t, e.allSpans, 0) + + err = e.ExportSpans(ctx, spans.Snapshots()) + require.NoError(t, err) + + assert.Len(t, e.allSpans, 0) +} diff --git a/internal/traces/otel.go b/internal/traces/otel.go new file mode 100644 index 00000000..01a201ea --- /dev/null +++ b/internal/traces/otel.go @@ -0,0 +1,60 @@ +package traces + +import ( + "context" + + "github.com/replicatedhq/troubleshoot/pkg/logger" + "github.com/replicatedhq/troubleshoot/pkg/version" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/sdk/resource" + "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.12.0" +) + +// ConfigureTracing configures the OpenTelemetry trace provider for CLI +// commands. Projects using troubleshoot as a library would need to register +// troubleshoot's exporter like so. +// +// var tp *trace.TracerProvider // client application's trace provider +// tp.RegisterSpanProcessor( +// trace.NewSimpleSpanProcessor( +// traces.GetExporterInstance(), // Troubleshoot's exporter +// ), +// ) +// +// The client application is responsible for constructing the trace provider +// and registering the exporter. Multiple exporters can be registered. +func ConfigureTracing(processName string) (func(), error) { + r, err := resource.Merge( + resource.Default(), + resource.NewWithAttributes( + resource.Default().SchemaURL(), + semconv.ProcessCommandKey.String(processName), + semconv.ProcessRuntimeVersionKey.String(version.Version()), + attribute.String("environment", "cli"), + ), + ) + + if err != nil { + return nil, err + } + + // Trace provider for support bundle cli. Each application is required + // to have its own trace provider. + tp := trace.NewTracerProvider( + trace.WithSampler(trace.AlwaysSample()), + trace.WithSyncer( + GetExporterInstance(), + ), + trace.WithResource(r), + ) + + otel.SetTracerProvider(tp) + + return func() { + if err := tp.Shutdown(context.Background()); err != nil { + logger.Printf("Failed to shutdown trace provider: %v", err) + } + }, nil +} diff --git a/pkg/analyze/analyzer.go b/pkg/analyze/analyzer.go index 14ccc166..7aa93e8a 100644 --- a/pkg/analyze/analyzer.go +++ b/pkg/analyze/analyzer.go @@ -1,13 +1,19 @@ package analyzer import ( + "context" "fmt" "reflect" "strconv" "github.com/pkg/errors" troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" + "github.com/replicatedhq/troubleshoot/pkg/constants" + "github.com/replicatedhq/troubleshoot/pkg/logger" "github.com/replicatedhq/troubleshoot/pkg/multitype" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" corev1 "k8s.io/api/core/v1" ) @@ -50,14 +56,25 @@ func isExcluded(excludeVal *multitype.BoolOrString) (bool, error) { return parsed, nil } -func HostAnalyze(hostAnalyzer *troubleshootv1beta2.HostAnalyze, getFile getCollectedFileContents, findFiles getChildCollectedFileContents) []*AnalyzeResult { +func HostAnalyze( + ctx context.Context, + hostAnalyzer *troubleshootv1beta2.HostAnalyze, + getFile getCollectedFileContents, + findFiles getChildCollectedFileContents, +) []*AnalyzeResult { analyzer, ok := GetHostAnalyzer(hostAnalyzer) if !ok { return NewAnalyzeResultError(analyzer, errors.New("invalid host analyzer")) } + _, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, analyzer.Title()) + span.SetAttributes(attribute.String("type", reflect.TypeOf(analyzer).String())) + defer span.End() + isExcluded, _ := analyzer.IsExcluded() if isExcluded { + logger.Printf("Excluding %q analyzer", analyzer.Title()) + span.SetAttributes(attribute.Bool(constants.EXCLUDED, true)) return nil } @@ -83,7 +100,12 @@ func NewAnalyzeResultError(analyzer HostAnalyzer, err error) []*AnalyzeResult { }} } -func Analyze(analyzer *troubleshootv1beta2.Analyze, getFile getCollectedFileContents, findFiles getChildCollectedFileContents) ([]*AnalyzeResult, error) { +func Analyze( + ctx context.Context, + analyzer *troubleshootv1beta2.Analyze, + getFile getCollectedFileContents, + findFiles getChildCollectedFileContents, +) ([]*AnalyzeResult, error) { if analyzer == nil { return nil, errors.New("nil analyzer") } @@ -97,16 +119,24 @@ func Analyze(analyzer *troubleshootv1beta2.Analyze, getFile getCollectedFileCont }}, nil } + _, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, analyzerInst.Title()) + span.SetAttributes(attribute.String("type", reflect.TypeOf(analyzerInst).String())) + defer span.End() + isExcluded, err := analyzerInst.IsExcluded() if err != nil { + span.SetStatus(codes.Error, err.Error()) return nil, err } if isExcluded { + logger.Printf("Excluding %q analyzer", analyzerInst.Title()) + span.SetAttributes(attribute.Bool(constants.EXCLUDED, true)) return nil, nil } results, err := analyzerInst.Analyze(getFile, findFiles) if err != nil { + span.SetStatus(codes.Error, err.Error()) return nil, err } diff --git a/pkg/analyze/analyzer_test.go b/pkg/analyze/analyzer_test.go index 0b8b4ad9..59739664 100644 --- a/pkg/analyze/analyzer_test.go +++ b/pkg/analyze/analyzer_test.go @@ -1,6 +1,7 @@ package analyzer import ( + "context" "testing" troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" @@ -67,7 +68,7 @@ func Test_GetExcludeFlag(t *testing.T) { } func TestAnalyzeWithNilAnalyzer(t *testing.T) { - got, err := Analyze(nil, nil, nil) + got, err := Analyze(context.Background(), nil, nil, nil) assert.Error(t, err) assert.Nil(t, got) } diff --git a/pkg/analyze/download.go b/pkg/analyze/download.go index 49594ac2..2f7a19f2 100644 --- a/pkg/analyze/download.go +++ b/pkg/analyze/download.go @@ -3,6 +3,7 @@ package analyzer import ( "archive/tar" "compress/gzip" + "context" "io" "io/fs" "os" @@ -24,7 +25,12 @@ type fileContentProvider struct { } // Analyze local will analyze a locally available (already downloaded) bundle -func AnalyzeLocal(localBundlePath string, analyzers []*troubleshootv1beta2.Analyze, hostAnalyzers []*troubleshootv1beta2.HostAnalyze) ([]*AnalyzeResult, error) { +func AnalyzeLocal( + ctx context.Context, + localBundlePath string, + analyzers []*troubleshootv1beta2.Analyze, + hostAnalyzers []*troubleshootv1beta2.HostAnalyze, +) ([]*AnalyzeResult, error) { rootDir, err := FindBundleRootDir(localBundlePath) if err != nil { return nil, errors.Wrap(err, "failed to find root dir") @@ -34,7 +40,7 @@ func AnalyzeLocal(localBundlePath string, analyzers []*troubleshootv1beta2.Analy analyzeResults := []*AnalyzeResult{} for _, analyzer := range analyzers { - analyzeResult, err := Analyze(analyzer, fcp.getFileContents, fcp.getChildFileContents) + analyzeResult, err := Analyze(ctx, analyzer, fcp.getFileContents, fcp.getChildFileContents) if err != nil { logger.Printf("An analyzer failed to run: %v", err) continue @@ -49,7 +55,7 @@ func AnalyzeLocal(localBundlePath string, analyzers []*troubleshootv1beta2.Analy } for _, hostAnalyzer := range hostAnalyzers { - analyzeResult := HostAnalyze(hostAnalyzer, fcp.getFileContents, fcp.getChildFileContents) + analyzeResult := HostAnalyze(ctx, hostAnalyzer, fcp.getFileContents, fcp.getChildFileContents) analyzeResults = append(analyzeResults, analyzeResult...) } @@ -81,7 +87,7 @@ func DownloadAndAnalyze(bundleURL string, analyzersSpec string) ([]*AnalyzeResul hostAnalyzers = parsedHostAnalyzers } - return AnalyzeLocal(rootDir, analyzers, hostAnalyzers) + return AnalyzeLocal(context.Background(), rootDir, analyzers, hostAnalyzers) } func DownloadAndExtractSupportBundle(bundleURL string) (string, string, error) { @@ -101,10 +107,10 @@ func DownloadAndExtractSupportBundle(bundleURL string) (string, string, error) { return "", "", errors.Wrap(err, "failed to find root dir") } - _, err = os.Stat(filepath.Join(bundleDir, constants.VersionFilename)) + _, err = os.Stat(filepath.Join(bundleDir, constants.VERSION_FILENAME)) if err != nil { os.RemoveAll(tmpDir) - return "", "", errors.Wrap(err, "failed to read "+constants.VersionFilename) + return "", "", errors.Wrap(err, "failed to read "+constants.VERSION_FILENAME) } return tmpDir, bundleDir, nil @@ -283,7 +289,7 @@ func FindBundleRootDir(localBundlePath string) (string, error) { isInSubDir := true for _, name := range names { - if name == constants.VersionFilename { + if name == constants.VERSION_FILENAME { isInSubDir = false break } diff --git a/pkg/collect/host_udpportstatus_test.go b/pkg/collect/host_udpportstatus_test.go index 569cbfe0..32ef6a98 100644 --- a/pkg/collect/host_udpportstatus_test.go +++ b/pkg/collect/host_udpportstatus_test.go @@ -29,13 +29,9 @@ func TestCollectHostUDPPortStatus_Collect(t *testing.T) { return port, conn, err } - type fields struct { - hostCollector *troubleshootv1beta2.UDPPortStatus - } tests := []struct { name string getPort func(t *testing.T) (port int, closeFn func() error) - fields fields want map[string][]byte }{ { diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go index 40f1781b..494119f7 100644 --- a/pkg/constants/constants.go +++ b/pkg/constants/constants.go @@ -9,11 +9,17 @@ const ( DEFAULT_CLIENT_BURST = 100 // DEFAULT_CLIENT_USER_AGENT is an field that specifies the caller of troubleshoot request. DEFAULT_CLIENT_USER_AGENT = "ReplicatedTroubleshoot" - // VersionFilename is the name of the file that contains the support bundle version. - VersionFilename = "version.yaml" + // VERSION_FILENAME is the name of the file that contains the support bundle version. + VERSION_FILENAME = "version.yaml" // DEFAULT_LOGS_COLLECTOR_TIMEOUT is the default timeout for logs collector. DEFAULT_LOGS_COLLECTOR_TIMEOUT = 60 * time.Second + // Tracing constants + + LIB_TRACER_NAME = "github.com/replicatedhq/troubleshoot" + TROUBLESHOOT_ROOT_SPAN_NAME = "ReplicatedTroubleshootRootSpan" + EXCLUDED = "excluded" + // Cluster Resources Collector Directories CLUSTER_RESOURCES_DIR = "cluster-resources" CLUSTER_RESOURCES_NAMESPACES = "namespace" diff --git a/pkg/preflight/analyze.go b/pkg/preflight/analyze.go index 246b3fbc..9edb5bc3 100644 --- a/pkg/preflight/analyze.go +++ b/pkg/preflight/analyze.go @@ -1,6 +1,7 @@ package preflight import ( + "context" "encoding/json" "fmt" "path/filepath" @@ -13,12 +14,12 @@ import ( // Analyze runs the analyze phase of preflight checks func (c ClusterCollectResult) Analyze() []*analyze.AnalyzeResult { - return doAnalyze(c.AllCollectedData, c.Spec.Spec.Analyzers, nil, "") + return doAnalyze(c.Context, c.AllCollectedData, c.Spec.Spec.Analyzers, nil, "") } // Analyze runs the analyze phase of host preflight checks func (c HostCollectResult) Analyze() []*analyze.AnalyzeResult { - return doAnalyze(c.AllCollectedData, nil, c.Spec.Spec.Analyzers, "") + return doAnalyze(c.Context, c.AllCollectedData, nil, c.Spec.Spec.Analyzers, "") } // Analyze runs the analyze phase of host preflight checks. @@ -43,12 +44,18 @@ func (c RemoteCollectResult) Analyze() []*analyze.AnalyzeResult { byteResult[k] = []byte(v) } - results = append(results, doAnalyze(byteResult, nil, c.Spec.Spec.Analyzers, nodeName)...) + results = append(results, doAnalyze(c.Context, byteResult, nil, c.Spec.Spec.Analyzers, nodeName)...) } return results } -func doAnalyze(allCollectedData map[string][]byte, analyzers []*troubleshootv1beta2.Analyze, hostAnalyzers []*troubleshootv1beta2.HostAnalyze, nodeName string) []*analyze.AnalyzeResult { +func doAnalyze( + ctx context.Context, + allCollectedData map[string][]byte, + analyzers []*troubleshootv1beta2.Analyze, + hostAnalyzers []*troubleshootv1beta2.HostAnalyze, + nodeName string, +) []*analyze.AnalyzeResult { getCollectedFileContents := func(fileName string) ([]byte, error) { contents, ok := allCollectedData[fileName] if !ok { @@ -89,7 +96,7 @@ func doAnalyze(allCollectedData map[string][]byte, analyzers []*troubleshootv1be analyzeResults := []*analyze.AnalyzeResult{} for _, analyzer := range analyzers { - analyzeResult, err := analyze.Analyze(analyzer, getCollectedFileContents, getChildCollectedFileContents) + analyzeResult, err := analyze.Analyze(ctx, analyzer, getCollectedFileContents, getChildCollectedFileContents) if err != nil { strict, strictErr := HasStrictAnalyzer(analyzer) if strictErr != nil { @@ -112,7 +119,7 @@ func doAnalyze(allCollectedData map[string][]byte, analyzers []*troubleshootv1be } for _, hostAnalyzer := range hostAnalyzers { - analyzeResult := analyze.HostAnalyze(hostAnalyzer, getCollectedFileContents, getChildCollectedFileContents) + analyzeResult := analyze.HostAnalyze(ctx, hostAnalyzer, getCollectedFileContents, getChildCollectedFileContents) analyzeResults = append(analyzeResults, analyzeResult...) } diff --git a/pkg/preflight/collect.go b/pkg/preflight/collect.go index 7cf174d8..fbcbbc1e 100644 --- a/pkg/preflight/collect.go +++ b/pkg/preflight/collect.go @@ -12,7 +12,11 @@ import ( troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" "github.com/replicatedhq/troubleshoot/pkg/collect" "github.com/replicatedhq/troubleshoot/pkg/constants" + "github.com/replicatedhq/troubleshoot/pkg/logger" "github.com/replicatedhq/troubleshoot/pkg/version" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" ) @@ -55,6 +59,7 @@ type ClusterCollectResult struct { RemoteCollectors collect.RemoteCollectors isRBACAllowed bool Spec *troubleshootv1beta2.Preflight + Context context.Context } func (cr ClusterCollectResult) IsRBACAllowed() bool { @@ -65,6 +70,7 @@ type HostCollectResult struct { AllCollectedData map[string][]byte Collectors []collect.HostCollector Spec *troubleshootv1beta2.HostPreflight + Context context.Context } func (cr HostCollectResult) IsRBACAllowed() bool { @@ -75,6 +81,7 @@ type RemoteCollectResult struct { AllCollectedData map[string][]byte Collectors collect.RemoteCollectors Spec *troubleshootv1beta2.HostPreflight + Context context.Context } func (cr RemoteCollectResult) IsRBACAllowed() bool { @@ -83,6 +90,12 @@ func (cr RemoteCollectResult) IsRBACAllowed() bool { // CollectHost runs the collection phase of host preflight checks func CollectHost(opts CollectOpts, p *troubleshootv1beta2.HostPreflight) (CollectResult, error) { + return CollectHostWithContext(context.Background(), opts, p) +} + +func CollectHostWithContext( + ctx context.Context, opts CollectOpts, p *troubleshootv1beta2.HostPreflight, +) (CollectResult, error) { collectSpecs := make([]*troubleshootv1beta2.HostCollect, 0, 0) if p != nil && p.Spec.Collectors != nil { collectSpecs = append(collectSpecs, p.Spec.Collectors...) @@ -101,11 +114,18 @@ func CollectHost(opts CollectOpts, p *troubleshootv1beta2.HostPreflight) (Collec collectResult := HostCollectResult{ Collectors: collectors, Spec: p, + Context: ctx, } for _, collector := range collectors { + _, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, collector.Title()) + span.SetAttributes(attribute.String("type", reflect.TypeOf(collector).String())) + isExcluded, _ := collector.IsExcluded() if isExcluded { + logger.Printf("Excluding %q collector", collector.Title()) + span.SetAttributes(attribute.Bool(constants.EXCLUDED, true)) + span.End() continue } @@ -117,6 +137,7 @@ func CollectHost(opts CollectOpts, p *troubleshootv1beta2.HostPreflight) (Collec for k, v := range result { allCollectedData[k] = v } + span.End() } collectResult.AllCollectedData = allCollectedData @@ -126,6 +147,10 @@ func CollectHost(opts CollectOpts, p *troubleshootv1beta2.HostPreflight) (Collec // Collect runs the collection phase of preflight checks func Collect(opts CollectOpts, p *troubleshootv1beta2.Preflight) (CollectResult, error) { + return CollectWithContext(context.Background(), opts, p) +} + +func CollectWithContext(ctx context.Context, opts CollectOpts, p *troubleshootv1beta2.Preflight) (CollectResult, error) { var allCollectors []collect.Collector var foundForbidden bool @@ -133,8 +158,12 @@ func Collect(opts CollectOpts, p *troubleshootv1beta2.Preflight) (CollectResult, if p != nil && p.Spec.Collectors != nil { collectSpecs = append(collectSpecs, p.Spec.Collectors...) } - collectSpecs = collect.EnsureCollectorInList(collectSpecs, troubleshootv1beta2.Collect{ClusterInfo: &troubleshootv1beta2.ClusterInfo{}}) - collectSpecs = collect.EnsureCollectorInList(collectSpecs, troubleshootv1beta2.Collect{ClusterResources: &troubleshootv1beta2.ClusterResources{}}) + collectSpecs = collect.EnsureCollectorInList( + collectSpecs, troubleshootv1beta2.Collect{ClusterInfo: &troubleshootv1beta2.ClusterInfo{}}, + ) + collectSpecs = collect.EnsureCollectorInList( + collectSpecs, troubleshootv1beta2.Collect{ClusterResources: &troubleshootv1beta2.ClusterResources{}}, + ) collectSpecs = collect.DedupCollectors(collectSpecs) collectSpecs = collect.EnsureClusterResourcesFirst(collectSpecs) @@ -153,7 +182,7 @@ func Collect(opts CollectOpts, p *troubleshootv1beta2.Preflight) (CollectResult, for _, desiredCollector := range collectSpecs { if collectorInterface, ok := collect.GetCollector(desiredCollector, "", opts.Namespace, opts.KubernetesRestConfig, k8sClient, nil); ok { if collector, ok := collectorInterface.(collect.Collector); ok { - err := collector.CheckRBAC(context.Background(), collector, desiredCollector, opts.KubernetesRestConfig, opts.Namespace) + err := collector.CheckRBAC(ctx, collector, desiredCollector, opts.KubernetesRestConfig, opts.Namespace) if err != nil { return nil, errors.Wrap(err, "failed to check RBAC for collectors") } @@ -193,6 +222,7 @@ func Collect(opts CollectOpts, p *troubleshootv1beta2.Preflight) (CollectResult, collectResult := ClusterCollectResult{ Collectors: allCollectors, Spec: p, + Context: ctx, } if foundForbidden && !opts.IgnorePermissionErrors { @@ -201,8 +231,14 @@ func Collect(opts CollectOpts, p *troubleshootv1beta2.Preflight) (CollectResult, } for i, collector := range allCollectors { + _, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, collector.Title()) + span.SetAttributes(attribute.String("type", reflect.TypeOf(collector).String())) + isExcluded, _ := collector.IsExcluded() if isExcluded { + logger.Printf("Excluding %q collector", collector.Title()) + span.SetAttributes(attribute.Bool(constants.EXCLUDED, true)) + span.End() continue } @@ -217,6 +253,8 @@ func Collect(opts CollectOpts, p *troubleshootv1beta2.Preflight) (CollectResult, TotalCount: len(allCollectors), Collectors: collectorList, } + span.SetStatus(codes.Error, "skipping collector, insufficient RBAC permissions") + span.End() continue } } @@ -245,6 +283,8 @@ func Collect(opts CollectOpts, p *troubleshootv1beta2.Preflight) (CollectResult, TotalCount: len(allCollectors), Collectors: collectorList, } + span.SetStatus(codes.Error, err.Error()) + span.End() continue } @@ -262,6 +302,7 @@ func Collect(opts CollectOpts, p *troubleshootv1beta2.Preflight) (CollectResult, for k, v := range result { allCollectedData[k] = v } + span.End() } collectResult.AllCollectedData = allCollectedData @@ -271,6 +312,10 @@ func Collect(opts CollectOpts, p *troubleshootv1beta2.Preflight) (CollectResult, // Collect runs the collection phase of preflight checks func CollectRemote(opts CollectOpts, p *troubleshootv1beta2.HostPreflight) (CollectResult, error) { + return CollectRemoteWithContext(context.Background(), opts, p) +} + +func CollectRemoteWithContext(ctx context.Context, opts CollectOpts, p *troubleshootv1beta2.HostPreflight) (CollectResult, error) { collectSpecs := make([]*troubleshootv1beta2.RemoteCollect, 0, 0) if p != nil && p.Spec.RemoteCollectors != nil { collectSpecs = append(collectSpecs, p.Spec.RemoteCollectors...) @@ -296,6 +341,7 @@ func CollectRemote(opts CollectOpts, p *troubleshootv1beta2.HostPreflight) (Coll collectResult := RemoteCollectResult{ Collectors: collectors, Spec: p, + Context: ctx, } // generate a map of all collectors for atomic status messages @@ -308,6 +354,9 @@ func CollectRemote(opts CollectOpts, p *troubleshootv1beta2.HostPreflight) (Coll // Run preflights collectors synchronously for i, collector := range collectors { + _, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, collector.GetDisplayName()) + span.SetAttributes(attribute.String("type", reflect.TypeOf(collector).String())) + collectorList[collector.GetDisplayName()] = CollectorStatus{ Status: "running", } @@ -334,6 +383,8 @@ func CollectRemote(opts CollectOpts, p *troubleshootv1beta2.HostPreflight) (Coll TotalCount: len(collectors), Collectors: collectorList, } + span.SetStatus(codes.Error, err.Error()) + span.End() continue } @@ -375,6 +426,8 @@ func CollectRemote(opts CollectOpts, p *troubleshootv1beta2.HostPreflight) (Coll } } + + span.End() } collectResult.AllCollectedData = allCollectedData diff --git a/pkg/preflight/run.go b/pkg/preflight/run.go index 9cb246db..9a0dd50f 100644 --- a/pkg/preflight/run.go +++ b/pkg/preflight/run.go @@ -3,7 +3,7 @@ package preflight import ( "context" "fmt" - "io/ioutil" + "io" "net/http" "net/url" "os" @@ -18,12 +18,14 @@ import ( analyzer "github.com/replicatedhq/troubleshoot/pkg/analyze" troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" troubleshootclientsetscheme "github.com/replicatedhq/troubleshoot/pkg/client/troubleshootclientset/scheme" + "github.com/replicatedhq/troubleshoot/pkg/constants" "github.com/replicatedhq/troubleshoot/pkg/docrewrite" "github.com/replicatedhq/troubleshoot/pkg/k8sutil" "github.com/replicatedhq/troubleshoot/pkg/oci" "github.com/replicatedhq/troubleshoot/pkg/specs" "github.com/spf13/viper" spin "github.com/tj/go-spin" + "go.opentelemetry.io/otel" "golang.org/x/sync/errgroup" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -31,6 +33,10 @@ import ( ) func RunPreflights(interactive bool, output string, format string, args []string) error { + ctx, root := otel.Tracer( + constants.LIB_TRACER_NAME).Start(context.Background(), constants.TROUBLESHOOT_ROOT_SPAN_NAME) + defer root.End() + if interactive { fmt.Print(cursor.Hide()) defer fmt.Print(cursor.Show()) @@ -64,7 +70,7 @@ func RunPreflights(interactive bool, output string, format string, args []string preflightContent = spec } else if _, err = os.Stat(v); err == nil { - b, err := ioutil.ReadFile(v) + b, err := os.ReadFile(v) if err != nil { return err } @@ -103,7 +109,7 @@ func RunPreflights(interactive bool, output string, format string, args []string } defer resp.Body.Close() - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return err } @@ -142,7 +148,7 @@ func RunPreflights(interactive bool, output string, format string, args []string progressCh := make(chan interface{}) defer close(progressCh) - ctx, stopProgressCollection := context.WithCancel(context.Background()) + ctx, stopProgressCollection := context.WithCancel(ctx) // make sure we shut down progress collection goroutines if an error occurs defer stopProgressCollection() progressCollection, ctx := errgroup.WithContext(ctx) @@ -156,7 +162,7 @@ func RunPreflights(interactive bool, output string, format string, args []string uploadResultsMap := make(map[string][]CollectResult) if preflightSpec != nil { - r, err := collectInCluster(preflightSpec, progressCh) + r, err := collectInCluster(ctx, preflightSpec, progressCh) if err != nil { return errors.Wrap(err, "failed to collect in cluster") } @@ -165,7 +171,7 @@ func RunPreflights(interactive bool, output string, format string, args []string } if uploadResultSpecs != nil { for _, spec := range uploadResultSpecs { - r, err := collectInCluster(spec, progressCh) + r, err := collectInCluster(ctx, spec, progressCh) if err != nil { return errors.Wrap(err, "failed to collect in cluster") } @@ -176,14 +182,14 @@ func RunPreflights(interactive bool, output string, format string, args []string } if hostPreflightSpec != nil { if len(hostPreflightSpec.Spec.Collectors) > 0 { - r, err := collectHost(hostPreflightSpec, progressCh) + r, err := collectHost(ctx, hostPreflightSpec, progressCh) if err != nil { return errors.Wrap(err, "failed to collect from host") } collectResults = append(collectResults, *r) } if len(hostPreflightSpec.Spec.RemoteCollectors) > 0 { - r, err := collectRemote(hostPreflightSpec, progressCh) + r, err := collectRemote(ctx, hostPreflightSpec, progressCh) if err != nil { return errors.Wrap(err, "failed to collect remotely") } @@ -284,7 +290,7 @@ func collectNonInteractiveProgess(ctx context.Context, progressCh <-chan interfa } } -func collectInCluster(preflightSpec *troubleshootv1beta2.Preflight, progressCh chan interface{}) (*CollectResult, error) { +func collectInCluster(ctx context.Context, preflightSpec *troubleshootv1beta2.Preflight, progressCh chan interface{}) (*CollectResult, error) { v := viper.GetViper() restConfig, err := k8sutil.GetRESTConfig() @@ -306,7 +312,7 @@ func collectInCluster(preflightSpec *troubleshootv1beta2.Preflight, progressCh c } } - collectResults, err := Collect(collectOpts, preflightSpec) + collectResults, err := CollectWithContext(ctx, collectOpts, preflightSpec) if err != nil { if collectResults != nil && !collectResults.IsRBACAllowed() { if preflightSpec.Spec.UploadResultsTo != "" { @@ -323,7 +329,7 @@ func collectInCluster(preflightSpec *troubleshootv1beta2.Preflight, progressCh c return &collectResults, nil } -func collectRemote(preflightSpec *troubleshootv1beta2.HostPreflight, progressCh chan interface{}) (*CollectResult, error) { +func collectRemote(ctx context.Context, preflightSpec *troubleshootv1beta2.HostPreflight, progressCh chan interface{}) (*CollectResult, error) { v := viper.GetViper() restConfig, err := k8sutil.GetRESTConfig() @@ -365,7 +371,7 @@ func collectRemote(preflightSpec *troubleshootv1beta2.HostPreflight, progressCh return &collectResults, nil } -func collectHost(hostPreflightSpec *troubleshootv1beta2.HostPreflight, progressCh chan interface{}) (*CollectResult, error) { +func collectHost(ctx context.Context, hostPreflightSpec *troubleshootv1beta2.HostPreflight, progressCh chan interface{}) (*CollectResult, error) { collectOpts := CollectOpts{ ProgressChan: progressCh, } diff --git a/pkg/supportbundle/collect.go b/pkg/supportbundle/collect.go index 63c0adcc..9971aa6e 100644 --- a/pkg/supportbundle/collect.go +++ b/pkg/supportbundle/collect.go @@ -15,13 +15,17 @@ import ( "github.com/replicatedhq/troubleshoot/pkg/collect" "github.com/replicatedhq/troubleshoot/pkg/constants" "github.com/replicatedhq/troubleshoot/pkg/convert" + "github.com/replicatedhq/troubleshoot/pkg/logger" "github.com/replicatedhq/troubleshoot/pkg/version" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" "gopkg.in/yaml.v2" "k8s.io/client-go/kubernetes" ) -func runHostCollectors(hostCollectors []*troubleshootv1beta2.HostCollect, additionalRedactors *troubleshootv1beta2.Redactor, bundlePath string, opts SupportBundleCreateOpts) (collect.CollectorResult, error) { - collectSpecs := make([]*troubleshootv1beta2.HostCollect, 0, 0) +func runHostCollectors(ctx context.Context, hostCollectors []*troubleshootv1beta2.HostCollect, additionalRedactors *troubleshootv1beta2.Redactor, bundlePath string, opts SupportBundleCreateOpts) (collect.CollectorResult, error) { + collectSpecs := make([]*troubleshootv1beta2.HostCollect, 0) collectSpecs = append(collectSpecs, hostCollectors...) allCollectedData := make(map[string][]byte) @@ -35,16 +39,25 @@ func runHostCollectors(hostCollectors []*troubleshootv1beta2.HostCollect, additi } for _, collector := range collectors { + // TODO: Add context to host collectors + _, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, collector.Title()) + span.SetAttributes(attribute.String("type", reflect.TypeOf(collector).String())) + isExcluded, _ := collector.IsExcluded() if isExcluded { + logger.Printf("Excluding %q collector", collector.Title()) + span.SetAttributes(attribute.Bool(constants.EXCLUDED, true)) + span.End() continue } opts.ProgressChan <- fmt.Sprintf("[%s] Running host collector...", collector.Title()) result, err := collector.Collect(opts.ProgressChan) if err != nil { + span.SetStatus(codes.Error, err.Error()) opts.ProgressChan <- errors.Errorf("failed to run host collector: %s: %v", collector.Title(), err) } + span.End() for k, v := range result { allCollectedData[k] = v } @@ -58,17 +71,21 @@ func runHostCollectors(hostCollectors []*troubleshootv1beta2.HostCollect, additi } if opts.Redact { + _, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, "Host collectors") + span.SetAttributes(attribute.String("type", "Redactors")) err := collect.RedactResult(bundlePath, collectResult, globalRedactors) if err != nil { err = errors.Wrap(err, "failed to redact host collector results") + span.SetStatus(codes.Error, err.Error()) return collectResult, err } + span.End() } return collectResult, nil } -func runCollectors(collectors []*troubleshootv1beta2.Collect, additionalRedactors *troubleshootv1beta2.Redactor, bundlePath string, opts SupportBundleCreateOpts) (collect.CollectorResult, error) { +func runCollectors(ctx context.Context, collectors []*troubleshootv1beta2.Collect, additionalRedactors *troubleshootv1beta2.Redactor, bundlePath string, opts SupportBundleCreateOpts) (collect.CollectorResult, error) { var allCollectors []collect.Collector var foundForbidden bool @@ -94,7 +111,7 @@ func runCollectors(collectors []*troubleshootv1beta2.Collect, additionalRedactor for _, desiredCollector := range collectSpecs { if collectorInterface, ok := collect.GetCollector(desiredCollector, bundlePath, opts.Namespace, opts.KubernetesRestConfig, k8sClient, opts.SinceTime); ok { if collector, ok := collectorInterface.(collect.Collector); ok { - err := collector.CheckRBAC(context.Background(), collector, desiredCollector, opts.KubernetesRestConfig, opts.Namespace) + err := collector.CheckRBAC(ctx, collector, desiredCollector, opts.KubernetesRestConfig, opts.Namespace) if err != nil { return nil, errors.Wrap(err, "failed to check RBAC for collectors") } @@ -130,8 +147,14 @@ func runCollectors(collectors []*troubleshootv1beta2.Collect, additionalRedactor } for _, collector := range allCollectors { + _, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, collector.Title()) + span.SetAttributes(attribute.String("type", reflect.TypeOf(collector).String())) + isExcluded, _ := collector.IsExcluded() if isExcluded { + logger.Printf("Excluding %q collector", collector.Title()) + span.SetAttributes(attribute.Bool(constants.EXCLUDED, true)) + span.End() continue } @@ -140,17 +163,22 @@ func runCollectors(collectors []*troubleshootv1beta2.Collect, additionalRedactor if _, ok := collector.(*collect.CollectClusterResources); !ok { msg := fmt.Sprintf("skipping collector %s with insufficient RBAC permissions", collector.Title()) opts.CollectorProgressCallback(opts.ProgressChan, msg) + span.SetStatus(codes.Error, "skipping collector, insufficient RBAC permissions") + span.End() continue } } opts.CollectorProgressCallback(opts.ProgressChan, collector.Title()) result, err := collector.Collect(opts.ProgressChan) if err != nil { + span.SetStatus(codes.Error, err.Error()) opts.ProgressChan <- errors.Errorf("failed to run collector: %s: %v", collector.Title(), err) } + for k, v := range result { allCollectedData[k] = v } + span.End() } collectResult := allCollectedData @@ -161,10 +189,17 @@ func runCollectors(collectors []*troubleshootv1beta2.Collect, additionalRedactor } if opts.Redact { + // TODO: Should we record how long each redactor takes? + _, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, "In-cluster collectors") + span.SetAttributes(attribute.String("type", "Redactors")) err := collect.RedactResult(bundlePath, collectResult, globalRedactors) if err != nil { - return collectResult, errors.Wrap(err, "failed to redact in cluster collector results") + err := errors.Wrap(err, "failed to redact in cluster collector results") + span.SetStatus(codes.Error, err.Error()) + span.End() + return collectResult, err } + span.End() } return collectResult, nil diff --git a/pkg/supportbundle/supportbundle.go b/pkg/supportbundle/supportbundle.go index 8cbe1dfc..e68eb7c7 100644 --- a/pkg/supportbundle/supportbundle.go +++ b/pkg/supportbundle/supportbundle.go @@ -1,6 +1,8 @@ package supportbundle import ( + "bytes" + "context" "fmt" "net/http" "os" @@ -11,11 +13,14 @@ import ( cursor "github.com/ahmetalpbalkan/go-cursor" "github.com/fatih/color" "github.com/pkg/errors" + "github.com/replicatedhq/troubleshoot/internal/traces" analyzer "github.com/replicatedhq/troubleshoot/pkg/analyze" troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" "github.com/replicatedhq/troubleshoot/pkg/collect" "github.com/replicatedhq/troubleshoot/pkg/constants" "github.com/replicatedhq/troubleshoot/pkg/convert" + "github.com/replicatedhq/troubleshoot/pkg/logger" + "go.opentelemetry.io/otel" "k8s.io/client-go/rest" "k8s.io/klog/v2" ) @@ -43,7 +48,10 @@ type SupportBundleResponse struct { // collectors, analyzers and after collection steps. Input arguments are specifications. // if FromCLI option is set to true, the output is the name of the archive on disk in the cwd. // if FromCLI option is set to false, the support bundle is archived in the OS temp folder (os.TempDir()). -func CollectSupportBundleFromSpec(spec *troubleshootv1beta2.SupportBundleSpec, additionalRedactors *troubleshootv1beta2.Redactor, opts SupportBundleCreateOpts) (*SupportBundleResponse, error) { +func CollectSupportBundleFromSpec( + spec *troubleshootv1beta2.SupportBundleSpec, additionalRedactors *troubleshootv1beta2.Redactor, opts SupportBundleCreateOpts, +) (*SupportBundleResponse, error) { + resultsResponse := SupportBundleResponse{} if opts.KubernetesRestConfig == nil { @@ -90,13 +98,23 @@ func CollectSupportBundleFromSpec(spec *troubleshootv1beta2.SupportBundleSpec, a var result, files, hostFiles collect.CollectorResult + ctx, root := otel.Tracer(constants.LIB_TRACER_NAME).Start( + context.Background(), constants.TROUBLESHOOT_ROOT_SPAN_NAME, + ) + defer func() { + // If this function returns an error, root.End() may not be called. + // We want to ensure this happens, so we defer it. It is safe to call + // root.End() multiple times. + root.End() + }() + // Cache error returned by collectors and return it at the end of the function // so as to have a chance to run analyzers and archive the support bundle after. // If both host and in cluster collectors fail, the errors will be wrapped collectorsErrs := []string{} if spec.HostCollectors != nil { // Run host collectors - hostFiles, err = runHostCollectors(spec.HostCollectors, additionalRedactors, bundlePath, opts) + hostFiles, err = runHostCollectors(ctx, spec.HostCollectors, additionalRedactors, bundlePath, opts) if err != nil { collectorsErrs = append(collectorsErrs, fmt.Sprintf("failed to run host collectors: %s", err)) } @@ -104,7 +122,7 @@ func CollectSupportBundleFromSpec(spec *troubleshootv1beta2.SupportBundleSpec, a if spec.Collectors != nil { // Run collectors - files, err = runCollectors(spec.Collectors, additionalRedactors, bundlePath, opts) + files, err = runCollectors(ctx, spec.Collectors, additionalRedactors, bundlePath, opts) if err != nil { collectorsErrs = append(collectorsErrs, fmt.Sprintf("failed to run collectors: %s", err)) } @@ -128,13 +146,13 @@ func CollectSupportBundleFromSpec(spec *troubleshootv1beta2.SupportBundleSpec, a return nil, errors.Wrap(err, "failed to get version file") } - err = result.SaveResult(bundlePath, constants.VersionFilename, version) + err = result.SaveResult(bundlePath, constants.VERSION_FILENAME, version) if err != nil { return nil, errors.Wrap(err, "failed to write version") } // Run Analyzers - analyzeResults, err := AnalyzeSupportBundle(spec, bundlePath) + analyzeResults, err := AnalyzeSupportBundle(ctx, spec, bundlePath) if err != nil { if opts.FromCLI { c := color.New(color.FgHiRed) @@ -156,6 +174,17 @@ func CollectSupportBundleFromSpec(spec *troubleshootv1beta2.SupportBundleSpec, a return nil, errors.Wrap(err, "failed to write analysis") } + // Complete tracing by ending the root span and collecting + // the summary of the traces. Store them in the support bundle. + root.End() + summary := traces.GetExporterInstance().GetSummary() + err = result.SaveResult(bundlePath, "execution-data/summary.txt", bytes.NewReader([]byte(summary))) + if err != nil { + // Don't fail the support bundle if we can't save the execution summary + logger.Printf("failed to save execution summary file in the support bundle: %v", err) + } + + // Archive Support Bundle if err := result.ArchiveSupportBundle(bundlePath, filename); err != nil { return nil, errors.Wrap(err, "create bundle file") } @@ -174,6 +203,7 @@ func CollectSupportBundleFromSpec(spec *troubleshootv1beta2.SupportBundleSpec, a if len(collectorsErrs) > 0 { // TODO: Consider a collectors error type + // TODO: use errors.Join in go 1.20 (https://pkg.go.dev/errors#Join) return &resultsResponse, fmt.Errorf(strings.Join(collectorsErrs, "\n")) } @@ -222,11 +252,11 @@ func ProcessSupportBundleAfterCollection(spec *troubleshootv1beta2.SupportBundle // AnalyzeSupportBundle performs analysis on a support bundle using the support bundle spec and an already unpacked support // bundle on disk -func AnalyzeSupportBundle(spec *troubleshootv1beta2.SupportBundleSpec, tmpDir string) ([]*analyzer.AnalyzeResult, error) { +func AnalyzeSupportBundle(ctx context.Context, spec *troubleshootv1beta2.SupportBundleSpec, tmpDir string) ([]*analyzer.AnalyzeResult, error) { if len(spec.Analyzers) == 0 && len(spec.HostAnalyzers) == 0 { return nil, nil } - analyzeResults, err := analyzer.AnalyzeLocal(tmpDir, spec.Analyzers, spec.HostAnalyzers) + analyzeResults, err := analyzer.AnalyzeLocal(ctx, tmpDir, spec.Analyzers, spec.HostAnalyzers) if err != nil { return nil, errors.Wrap(err, "failed to analyze support bundle") }