mirror of
https://github.com/weaveworks/scope.git
synced 2026-02-14 18:09:59 +00:00
565 lines
16 KiB
Go
565 lines
16 KiB
Go
package report
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math/rand"
|
|
"strings"
|
|
"time"
|
|
|
|
opentracing "github.com/opentracing/opentracing-go"
|
|
|
|
"github.com/weaveworks/scope/common/xfer"
|
|
)
|
|
|
|
// Names of the various topologies.
|
|
const (
|
|
Endpoint = "endpoint"
|
|
Process = "process"
|
|
Container = "container"
|
|
Pod = "pod"
|
|
Service = "service"
|
|
Deployment = "deployment"
|
|
ReplicaSet = "replica_set"
|
|
DaemonSet = "daemon_set"
|
|
StatefulSet = "stateful_set"
|
|
CronJob = "cron_job"
|
|
Namespace = "namespace"
|
|
ContainerImage = "container_image"
|
|
Host = "host"
|
|
Overlay = "overlay"
|
|
ECSService = "ecs_service"
|
|
ECSTask = "ecs_task"
|
|
SwarmService = "swarm_service"
|
|
PersistentVolume = "persistent_volume"
|
|
PersistentVolumeClaim = "persistent_volume_claim"
|
|
StorageClass = "storage_class"
|
|
VolumeSnapshot = "volume_snapshot"
|
|
VolumeSnapshotData = "volume_snapshot_data"
|
|
Job = "job"
|
|
|
|
// Shapes used for different nodes
|
|
Circle = "circle"
|
|
Triangle = "triangle"
|
|
Square = "square"
|
|
Pentagon = "pentagon"
|
|
Hexagon = "hexagon"
|
|
Heptagon = "heptagon"
|
|
Octagon = "octagon"
|
|
Cloud = "cloud"
|
|
Cylinder = "cylinder"
|
|
DottedCylinder = "dottedcylinder"
|
|
StorageSheet = "sheet"
|
|
Camera = "camera"
|
|
DottedTriangle = "dottedtriangle"
|
|
|
|
// Used when counting the number of containers
|
|
ContainersKey = "containers"
|
|
)
|
|
|
|
// topologyNames are the names of all report topologies.
|
|
var topologyNames = []string{
|
|
Endpoint,
|
|
Process,
|
|
Container,
|
|
ContainerImage,
|
|
Pod,
|
|
Service,
|
|
Deployment,
|
|
ReplicaSet,
|
|
DaemonSet,
|
|
StatefulSet,
|
|
CronJob,
|
|
Namespace,
|
|
Host,
|
|
Overlay,
|
|
ECSTask,
|
|
ECSService,
|
|
SwarmService,
|
|
PersistentVolume,
|
|
PersistentVolumeClaim,
|
|
StorageClass,
|
|
VolumeSnapshot,
|
|
VolumeSnapshotData,
|
|
Job,
|
|
}
|
|
|
|
// Report is the core data type. It's produced by probes, and consumed and
|
|
// stored by apps. It's composed of multiple topologies, each representing
|
|
// a different (related, but not equivalent) view of the network.
|
|
type Report struct {
|
|
// TS is the time this report was generated
|
|
TS time.Time
|
|
|
|
// Endpoint nodes are individual (address, port) tuples on each host.
|
|
// They come from inspecting active connections and can (theoretically)
|
|
// be traced back to a process. Edges are present.
|
|
Endpoint Topology
|
|
|
|
// Process nodes are processes on each host. Edges are not present.
|
|
Process Topology
|
|
|
|
// Container nodes represent all Docker containers on hosts running probes.
|
|
// Metadata includes things like containter id, name, image id etc.
|
|
// Edges are not present.
|
|
Container Topology
|
|
|
|
// Pod nodes represent all Kubernetes pods running on hosts running probes.
|
|
// Metadata includes things like pod id, name etc. Edges are not
|
|
// present.
|
|
Pod Topology
|
|
|
|
// Service nodes represent all Kubernetes services running on hosts running probes.
|
|
// Metadata includes things like service id, name etc. Edges are not
|
|
// present.
|
|
Service Topology
|
|
|
|
// Deployment nodes represent all Kubernetes deployments running on hosts running probes.
|
|
// Metadata includes things like deployment id, name etc. Edges are not
|
|
// present.
|
|
Deployment Topology
|
|
|
|
// ReplicaSet nodes represent all Kubernetes ReplicaSets running on hosts running probes.
|
|
// Metadata includes things like ReplicaSet id, name etc. Edges are not
|
|
// present.
|
|
ReplicaSet Topology
|
|
|
|
// DaemonSet nodes represent all Kubernetes DaemonSets running on hosts running probes.
|
|
// Metadata includes things like DaemonSet id, name etc. Edges are not
|
|
// present.
|
|
DaemonSet Topology
|
|
|
|
// StatefulSet nodes represent all Kubernetes Stateful Sets running on hosts running probes.
|
|
// Metadata includes things like Stateful Set id, name, etc. Edges are not
|
|
// present.
|
|
StatefulSet Topology
|
|
|
|
// CronJob nodes represent all Kubernetes Cron Jobs running on hosts running probes.
|
|
// Metadata includes things like Cron Job id, name, etc. Edges are not
|
|
// present.
|
|
CronJob Topology
|
|
|
|
// Namespace nodes represent all Kubernetes Namespaces running on hosts running probes.
|
|
// Metadata includes things like Namespace id, name, etc. Edges are not
|
|
// present.
|
|
Namespace Topology
|
|
|
|
// ContainerImages nodes represent all Docker containers images on
|
|
// hosts running probes. Metadata includes things like image id, name etc.
|
|
// Edges are not present.
|
|
ContainerImage Topology
|
|
|
|
// Host nodes are physical hosts that run probes. Metadata includes things
|
|
// like operating system, load, etc. The information is scraped by the
|
|
// probes with each published report. Edges are not present.
|
|
Host Topology
|
|
|
|
// ECS Task nodes are AWS ECS tasks, which represent a group of containers.
|
|
// Metadata is limited for now, more to come later. Edges are not present.
|
|
ECSTask Topology
|
|
|
|
// ECS Service nodes are AWS ECS services, which represent a specification for a
|
|
// desired count of tasks with a task definition template.
|
|
// Metadata is limited for now, more to come later. Edges are not present.
|
|
ECSService Topology
|
|
|
|
// Swarm Service nodes are Docker Swarm services, which represent a specification for a
|
|
// group of tasks (either one per host, or a desired count).
|
|
// Edges are not present.
|
|
SwarmService Topology
|
|
|
|
// Overlay nodes are active peers in any software-defined network that's
|
|
// overlaid on the infrastructure. The information is scraped by polling
|
|
// their status endpoints. Edges are present.
|
|
Overlay Topology
|
|
|
|
// Persistent Volume nodes represent all Kubernetes Persistent Volumes running on hosts running probes.
|
|
// Metadata is limited for now, more to come later.
|
|
PersistentVolume Topology
|
|
|
|
// Persistent Volume Claim nodes represent all Kubernetes Persistent Volume Claims running on hosts running probes.
|
|
// Metadata is limited for now, more to come later.
|
|
PersistentVolumeClaim Topology
|
|
|
|
// Storage Class represent all kubernetes Storage Classes on hosts running probes.
|
|
// Metadata is limited for now, more to come later.
|
|
StorageClass Topology
|
|
|
|
// VolumeSnapshot represent all Kubernetes Volume Snapshots on hosts running probes.
|
|
VolumeSnapshot Topology
|
|
|
|
// VolumeSnapshotData represent all Kubernetes Volume Snapshot Data on hosts running probes.
|
|
VolumeSnapshotData Topology
|
|
|
|
// Job represent all Kubernetes Job on hosts running probes.
|
|
Job Topology
|
|
|
|
DNS DNSRecords `json:"DNS,omitempty" deepequal:"nil==empty"`
|
|
// Backwards-compatibility for an accident in commit 951629a / release 1.11.6.
|
|
BugDNS DNSRecords `json:"nodes,omitempty"`
|
|
|
|
// Sampling data for this report.
|
|
Sampling Sampling
|
|
|
|
// Window is the amount of time that this report purports to represent.
|
|
// Windows must be carefully merged. They should only be added when
|
|
// reports cover non-overlapping periods of time. By default, we assume
|
|
// that's true, and add windows in merge operations. When that's not true,
|
|
// such as in the app, we expect the component to overwrite the window
|
|
// before serving it to consumers.
|
|
Window time.Duration
|
|
|
|
// Shortcut reports should be propagated to the UI as quickly as possible,
|
|
// bypassing the usual spy interval, publish interval and app ws interval.
|
|
Shortcut bool
|
|
|
|
Plugins xfer.PluginSpecs
|
|
|
|
// ID a random identifier for this report, used when caching
|
|
// rendered views of the report. Reports with the same id
|
|
// must be equal, but we don't require that equal reports have
|
|
// the same id.
|
|
ID string `deepequal:"skip"`
|
|
}
|
|
|
|
// MakeReport makes a clean report, ready to Merge() other reports into.
|
|
func MakeReport() Report {
|
|
return Report{
|
|
Endpoint: MakeTopology(),
|
|
|
|
Process: MakeTopology().
|
|
WithShape(Square).
|
|
WithLabel("process", "processes"),
|
|
|
|
Container: MakeTopology().
|
|
WithShape(Hexagon).
|
|
WithLabel("container", "containers"),
|
|
|
|
ContainerImage: MakeTopology().
|
|
WithShape(Hexagon).
|
|
WithLabel("image", "images"),
|
|
|
|
Host: MakeTopology().
|
|
WithShape(Circle).
|
|
WithLabel("host", "hosts"),
|
|
|
|
Pod: MakeTopology().
|
|
WithShape(Heptagon).
|
|
WithLabel("pod", "pods"),
|
|
|
|
Service: MakeTopology().
|
|
WithShape(Heptagon).
|
|
WithLabel("service", "services"),
|
|
|
|
Deployment: MakeTopology().
|
|
WithShape(Heptagon).
|
|
WithLabel("deployment", "deployments"),
|
|
|
|
ReplicaSet: MakeTopology().
|
|
WithShape(Triangle).
|
|
WithLabel("replica set", "replica sets"),
|
|
|
|
DaemonSet: MakeTopology().
|
|
WithShape(Pentagon).
|
|
WithLabel("daemonset", "daemonsets"),
|
|
|
|
StatefulSet: MakeTopology().
|
|
WithShape(Octagon).
|
|
WithLabel("stateful set", "stateful sets"),
|
|
|
|
CronJob: MakeTopology().
|
|
WithShape(Triangle).
|
|
WithLabel("cron job", "cron jobs"),
|
|
|
|
Namespace: MakeTopology(),
|
|
|
|
Overlay: MakeTopology().
|
|
WithShape(Circle).
|
|
WithLabel("peer", "peers"),
|
|
|
|
ECSTask: MakeTopology().
|
|
WithShape(Heptagon).
|
|
WithLabel("task", "tasks"),
|
|
|
|
ECSService: MakeTopology().
|
|
WithShape(Heptagon).
|
|
WithLabel("service", "services"),
|
|
|
|
SwarmService: MakeTopology().
|
|
WithShape(Heptagon).
|
|
WithLabel("service", "services"),
|
|
|
|
PersistentVolume: MakeTopology().
|
|
WithShape(Cylinder).
|
|
WithLabel("persistent volume", "persistent volumes"),
|
|
|
|
PersistentVolumeClaim: MakeTopology().
|
|
WithShape(DottedCylinder).
|
|
WithLabel("persistent volume claim", "persistent volume claims"),
|
|
|
|
StorageClass: MakeTopology().
|
|
WithShape(StorageSheet).
|
|
WithLabel("storage class", "storage classes"),
|
|
|
|
VolumeSnapshot: MakeTopology().
|
|
WithShape(DottedCylinder).
|
|
WithTag(Camera).
|
|
WithLabel("volume snapshot", "volume snapshots"),
|
|
|
|
VolumeSnapshotData: MakeTopology().
|
|
WithShape(Cylinder).
|
|
WithTag(Camera).
|
|
WithLabel("volume snapshot data", "volume snapshot data"),
|
|
|
|
Job: MakeTopology().
|
|
WithShape(DottedTriangle).
|
|
WithLabel("job", "jobs"),
|
|
|
|
DNS: DNSRecords{},
|
|
|
|
Sampling: Sampling{},
|
|
Window: 0,
|
|
Plugins: xfer.MakePluginSpecs(),
|
|
ID: fmt.Sprintf("%d", rand.Int63()),
|
|
}
|
|
}
|
|
|
|
// Copy returns a value copy of the report.
|
|
func (r Report) Copy() Report {
|
|
newReport := Report{
|
|
TS: r.TS,
|
|
DNS: r.DNS.Copy(),
|
|
Sampling: r.Sampling,
|
|
Window: r.Window,
|
|
Shortcut: r.Shortcut,
|
|
Plugins: r.Plugins.Copy(),
|
|
ID: fmt.Sprintf("%d", rand.Int63()),
|
|
}
|
|
newReport.WalkPairedTopologies(&r, func(newTopology, oldTopology *Topology) {
|
|
*newTopology = oldTopology.Copy()
|
|
})
|
|
return newReport
|
|
}
|
|
|
|
// UnsafeMerge merges another Report into the receiver. The original is modified.
|
|
func (r *Report) UnsafeMerge(other Report) {
|
|
// Merged report has the earliest non-zero timestamp
|
|
if !other.TS.IsZero() && (r.TS.IsZero() || other.TS.Before(r.TS)) {
|
|
r.TS = other.TS
|
|
}
|
|
r.DNS = r.DNS.Merge(other.DNS)
|
|
r.Sampling = r.Sampling.Merge(other.Sampling)
|
|
r.Window = r.Window + other.Window
|
|
r.Plugins = r.Plugins.Merge(other.Plugins)
|
|
r.WalkPairedTopologies(&other, func(ourTopology, theirTopology *Topology) {
|
|
ourTopology.UnsafeMerge(*theirTopology)
|
|
})
|
|
}
|
|
|
|
// UnsafeUnMerge removes any information from r that would be added by merging other.
|
|
// The original is modified.
|
|
func (r *Report) UnsafeUnMerge(other Report) {
|
|
// TODO: DNS, Sampling, Plugins
|
|
r.Window = r.Window - other.Window
|
|
r.WalkPairedTopologies(&other, func(ourTopology, theirTopology *Topology) {
|
|
ourTopology.UnsafeUnMerge(*theirTopology)
|
|
})
|
|
}
|
|
|
|
// UnsafeRemovePartMergedNodes removes nodes that have not fully re-merged.
|
|
// E.g. if a node is removed from source between two full reports, then we
|
|
// might only have a delta of its last state. Remove that from the set.
|
|
// The original is modified.
|
|
func (r *Report) UnsafeRemovePartMergedNodes(ctx context.Context) {
|
|
dropped := map[string]int{}
|
|
r.WalkNamedTopologies(func(name string, t *Topology) {
|
|
for k, v := range t.Nodes {
|
|
if v.isPartMerged() {
|
|
delete(t.Nodes, k)
|
|
dropped[name]++
|
|
}
|
|
}
|
|
})
|
|
if span := opentracing.SpanFromContext(ctx); span != nil && len(dropped) > 0 {
|
|
msg := ""
|
|
for name, count := range dropped {
|
|
msg += fmt.Sprintf("%s: %d, ", name, count)
|
|
}
|
|
span.LogKV("dropped-part-merged", msg)
|
|
}
|
|
}
|
|
|
|
// WalkTopologies iterates through the Topologies of the report,
|
|
// potentially modifying them
|
|
func (r *Report) WalkTopologies(f func(*Topology)) {
|
|
for _, name := range topologyNames {
|
|
f(r.topology(name))
|
|
}
|
|
}
|
|
|
|
// WalkNamedTopologies iterates through the Topologies of the report,
|
|
// potentially modifying them.
|
|
func (r *Report) WalkNamedTopologies(f func(string, *Topology)) {
|
|
for _, name := range topologyNames {
|
|
f(name, r.topology(name))
|
|
}
|
|
}
|
|
|
|
// WalkPairedTopologies iterates through the Topologies of this and another report,
|
|
// potentially modifying one or both.
|
|
func (r *Report) WalkPairedTopologies(o *Report, f func(*Topology, *Topology)) {
|
|
for _, name := range topologyNames {
|
|
f(r.topology(name), o.topology(name))
|
|
}
|
|
}
|
|
|
|
// topology returns a reference to one of the report's topologies,
|
|
// selected by name.
|
|
func (r *Report) topology(name string) *Topology {
|
|
switch name {
|
|
case Endpoint:
|
|
return &r.Endpoint
|
|
case Process:
|
|
return &r.Process
|
|
case Container:
|
|
return &r.Container
|
|
case ContainerImage:
|
|
return &r.ContainerImage
|
|
case Pod:
|
|
return &r.Pod
|
|
case Service:
|
|
return &r.Service
|
|
case Deployment:
|
|
return &r.Deployment
|
|
case ReplicaSet:
|
|
return &r.ReplicaSet
|
|
case DaemonSet:
|
|
return &r.DaemonSet
|
|
case StatefulSet:
|
|
return &r.StatefulSet
|
|
case CronJob:
|
|
return &r.CronJob
|
|
case Namespace:
|
|
return &r.Namespace
|
|
case Host:
|
|
return &r.Host
|
|
case Overlay:
|
|
return &r.Overlay
|
|
case ECSTask:
|
|
return &r.ECSTask
|
|
case ECSService:
|
|
return &r.ECSService
|
|
case SwarmService:
|
|
return &r.SwarmService
|
|
case PersistentVolume:
|
|
return &r.PersistentVolume
|
|
case PersistentVolumeClaim:
|
|
return &r.PersistentVolumeClaim
|
|
case StorageClass:
|
|
return &r.StorageClass
|
|
case VolumeSnapshot:
|
|
return &r.VolumeSnapshot
|
|
case VolumeSnapshotData:
|
|
return &r.VolumeSnapshotData
|
|
case Job:
|
|
return &r.Job
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Topology returns one of the report's topologies, selected by name.
|
|
func (r Report) Topology(name string) (Topology, bool) {
|
|
if t := r.topology(name); t != nil {
|
|
return *t, true
|
|
}
|
|
return Topology{}, false
|
|
}
|
|
|
|
// Validate checks the report for various inconsistencies.
|
|
func (r Report) Validate() error {
|
|
var errs []string
|
|
for _, name := range topologyNames {
|
|
if err := r.topology(name).Validate(); err != nil {
|
|
errs = append(errs, err.Error())
|
|
}
|
|
}
|
|
if r.Sampling.Count > r.Sampling.Total {
|
|
errs = append(errs, fmt.Sprintf("sampling count (%d) bigger than total (%d)", r.Sampling.Count, r.Sampling.Total))
|
|
}
|
|
if len(errs) > 0 {
|
|
return fmt.Errorf("%d error(s): %s", len(errs), strings.Join(errs, "; "))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// DropTopologiesOver - as a protection against overloading the app
|
|
// server, drop topologies that have really large node counts. In
|
|
// practice we only see this with runaway numbers of zombie processes.
|
|
func (r Report) DropTopologiesOver(limit int) (Report, []string) {
|
|
dropped := []string{}
|
|
r.WalkNamedTopologies(func(name string, topology *Topology) {
|
|
if topology != nil && len(topology.Nodes) > limit {
|
|
topology.Nodes = Nodes{}
|
|
dropped = append(dropped, name)
|
|
}
|
|
})
|
|
return r, dropped
|
|
}
|
|
|
|
// Summary returns a human-readable string summarising the contents, for diagnostic purposes
|
|
func (r Report) Summary() string {
|
|
ret := ""
|
|
if len(r.Host.Nodes) == 1 {
|
|
for k := range r.Host.Nodes {
|
|
ret = k + ": "
|
|
}
|
|
}
|
|
count := 0
|
|
r.WalkNamedTopologies(func(n string, t *Topology) {
|
|
if len(t.Nodes) > 0 {
|
|
count++
|
|
if count > 1 {
|
|
ret = ret + ", "
|
|
}
|
|
ret = ret + fmt.Sprintf("%s:%d", n, len(t.Nodes))
|
|
}
|
|
})
|
|
return ret
|
|
}
|
|
|
|
// Sampling describes how the packet data sources for this report were
|
|
// sampled. It can be used to calculate effective sample rates. We can't
|
|
// just put the rate here, because that can't be accurately merged. Counts
|
|
// in e.g. edge metadata structures have already been adjusted to
|
|
// compensate for the sample rate.
|
|
type Sampling struct {
|
|
Count uint64 // observed and processed
|
|
Total uint64 // observed overall
|
|
}
|
|
|
|
// Rate returns the effective sampling rate.
|
|
func (s Sampling) Rate() float64 {
|
|
if s.Total <= 0 {
|
|
return 1.0
|
|
}
|
|
return float64(s.Count) / float64(s.Total)
|
|
}
|
|
|
|
// Merge combines two sampling structures via simple addition and returns the
|
|
// result. The original is not modified.
|
|
func (s Sampling) Merge(other Sampling) Sampling {
|
|
return Sampling{
|
|
Count: s.Count + other.Count,
|
|
Total: s.Total + other.Total,
|
|
}
|
|
}
|
|
|
|
const (
|
|
// HostNodeID is a metadata foreign key, linking a node in any topology to
|
|
// a node in the host topology. That host node is the origin host, where
|
|
// the node was originally detected.
|
|
HostNodeID = "host_node_id"
|
|
// ControlProbeID is the random ID of the probe which controls the specific node.
|
|
ControlProbeID = "control_probe_id"
|
|
)
|