From 6e715d26977a91a4870a65941dbb49e356207c81 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 13 Aug 2019 16:16:16 +0000 Subject: [PATCH 1/2] fix(probe): Loosen ebpf parameters to reduce restarts Delay kernel events by up to 0.2ms, to reduce the chance the ebpf reporter sends them out-of-order, and allow out-of-order events to happen up to once a minute without giving up on the ebpf reporter. --- probe/endpoint/connection_tracker.go | 4 ++-- probe/endpoint/ebpf.go | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/probe/endpoint/connection_tracker.go b/probe/endpoint/connection_tracker.go index e65c4d3c5..db9764b4e 100644 --- a/probe/endpoint/connection_tracker.go +++ b/probe/endpoint/connection_tracker.go @@ -87,8 +87,8 @@ func (t *connectionTracker) ReportConnections(rpt *report.Report) { ebpfLastFailureTime := t.ebpfLastFailureTime t.ebpfLastFailureTime = time.Now() - if ebpfLastFailureTime.After(time.Now().Add(-5 * time.Minute)) { - // Multiple failures in the last 5 minutes, fall back to proc parsing + if ebpfLastFailureTime.After(time.Now().Add(-1 * time.Minute)) { + // Multiple failures in the last minute, fall back to proc parsing log.Warnf("ebpf tracker died again, gently falling back to proc scanning") t.useProcfs() } else { diff --git a/probe/endpoint/ebpf.go b/probe/endpoint/ebpf.go index 2a1e1aefe..aca292851 100644 --- a/probe/endpoint/ebpf.go +++ b/probe/endpoint/ebpf.go @@ -129,6 +129,7 @@ func newEbpfTracker() (*EbpfTracker, error) { debugBPF = true } + tracer.TimestampOffset = 200000 // Delay events by 0.2ms to avoid out-of-order reporting tracker := &EbpfTracker{ debugBPF: debugBPF, } From eba9f31f3f691f3dfbc32d9844dc59959dfa0142 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 13 Aug 2019 16:28:32 +0000 Subject: [PATCH 2/2] fix(probe): restart conntrack handler periodically to clear out data We observe a slow increase in connections reported, and are unable to find the root cause, so clear down the data every six hours and start from a clean sheet. --- probe/endpoint/conntrack.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/probe/endpoint/conntrack.go b/probe/endpoint/conntrack.go index 92678878e..0e2ae17ba 100644 --- a/probe/endpoint/conntrack.go +++ b/probe/endpoint/conntrack.go @@ -135,16 +135,20 @@ func (c *conntrackWalker) run() { return } - defer log.Infof("conntrack exiting") - + periodicRestart := time.After(6 * time.Hour) // Handle conntrack events from netlink socket for { select { + case <-periodicRestart: + log.Debugf("conntrack periodic restart") + return case <-c.quit: + log.Infof("conntrack quit signal - exiting") stop() return case f, ok := <-events: if !ok { + log.Errorf("conntrack events read failed - exiting") return } if f.Err != nil {