Verify hub/front pods are ready by conditions (#1864 )

* Verify hub/front pods are ready by conditions * log waiting for readiness * proper sync --------- Co-authored-by: Alon Girmonsky <1990761+alongir@users.noreply.github.com>
2026-03-23 11:57:17 +00:00 · 2026-03-21 17:33:48 -07:00
2 changed files with 60 additions and 117 deletions
--- a/cmd/tapRunner.go
+++ b/cmd/tapRunner.go
@@ -40,9 +40,11 @@ type Readiness struct {
 }

 var ready *Readiness
+var proxyOnce sync.Once

 func tap() {
 	ready = &Readiness{}
+	proxyOnce = sync.Once{}
 	state.startTime = time.Now()
 	log.Info().Str("registry", config.Config.Tap.Docker.Registry).Str("tag", config.Config.Tap.Docker.Tag).Msg("Using Docker:")

@@ -147,11 +149,21 @@ func printNoPodsFoundSuggestion(targetNamespaces []string) {
 	log.Warn().Msg(fmt.Sprintf("Did not find any currently running pods that match the regex argument, %s will automatically target matching pods if any are created later%s", misc.Software, suggestionStr))
 }

+func isPodReady(pod *core.Pod) bool {
+	for _, condition := range pod.Status.Conditions {
+		if condition.Type == core.PodReady {
+			return condition.Status == core.ConditionTrue
+		}
+	}
+	return false
+}
+
 func watchHubPod(ctx context.Context, kubernetesProvider *kubernetes.Provider, cancel context.CancelFunc) {
 	podExactRegex := regexp.MustCompile(fmt.Sprintf("^%s", kubernetes.HubPodName))
 	podWatchHelper := kubernetes.NewPodWatchHelper(kubernetesProvider, podExactRegex)
 	eventChan, errorChan := kubernetes.FilteredWatch(ctx, podWatchHelper, []string{config.Config.Tap.Release.Namespace}, podWatchHelper)
-	isPodReady := false
+	podReady := false
+	podRunning := false

 	timeAfter := time.After(120 * time.Second)
 	for {
@@ -183,26 +195,30 @@ func watchHubPod(ctx context.Context, kubernetesProvider *kubernetes.Provider, c
 					Interface("containers-statuses", modifiedPod.Status.ContainerStatuses).
 					Msg("Watching pod.")

-				if modifiedPod.Status.Phase == core.PodRunning && !isPodReady {
-					isPodReady = true
+				if isPodReady(modifiedPod) && !podReady {
+					podReady = true

 					ready.Lock()
 					ready.Hub = true
 					ready.Unlock()
 					log.Info().Str("pod", kubernetes.HubPodName).Msg("Ready.")
+				} else if modifiedPod.Status.Phase == core.PodRunning && !podRunning {
+					podRunning = true
+					log.Info().Str("pod", kubernetes.HubPodName).Msg("Waiting for readiness...")
 				}

 				ready.Lock()
-				proxyDone := ready.Proxy
 				hubPodReady := ready.Hub
 				frontPodReady := ready.Front
 				ready.Unlock()

-				if !proxyDone && hubPodReady && frontPodReady {
-					ready.Lock()
-					ready.Proxy = true
-					ready.Unlock()
-					postFrontStarted(ctx, kubernetesProvider, cancel)
+				if hubPodReady && frontPodReady {
+					proxyOnce.Do(func() {
+						ready.Lock()
+						ready.Proxy = true
+						ready.Unlock()
+						postFrontStarted(ctx, kubernetesProvider, cancel)
+					})
 				}
 			case kubernetes.EventBookmark:
 				break
@@ -223,7 +239,7 @@ func watchHubPod(ctx context.Context, kubernetesProvider *kubernetes.Provider, c
 			cancel()

 		case <-timeAfter:
-			if !isPodReady {
+			if !podReady {
 				log.Error().
 					Str("pod", kubernetes.HubPodName).
 					Msg("Pod was not ready in time.")
@@ -242,7 +258,8 @@ func watchFrontPod(ctx context.Context, kubernetesProvider *kubernetes.Provider,
 	podExactRegex := regexp.MustCompile(fmt.Sprintf("^%s", kubernetes.FrontPodName))
 	podWatchHelper := kubernetes.NewPodWatchHelper(kubernetesProvider, podExactRegex)
 	eventChan, errorChan := kubernetes.FilteredWatch(ctx, podWatchHelper, []string{config.Config.Tap.Release.Namespace}, podWatchHelper)
-	isPodReady := false
+	podReady := false
+	podRunning := false

 	timeAfter := time.After(120 * time.Second)
 	for {
@@ -274,25 +291,29 @@ func watchFrontPod(ctx context.Context, kubernetesProvider *kubernetes.Provider,
 					Interface("containers-statuses", modifiedPod.Status.ContainerStatuses).
 					Msg("Watching pod.")

-				if modifiedPod.Status.Phase == core.PodRunning && !isPodReady {
-					isPodReady = true
+				if isPodReady(modifiedPod) && !podReady {
+					podReady = true
 					ready.Lock()
 					ready.Front = true
 					ready.Unlock()
 					log.Info().Str("pod", kubernetes.FrontPodName).Msg("Ready.")
+				} else if modifiedPod.Status.Phase == core.PodRunning && !podRunning {
+					podRunning = true
+					log.Info().Str("pod", kubernetes.FrontPodName).Msg("Waiting for readiness...")
 				}

 				ready.Lock()
-				proxyDone := ready.Proxy
 				hubPodReady := ready.Hub
 				frontPodReady := ready.Front
 				ready.Unlock()

-				if !proxyDone && hubPodReady && frontPodReady {
-					ready.Lock()
-					ready.Proxy = true
-					ready.Unlock()
-					postFrontStarted(ctx, kubernetesProvider, cancel)
+				if hubPodReady && frontPodReady {
+					proxyOnce.Do(func() {
+						ready.Lock()
+						ready.Proxy = true
+						ready.Unlock()
+						postFrontStarted(ctx, kubernetesProvider, cancel)
+					})
 				}
 			case kubernetes.EventBookmark:
 				break
@@ -312,7 +333,7 @@ func watchFrontPod(ctx context.Context, kubernetesProvider *kubernetes.Provider,
 				Msg("Failed creating pod.")

 		case <-timeAfter:
-			if !isPodReady {
+			if !podReady {
 				log.Error().
 					Str("pod", kubernetes.FrontPodName).
 					Msg("Pod was not ready in time.")
@@ -429,9 +450,6 @@ func postFrontStarted(ctx context.Context, kubernetesProvider *kubernetes.Provid
 		watchScripts(ctx, kubernetesProvider, false)
 	}

-	if config.Config.Scripting.Console {
-		go runConsoleWithoutProxy()
-	}
 }

 func updateConfig(kubernetesProvider *kubernetes.Provider) {
--- a/skills/network-rca/SKILL.md
+++ b/skills/network-rca/SKILL.md
@@ -29,31 +29,6 @@ Unlike real-time monitoring, retrospective analysis lets you go back in time:
 reconstruct what happened, compare against known-good baselines, and pinpoint
 root causes with full L4/L7 visibility.

-## Timezone Handling
-
-All timestamps presented to the user **must use the local timezone** of the environment
-where the agent is running. Users think in local time ("this happened around 3pm"), and
-UTC-only output adds friction during incident response when speed matters.
-
-### Rules
-
-1. **Detect the local timezone** at the start of every investigation. Use the system
-   clock or environment (e.g., `date +%Z` or equivalent) to determine the timezone.
-2. **Present local time as the primary reference** in all output — summaries, event
-   correlations, time-range references, and tables.
-3. **Show UTC in parentheses** for clarity, e.g., `15:03:22 IST (12:03:22 UTC)`.
-4. **Convert tool responses** — Kubeshark MCP tools return timestamps in UTC. Always
-   convert these to local time before presenting to the user.
-5. **Use local time in natural language** — when describing events, say "the spike at
-   3:23 PM" not "the spike at 12:23 UTC".
-
-### Snapshot Creation
-
-When creating snapshots, Kubeshark MCP tools accept UTC timestamps. Convert the user's
-local time references to UTC before passing them to tools like `create_snapshot` or
-`export_snapshot_pcap`. Confirm the converted window with the user if there's any
-ambiguity.
-
 ## Prerequisites

 Before starting any analysis, verify the environment is ready.
@@ -128,11 +103,6 @@ Both routes are valid and complementary. Use PCAP when you need raw packets
 for human analysis or compliance. Use Dissection when you want an AI agent
 to search and analyze traffic programmatically.

-**Default to Dissection.** Unless the user explicitly asks for a PCAP file or
-Wireshark export, assume Dissection is needed. Any question about workloads,
-APIs, services, pods, error rates, latency, or traffic patterns requires
-dissected data.
-
 ## Snapshot Operations

 Both routes start here. A snapshot is an immutable freeze of all cluster traffic
@@ -146,19 +116,19 @@ Check what raw capture data exists across the cluster. You can only create
 snapshots within these boundaries — data outside the window has been rotated
 out of the FIFO buffer.

-**Example response** (raw tool output is in UTC — convert to local time before presenting):
+**Example response**:
 ```
 Cluster-wide:
-  Oldest: 2026-03-14 18:12:34 IST (16:12:34 UTC)
-  Newest: 2026-03-14 20:05:20 IST (18:05:20 UTC)
+  Oldest: 2026-03-14 16:12:34 UTC
+  Newest: 2026-03-14 18:05:20 UTC

 Per node:
-  ┌─────────────────────────────┬───────────────────────────────┬───────────────────────────────┐
-  │            Node             │            Oldest             │            Newest             │
-  ├─────────────────────────────┼───────────────────────────────┼───────────────────────────────┤
-  │ ip-10-0-25-170.ec2.internal │ 18:12:34 IST (16:12:34 UTC)  │ 20:03:39 IST (18:03:39 UTC)  │
-  │ ip-10-0-32-115.ec2.internal │ 18:13:45 IST (16:13:45 UTC)  │ 20:05:20 IST (18:05:20 UTC)  │
-  └─────────────────────────────┴───────────────────────────────┴───────────────────────────────┘
+  ┌─────────────────────────────┬──────────┬──────────┐
+  │            Node             │  Oldest  │  Newest  │
+  ├─────────────────────────────┼──────────┼──────────┤
+  │ ip-10-0-25-170.ec2.internal │ 16:12:34 │ 18:03:39 │
+  │ ip-10-0-32-115.ec2.internal │ 16:13:45 │ 18:05:20 │
+  └─────────────────────────────┴──────────┴──────────┘
 ```

 If the incident falls outside the available window, the data has been rotated
@@ -262,30 +232,7 @@ KFL field names differ from what you might expect (e.g., `status_code` not
 `response.status`, `src.pod.namespace` not `src.namespace`). Using incorrect
 fields produces wrong results without warning.

-### Dissection Is Required — Do Not Skip This
-
-**Any question about workloads, Kubernetes resources, services, pods, namespaces,
-or API calls requires dissection.** Only the PCAP route works without it. If the
-user asks anything about traffic content, API behavior, error rates, latency,
-or service-to-service communication, you **must** ensure dissection is active
-before attempting to answer.
-
-**Do not wait for dissection to complete on its own — it will not start by itself.**
-
-Follow this sequence every time before using `list_api_calls`, `get_api_call`,
-or `get_api_stats`:
-
-1. **Check status**: Call `get_snapshot_dissection_status` (or `list_snapshot_dissections`)
-   to see if a dissection already exists for this snapshot.
-2. **If dissection exists and is completed** — proceed with your query. No further
-   action needed.
-3. **If dissection is in progress** — wait for it to complete, then proceed.
-4. **If no dissection exists** — you **must** call `start_snapshot_dissection` to
-   trigger it. Then monitor progress with `get_snapshot_dissection_status` until
-   it completes.
-
-Never assume dissection is running. Never wait for a dissection that was not started.
-The agent is responsible for triggering dissection when it is missing.
+### Activate Dissection

 **Tool**: `start_snapshot_dissection`

@@ -296,27 +243,6 @@ become available:
 - `get_api_call` — Drill into a specific call (headers, body, timing, payload)
 - `get_api_stats` — Aggregated statistics (throughput, error rates, latency)

-### Every Question Is a Query
-
-**Every user prompt that involves APIs, workloads, services, pods, namespaces,
-or Kubernetes semantics should translate into a `list_api_calls` call with an
-appropriate KFL filter.** Do not answer from memory or prior results — always
-run a fresh query that matches what the user is asking.
-
-Examples of user prompts and the queries they should trigger:
-
-| User says | Action |
-|---|---|
-| "Show me all 500 errors" | `list_api_calls` with KFL: `http && status_code == 500` |
-| "What's hitting the payment service?" | `list_api_calls` with KFL: `dst.service.name == "payment-service"` |
-| "Any DNS failures?" | `list_api_calls` with KFL: `dns && status_code != 0` |
-| "Show traffic from namespace prod to staging" | `list_api_calls` with KFL: `src.pod.namespace == "prod" && dst.pod.namespace == "staging"` |
-| "What are the slowest API calls?" | `list_api_calls` with KFL: `http && elapsed_time > 5000000` |
-
-The user's natural language maps to KFL. Your job is to translate intent into
-the right filter and run the query — don't summarize old results or speculate
-without fresh data.
-
 ### Investigation Strategy

 Start broad, then narrow:
@@ -329,17 +255,16 @@ Start broad, then narrow:
   full payload to understand what went wrong.
 4. Use KFL filters to slice by namespace, service, protocol, or any combination.

-**Example `list_api_calls` response** (filtered to `http && status_code >= 500`,
-timestamps converted from UTC to local):
+**Example `list_api_calls` response** (filtered to `http && status_code >= 500`):
 ```
-┌──────────────────────────────────────────┬────────┬──────────────────────────┬────────┬───────────┐
-│                Timestamp                 │ Method │           URL            │ Status │  Elapsed  │
-├──────────────────────────────────────────┼────────┼──────────────────────────┼────────┼───────────┤
-│ 2026-03-14 19:23:45 IST (17:23:45 UTC)  │ POST   │ /api/v1/orders/charge    │ 503    │ 12,340 ms │
-│ 2026-03-14 19:23:46 IST (17:23:46 UTC)  │ POST   │ /api/v1/orders/charge    │ 503    │ 11,890 ms │
-│ 2026-03-14 19:23:48 IST (17:23:48 UTC)  │ GET    │ /api/v1/inventory/check  │ 500    │  8,210 ms │
-│ 2026-03-14 19:24:01 IST (17:24:01 UTC)  │ POST   │ /api/v1/payments/process │ 502    │ 30,000 ms │
-└──────────────────────────────────────────┴────────┴──────────────────────────┴────────┴───────────┘
+┌──────────────────────┬────────┬──────────────────────────┬────────┬───────────┐
+│      Timestamp       │ Method │           URL            │ Status │  Elapsed  │
+├──────────────────────┼────────┼──────────────────────────┼────────┼───────────┤
+│ 2026-03-14 17:23:45  │ POST   │ /api/v1/orders/charge    │ 503    │ 12,340 ms │
+│ 2026-03-14 17:23:46  │ POST   │ /api/v1/orders/charge    │ 503    │ 11,890 ms │
+│ 2026-03-14 17:23:48  │ GET    │ /api/v1/inventory/check  │ 500    │  8,210 ms │
+│ 2026-03-14 17:24:01  │ POST   │ /api/v1/payments/process │ 502    │ 30,000 ms │
+└──────────────────────┴────────┴──────────────────────────┴────────┴───────────┘
 Src: api-gateway (prod)  →  Dst: payment-service (prod)
 ```