mirror of
https://github.com/replicatedhq/troubleshoot.git
synced 2026-04-15 07:16:34 +00:00
* created roadmap and yaml claude agent
* Update roadmap.md
* feat: Clean advanced analysis implementation - core agents, engine, artifacts
* Remove unrelated files - keep only advanced analysis implementation
* fix: Fix goroutine leak in hosted agent rate limiter
- Added stop channel and stopped flag to RateLimiter struct
- Modified replenishTokens to listen for stop signal and exit cleanly
- Added Stop() method to gracefully shutdown rate limiter
- Added Stop() method to HostedAgent to cleanup rate limiter on shutdown
Fixes cursor bot issue: Rate Limiter Goroutine Leak
* fix: Fix analyzer config and model validation bugs
Bug 1: Analyzer Config Missing File Path
- Added filePath to DeploymentStatus analyzer config in convertAnalyzerToSpec
- Sets namespace-specific path (cluster-resources/deployments/{namespace}.json)
- Falls back to generic path (cluster-resources/deployments.json) if no namespace
- Fixes LocalAgent.analyzeDeploymentStatus backward compatibility
Bug 2: HealthCheck Fails Model Validation
- Changed Ollama model validation from prefix match to exact match
- Prevents false positives where llama2:13b would match request for llama2:7b
- Ensures agent only reports healthy when exact model is available
Both fixes address cursor bot reported issues and maintain backward compatibility.
* fixing lint errors
* fixing lint errors
* adding CLI flags
* fix: resolve linting errors for CI
- Remove unnecessary nil check in host_kernel_configs.go (len() for nil slices is zero)
- Remove unnecessary fmt.Sprintf() calls in ceph.go for static strings
- Apply go fmt formatting fixes
Fixes failing lint CI check
* fix: resolve CI failures in build-test workflow and Ollama tests
1. Fix GitHub Actions workflow logic error:
- Replace problematic contains() expression with explicit job result checks
- Properly handle failure and cancelled states for each job
- Prevents false positive failures in success summary job
2. Fix Ollama agent parseLLMResponse panics:
- Add proper error handling for malformed JSON in LLM responses
- Return error when JSON is found but invalid (instead of silent fallback)
- Add error when no meaningful content can be parsed from response
- Prevents nil pointer dereference in test assertions
Fixes failing build-test/success and build-test/test CI checks
* fix: resolve all CI failures and cursor bot issues
1. Fix disable-ollama flag logic bug:
- Remove disable-ollama from advanced analysis trigger condition
- Prevents unintended advanced analysis mode when no agents registered
- Allows proper fallback to legacy analysis
2. Fix diff test consistency:
- Update test expectations to match function behavior (lines with newlines)
- Ensures consistency between streaming and non-streaming diff paths
3. Fix Ollama agent error handling:
- Add proper error return for malformed JSON in LLM responses
- Add meaningful content validation for markdown parsing
- Prevents nil pointer panics in test assertions
4. Fix analysis engine mock agent:
- Mock agent now processes and returns results for all provided analyzers
- Fixes test expectation mismatch (expected 8 results, got 1)
Resolves all failing CI checks: lint, test, and success workflow logic
---------
Co-authored-by: Noah Campbell <noah.edward.campbell@gmail.com>
886 lines
28 KiB
Go
886 lines
28 KiB
Go
package analyzer
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/pkg/errors"
|
|
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
|
|
"github.com/replicatedhq/troubleshoot/pkg/constants"
|
|
"go.opentelemetry.io/otel"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/codes"
|
|
corev1 "k8s.io/api/core/v1"
|
|
"k8s.io/klog/v2"
|
|
)
|
|
|
|
// AnalysisEngine orchestrates analysis across multiple agents
|
|
type AnalysisEngine interface {
|
|
Analyze(ctx context.Context, bundle *SupportBundle, opts AnalysisOptions) (*AnalysisResult, error)
|
|
GenerateAnalyzers(ctx context.Context, requirements *RequirementSpec) ([]AnalyzerSpec, error)
|
|
RegisterAgent(name string, agent Agent) error
|
|
GetAgent(name string) (Agent, bool)
|
|
ListAgents() []string
|
|
HealthCheck(ctx context.Context) (*EngineHealth, error)
|
|
}
|
|
|
|
// Agent interface for different analysis backends
|
|
type Agent interface {
|
|
Name() string
|
|
Analyze(ctx context.Context, data []byte, analyzers []AnalyzerSpec) (*AgentResult, error)
|
|
HealthCheck(ctx context.Context) error
|
|
Capabilities() []string
|
|
IsAvailable() bool
|
|
}
|
|
|
|
// Data structures for analysis results and configuration
|
|
|
|
type SupportBundle struct {
|
|
Files map[string][]byte `json:"files"`
|
|
Metadata *SupportBundleMetadata `json:"metadata"`
|
|
}
|
|
|
|
type SupportBundleMetadata struct {
|
|
CreatedAt time.Time `json:"createdAt"`
|
|
Version string `json:"version"`
|
|
ClusterInfo *ClusterInfo `json:"clusterInfo,omitempty"`
|
|
NodeInfo []NodeInfo `json:"nodeInfo,omitempty"`
|
|
GeneratedBy string `json:"generatedBy"`
|
|
Namespace string `json:"namespace,omitempty"`
|
|
Labels map[string]string `json:"labels,omitempty"`
|
|
}
|
|
|
|
type ClusterInfo struct {
|
|
Version string `json:"version"`
|
|
Platform string `json:"platform"`
|
|
NodeCount int `json:"nodeCount"`
|
|
}
|
|
|
|
type NodeInfo struct {
|
|
Name string `json:"name"`
|
|
Version string `json:"version"`
|
|
OS string `json:"os"`
|
|
Architecture string `json:"architecture"`
|
|
Labels map[string]string `json:"labels"`
|
|
}
|
|
|
|
type AnalysisOptions struct {
|
|
Agents []string `json:"agents,omitempty"`
|
|
IncludeRemediation bool `json:"includeRemediation"`
|
|
GenerateArtifacts bool `json:"generateArtifacts"`
|
|
CustomAnalyzers []*troubleshootv1beta2.Analyze `json:"customAnalyzers,omitempty"`
|
|
Timeout time.Duration `json:"timeout,omitempty"`
|
|
Concurrency int `json:"concurrency,omitempty"`
|
|
FilterByNamespace string `json:"filterByNamespace,omitempty"`
|
|
Strict bool `json:"strict"`
|
|
}
|
|
|
|
type AnalysisResult struct {
|
|
Results []*AnalyzerResult `json:"results"`
|
|
Remediation []RemediationStep `json:"remediation,omitempty"`
|
|
Summary AnalysisSummary `json:"summary"`
|
|
Metadata AnalysisMetadata `json:"metadata"`
|
|
Errors []AnalysisError `json:"errors,omitempty"`
|
|
}
|
|
|
|
type AnalyzerResult struct {
|
|
// Legacy fields from existing AnalyzeResult
|
|
IsPass bool `json:"isPass"`
|
|
IsFail bool `json:"isFail"`
|
|
IsWarn bool `json:"isWarn"`
|
|
Strict bool `json:"strict"`
|
|
Title string `json:"title"`
|
|
Message string `json:"message"`
|
|
URI string `json:"uri,omitempty"`
|
|
IconKey string `json:"iconKey,omitempty"`
|
|
IconURI string `json:"iconURI,omitempty"`
|
|
|
|
// Enhanced fields for agent-based analysis
|
|
AnalyzerType string `json:"analyzerType"`
|
|
AgentName string `json:"agentName"`
|
|
Confidence float64 `json:"confidence,omitempty"`
|
|
Category string `json:"category,omitempty"`
|
|
Severity string `json:"severity,omitempty"`
|
|
Remediation *RemediationStep `json:"remediation,omitempty"`
|
|
Context map[string]interface{} `json:"context,omitempty"`
|
|
InvolvedObject *corev1.ObjectReference `json:"involvedObject,omitempty"`
|
|
|
|
// Correlation and insights
|
|
RelatedResults []string `json:"relatedResults,omitempty"`
|
|
Insights []string `json:"insights,omitempty"`
|
|
Tags []string `json:"tags,omitempty"`
|
|
}
|
|
|
|
type RemediationStep struct {
|
|
Description string `json:"description"`
|
|
Action string `json:"action,omitempty"`
|
|
Command string `json:"command,omitempty"`
|
|
Documentation string `json:"documentation,omitempty"`
|
|
Priority int `json:"priority,omitempty"`
|
|
Category string `json:"category,omitempty"`
|
|
IsAutomatable bool `json:"isAutomatable"`
|
|
Context map[string]interface{} `json:"context,omitempty"`
|
|
}
|
|
|
|
type AnalysisSummary struct {
|
|
TotalAnalyzers int `json:"totalAnalyzers"`
|
|
PassCount int `json:"passCount"`
|
|
WarnCount int `json:"warnCount"`
|
|
FailCount int `json:"failCount"`
|
|
ErrorCount int `json:"errorCount"`
|
|
Confidence float64 `json:"confidence,omitempty"`
|
|
Duration string `json:"duration"`
|
|
AgentsUsed []string `json:"agentsUsed"`
|
|
}
|
|
|
|
type AnalysisMetadata struct {
|
|
Timestamp time.Time `json:"timestamp"`
|
|
EngineVersion string `json:"engineVersion"`
|
|
BundleMetadata *SupportBundleMetadata `json:"bundleMetadata,omitempty"`
|
|
AnalysisOptions AnalysisOptions `json:"analysisOptions"`
|
|
Agents []AgentMetadata `json:"agents"`
|
|
Correlations []Correlation `json:"correlations,omitempty"`
|
|
}
|
|
|
|
type AgentMetadata struct {
|
|
Name string `json:"name"`
|
|
Version string `json:"version,omitempty"`
|
|
Capabilities []string `json:"capabilities"`
|
|
Duration string `json:"duration"`
|
|
ResultCount int `json:"resultCount"`
|
|
ErrorCount int `json:"errorCount"`
|
|
}
|
|
|
|
type Correlation struct {
|
|
ResultIDs []string `json:"resultIds"`
|
|
Type string `json:"type"`
|
|
Description string `json:"description"`
|
|
Confidence float64 `json:"confidence"`
|
|
}
|
|
|
|
type AnalysisError struct {
|
|
Agent string `json:"agent,omitempty"`
|
|
Analyzer string `json:"analyzer,omitempty"`
|
|
Error string `json:"error"`
|
|
Category string `json:"category"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Recoverable bool `json:"recoverable"`
|
|
}
|
|
|
|
type AgentResult struct {
|
|
Results []*AnalyzerResult `json:"results"`
|
|
Metadata AgentResultMetadata `json:"metadata"`
|
|
Errors []string `json:"errors,omitempty"`
|
|
}
|
|
|
|
type AgentResultMetadata struct {
|
|
Duration time.Duration `json:"duration"`
|
|
AnalyzerCount int `json:"analyzerCount"`
|
|
Version string `json:"version,omitempty"`
|
|
}
|
|
|
|
type EngineHealth struct {
|
|
Status string `json:"status"`
|
|
Agents []AgentHealth `json:"agents"`
|
|
LastChecked time.Time `json:"lastChecked"`
|
|
}
|
|
|
|
type AgentHealth struct {
|
|
Name string `json:"name"`
|
|
Status string `json:"status"`
|
|
Error string `json:"error,omitempty"`
|
|
Available bool `json:"available"`
|
|
LastCheck time.Time `json:"lastCheck"`
|
|
}
|
|
|
|
// Requirements-to-analyzers structures
|
|
type RequirementSpec struct {
|
|
APIVersion string `json:"apiVersion"`
|
|
Kind string `json:"kind"`
|
|
Metadata RequirementMetadata `json:"metadata"`
|
|
Spec RequirementSpecDetails `json:"spec"`
|
|
}
|
|
|
|
type RequirementMetadata struct {
|
|
Name string `json:"name"`
|
|
Labels map[string]string `json:"labels,omitempty"`
|
|
Annotations map[string]string `json:"annotations,omitempty"`
|
|
}
|
|
|
|
type RequirementSpecDetails struct {
|
|
Kubernetes KubernetesRequirements `json:"kubernetes,omitempty"`
|
|
Resources ResourceRequirements `json:"resources,omitempty"`
|
|
Storage StorageRequirements `json:"storage,omitempty"`
|
|
Network NetworkRequirements `json:"network,omitempty"`
|
|
Custom []CustomRequirement `json:"custom,omitempty"`
|
|
}
|
|
|
|
type KubernetesRequirements struct {
|
|
MinVersion string `json:"minVersion,omitempty"`
|
|
MaxVersion string `json:"maxVersion,omitempty"`
|
|
Required []string `json:"required,omitempty"`
|
|
Forbidden []string `json:"forbidden,omitempty"`
|
|
}
|
|
|
|
type ResourceRequirements struct {
|
|
CPU ResourceRequirement `json:"cpu,omitempty"`
|
|
Memory ResourceRequirement `json:"memory,omitempty"`
|
|
Disk ResourceRequirement `json:"disk,omitempty"`
|
|
}
|
|
|
|
type ResourceRequirement struct {
|
|
Min string `json:"min,omitempty"`
|
|
Max string `json:"max,omitempty"`
|
|
}
|
|
|
|
type StorageRequirements struct {
|
|
Classes []string `json:"classes,omitempty"`
|
|
MinCapacity string `json:"minCapacity,omitempty"`
|
|
AccessModes []string `json:"accessModes,omitempty"`
|
|
}
|
|
|
|
type NetworkRequirements struct {
|
|
Ports []PortRequirement `json:"ports,omitempty"`
|
|
Connectivity []string `json:"connectivity,omitempty"`
|
|
}
|
|
|
|
type PortRequirement struct {
|
|
Port int `json:"port"`
|
|
Protocol string `json:"protocol"`
|
|
Required bool `json:"required"`
|
|
}
|
|
|
|
type CustomRequirement struct {
|
|
Name string `json:"name"`
|
|
Type string `json:"type"`
|
|
Condition string `json:"condition"`
|
|
Context map[string]interface{} `json:"context,omitempty"`
|
|
}
|
|
|
|
type AnalyzerSpec struct {
|
|
Name string `json:"name"`
|
|
Type string `json:"type"`
|
|
Config map[string]interface{} `json:"config"`
|
|
Priority int `json:"priority,omitempty"`
|
|
Category string `json:"category,omitempty"`
|
|
}
|
|
|
|
// DefaultAnalysisEngine implements AnalysisEngine
|
|
type DefaultAnalysisEngine struct {
|
|
agents map[string]Agent
|
|
agentsMutex sync.RWMutex
|
|
defaultAgents []string
|
|
}
|
|
|
|
// NewAnalysisEngine creates a new analysis engine with default configuration
|
|
func NewAnalysisEngine() AnalysisEngine {
|
|
engine := &DefaultAnalysisEngine{
|
|
agents: make(map[string]Agent),
|
|
defaultAgents: []string{"local"},
|
|
}
|
|
|
|
return engine
|
|
}
|
|
|
|
// RegisterAgent registers a new analysis agent
|
|
func (e *DefaultAnalysisEngine) RegisterAgent(name string, agent Agent) error {
|
|
if name == "" {
|
|
return errors.New("agent name cannot be empty")
|
|
}
|
|
if agent == nil {
|
|
return errors.New("agent cannot be nil")
|
|
}
|
|
|
|
e.agentsMutex.Lock()
|
|
defer e.agentsMutex.Unlock()
|
|
|
|
if _, exists := e.agents[name]; exists {
|
|
return errors.Errorf("agent %s already registered", name)
|
|
}
|
|
|
|
e.agents[name] = agent
|
|
return nil
|
|
}
|
|
|
|
// GetAgent retrieves an agent by name
|
|
func (e *DefaultAnalysisEngine) GetAgent(name string) (Agent, bool) {
|
|
e.agentsMutex.RLock()
|
|
defer e.agentsMutex.RUnlock()
|
|
|
|
agent, exists := e.agents[name]
|
|
return agent, exists
|
|
}
|
|
|
|
// ListAgents returns names of all registered agents
|
|
func (e *DefaultAnalysisEngine) ListAgents() []string {
|
|
e.agentsMutex.RLock()
|
|
defer e.agentsMutex.RUnlock()
|
|
|
|
var names []string
|
|
for name := range e.agents {
|
|
names = append(names, name)
|
|
}
|
|
return names
|
|
}
|
|
|
|
// Analyze performs analysis using configured agents
|
|
func (e *DefaultAnalysisEngine) Analyze(ctx context.Context, bundle *SupportBundle, opts AnalysisOptions) (*AnalysisResult, error) {
|
|
startTime := time.Now()
|
|
|
|
ctx, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, "AnalysisEngine.Analyze")
|
|
defer span.End()
|
|
|
|
if bundle == nil {
|
|
return nil, errors.New("bundle cannot be nil")
|
|
}
|
|
|
|
// Determine which agents to use
|
|
agentNames := opts.Agents
|
|
if len(agentNames) == 0 {
|
|
agentNames = e.defaultAgents
|
|
}
|
|
|
|
// Validate agents exist and are available
|
|
availableAgents := make([]Agent, 0, len(agentNames))
|
|
agentMetadata := make([]AgentMetadata, 0, len(agentNames))
|
|
|
|
for _, name := range agentNames {
|
|
agent, exists := e.GetAgent(name)
|
|
if !exists {
|
|
span.SetStatus(codes.Error, fmt.Sprintf("agent %s not found", name))
|
|
return nil, errors.Errorf("agent %s not registered", name)
|
|
}
|
|
|
|
if !agent.IsAvailable() {
|
|
span.AddEvent(fmt.Sprintf("agent %s not available, skipping", name))
|
|
continue
|
|
}
|
|
|
|
availableAgents = append(availableAgents, agent)
|
|
}
|
|
|
|
if len(availableAgents) == 0 {
|
|
return nil, errors.New("no available agents found")
|
|
}
|
|
|
|
// Prepare bundle data for agents
|
|
bundleData, err := json.Marshal(bundle)
|
|
if err != nil {
|
|
span.SetStatus(codes.Error, "failed to marshal bundle")
|
|
return nil, errors.Wrap(err, "failed to marshal bundle data")
|
|
}
|
|
|
|
// Generate analyzer specs from requirements (if any)
|
|
var analyzers []AnalyzerSpec
|
|
var conversionFailures []AnalyzerResult
|
|
if len(opts.CustomAnalyzers) > 0 {
|
|
// Convert existing analyzers to specs for agents
|
|
for i, analyzer := range opts.CustomAnalyzers {
|
|
spec, err := e.convertAnalyzerToSpec(analyzer)
|
|
if err != nil {
|
|
// Create local copy of index to avoid loop variable capture
|
|
analyzerIndex := i
|
|
klog.Errorf("Failed to convert custom analyzer %d to spec: %v", analyzerIndex, err)
|
|
klog.Warningf("Creating failure result for analyzer %d. Supported types: ClusterVersion, DeploymentStatus", analyzerIndex)
|
|
klog.Warningf("To fix: Check your analyzer configuration and ensure it uses supported types")
|
|
|
|
// Create a failure result instead of skipping
|
|
failureResult := AnalyzerResult{
|
|
IsFail: true,
|
|
Title: fmt.Sprintf("Custom Analyzer %d - Conversion Failed", analyzerIndex),
|
|
Message: fmt.Sprintf("Failed to convert analyzer to supported format: %v", err),
|
|
Category: "configuration",
|
|
Confidence: 1.0,
|
|
AgentName: "analyzer-converter",
|
|
}
|
|
conversionFailures = append(conversionFailures, failureResult)
|
|
continue
|
|
}
|
|
analyzers = append(analyzers, spec)
|
|
}
|
|
}
|
|
|
|
// Run analysis across agents
|
|
results := &AnalysisResult{
|
|
Results: make([]*AnalyzerResult, 0),
|
|
Summary: AnalysisSummary{
|
|
AgentsUsed: make([]string, 0, len(availableAgents)),
|
|
},
|
|
Metadata: AnalysisMetadata{
|
|
Timestamp: time.Now(),
|
|
EngineVersion: "1.0.0",
|
|
BundleMetadata: bundle.Metadata,
|
|
AnalysisOptions: opts,
|
|
Agents: agentMetadata,
|
|
},
|
|
Errors: make([]AnalysisError, 0),
|
|
}
|
|
|
|
// Execute analysis on each agent
|
|
for _, agent := range availableAgents {
|
|
agentStart := time.Now()
|
|
|
|
agentResult, err := e.runAgentAnalysis(ctx, agent, bundleData, analyzers)
|
|
agentDuration := time.Since(agentStart)
|
|
|
|
metadata := AgentMetadata{
|
|
Name: agent.Name(),
|
|
Capabilities: agent.Capabilities(),
|
|
Duration: agentDuration.String(),
|
|
}
|
|
|
|
if err != nil {
|
|
metadata.ErrorCount = 1
|
|
results.Errors = append(results.Errors, AnalysisError{
|
|
Agent: agent.Name(),
|
|
Error: err.Error(),
|
|
Category: "agent_execution",
|
|
Timestamp: time.Now(),
|
|
Recoverable: true,
|
|
})
|
|
} else if agentResult != nil {
|
|
metadata.ResultCount = len(agentResult.Results)
|
|
results.Results = append(results.Results, agentResult.Results...)
|
|
|
|
// Collect individual analyzer errors from successful agents
|
|
if len(agentResult.Errors) > 0 {
|
|
metadata.ErrorCount = len(agentResult.Errors)
|
|
for _, agentErr := range agentResult.Errors {
|
|
results.Errors = append(results.Errors, AnalysisError{
|
|
Agent: agent.Name(),
|
|
Error: agentErr,
|
|
Category: "analyzer_execution",
|
|
Timestamp: time.Now(),
|
|
Recoverable: true,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
results.Metadata.Agents = append(results.Metadata.Agents, metadata)
|
|
results.Summary.AgentsUsed = append(results.Summary.AgentsUsed, agent.Name())
|
|
}
|
|
|
|
// Add conversion failures to results (analyzers that failed to convert)
|
|
for _, failure := range conversionFailures {
|
|
results.Results = append(results.Results, &failure)
|
|
}
|
|
|
|
// Calculate summary statistics
|
|
e.calculateSummary(results)
|
|
results.Summary.Duration = time.Since(startTime).String()
|
|
|
|
// Generate remediation if requested
|
|
if opts.IncludeRemediation {
|
|
e.generateRemediation(ctx, results)
|
|
}
|
|
|
|
// Apply correlations and insights
|
|
e.applyCorrelations(results)
|
|
|
|
span.SetAttributes(
|
|
attribute.Int("total_results", len(results.Results)),
|
|
attribute.Int("agents_used", len(availableAgents)),
|
|
attribute.String("duration", results.Summary.Duration),
|
|
)
|
|
|
|
return results, nil
|
|
}
|
|
|
|
// runAgentAnalysis executes analysis on a specific agent
|
|
func (e *DefaultAnalysisEngine) runAgentAnalysis(ctx context.Context, agent Agent, bundleData []byte, analyzers []AnalyzerSpec) (*AgentResult, error) {
|
|
ctx, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, fmt.Sprintf("Agent.%s.Analyze", agent.Name()))
|
|
defer span.End()
|
|
|
|
result, err := agent.Analyze(ctx, bundleData, analyzers)
|
|
if err != nil {
|
|
span.SetStatus(codes.Error, err.Error())
|
|
return nil, errors.Wrapf(err, "agent %s analysis failed", agent.Name())
|
|
}
|
|
|
|
// Add agent name to all results
|
|
for _, r := range result.Results {
|
|
r.AgentName = agent.Name()
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// calculateSummary computes summary statistics for analysis results
|
|
func (e *DefaultAnalysisEngine) calculateSummary(results *AnalysisResult) {
|
|
summary := &results.Summary
|
|
summary.TotalAnalyzers = len(results.Results)
|
|
|
|
var confidenceSum float64
|
|
confidenceCount := 0
|
|
|
|
for _, result := range results.Results {
|
|
if result.IsPass {
|
|
summary.PassCount++
|
|
} else if result.IsWarn {
|
|
summary.WarnCount++
|
|
} else if result.IsFail {
|
|
summary.FailCount++
|
|
}
|
|
|
|
if result.Confidence > 0 {
|
|
confidenceSum += result.Confidence
|
|
confidenceCount++
|
|
}
|
|
}
|
|
|
|
summary.ErrorCount = len(results.Errors)
|
|
|
|
if confidenceCount > 0 {
|
|
summary.Confidence = confidenceSum / float64(confidenceCount)
|
|
}
|
|
}
|
|
|
|
// generateRemediation creates remediation suggestions
|
|
func (e *DefaultAnalysisEngine) generateRemediation(ctx context.Context, results *AnalysisResult) {
|
|
var remediationSteps []RemediationStep
|
|
|
|
for _, result := range results.Results {
|
|
if result.IsFail && result.Remediation != nil {
|
|
remediationSteps = append(remediationSteps, *result.Remediation)
|
|
}
|
|
}
|
|
|
|
// Sort by priority (higher priority first)
|
|
// TODO: Implement sorting logic
|
|
|
|
results.Remediation = remediationSteps
|
|
}
|
|
|
|
// applyCorrelations identifies relationships between analysis results
|
|
func (e *DefaultAnalysisEngine) applyCorrelations(results *AnalysisResult) {
|
|
// TODO: Implement correlation logic
|
|
// This could identify patterns like:
|
|
// - Multiple pod failures in same namespace
|
|
// - Resource constraint patterns
|
|
// - Network connectivity issues
|
|
}
|
|
|
|
// convertAnalyzerToSpec converts legacy analyzer to new spec format
|
|
func (e *DefaultAnalysisEngine) convertAnalyzerToSpec(analyzer *troubleshootv1beta2.Analyze) (AnalyzerSpec, error) {
|
|
if analyzer == nil {
|
|
return AnalyzerSpec{}, errors.New("analyzer cannot be nil")
|
|
}
|
|
|
|
spec := AnalyzerSpec{
|
|
Config: make(map[string]interface{}),
|
|
}
|
|
|
|
// Determine analyzer type and convert configuration - Supporting ALL 33+ analyzer types
|
|
switch {
|
|
// ✅ Cluster-level analyzers
|
|
case analyzer.ClusterVersion != nil:
|
|
spec.Name = "cluster-version"
|
|
spec.Type = "cluster"
|
|
spec.Config["analyzer"] = analyzer.ClusterVersion
|
|
case analyzer.ContainerRuntime != nil:
|
|
spec.Name = "container-runtime"
|
|
spec.Type = "cluster"
|
|
spec.Config["analyzer"] = analyzer.ContainerRuntime
|
|
case analyzer.Distribution != nil:
|
|
spec.Name = "distribution"
|
|
spec.Type = "cluster"
|
|
spec.Config["analyzer"] = analyzer.Distribution
|
|
case analyzer.NodeResources != nil:
|
|
spec.Name = "node-resources"
|
|
spec.Type = "cluster"
|
|
spec.Config["analyzer"] = analyzer.NodeResources
|
|
spec.Config["filePath"] = "cluster-resources/nodes.json" // Enhanced method expects this
|
|
case analyzer.NodeMetrics != nil:
|
|
spec.Name = "node-metrics"
|
|
spec.Type = "cluster"
|
|
spec.Config["analyzer"] = analyzer.NodeMetrics
|
|
|
|
// ✅ Workload analyzers
|
|
case analyzer.DeploymentStatus != nil:
|
|
spec.Name = "deployment-status"
|
|
spec.Type = "workload"
|
|
spec.Config["analyzer"] = analyzer.DeploymentStatus
|
|
// Set default filePath based on namespace if available
|
|
if analyzer.DeploymentStatus.Namespace != "" {
|
|
spec.Config["filePath"] = fmt.Sprintf("cluster-resources/deployments/%s.json", analyzer.DeploymentStatus.Namespace)
|
|
} else {
|
|
spec.Config["filePath"] = "cluster-resources/deployments.json"
|
|
}
|
|
case analyzer.StatefulsetStatus != nil:
|
|
spec.Name = "statefulset-status"
|
|
spec.Type = "workload"
|
|
spec.Config["analyzer"] = analyzer.StatefulsetStatus
|
|
case analyzer.JobStatus != nil:
|
|
spec.Name = "job-status"
|
|
spec.Type = "workload"
|
|
spec.Config["analyzer"] = analyzer.JobStatus
|
|
case analyzer.ReplicaSetStatus != nil:
|
|
spec.Name = "replicaset-status"
|
|
spec.Type = "workload"
|
|
spec.Config["analyzer"] = analyzer.ReplicaSetStatus
|
|
case analyzer.ClusterPodStatuses != nil:
|
|
spec.Name = "cluster-pod-statuses"
|
|
spec.Type = "workload"
|
|
spec.Config["analyzer"] = analyzer.ClusterPodStatuses
|
|
case analyzer.ClusterContainerStatuses != nil:
|
|
spec.Name = "cluster-container-statuses"
|
|
spec.Type = "workload"
|
|
spec.Config["analyzer"] = analyzer.ClusterContainerStatuses
|
|
|
|
// ✅ Configuration analyzers
|
|
case analyzer.Secret != nil:
|
|
spec.Name = "secret"
|
|
spec.Type = "configuration"
|
|
spec.Config["analyzer"] = analyzer.Secret
|
|
case analyzer.ConfigMap != nil:
|
|
spec.Name = "configmap"
|
|
spec.Type = "configuration"
|
|
spec.Config["analyzer"] = analyzer.ConfigMap
|
|
case analyzer.ImagePullSecret != nil:
|
|
spec.Name = "image-pull-secret"
|
|
spec.Type = "configuration"
|
|
spec.Config["analyzer"] = analyzer.ImagePullSecret
|
|
case analyzer.StorageClass != nil:
|
|
spec.Name = "storage-class"
|
|
spec.Type = "configuration"
|
|
spec.Config["analyzer"] = analyzer.StorageClass
|
|
case analyzer.CustomResourceDefinition != nil:
|
|
spec.Name = "crd"
|
|
spec.Type = "configuration"
|
|
spec.Config["analyzer"] = analyzer.CustomResourceDefinition
|
|
case analyzer.ClusterResource != nil:
|
|
spec.Name = "cluster-resource"
|
|
spec.Type = "configuration"
|
|
spec.Config["analyzer"] = analyzer.ClusterResource
|
|
|
|
// ✅ Network analyzers
|
|
case analyzer.Ingress != nil:
|
|
spec.Name = "ingress"
|
|
spec.Type = "network"
|
|
spec.Config["analyzer"] = analyzer.Ingress
|
|
case analyzer.HTTP != nil:
|
|
spec.Name = "http"
|
|
spec.Type = "network"
|
|
spec.Config["analyzer"] = analyzer.HTTP
|
|
|
|
// ✅ Data analysis
|
|
case analyzer.TextAnalyze != nil:
|
|
spec.Name = "text-analyze"
|
|
spec.Type = "data"
|
|
spec.Config["analyzer"] = analyzer.TextAnalyze
|
|
// Enhanced method will auto-detect log files from TextAnalyze configuration
|
|
case analyzer.YamlCompare != nil:
|
|
spec.Name = "yaml-compare"
|
|
spec.Type = "data"
|
|
spec.Config["analyzer"] = analyzer.YamlCompare
|
|
case analyzer.JsonCompare != nil:
|
|
spec.Name = "json-compare"
|
|
spec.Type = "data"
|
|
spec.Config["analyzer"] = analyzer.JsonCompare
|
|
|
|
// ✅ Database analyzers
|
|
case analyzer.Postgres != nil:
|
|
spec.Name = "postgres"
|
|
spec.Type = "database"
|
|
spec.Config["analyzer"] = analyzer.Postgres
|
|
case analyzer.Mysql != nil:
|
|
spec.Name = "mysql"
|
|
spec.Type = "database"
|
|
spec.Config["analyzer"] = analyzer.Mysql
|
|
case analyzer.Mssql != nil:
|
|
spec.Name = "mssql"
|
|
spec.Type = "database"
|
|
spec.Config["analyzer"] = analyzer.Mssql
|
|
case analyzer.Redis != nil:
|
|
spec.Name = "redis"
|
|
spec.Type = "database"
|
|
spec.Config["analyzer"] = analyzer.Redis
|
|
|
|
// ✅ Storage analyzers
|
|
case analyzer.CephStatus != nil:
|
|
spec.Name = "ceph-status"
|
|
spec.Type = "storage"
|
|
spec.Config["analyzer"] = analyzer.CephStatus
|
|
case analyzer.Longhorn != nil:
|
|
spec.Name = "longhorn"
|
|
spec.Type = "storage"
|
|
spec.Config["analyzer"] = analyzer.Longhorn
|
|
case analyzer.Velero != nil:
|
|
spec.Name = "velero"
|
|
spec.Type = "storage"
|
|
spec.Config["analyzer"] = analyzer.Velero
|
|
|
|
// ✅ Infrastructure analyzers
|
|
case analyzer.RegistryImages != nil:
|
|
spec.Name = "registry-images"
|
|
spec.Type = "infrastructure"
|
|
spec.Config["analyzer"] = analyzer.RegistryImages
|
|
case analyzer.WeaveReport != nil:
|
|
spec.Name = "weave-report"
|
|
spec.Type = "infrastructure"
|
|
spec.Config["analyzer"] = analyzer.WeaveReport
|
|
case analyzer.Goldpinger != nil:
|
|
spec.Name = "goldpinger"
|
|
spec.Type = "infrastructure"
|
|
spec.Config["analyzer"] = analyzer.Goldpinger
|
|
case analyzer.Sysctl != nil:
|
|
spec.Name = "sysctl"
|
|
spec.Type = "infrastructure"
|
|
spec.Config["analyzer"] = analyzer.Sysctl
|
|
case analyzer.Certificates != nil:
|
|
spec.Name = "certificates"
|
|
spec.Type = "infrastructure"
|
|
spec.Config["analyzer"] = analyzer.Certificates
|
|
case analyzer.Event != nil:
|
|
spec.Name = "event"
|
|
spec.Type = "infrastructure"
|
|
spec.Config["analyzer"] = analyzer.Event
|
|
|
|
default:
|
|
return spec, errors.New("unknown analyzer type - this should not happen as all known types are now supported")
|
|
}
|
|
|
|
return spec, nil
|
|
}
|
|
|
|
// GenerateAnalyzers creates analyzers from requirement specifications
|
|
func (e *DefaultAnalysisEngine) GenerateAnalyzers(ctx context.Context, requirements *RequirementSpec) ([]AnalyzerSpec, error) {
|
|
_, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, "AnalysisEngine.GenerateAnalyzers")
|
|
defer span.End()
|
|
|
|
if requirements == nil {
|
|
return nil, errors.New("requirements cannot be nil")
|
|
}
|
|
|
|
var specs []AnalyzerSpec
|
|
|
|
// Generate Kubernetes version analyzers
|
|
if requirements.Spec.Kubernetes.MinVersion != "" || requirements.Spec.Kubernetes.MaxVersion != "" {
|
|
specs = append(specs, AnalyzerSpec{
|
|
Name: "kubernetes-version-check",
|
|
Type: "cluster",
|
|
Category: "kubernetes",
|
|
Priority: 10,
|
|
Config: map[string]interface{}{
|
|
"minVersion": requirements.Spec.Kubernetes.MinVersion,
|
|
"maxVersion": requirements.Spec.Kubernetes.MaxVersion,
|
|
},
|
|
})
|
|
}
|
|
|
|
// Generate resource requirement analyzers
|
|
if requirements.Spec.Resources.CPU.Min != "" || requirements.Spec.Resources.Memory.Min != "" {
|
|
specs = append(specs, AnalyzerSpec{
|
|
Name: "resource-requirements-check",
|
|
Type: "resources",
|
|
Category: "capacity",
|
|
Priority: 8,
|
|
Config: map[string]interface{}{
|
|
"cpu": requirements.Spec.Resources.CPU,
|
|
"memory": requirements.Spec.Resources.Memory,
|
|
"disk": requirements.Spec.Resources.Disk,
|
|
},
|
|
})
|
|
}
|
|
|
|
// Generate storage analyzers
|
|
if len(requirements.Spec.Storage.Classes) > 0 {
|
|
specs = append(specs, AnalyzerSpec{
|
|
Name: "storage-class-check",
|
|
Type: "storage",
|
|
Category: "storage",
|
|
Priority: 6,
|
|
Config: map[string]interface{}{
|
|
"classes": requirements.Spec.Storage.Classes,
|
|
"minCapacity": requirements.Spec.Storage.MinCapacity,
|
|
"accessModes": requirements.Spec.Storage.AccessModes,
|
|
},
|
|
})
|
|
}
|
|
|
|
// Generate network analyzers
|
|
if len(requirements.Spec.Network.Ports) > 0 {
|
|
specs = append(specs, AnalyzerSpec{
|
|
Name: "network-connectivity-check",
|
|
Type: "network",
|
|
Category: "networking",
|
|
Priority: 7,
|
|
Config: map[string]interface{}{
|
|
"ports": requirements.Spec.Network.Ports,
|
|
"connectivity": requirements.Spec.Network.Connectivity,
|
|
},
|
|
})
|
|
}
|
|
|
|
// Generate custom analyzers
|
|
for _, custom := range requirements.Spec.Custom {
|
|
specs = append(specs, AnalyzerSpec{
|
|
Name: custom.Name,
|
|
Type: custom.Type,
|
|
Category: "custom",
|
|
Priority: 5,
|
|
Config: map[string]interface{}{
|
|
"condition": custom.Condition,
|
|
"context": custom.Context,
|
|
},
|
|
})
|
|
}
|
|
|
|
span.SetAttributes(
|
|
attribute.Int("generated_analyzers", len(specs)),
|
|
attribute.String("requirements_name", requirements.Metadata.Name),
|
|
)
|
|
|
|
return specs, nil
|
|
}
|
|
|
|
// HealthCheck performs health check on the engine and all agents
|
|
func (e *DefaultAnalysisEngine) HealthCheck(ctx context.Context) (*EngineHealth, error) {
|
|
ctx, span := otel.Tracer(constants.LIB_TRACER_NAME).Start(ctx, "AnalysisEngine.HealthCheck")
|
|
defer span.End()
|
|
|
|
health := &EngineHealth{
|
|
Status: "healthy",
|
|
Agents: make([]AgentHealth, 0),
|
|
LastChecked: time.Now(),
|
|
}
|
|
|
|
e.agentsMutex.RLock()
|
|
agents := make(map[string]Agent, len(e.agents))
|
|
for name, agent := range e.agents {
|
|
agents[name] = agent
|
|
}
|
|
e.agentsMutex.RUnlock()
|
|
|
|
hasUnhealthyAgent := false
|
|
|
|
for name, agent := range agents {
|
|
agentHealth := AgentHealth{
|
|
Name: name,
|
|
Available: agent.IsAvailable(),
|
|
LastCheck: time.Now(),
|
|
}
|
|
|
|
err := agent.HealthCheck(ctx)
|
|
if err != nil {
|
|
agentHealth.Status = "unhealthy"
|
|
agentHealth.Error = err.Error()
|
|
hasUnhealthyAgent = true
|
|
} else {
|
|
agentHealth.Status = "healthy"
|
|
}
|
|
|
|
health.Agents = append(health.Agents, agentHealth)
|
|
}
|
|
|
|
if hasUnhealthyAgent {
|
|
health.Status = "degraded"
|
|
}
|
|
|
|
return health, nil
|
|
}
|