feat(api): generate alert grid per label value

This allows generating multiple independent alert group lists, one per unique value of the label specified by the user. This way we can have a separate grid per severity or cluster label value.
2026-02-13 20:59:53 +00:00 · 2020-03-29 16:43:45 +01:00
parent 636af261ca
commit cff62dda2f
5 changed files with 228 additions and 52 deletions
--- a/cmd/karma/alerts.go
+++ b/cmd/karma/alerts.go
@@ -168,9 +168,7 @@ func sortByStartsAt(i, j int, groups []models.APIAlertGroup, sortReverse bool) b
 	return groups[i].LatestStartsAt.Before(groups[j].LatestStartsAt)
 }

-func sortAlertGroups(c *gin.Context, groupsMap map[string]models.APIAlertGroup) []models.APIAlertGroup {
-	groups := make([]models.APIAlertGroup, 0, len(groupsMap))
-
+func getSortOptions(c *gin.Context) (string, string, string) {
 	sortOrder, found := c.GetQuery("sortOrder")
 	if !found || sortOrder == "" {
 		sortOrder = config.Config.Grid.Sorting.Order
@@ -190,9 +188,11 @@ func sortAlertGroups(c *gin.Context, groupsMap map[string]models.APIAlertGroup)
 		sortLabel = config.Config.Grid.Sorting.Label
 	}

-	for _, g := range groupsMap {
-		groups = append(groups, g)
-	}
+	return sortOrder, sortReverse, sortLabel
+}
+
+func sortAlertGroups(c *gin.Context, groups []models.APIAlertGroup) []models.APIAlertGroup {
+	sortOrder, sortReverse, sortLabel := getSortOptions(c)

 	switch sortOrder {
 	case "startsAt":
@@ -239,3 +239,33 @@ func sortAlertGroups(c *gin.Context, groupsMap map[string]models.APIAlertGroup)

 	return groups
 }
+
+func sortGrids(c *gin.Context, gridLabel string, gridsMap map[string]models.APIGrid, gridSortReverse bool) []models.APIGrid {
+	grids := make([]models.APIGrid, 0, len(gridsMap))
+
+	for _, g := range gridsMap {
+		g.AlertGroups = sortAlertGroups(c, g.AlertGroups)
+		grids = append(grids, g)
+	}
+
+	sort.Slice(grids, func(i, j int) bool {
+		vi := resolveLabelValue(gridLabel, grids[i].LabelValue)
+		vj := resolveLabelValue(gridLabel, grids[j].LabelValue)
+
+		if vi == "" {
+			// first label is missing
+			return gridSortReverse
+		}
+		if vj == "" {
+			// second label is missing
+			return !gridSortReverse
+		}
+		// finnally return groups sorted by label
+		if gridSortReverse {
+			return !sortorder.NaturalLess(vi, vj)
+		}
+		return sortorder.NaturalLess(vi, vj)
+	})
+
+	return grids
+}
--- a/cmd/karma/api_test.go
+++ b/cmd/karma/api_test.go
@@ -1005,13 +1005,13 @@ func TestVerifyAllGroups(t *testing.T) {
 			t.Errorf("Failed to unmarshal response: %s", err)
 		}

-		if len(ur.AlertGroups) != len(groupTests) {
+		if len(ur.Grids[0].AlertGroups) != len(groupTests) {
 			t.Errorf("[%s] Got %d alert(s) in response, expected %d",
-				version, len(ur.AlertGroups), len(groupTests))
+				version, len(ur.Grids[0].AlertGroups), len(groupTests))
 		}
 		for _, testCase := range groupTests {
 			groupFound := false
-			for _, group := range ur.AlertGroups {
+			for _, group := range ur.Grids[0].AlertGroups {
 				if compareAlertGroups(testCase, group) {
 					groupFound = true
 					testAlertGroup(version, t, testCase, group)
@@ -1221,7 +1221,7 @@ func TestSortOrder(t *testing.T) {
 			}

 			values := []string{}
-			for _, ag := range ur.AlertGroups {
+			for _, ag := range ur.Grids[0].AlertGroups {
 				v := ag.Labels[testCase.expectedLabel]
 				if v == "" {
 					v = ag.Shared.Labels[testCase.expectedLabel]
--- a/cmd/karma/views.go
+++ b/cmd/karma/views.go
@@ -203,11 +203,11 @@ func alerts(c *gin.Context) {
 		return
 	}

-	// get filters
+	gridLabel, _ := c.GetQuery("gridLabel")
+
 	matchFilters, validFilters := getFiltersFromQuery(c.QueryArray("q"))

-	// set pointers for data store objects, need a lock until end of view is reached
-	alerts := map[string]models.APIAlertGroup{}
+	grids := map[string]models.APIGrid{}
 	colors := models.LabelsColorMap{}
 	counters := map[string]map[string]int{}

@@ -227,18 +227,7 @@ func alerts(c *gin.Context) {

 	var matches int
 	for _, ag := range dedupedAlerts {
-		agCopy := models.AlertGroup{
-			ID:                ag.ID,
-			Receiver:          ag.Receiver,
-			Labels:            ag.Labels,
-			LatestStartsAt:    ag.LatestStartsAt,
-			Alerts:            []models.Alert{},
-			AlertmanagerCount: map[string]int{},
-			StateCount:        map[string]int{},
-		}
-		for _, s := range models.AlertStateList {
-			agCopy.StateCount[s] = 0
-		}
+		perGridAlertGroup := map[string]*models.AlertGroup{}

 		for _, alert := range ag.Alerts {
 			alert := alert // scopelint pin
@@ -258,6 +247,25 @@ func alerts(c *gin.Context) {
 				// we update it here rather than in dedup since here we can apply it
 				// only for alerts left after filtering
 				alert.UpdateFingerprints()
+
+				alertGridLabelValue := alert.Labels[gridLabel]
+				agCopy, found := perGridAlertGroup[alertGridLabelValue]
+				if !found {
+					agCopy = &models.AlertGroup{
+						ID:                ag.ID,
+						Receiver:          ag.Receiver,
+						Labels:            ag.Labels,
+						LatestStartsAt:    ag.LatestStartsAt,
+						Alerts:            []models.Alert{},
+						AlertmanagerCount: map[string]int{},
+						StateCount:        map[string]int{},
+					}
+					for _, s := range models.AlertStateList {
+						agCopy.StateCount[s] = 0
+					}
+					perGridAlertGroup[alertGridLabelValue] = agCopy
+				}
+
 				agCopy.Alerts = append(agCopy.Alerts, alert)

 				countLabel(counters, "@state", alert.State)
@@ -307,32 +315,44 @@ func alerts(c *gin.Context) {
 			}
 		}

-		if len(agCopy.Alerts) > 0 {
-			for i, alert := range agCopy.Alerts {
-				if alert.IsSilenced() {
-					for j, am := range alert.Alertmanager {
-						key := amNameToCluster[am.Name]
-						// cluster might be wrong when collecting (races between fetches)
-						// update is with current cluster discovery state
-						agCopy.Alerts[i].Alertmanager[j].Cluster = key
-						for _, silence := range am.Silences {
-							_, found := silences[key][silence.ID]
-							if !found {
-								silences[key][silence.ID] = *silence
+		for gridLabelValue, ag := range perGridAlertGroup {
+			if len(ag.Alerts) > 0 {
+				for i, alert := range ag.Alerts {
+					if alert.IsSilenced() {
+						for j, am := range alert.Alertmanager {
+							key := amNameToCluster[am.Name]
+							// cluster might be wrong when collecting (races between fetches)
+							// update is with current cluster discovery state
+							ag.Alerts[i].Alertmanager[j].Cluster = key
+							for _, silence := range am.Silences {
+								_, found := silences[key][silence.ID]
+								if !found {
+									silences[key][silence.ID] = *silence
+								}
 							}
 						}
 					}
 				}
-			}
-			sort.Sort(agCopy.Alerts)
-			agCopy.LatestStartsAt = agCopy.FindLatestStartsAt()
-			agCopy.Hash = agCopy.ContentFingerprint()
-			apiAG := models.APIAlertGroup{AlertGroup: agCopy}
-			apiAG.DedupSharedMaps()
-			alerts[agCopy.ID] = apiAG
-			resp.TotalAlerts += len(agCopy.Alerts)
-		}
+				sort.Sort(ag.Alerts)
+				ag.LatestStartsAt = ag.FindLatestStartsAt()
+				ag.Hash = ag.ContentFingerprint()
+				apiAG := models.APIAlertGroup{AlertGroup: *ag}
+				apiAG.DedupSharedMaps()
+				resp.TotalAlerts += len(ag.Alerts)

+				grid, found := grids[gridLabelValue]
+				if !found {
+					grid = models.APIGrid{
+						LabelName:   gridLabel,
+						LabelValue:  gridLabelValue,
+						AlertGroups: []models.APIAlertGroup{},
+					}
+					grids[gridLabelValue] = grid
+				}
+				grid.AlertGroups = append(grid.AlertGroups, apiAG)
+				grids[gridLabelValue] = grid
+			}
+		}
 	}

 	for _, filter := range matchFilters {
@@ -341,7 +361,11 @@ func alerts(c *gin.Context) {
 		}
 	}

-	resp.AlertGroups = sortAlertGroups(c, alerts)
+	//resp.AlertGroups = sortAlertGroups(c, alerts)
+	v, _ := c.GetQuery("gridSortReverse")
+	gridSortReverse := v == "1"
+
+	resp.Grids = sortGrids(c, gridLabel, grids, gridSortReverse)
 	resp.Silences = silences
 	resp.Colors = colors
 	resp.Counters = countersToLabelStats(counters)
--- a/cmd/karma/views_test.go
+++ b/cmd/karma/views_test.go
@@ -145,8 +145,8 @@ func TestAlerts(t *testing.T) {
 			if len(ur.Colors) != 1 {
 				t.Errorf("[%s] Got %d color(s) in response, expected %d", version, len(ur.Colors), 1)
 			}
-			if len(ur.AlertGroups) != 1 {
-				t.Errorf("[%s] Got %d alert(s) in response, expected %d", version, len(ur.AlertGroups), 1)
+			if len(ur.Grids[0].AlertGroups) != 1 {
+				t.Errorf("[%s] Got %d alert group(s) in response, expected %d", version, len(ur.Grids[0].AlertGroups), 1)
 			}
 			if ur.Version == "" {
 				t.Errorf("[%s] Empty version in response", version)
@@ -172,7 +172,7 @@ func TestAlerts(t *testing.T) {
 			if len(ur.Counters) != 6 {
 				t.Errorf("[%s] Invalid number of counters in response (%d): %v", version, len(ur.Counters), ur.Counters)
 			}
-			for _, ag := range ur.AlertGroups {
+			for _, ag := range ur.Grids[0].AlertGroups {
 				for _, a := range ag.Alerts {
 					linkCount := 0
 					for _, annotation := range a.Annotations {
@@ -192,6 +192,122 @@ func TestAlerts(t *testing.T) {
 	}
 }

+func TestGrids(t *testing.T) {
+	type testCaseGridT struct {
+		labelValue      string
+		alertGroupCount int
+	}
+	type testCaseT struct {
+		gridLabel    string
+		requestQuery string
+		grids        []testCaseGridT
+	}
+	testCases := []testCaseT{
+		{
+			gridLabel:    "cluster",
+			requestQuery: "",
+			grids: []testCaseGridT{
+				{labelValue: "dev", alertGroupCount: 4},
+				{labelValue: "prod", alertGroupCount: 4},
+				{labelValue: "staging", alertGroupCount: 4},
+			},
+		},
+		{
+			gridLabel:    "cluster",
+			requestQuery: "&gridSortReverse=1",
+			grids: []testCaseGridT{
+				{labelValue: "staging", alertGroupCount: 4},
+				{labelValue: "prod", alertGroupCount: 4},
+				{labelValue: "dev", alertGroupCount: 4},
+			},
+		},
+		{
+			gridLabel:    "foo",
+			requestQuery: "",
+			grids: []testCaseGridT{
+				{labelValue: "", alertGroupCount: 10},
+			},
+		},
+		{
+			gridLabel:    "",
+			requestQuery: "",
+			grids: []testCaseGridT{
+				{labelValue: "", alertGroupCount: 10},
+			},
+		},
+		{
+			gridLabel:    "",
+			requestQuery: "&q=foo=bar",
+			grids:        []testCaseGridT{},
+		},
+		{
+			gridLabel:    "disk",
+			requestQuery: "",
+			grids: []testCaseGridT{
+				{labelValue: "sda", alertGroupCount: 2},
+				{labelValue: "", alertGroupCount: 8},
+			},
+		},
+		{
+			gridLabel:    "disk",
+			requestQuery: "&gridSortReverse=1",
+			grids: []testCaseGridT{
+				{labelValue: "", alertGroupCount: 8},
+				{labelValue: "sda", alertGroupCount: 2},
+			},
+		},
+		{
+			gridLabel:    "disk",
+			requestQuery: "&q=alertname=Free_Disk_Space_Too_Low",
+			grids: []testCaseGridT{
+				{labelValue: "sda", alertGroupCount: 2},
+			},
+		},
+	}
+
+	mockConfig()
+	for _, version := range mock.ListAllMocks() {
+		for _, testCase := range testCases {
+			t.Run(fmt.Sprintf("version=%q gridLabel=%q query=%q", version, testCase.gridLabel, testCase.requestQuery), func(t *testing.T) {
+				mockAlerts(version)
+				r := ginTestEngine()
+				// re-run a few times to test the cache
+				for i := 1; i <= 3; i++ {
+					req := httptest.NewRequest("GET", "/alerts.json?gridLabel="+testCase.gridLabel+testCase.requestQuery, nil)
+					resp := httptest.NewRecorder()
+					r.ServeHTTP(resp, req)
+					if resp.Code != http.StatusOK {
+						t.Errorf("GET /alerts.json returned status %d", resp.Code)
+					}
+
+					ur := models.AlertsResponse{}
+					err := json.Unmarshal(resp.Body.Bytes(), &ur)
+					if err != nil {
+						t.Errorf("Failed to unmarshal response: %s", err)
+					}
+
+					if len(ur.Grids) != len(testCase.grids) {
+						t.Errorf("Expected %d grids, got %d", len(testCase.grids), len(ur.Grids))
+					} else {
+						for index, expectedGrid := range testCase.grids {
+							grid := ur.Grids[index]
+							if grid.LabelName != testCase.gridLabel {
+								t.Errorf("Got wrong labelName for grid %d: %q, expected %q", index, grid.LabelName, testCase.gridLabel)
+							}
+							if grid.LabelValue != expectedGrid.labelValue {
+								t.Errorf("Got wrong labelValue for grid %d: %q, expected %q", index, grid.LabelValue, expectedGrid.labelValue)
+							}
+							if len(grid.AlertGroups) != expectedGrid.alertGroupCount {
+								t.Errorf("Got wrong alert group count for grid %d: %d, expected %d", index, len(grid.AlertGroups), expectedGrid.alertGroupCount)
+							}
+						}
+					}
+				}
+			})
+		}
+	}
+}
+
 func TestValidateAllAlerts(t *testing.T) {
 	mockConfig()
 	for _, version := range mock.ListAllMocks() {
@@ -212,7 +328,7 @@ func TestValidateAllAlerts(t *testing.T) {
 			if err != nil {
 				t.Errorf("Failed to unmarshal response: %s", err)
 			}
-			for _, ag := range ur.AlertGroups {
+			for _, ag := range ur.Grids[0].AlertGroups {
 				for _, a := range ag.Alerts {
 					if !slices.StringInSlice(models.AlertStateList, a.State) {
 						t.Errorf("Invalid alert status '%s', not in %v", a.State, models.AlertStateList)
--- a/internal/models/api.go
+++ b/internal/models/api.go
@@ -300,6 +300,12 @@ type AuthenticationInfo struct {
 	Username string `json:"username"`
 }

+type APIGrid struct {
+	LabelName   string          `json:"labelName"`
+	LabelValue  string          `json:"labelValue"`
+	AlertGroups []APIAlertGroup `json:"alertGroups"`
+}
+
 // AlertsResponse is the structure of JSON response UI will use to get alert data
 type AlertsResponse struct {
 	Status         string                        `json:"status"`
@@ -307,7 +313,7 @@ type AlertsResponse struct {
 	Version        string                        `json:"version"`
 	Upstreams      AlertmanagerAPISummary        `json:"upstreams"`
 	Silences       map[string]map[string]Silence `json:"silences"`
-	AlertGroups    []APIAlertGroup               `json:"groups"`
+	Grids          []APIGrid                     `json:"grids"`
 	TotalAlerts    int                           `json:"totalAlerts"`
 	Colors         LabelsColorMap                `json:"colors"`
 	Filters        []Filter                      `json:"filters"`