diff --git a/Makefile b/Makefile index 7f9280d2..08fa7c0d 100644 --- a/Makefile +++ b/Makefile @@ -96,6 +96,7 @@ helm-test-exec: ct helm-controller-version ko-build-all $(MAKE) e2e-load-image CLUSTER_NAME=capsule-charts IMAGE=$(CAPSULE_IMG) VERSION=v0.0.0 $(MAKE) e2e-load-image CLUSTER_NAME=capsule-charts IMAGE=$(CAPSULE_IMG) VERSION=tracing @$(KUBECTL) create ns capsule-system || true + @$(KUBECTL) apply --force-conflicts --server-side=true -f https://github.com/grafana/grafana-operator/releases/download/v5.18.0/crds.yaml @$(KUBECTL) apply --force-conflicts --server-side=true -f https://github.com/cert-manager/cert-manager/releases/download/v1.9.1/cert-manager.crds.yaml @$(KUBECTL) apply --force-conflicts --server-side=true -f https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.58.0/bundle.yaml @$(CT) install --config $(SRC_ROOT)/.github/configs/ct.yaml --namespace=capsule-system --all --debug diff --git a/api/v1beta2/resourcepool_status.go b/api/v1beta2/resourcepool_status.go index 9a6763fd..e722a243 100644 --- a/api/v1beta2/resourcepool_status.go +++ b/api/v1beta2/resourcepool_status.go @@ -24,6 +24,8 @@ type ResourcePoolStatus struct { Claims ResourcePoolNamespaceClaimsStatus `json:"claims,omitempty"` // Tracks the Usage from Claimed against what has been granted from the pool Allocation ResourcePoolQuotaStatus `json:"allocation,omitempty"` + // Exhaustions from claims associated with the pool + Exhaustions map[string]api.PoolExhaustionResource `json:"exhaustions,omitempty"` } type ResourcePoolNamespaceClaimsStatus map[string]ResourcePoolClaimsList diff --git a/api/v1beta2/zz_generated.deepcopy.go b/api/v1beta2/zz_generated.deepcopy.go index 2b3b9115..44c9dd6b 100644 --- a/api/v1beta2/zz_generated.deepcopy.go +++ b/api/v1beta2/zz_generated.deepcopy.go @@ -887,6 +887,13 @@ func (in *ResourcePoolStatus) DeepCopyInto(out *ResourcePoolStatus) { } } in.Allocation.DeepCopyInto(&out.Allocation) + if in.Exhaustions != nil { + in, out := &in.Exhaustions, &out.Exhaustions + *out = make(map[string]api.PoolExhaustionResource, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourcePoolStatus. diff --git a/charts/capsule/README.md b/charts/capsule/README.md index b4c16799..4c0c7564 100644 --- a/charts/capsule/README.md +++ b/charts/capsule/README.md @@ -178,20 +178,29 @@ Here the values you can override: | manager.volumes | list | `[]` | Set the additional volumes needed for the Capsule manager container | | manager.webhookPort | int | `9443` | Set an alternative to the default container port. Useful for use in some kubernetes clusters (such as GKE Private) with aggregator routing turned on, because pod ports have to be opened manually on the firewall side | -### ServiceMonitor Parameters +### Monitoring Parameters | Key | Type | Default | Description | |-----|------|---------|-------------| -| serviceMonitor.annotations | object | `{}` | Assign additional Annotations | -| serviceMonitor.enabled | bool | `false` | Enable ServiceMonitor | -| serviceMonitor.endpoint.interval | string | `"15s"` | Set the scrape interval for the endpoint of the serviceMonitor | -| serviceMonitor.endpoint.metricRelabelings | list | `[]` | Set metricRelabelings for the endpoint of the serviceMonitor | -| serviceMonitor.endpoint.relabelings | list | `[]` | Set relabelings for the endpoint of the serviceMonitor | -| serviceMonitor.endpoint.scrapeTimeout | string | `""` | Set the scrape timeout for the endpoint of the serviceMonitor | -| serviceMonitor.labels | object | `{}` | Assign additional labels according to Prometheus' serviceMonitorSelector matching labels | -| serviceMonitor.matchLabels | object | `{}` | Change matching labels | -| serviceMonitor.namespace | string | `""` | Install the ServiceMonitor into a different Namespace, as the monitoring stack one (default: the release one) | -| serviceMonitor.targetLabels | list | `[]` | Set targetLabels for the serviceMonitor | +| monitoring.dashboards.annotations | object | `{}` | Annotations for dashboard configmaps | +| monitoring.dashboards.enabled | bool | `false` | Enable Dashboards to be deployed | +| monitoring.dashboards.labels | object | `{}` | Labels for dashboard configmaps | +| monitoring.dashboards.namespace | string | `""` | Custom namespace for dashboard configmaps | +| monitoring.dashboards.operator.allowCrossNamespaceImport | bool | `true` | Allow the Operator to match this resource with Grafanas outside the current namespace | +| monitoring.dashboards.operator.enabled | bool | `true` | Enable Operator Resources (GrafanaDashboard) | +| monitoring.dashboards.operator.folder | string | `""` | folder assignment for dashboard | +| monitoring.dashboards.operator.instanceSelector | object | `{}` | Selects Grafana instances for import | +| monitoring.dashboards.operator.resyncPeriod | string | `"10m"` | How often the resource is synced, defaults to 10m0s if not set | +| monitoring.serviceMonitor.annotations | object | `{}` | Assign additional Annotations | +| monitoring.serviceMonitor.enabled | bool | `false` | Enable ServiceMonitor | +| monitoring.serviceMonitor.endpoint.interval | string | `"15s"` | Set the scrape interval for the endpoint of the serviceMonitor | +| monitoring.serviceMonitor.endpoint.metricRelabelings | list | `[]` | Set metricRelabelings for the endpoint of the serviceMonitor | +| monitoring.serviceMonitor.endpoint.relabelings | list | `[]` | Set relabelings for the endpoint of the serviceMonitor | +| monitoring.serviceMonitor.endpoint.scrapeTimeout | string | `""` | Set the scrape timeout for the endpoint of the serviceMonitor | +| monitoring.serviceMonitor.labels | object | `{}` | Assign additional labels according to Prometheus' serviceMonitorSelector matching labels | +| monitoring.serviceMonitor.matchLabels | object | `{}` | Change matching labels | +| monitoring.serviceMonitor.namespace | string | `""` | Install the ServiceMonitor into a different Namespace, as the monitoring stack one (default: the release one) | +| monitoring.serviceMonitor.targetLabels | list | `[]` | Set targetLabels for the serviceMonitor | ### Webhooks Parameters diff --git a/charts/capsule/README.md.gotmpl b/charts/capsule/README.md.gotmpl index e3b46784..e1f00aa7 100644 --- a/charts/capsule/README.md.gotmpl +++ b/charts/capsule/README.md.gotmpl @@ -112,7 +112,7 @@ Here the values you can override: | Key | Type | Default | Description | |-----|------|---------|-------------| {{- range .Values }} - {{- if not (or (hasPrefix "global" .Key) (hasPrefix "manager" .Key) (hasPrefix "crds" .Key) (hasPrefix "serviceMonitor" .Key) (hasPrefix "webhook" .Key) (hasPrefix "capsule-proxy" .Key) ) }} + {{- if not (or (hasPrefix "global" .Key) (hasPrefix "manager" .Key) (hasPrefix "crds" .Key) (hasPrefix "monitoring" .Key) (hasPrefix "webhook" .Key) (hasPrefix "capsule-proxy" .Key) ) }} | {{ .Key }} | {{ .Type }} | {{ if .Default }}{{ .Default }}{{ else }}{{ .AutoDefault }}{{ end }} | {{ if .Description }}{{ .Description }}{{ else }}{{ .AutoDescription }}{{ end }} | {{- end }} {{- end }} @@ -127,12 +127,12 @@ Here the values you can override: {{- end }} {{- end }} -### ServiceMonitor Parameters +### Monitoring Parameters | Key | Type | Default | Description | |-----|------|---------|-------------| {{- range .Values }} - {{- if hasPrefix "serviceMonitor" .Key }} + {{- if hasPrefix "monitoring" .Key }} | {{ .Key }} | {{ .Type }} | {{ if .Default }}{{ .Default }}{{ else }}{{ .AutoDefault }}{{ end }} | {{ if .Description }}{{ .Description }}{{ else }}{{ .AutoDescription }}{{ end }} | {{- end }} {{- end }} diff --git a/charts/capsule/ci/monitoring-values.yaml b/charts/capsule/ci/monitoring-values.yaml new file mode 100644 index 00000000..8b6c629c --- /dev/null +++ b/charts/capsule/ci/monitoring-values.yaml @@ -0,0 +1,9 @@ +monitoring: + dashboards: + enabled: true + annotations: + k8s-sidecar-target-directory: /tmp/dashboards/Capsule + labels: + grafana_dashboard: "1" + operator: + enabled: true diff --git a/charts/capsule/crds/capsule.clastix.io_resourcepools.yaml b/charts/capsule/crds/capsule.clastix.io_resourcepools.yaml index a1ceaf73..617f4c2d 100644 --- a/charts/capsule/crds/capsule.clastix.io_resourcepools.yaml +++ b/charts/capsule/crds/capsule.clastix.io_resourcepools.yaml @@ -291,6 +291,26 @@ spec: type: array description: Tracks the quotas for the Resource. type: object + exhaustions: + additionalProperties: + properties: + available: + anyOf: + - type: integer + - type: string + description: Available Resources to be claimed + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + requesting: + anyOf: + - type: integer + - type: string + description: Requesting Resources + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + description: Exhaustions from claims associated with the pool + type: object namespaceCount: default: 0 description: How many namespaces are considered diff --git a/charts/capsule/dashboards/resourcepools-dashboard.json b/charts/capsule/dashboards/resourcepools-dashboard.json new file mode 100644 index 00000000..52fcd265 --- /dev/null +++ b/charts/capsule/dashboards/resourcepools-dashboard.json @@ -0,0 +1,1590 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Administrative Overview for ResourcePools", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 59, + "links": [], + "panels": [ + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 20, + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Amount of ResourcesPools providing the given resources", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 21, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(count by (pool) (capsule_pool_resource{pool=~\"$resourcepool\", resource=~\"$resource\"}))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "ResourcePools", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Overview of claims", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": true + }, + "mappings": [ + { + "options": { + "1": { + "color": "blue", + "index": 0, + "text": "Ready" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Ready" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "color": "red", + "index": 0, + "text": "NotReady" + }, + "1": { + "color": "blue", + "index": 1, + "text": "Ready" + } + }, + "type": "value" + } + ] + }, + { + "id": "custom.cellOptions", + "value": { + "applyToRow": false, + "type": "color-background", + "wrapText": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 20, + "x": 4, + "y": 1 + }, + "id": 57, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": true, + "fields": "", + "reducer": [ + "count" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "Ready" + } + ] + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "capsule_claim_condition{pool=~\"$resourcepool\"}", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Claim Overview", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Value": false, + "__name__": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "condition": true, + "container": true, + "container 1": true, + "container 2": true, + "container 3": true, + "endpoint": true, + "endpoint 1": true, + "endpoint 2": true, + "endpoint 3": true, + "instance": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "job": true, + "job 1": true, + "job 2": true, + "job 3": true, + "namespace": true, + "namespace 1": true, + "namespace 2": true, + "namespace 3": true, + "pod": true, + "pod 1": true, + "pod 2": true, + "pod 3": true, + "pool": false, + "pool 1": true, + "pool 2": true, + "pool 3": true, + "service": true, + "service 1": true, + "service 2": true, + "service 3": true, + "status": true + }, + "includeByName": {}, + "indexByName": { + "Time": 0, + "Value": 15, + "__name__": 1, + "condition": 4, + "container": 5, + "endpoint": 6, + "instance": 7, + "job": 8, + "name": 2, + "namespace": 9, + "pod": 10, + "pool": 11, + "reason": 12, + "service": 13, + "status": 14, + "target_namespace": 3 + }, + "renameByName": { + "Value": "Ready", + "Value #A": "Limit", + "Value #B": "Used", + "Value #C": "Available", + "condition": "Condition", + "name": "Claim", + "reason": "Reason", + "resource": "Resource", + "target_namespace": "Namespace" + } + } + } + ], + "transparent": true, + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Amount of ResourcesPoolsClaims", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 4 + }, + "id": 22, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(count by (name, namespace) (capsule_claim_condition))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total Claims", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Amount of ResourcesPoolsClaims successfully bound to ResourcePools", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 7 + }, + "id": 43, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(count by (name, namespace) (\n capsule_claim_condition{condition=\"Bound\"} == 1\n))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Bound Claims", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Amount of ResourcesPoolsClaims with no ResourcePool associated", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "red", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 10 + }, + "id": 44, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(count by (name, namespace) (\n capsule_claim_condition{pool=\"\"}\n))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Unassigned Claims", + "transparent": true, + "type": "stat" + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 4, + "panels": [], + "repeat": "resourcepool", + "repeatDirection": "h", + "title": "Pool $resourcepool", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 75 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 1, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "capsule_pool_usage_percentage{pool=~\"$resourcepool\", resource=~\"$resource\"}", + "format": "time_series", + "instant": true, + "legendFormat": "{{resource}}", + "range": false, + "refId": "A" + } + ], + "title": "Resource Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 75 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 18, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by (resource) ( capsule_pool_usage_percentage{pool=~\"$resourcepool\", resource=~\"$resource\"} )", + "format": "time_series", + "instant": false, + "legendFormat": "{{resource}}", + "range": true, + "refId": "A" + } + ], + "title": "Resource Usage", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Amount of ResourcesPoolsClaims", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 24 + }, + "id": 45, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(count by (name, namespace) (capsule_claim_condition{pool=~\"$resourcepool\"}))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total Claims", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Overview of claims", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": true + }, + "mappings": [ + { + "options": { + "1": { + "color": "blue", + "index": 0, + "text": "Ready" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Ready" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "color": "red", + "index": 0, + "text": "NotReady" + }, + "1": { + "color": "blue", + "index": 1, + "text": "Ready" + } + }, + "type": "value" + } + ] + }, + { + "id": "custom.cellOptions", + "value": { + "applyToRow": false, + "type": "color-background", + "wrapText": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 20, + "x": 4, + "y": 24 + }, + "id": 3, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": true, + "fields": "", + "reducer": [ + "count" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "Ready" + } + ] + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "capsule_claim_condition{pool=~\"$resourcepool\"}", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Claim Overview", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Value": false, + "__name__": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "container": true, + "container 1": true, + "container 2": true, + "container 3": true, + "endpoint": true, + "endpoint 1": true, + "endpoint 2": true, + "endpoint 3": true, + "instance": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "job": true, + "job 1": true, + "job 2": true, + "job 3": true, + "namespace": true, + "namespace 1": true, + "namespace 2": true, + "namespace 3": true, + "pod": true, + "pod 1": true, + "pod 2": true, + "pod 3": true, + "pool": true, + "pool 1": true, + "pool 2": true, + "pool 3": true, + "service": true, + "service 1": true, + "service 2": true, + "service 3": true, + "status": true + }, + "includeByName": {}, + "indexByName": { + "Time": 0, + "Value": 15, + "__name__": 1, + "condition": 4, + "container": 5, + "endpoint": 6, + "instance": 7, + "job": 8, + "name": 2, + "namespace": 9, + "pod": 10, + "pool": 11, + "reason": 12, + "service": 13, + "status": 14, + "target_namespace": 3 + }, + "renameByName": { + "Value": "Ready", + "Value #A": "Limit", + "Value #B": "Used", + "Value #C": "Available", + "condition": "Condition", + "name": "Claim", + "reason": "Reason", + "resource": "Resource", + "target_namespace": "Namespace" + } + } + } + ], + "transparent": true, + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Amount of ResourcesPoolsClaims successfully bound", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 28 + }, + "id": 46, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(count by (name, namespace) (\n capsule_claim_condition{pool=~\"$resourcepool\",condition=\"Bound\"} == 1\n))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Bound Claims", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Amount of ResourcesPoolsClaims successfully bound", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "orange", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 32 + }, + "id": 47, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(count by (name, namespace) (\n capsule_claim_condition{pool=~\"$resourcepool\",condition=\"Bound\"} == 0\n))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Queued Claims", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Pool Exhaustions", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "noValue": "🌈", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "orange", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Requesting" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "applyToRow": false, + "mode": "gradient", + "type": "color-background" + } + }, + { + "id": "unit", + "value": "bytes" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Available" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 36 + }, + "id": 52, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by (resource) (\n capsule_pool_exhaustion{pool=~\"$resourcepool\", resource=~\"$resource\"}\n)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by (resource) (\n capsule_pool_available{pool=~\"$resourcepool\", resource=~\"$resource\"}\n)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "B" + } + ], + "title": "Exhaustions/Overprovisioning", + "transformations": [ + { + "id": "joinByField", + "options": { + "byField": "resource", + "mode": "inner" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "__name__ 1": true, + "__name__ 2": true, + "container 1": true, + "container 2": true, + "endpoint 1": true, + "endpoint 2": true, + "instance 1": true, + "instance 2": true, + "job 1": true, + "namespace 1": true, + "pod 1": true, + "pool 1": true, + "resource": false, + "service 1": true + }, + "includeByName": {}, + "indexByName": { + "Time 1": 1, + "Time 2": 4, + "Value #A": 3, + "Value #B": 2, + "resource": 0 + }, + "renameByName": { + "Time 2": "", + "Value #A": "Requesting", + "Value #B": "Available", + "resource": "Resource" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Pool Exhaustions", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 13, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "🌈", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "orange", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Requesting" + }, + "properties": [] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 36 + }, + "id": 62, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "capsule_pool_exhaustion_percentage{pool=~\"$resourcepool\", resource=~\"$resource\"}", + "format": "time_series", + "instant": false, + "legendFormat": "{{resource}}", + "range": true, + "refId": "A" + } + ], + "title": "Exhaustions/Overprovisioning (%)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Allocation distribution amongst namespaces", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "fieldMinMax": false, + "mappings": [], + "noValue": "🤷‍♀️", + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 4.8, + "x": 0, + "y": 44 + }, + "id": 19, + "maxPerRow": 12, + "options": { + "displayLabels": [ + "name", + "value" + ], + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "repeat": "resource", + "repeatDirection": "h", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "capsule_pool_namespace_usage_percentage{pool=~\"$resourcepool\", resource=~\"$resource\"}", + "format": "time_series", + "instant": true, + "legendFormat": "{{target_namespace}}", + "range": false, + "refId": "A" + } + ], + "title": "$resource (Namespaces)", + "transparent": true, + "type": "piechart" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(capsule_pool_resource,resource)", + "description": "Query only specific resources within the given resource pools", + "hide": 0, + "includeAll": true, + "label": "Resource", + "multi": true, + "name": "resource", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(capsule_pool_resource,resource)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(capsule_pool_resource{resource=~\"$resource\"},pool)", + "description": "Select relevant Resourcepools", + "hide": 0, + "includeAll": true, + "label": "ResourcePool", + "multi": true, + "name": "resourcepool", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(capsule_pool_resource{resource=~\"$resource\"},pool)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prom" + }, + "filters": [], + "hide": 0, + "name": "Filters", + "skipUrlSync": false, + "type": "adhoc" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Capsule / ResourcePools", + "uid": "capsule-resourcepools", + "version": 18, + "weekStart": "" +} diff --git a/charts/capsule/templates/dashboards.yaml b/charts/capsule/templates/dashboards.yaml new file mode 100644 index 00000000..14d76532 --- /dev/null +++ b/charts/capsule/templates/dashboards.yaml @@ -0,0 +1,51 @@ + +{{- if $.Values.monitoring.dashboards.enabled }} + {{ range $path, $_ := .Files.Glob "dashboards/**-dashboard.json" }} + {{- with $ }} + {{- $content := (.Files.Get $path) }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "capsule.fullname" . }}-{{ $path | base | trimSuffix "-dashboard.json" | regexFind "[^_]+$" }}-dashboard + namespace: {{ default $.Release.Namespace $.Values.monitoring.dashboards.namespace | quote }} + annotations: + {{- with $.Values.monitoring.dashboards.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} + labels: + {{- include "capsule.labels" . | nindent 4 }} + {{- with $.Values.monitoring.dashboards.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +data: + {{ base $path }}: |- + {{- $content | nindent 4 }} + + {{- if $.Values.monitoring.dashboards.operator.enabled }} +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: {{ include "capsule.fullname" . }}-{{ $path | base | trimSuffix "-dashboard.json" | regexFind "[^_]+$" }} + namespace: {{ default $.Release.Namespace $.Values.monitoring.dashboards.namespace | quote }} + annotations: + {{- with $.Values.monitoring.dashboards.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} + labels: + {{- include "capsule.labels" . | nindent 4 }} + {{- with $.Values.monitoring.dashboards.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + configMapRef: + name: {{ include "capsule.fullname" . }}-{{ $path | base | trimSuffix "-dashboard.json" | regexFind "[^_]+$" }}-dashboard + key: {{ base $path }} + {{- with (omit $.Values.monitoring.dashboards.operator "enabled") }} + {{- toYaml . | nindent 2 }} + {{- end }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/charts/capsule/templates/servicemonitor.yaml b/charts/capsule/templates/servicemonitor.yaml index ef4c656a..b485d8a3 100644 --- a/charts/capsule/templates/servicemonitor.yaml +++ b/charts/capsule/templates/servicemonitor.yaml @@ -1,22 +1,23 @@ {{- if not $.Values.crds.exclusive }} - {{- if .Values.serviceMonitor.enabled }} + {{- with (mergeOverwrite .Values.monitoring.serviceMonitor (default dict .Values.serviceMonitor)) -}} + {{- if .enabled }} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: {{ include "capsule.fullname" . }}-monitor - namespace: {{ .Values.serviceMonitor.namespace | default .Release.Namespace }} + name: {{ include "capsule.fullname" $ }} + namespace: {{ .namespace | default $.Release.Namespace }} labels: {{- include "capsule.labels" . | nindent 4 }} - {{- with .Values.serviceMonitor.labels }} + {{- with .labels }} {{- toYaml . | nindent 4 }} {{- end }} - {{- with .Values.serviceMonitor.annotations }} + {{- with .annotations }} annotations: {{- toYaml . | nindent 4 }} {{- end }} spec: endpoints: - {{- with .Values.serviceMonitor.endpoint }} + {{- with .endpoint }} - interval: {{ .interval }} port: metrics path: /metrics @@ -31,18 +32,19 @@ spec: {{- end }} {{- end }} jobLabel: app.kubernetes.io/name - {{- with .Values.serviceMonitor.targetLabels }} + {{- with .targetLabels }} targetLabels: {{- toYaml . | nindent 4 }} {{- end }} selector: matchLabels: - {{- if .Values.serviceMonitor.matchLabels }} - {{- toYaml .Values.serviceMonitor.matchLabels | nindent 6 }} + {{- if .matchLabels }} + {{- toYaml .matchLabels | nindent 6 }} {{- else }} - {{- include "capsule.labels" . | nindent 6 }} + {{- include "capsule.selectorLabels" $ | nindent 6 }} {{- end }} namespaceSelector: matchNames: - - {{ .Release.Namespace }} + - {{ $.Release.Namespace }} + {{- end }} {{- end }} {{- end }} diff --git a/charts/capsule/values.schema.json b/charts/capsule/values.schema.json index 06ddcbd8..9617622d 100644 --- a/charts/capsule/values.schema.json +++ b/charts/capsule/values.schema.json @@ -331,6 +331,94 @@ }, "type": "object" }, + "monitoring": { + "properties": { + "dashboards": { + "properties": { + "annotations": { + "properties": {}, + "type": "object" + }, + "enabled": { + "type": "boolean" + }, + "labels": { + "properties": {}, + "type": "object" + }, + "namespace": { + "type": "string" + }, + "operator": { + "properties": { + "allowCrossNamespaceImport": { + "type": "boolean" + }, + "enabled": { + "type": "boolean" + }, + "folder": { + "type": "string" + }, + "instanceSelector": { + "properties": {}, + "type": "object" + }, + "resyncPeriod": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + }, + "serviceMonitor": { + "properties": { + "annotations": { + "properties": {}, + "type": "object" + }, + "enabled": { + "type": "boolean" + }, + "endpoint": { + "properties": { + "interval": { + "type": "string" + }, + "metricRelabelings": { + "type": "array" + }, + "relabelings": { + "type": "array" + }, + "scrapeTimeout": { + "type": "string" + } + }, + "type": "object" + }, + "labels": { + "properties": {}, + "type": "object" + }, + "matchLabels": { + "properties": {}, + "type": "object" + }, + "namespace": { + "type": "string" + }, + "targetLabels": { + "type": "array" + } + }, + "type": "object" + } + }, + "type": "object" + }, "nodeSelector": { "properties": {}, "type": "object" @@ -452,49 +540,6 @@ }, "type": "object" }, - "serviceMonitor": { - "properties": { - "annotations": { - "properties": {}, - "type": "object" - }, - "enabled": { - "type": "boolean" - }, - "endpoint": { - "properties": { - "interval": { - "type": "string" - }, - "metricRelabelings": { - "type": "array" - }, - "relabelings": { - "type": "array" - }, - "scrapeTimeout": { - "type": "string" - } - }, - "type": "object" - }, - "labels": { - "properties": {}, - "type": "object" - }, - "matchLabels": { - "properties": {}, - "type": "object" - }, - "namespace": { - "type": "string" - }, - "targetLabels": { - "type": "array" - } - }, - "type": "object" - }, "tls": { "properties": { "create": { diff --git a/charts/capsule/values.yaml b/charts/capsule/values.yaml index c78539a5..b664d462 100644 --- a/charts/capsule/values.yaml +++ b/charts/capsule/values.yaml @@ -377,27 +377,52 @@ webhooks: - key: capsule.clastix.io/tenant operator: Exists +# Monitoring Settings +monitoring: -# ServiceMonitor -serviceMonitor: - # -- Enable ServiceMonitor - enabled: false - # -- Install the ServiceMonitor into a different Namespace, as the monitoring stack one (default: the release one) - namespace: '' - # -- Assign additional labels according to Prometheus' serviceMonitorSelector matching labels - labels: {} - # -- Assign additional Annotations - annotations: {} - # -- Change matching labels - matchLabels: {} - # -- Set targetLabels for the serviceMonitor - targetLabels: [] - endpoint: - # -- Set the scrape interval for the endpoint of the serviceMonitor - interval: "15s" - # -- Set the scrape timeout for the endpoint of the serviceMonitor - scrapeTimeout: "" - # -- Set metricRelabelings for the endpoint of the serviceMonitor - metricRelabelings: [] - # -- Set relabelings for the endpoint of the serviceMonitor - relabelings: [] + dashboards: + # -- Enable Dashboards to be deployed + enabled: false + # -- Annotations for dashboard configmaps + annotations: {} + # -- Labels for dashboard configmaps + labels: {} + # grafana_dashboard: "1" + # -- Custom namespace for dashboard configmaps + namespace: "" + # Grafana Operator + operator: + # -- Enable Operator Resources (GrafanaDashboard) + enabled: true + # -- Allow the Operator to match this resource with Grafanas outside the current namespace + allowCrossNamespaceImport: true + # -- How often the resource is synced, defaults to 10m0s if not set + resyncPeriod: "10m" + # -- Selects Grafana instances for import + instanceSelector: {} + # -- folder assignment for dashboard + folder: "" + + # ServiceMonitor + serviceMonitor: + # -- Enable ServiceMonitor + enabled: false + # -- Install the ServiceMonitor into a different Namespace, as the monitoring stack one (default: the release one) + namespace: '' + # -- Assign additional labels according to Prometheus' serviceMonitorSelector matching labels + labels: {} + # -- Assign additional Annotations + annotations: {} + # -- Change matching labels + matchLabels: {} + # -- Set targetLabels for the serviceMonitor + targetLabels: [] + endpoint: + # -- Set the scrape interval for the endpoint of the serviceMonitor + interval: "15s" + # -- Set the scrape timeout for the endpoint of the serviceMonitor + scrapeTimeout: "" + # -- Set metricRelabelings for the endpoint of the serviceMonitor + metricRelabelings: [] + # -- Set relabelings for the endpoint of the serviceMonitor + relabelings: [] diff --git a/controllers/resourcepools/claim_controller.go b/controllers/resourcepools/claim_controller.go index b817af3c..4e102deb 100644 --- a/controllers/resourcepools/claim_controller.go +++ b/controllers/resourcepools/claim_controller.go @@ -51,9 +51,9 @@ func (r resourceClaimController) Reconcile(ctx context.Context, request ctrl.Req instance := &capsulev1beta2.ResourcePoolClaim{} if err = r.Get(ctx, request.NamespacedName, instance); err != nil { if apierrors.IsNotFound(err) { - log.Info("Request object not found, could have been deleted after reconcile request") + log.V(5).Info("Request object not found, could have been deleted after reconcile request") - r.metrics.DeleteClaimMetric(request.Name) + r.metrics.DeleteClaimMetric(request.Name, request.Namespace) return reconcile.Result{}, nil } diff --git a/controllers/resourcepools/pool_controller.go b/controllers/resourcepools/pool_controller.go index 43e7f79a..6fa0fca5 100644 --- a/controllers/resourcepools/pool_controller.go +++ b/controllers/resourcepools/pool_controller.go @@ -13,7 +13,6 @@ import ( "golang.org/x/sync/errgroup" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/types" @@ -76,7 +75,7 @@ func (r resourcePoolController) Reconcile(ctx context.Context, request ctrl.Requ instance := &capsulev1beta2.ResourcePool{} if err = r.Get(ctx, request.NamespacedName, instance); err != nil { if apierrors.IsNotFound(err) { - log.Info("Request object not found, could have been deleted after reconcile request") + log.V(5).Info("Request object not found, could have been deleted after reconcile request") r.metrics.DeleteResourcePoolMetric(request.Name) @@ -198,18 +197,23 @@ func (r *resourcePoolController) reconcile( // Keeps track of resources which are exhausted by previous resource // This is only required when Ordered is active - queuedResourcesMap := make(map[string]resource.Quantity) + exhaustions := make(map[string]api.PoolExhaustionResource) // You can now iterate over `allClaims` in order for _, claim := range claims { - log.Info("Found claim", "name", claim.Name, "namespace", claim.Namespace, "created", claim.CreationTimestamp) + log.V(5).Info("Found claim", "name", claim.Name, "namespace", claim.Namespace, "created", claim.CreationTimestamp) - err = r.reconcileResourceClaim(ctx, log.WithValues("Claim", claim.Name), pool, &claim, queuedResourcesMap) + err = r.reconcileResourceClaim(ctx, log.WithValues("Claim", claim.Name), pool, &claim, exhaustions) if err != nil { log.Error(err, "Failed to reconcile ResourceQuotaClaim", "claim", claim.Name) } } + log.V(7).Info("finalized reconciling claims", "exhaustions", exhaustions) + + r.metrics.CalculateExhaustions(pool, exhaustions) + pool.Status.Exhaustions = exhaustions + pool.CalculateClaimedResources() pool.AssignClaims() @@ -222,7 +226,7 @@ func (r *resourcePoolController) reconcileResourceClaim( log logr.Logger, pool *capsulev1beta2.ResourcePool, claim *capsulev1beta2.ResourcePoolClaim, - exhaustion map[string]resource.Quantity, + exhaustion map[string]api.PoolExhaustionResource, ) (err error) { t := pool.GetClaimFromStatus(claim) if t != nil { @@ -257,7 +261,6 @@ func (r *resourcePoolController) reconcileResourceClaim( return r.handleClaimResourceExhaustion( ctx, - pool, claim, exhaustions, exhaustion, @@ -271,14 +274,14 @@ func (r *resourcePoolController) canClaimWithinNamespace( log logr.Logger, pool *capsulev1beta2.ResourcePool, claim *capsulev1beta2.ResourcePoolClaim, -) (res map[string]PoolExhaustionResource) { +) (res map[string]api.PoolExhaustionResource) { claimable := pool.GetAvailableClaimableResources() log.V(5).Info("claimable resources", "claimable", claimable) _, namespaceClaimed := pool.GetNamespaceClaims(claim.Namespace) log.V(5).Info("namespace claimed resources", "claimed", namespaceClaimed) - res = make(map[string]PoolExhaustionResource) + res = make(map[string]api.PoolExhaustionResource) for resourceName, req := range claim.Spec.ResourceClaims { // Verify if total Quota is available @@ -286,10 +289,9 @@ func (r *resourcePoolController) canClaimWithinNamespace( if !exists || available.IsZero() || available.Cmp(req) < 0 { log.V(5).Info("not enough resources available", "available", available, "requesting", req) - res[resourceName.String()] = PoolExhaustionResource{ + res[resourceName.String()] = api.PoolExhaustionResource{ Available: available, Requesting: req, - Namespace: false, } continue @@ -303,12 +305,12 @@ func (r *resourcePoolController) canClaimWithinNamespace( func (r *resourcePoolController) handleClaimOrderedExhaustion( ctx context.Context, claim *capsulev1beta2.ResourcePoolClaim, - exhaustion map[string]resource.Quantity, + exhaustions map[string]api.PoolExhaustionResource, ) (queued bool, err error) { status := make([]string, 0) for resourceName, qt := range claim.Spec.ResourceClaims { - req, ok := exhaustion[resourceName.String()] + req, ok := exhaustions[resourceName.String()] if !ok { continue } @@ -318,7 +320,7 @@ func (r *resourcePoolController) handleClaimOrderedExhaustion( resourceName, qt.String(), resourceName, - req.String(), + req.Requesting.String(), ) status = append(status, line) } @@ -339,32 +341,28 @@ func (r *resourcePoolController) handleClaimOrderedExhaustion( func (r *resourcePoolController) handleClaimResourceExhaustion( ctx context.Context, - pool *capsulev1beta2.ResourcePool, claim *capsulev1beta2.ResourcePoolClaim, - exhaustions map[string]PoolExhaustionResource, - exhaustion map[string]resource.Quantity, + currentExhaustions map[string]api.PoolExhaustionResource, + exhaustions map[string]api.PoolExhaustionResource, ) (err error) { status := make([]string, 0) resourceNames := make([]string, 0) - for resourceName := range exhaustions { + for resourceName := range currentExhaustions { resourceNames = append(resourceNames, resourceName) } sort.Strings(resourceNames) for _, resourceName := range resourceNames { - ex := exhaustions[resourceName] + ex := currentExhaustions[resourceName] - if *pool.Spec.Config.OrderedQueue { - ext, ok := exhaustion[resourceName] - if ok { - ext.Add(ex.Requesting) - } else { - ext = ex.Requesting - } - - exhaustion[resourceName] = ext + ext, ok := exhaustions[resourceName] + if ok { + ext.Requesting.Add(ex.Requesting) + exhaustions[resourceName] = ext + } else { + exhaustions[resourceName] = ex } line := fmt.Sprintf( @@ -465,7 +463,7 @@ func (r *resourcePoolController) handleClaimDisassociation( return nil }) if err != nil { - log.Info("Removing owner reference failed", "claim", current.Name, "pool", pool.Name, "error", err) + log.V(3).Info("Removing owner reference failed", "claim", current.Name, "pool", pool.Name, "error", err) return err } diff --git a/controllers/resourcepools/types.go b/controllers/resourcepools/types.go deleted file mode 100644 index 21cd3638..00000000 --- a/controllers/resourcepools/types.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2020-2023 Project Capsule Authors. -// SPDX-License-Identifier: Apache-2.0 - -package resourcepools - -import ( - "k8s.io/apimachinery/pkg/api/resource" -) - -type PoolExhaustion map[string]PoolExhaustionResource - -type PoolExhaustionResource struct { - Namespace bool - Available resource.Quantity - Requesting resource.Quantity -} diff --git a/pkg/api/exhaustion.go b/pkg/api/exhaustion.go new file mode 100644 index 00000000..61d99183 --- /dev/null +++ b/pkg/api/exhaustion.go @@ -0,0 +1,16 @@ +// Copyright 2020-2023 Project Capsule Authors. +// SPDX-License-Identifier: Apache-2.0 + +package api + +import ( + "k8s.io/apimachinery/pkg/api/resource" +) + +// +kubebuilder:object:generate=true +type PoolExhaustionResource struct { + // Available Resources to be claimed + Available resource.Quantity `json:"available,omitempty"` + // Requesting Resources + Requesting resource.Quantity `json:"requesting,omitempty"` +} diff --git a/pkg/api/zz_generated.deepcopy.go b/pkg/api/zz_generated.deepcopy.go index 0c7ec15e..759b3d4c 100644 --- a/pkg/api/zz_generated.deepcopy.go +++ b/pkg/api/zz_generated.deepcopy.go @@ -287,6 +287,23 @@ func (in *PodOptions) DeepCopy() *PodOptions { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PoolExhaustionResource) DeepCopyInto(out *PoolExhaustionResource) { + *out = *in + out.Available = in.Available.DeepCopy() + out.Requesting = in.Requesting.DeepCopy() +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PoolExhaustionResource. +func (in *PoolExhaustionResource) DeepCopy() *PoolExhaustionResource { + if in == nil { + return nil + } + out := new(PoolExhaustionResource) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ResourceQuotaSpec) DeepCopyInto(out *ResourceQuotaSpec) { *out = *in diff --git a/pkg/metrics/claim_recorder.go b/pkg/metrics/claim_recorder.go index bd0cabdc..a981bc45 100644 --- a/pkg/metrics/claim_recorder.go +++ b/pkg/metrics/claim_recorder.go @@ -5,10 +5,10 @@ package metrics import ( "github.com/prometheus/client_golang/prometheus" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" crtlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" capsulev1beta2 "github.com/projectcapsule/capsule/api/v1beta2" - "github.com/projectcapsule/capsule/pkg/meta" ) type ClaimRecorder struct { @@ -31,7 +31,7 @@ func NewClaimRecorder() *ClaimRecorder { Name: "claim_condition", Help: "The current condition status of a claim.", }, - []string{"name", "target_namespace", "condition", "status", "reason", "pool"}, + []string{"name", "target_namespace", "condition", "reason", "pool"}, ), claimResourcesGauge: prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -47,26 +47,29 @@ func NewClaimRecorder() *ClaimRecorder { func (r *ClaimRecorder) Collectors() []prometheus.Collector { return []prometheus.Collector{ r.claimConditionGauge, + r.claimResourcesGauge, } } // RecordCondition records the condition as given for the ref. func (r *ClaimRecorder) RecordClaimCondition(claim *capsulev1beta2.ResourcePoolClaim) { - for _, status := range []string{meta.AssignedCondition, meta.BoundCondition} { - var value float64 - if status == claim.Status.Condition.Type { - value = 1 - } + // Remove all Condition Metrics to avoid duplicates + r.claimConditionGauge.DeletePartialMatch(map[string]string{ + "name": claim.Name, + "namespace": claim.Namespace, + }) - r.claimConditionGauge.WithLabelValues( - claim.Name, - claim.Namespace, - status, - string(claim.Status.Condition.Status), - claim.Status.Condition.Reason, - claim.Status.Pool.Name.String(), - ).Set(value) + value := 0 + if claim.Status.Condition.Status == metav1.ConditionTrue { + value = 1 } + r.claimConditionGauge.WithLabelValues( + claim.Name, + claim.Namespace, + claim.Status.Condition.Type, + claim.Status.Condition.Reason, + claim.Status.Pool.Name.String(), + ).Set(float64(value)) for resourceName, qt := range claim.Spec.ResourceClaims { r.claimResourcesGauge.WithLabelValues( @@ -78,8 +81,13 @@ func (r *ClaimRecorder) RecordClaimCondition(claim *capsulev1beta2.ResourcePoolC } // DeleteCondition deletes the condition metrics for the ref. -func (r *ClaimRecorder) DeleteClaimMetric(claim string) { - for _, status := range []string{meta.ReadyCondition, meta.NotReadyCondition} { - r.claimConditionGauge.DeleteLabelValues(claim, status) - } +func (r *ClaimRecorder) DeleteClaimMetric(claim string, namespace string) { + r.claimConditionGauge.DeletePartialMatch(map[string]string{ + "name": claim, + "namespace": namespace, + }) + r.claimResourcesGauge.DeletePartialMatch(map[string]string{ + "name": claim, + "namespace": namespace, + }) } diff --git a/pkg/metrics/pool_recorder.go b/pkg/metrics/pool_recorder.go index a4e60890..f9526a77 100644 --- a/pkg/metrics/pool_recorder.go +++ b/pkg/metrics/pool_recorder.go @@ -8,15 +8,19 @@ import ( crtlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" capsulev1beta2 "github.com/projectcapsule/capsule/api/v1beta2" + "github.com/projectcapsule/capsule/pkg/api" ) type ResourcePoolRecorder struct { - poolResource *prometheus.GaugeVec - poolResourceLimit *prometheus.GaugeVec - poolResourceAvailable *prometheus.GaugeVec - poolResourceUsage *prometheus.GaugeVec - poolResourceExhaustion *prometheus.GaugeVec - poolNamespaceResourceUsage *prometheus.GaugeVec + poolResource *prometheus.GaugeVec + poolResourceLimit *prometheus.GaugeVec + poolResourceAvailable *prometheus.GaugeVec + poolResourceUsage *prometheus.GaugeVec + poolResourceUsagePercentage *prometheus.GaugeVec + poolResourceExhaustion *prometheus.GaugeVec + poolResourceExhaustionPercentage *prometheus.GaugeVec + poolNamespaceResourceUsage *prometheus.GaugeVec + poolNamespaceResourceUsagePercentage *prometheus.GaugeVec } func MustMakeResourcePoolRecorder() *ResourcePoolRecorder { @@ -36,6 +40,14 @@ func NewResourcePoolRecorder() *ResourcePoolRecorder { }, []string{"pool", "resource"}, ), + poolResourceExhaustionPercentage: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricsPrefix, + Name: "pool_exhaustion_percentage", + Help: "Resources become exhausted, when there's not enough available for all claims and the claims get queued (Percentage)", + }, + []string{"pool", "resource"}, + ), poolResource: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: metricsPrefix, @@ -60,7 +72,14 @@ func NewResourcePoolRecorder() *ResourcePoolRecorder { }, []string{"pool", "resource"}, ), - + poolResourceUsagePercentage: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricsPrefix, + Name: "pool_usage_percentage", + Help: "Current resource usage for a given resource in a resource pool (percentage)", + }, + []string{"pool", "resource"}, + ), poolResourceAvailable: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: metricsPrefix, @@ -77,6 +96,14 @@ func NewResourcePoolRecorder() *ResourcePoolRecorder { }, []string{"pool", "target_namespace", "resource"}, ), + poolNamespaceResourceUsagePercentage: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricsPrefix, + Name: "pool_namespace_usage_percentage", + Help: "Current resources claimed on namespace basis for a given resource in a resource pool for a specific namespace (percentage)", + }, + []string{"pool", "target_namespace", "resource"}, + ), } } @@ -85,9 +112,12 @@ func (r *ResourcePoolRecorder) Collectors() []prometheus.Collector { r.poolResource, r.poolResourceLimit, r.poolResourceUsage, + r.poolResourceUsagePercentage, r.poolResourceAvailable, r.poolResourceExhaustion, + r.poolResourceExhaustionPercentage, r.poolNamespaceResourceUsage, + r.poolNamespaceResourceUsagePercentage, } } @@ -124,11 +154,57 @@ func (r *ResourcePoolRecorder) ResourceUsageMetrics(pool *capsulev1beta2.Resourc pool.Name, resourceName.String(), ).Set(float64(available.MilliValue()) / 1000) + + usagePercentage := float64(0) + if quantity.MilliValue() > 0 { + usagePercentage = (float64(claimed.MilliValue()) / float64(quantity.MilliValue())) * 100 + } + + r.poolResourceUsagePercentage.WithLabelValues( + pool.Name, + resourceName.String(), + ).Set(usagePercentage) } r.resourceUsageMetricsByNamespace(pool) } +// Emit exhaustion metrics +func (r *ResourcePoolRecorder) CalculateExhaustions( + pool *capsulev1beta2.ResourcePool, + current map[string]api.PoolExhaustionResource, +) { + for resource := range pool.Status.Exhaustions { + if _, ok := current[resource]; ok { + continue + } + + r.poolResourceExhaustion.DeleteLabelValues(pool.Name, resource) + r.poolResourceExhaustionPercentage.DeleteLabelValues(pool.Name, resource) + } + + for resource, ex := range current { + available := float64(ex.Available.MilliValue()) / 1000 + requesting := float64(ex.Requesting.MilliValue()) / 1000 + + r.poolResourceExhaustion.WithLabelValues( + pool.Name, + resource, + ).Set(float64(ex.Requesting.MilliValue()) / 1000) + + // Calculate and expose overprovisioning percentage + if available > 0 && requesting > available { + percent := ((requesting - available) / available) * 100 + r.poolResourceExhaustionPercentage.WithLabelValues( + pool.Name, + resource, + ).Set(percent) + } else { + r.poolResourceExhaustionPercentage.DeleteLabelValues(pool.Name, resource) + } + } +} + // Delete all metrics for a namespace in a resource pool. func (r *ResourcePoolRecorder) DeleteResourcePoolNamespaceMetric(pool string, namespace string) { r.poolNamespaceResourceUsage.DeletePartialMatch(map[string]string{"pool": pool, "namespace": namespace}) @@ -147,7 +223,9 @@ func (r *ResourcePoolRecorder) cleanupAllMetricForLabels(labels map[string]strin r.poolResourceLimit.DeletePartialMatch(labels) r.poolResourceAvailable.DeletePartialMatch(labels) r.poolResourceUsage.DeletePartialMatch(labels) + r.poolResourceUsagePercentage.DeletePartialMatch(labels) r.poolNamespaceResourceUsage.DeletePartialMatch(labels) + r.poolNamespaceResourceUsagePercentage.DeletePartialMatch(labels) r.poolResource.DeletePartialMatch(labels) r.poolResourceExhaustion.DeletePartialMatch(labels) } @@ -163,6 +241,17 @@ func (r *ResourcePoolRecorder) resourceUsageMetricsByNamespace(pool *capsulev1be namespace, resourceName.String(), ).Set(float64(quantity.MilliValue()) / 1000) + + availble, ok := pool.Status.Allocation.Hard[resourceName] + if !ok { + continue + } + + r.poolNamespaceResourceUsagePercentage.WithLabelValues( + pool.Name, + namespace, + resourceName.String(), + ).Set((float64(quantity.MilliValue()) / float64(availble.MilliValue())) * 100) } } }