feat: improve resourcepool monitoring (#1488)

* feat(resourcepools): add improved metrics

Signed-off-by: Oliver Bähler <oliverbaehler@hotmail.com>

* feat(helm): add resourcepool dashboard

Signed-off-by: Oliver Bähler <oliverbaehler@hotmail.com>

---------

Signed-off-by: Oliver Bähler <oliverbaehler@hotmail.com>
This commit is contained in:
Oliver Bähler
2025-06-03 14:10:42 +02:00
committed by GitHub
parent d3b435c353
commit c8377d51f1
19 changed files with 2037 additions and 164 deletions

View File

@@ -96,6 +96,7 @@ helm-test-exec: ct helm-controller-version ko-build-all
$(MAKE) e2e-load-image CLUSTER_NAME=capsule-charts IMAGE=$(CAPSULE_IMG) VERSION=v0.0.0
$(MAKE) e2e-load-image CLUSTER_NAME=capsule-charts IMAGE=$(CAPSULE_IMG) VERSION=tracing
@$(KUBECTL) create ns capsule-system || true
@$(KUBECTL) apply --force-conflicts --server-side=true -f https://github.com/grafana/grafana-operator/releases/download/v5.18.0/crds.yaml
@$(KUBECTL) apply --force-conflicts --server-side=true -f https://github.com/cert-manager/cert-manager/releases/download/v1.9.1/cert-manager.crds.yaml
@$(KUBECTL) apply --force-conflicts --server-side=true -f https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.58.0/bundle.yaml
@$(CT) install --config $(SRC_ROOT)/.github/configs/ct.yaml --namespace=capsule-system --all --debug

View File

@@ -24,6 +24,8 @@ type ResourcePoolStatus struct {
Claims ResourcePoolNamespaceClaimsStatus `json:"claims,omitempty"`
// Tracks the Usage from Claimed against what has been granted from the pool
Allocation ResourcePoolQuotaStatus `json:"allocation,omitempty"`
// Exhaustions from claims associated with the pool
Exhaustions map[string]api.PoolExhaustionResource `json:"exhaustions,omitempty"`
}
type ResourcePoolNamespaceClaimsStatus map[string]ResourcePoolClaimsList

View File

@@ -887,6 +887,13 @@ func (in *ResourcePoolStatus) DeepCopyInto(out *ResourcePoolStatus) {
}
}
in.Allocation.DeepCopyInto(&out.Allocation)
if in.Exhaustions != nil {
in, out := &in.Exhaustions, &out.Exhaustions
*out = make(map[string]api.PoolExhaustionResource, len(*in))
for key, val := range *in {
(*out)[key] = *val.DeepCopy()
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourcePoolStatus.

View File

@@ -178,20 +178,29 @@ Here the values you can override:
| manager.volumes | list | `[]` | Set the additional volumes needed for the Capsule manager container |
| manager.webhookPort | int | `9443` | Set an alternative to the default container port. Useful for use in some kubernetes clusters (such as GKE Private) with aggregator routing turned on, because pod ports have to be opened manually on the firewall side |
### ServiceMonitor Parameters
### Monitoring Parameters
| Key | Type | Default | Description |
|-----|------|---------|-------------|
| serviceMonitor.annotations | object | `{}` | Assign additional Annotations |
| serviceMonitor.enabled | bool | `false` | Enable ServiceMonitor |
| serviceMonitor.endpoint.interval | string | `"15s"` | Set the scrape interval for the endpoint of the serviceMonitor |
| serviceMonitor.endpoint.metricRelabelings | list | `[]` | Set metricRelabelings for the endpoint of the serviceMonitor |
| serviceMonitor.endpoint.relabelings | list | `[]` | Set relabelings for the endpoint of the serviceMonitor |
| serviceMonitor.endpoint.scrapeTimeout | string | `""` | Set the scrape timeout for the endpoint of the serviceMonitor |
| serviceMonitor.labels | object | `{}` | Assign additional labels according to Prometheus' serviceMonitorSelector matching labels |
| serviceMonitor.matchLabels | object | `{}` | Change matching labels |
| serviceMonitor.namespace | string | `""` | Install the ServiceMonitor into a different Namespace, as the monitoring stack one (default: the release one) |
| serviceMonitor.targetLabels | list | `[]` | Set targetLabels for the serviceMonitor |
| monitoring.dashboards.annotations | object | `{}` | Annotations for dashboard configmaps |
| monitoring.dashboards.enabled | bool | `false` | Enable Dashboards to be deployed |
| monitoring.dashboards.labels | object | `{}` | Labels for dashboard configmaps |
| monitoring.dashboards.namespace | string | `""` | Custom namespace for dashboard configmaps |
| monitoring.dashboards.operator.allowCrossNamespaceImport | bool | `true` | Allow the Operator to match this resource with Grafanas outside the current namespace |
| monitoring.dashboards.operator.enabled | bool | `true` | Enable Operator Resources (GrafanaDashboard) |
| monitoring.dashboards.operator.folder | string | `""` | folder assignment for dashboard |
| monitoring.dashboards.operator.instanceSelector | object | `{}` | Selects Grafana instances for import |
| monitoring.dashboards.operator.resyncPeriod | string | `"10m"` | How often the resource is synced, defaults to 10m0s if not set |
| monitoring.serviceMonitor.annotations | object | `{}` | Assign additional Annotations |
| monitoring.serviceMonitor.enabled | bool | `false` | Enable ServiceMonitor |
| monitoring.serviceMonitor.endpoint.interval | string | `"15s"` | Set the scrape interval for the endpoint of the serviceMonitor |
| monitoring.serviceMonitor.endpoint.metricRelabelings | list | `[]` | Set metricRelabelings for the endpoint of the serviceMonitor |
| monitoring.serviceMonitor.endpoint.relabelings | list | `[]` | Set relabelings for the endpoint of the serviceMonitor |
| monitoring.serviceMonitor.endpoint.scrapeTimeout | string | `""` | Set the scrape timeout for the endpoint of the serviceMonitor |
| monitoring.serviceMonitor.labels | object | `{}` | Assign additional labels according to Prometheus' serviceMonitorSelector matching labels |
| monitoring.serviceMonitor.matchLabels | object | `{}` | Change matching labels |
| monitoring.serviceMonitor.namespace | string | `""` | Install the ServiceMonitor into a different Namespace, as the monitoring stack one (default: the release one) |
| monitoring.serviceMonitor.targetLabels | list | `[]` | Set targetLabels for the serviceMonitor |
### Webhooks Parameters

View File

@@ -112,7 +112,7 @@ Here the values you can override:
| Key | Type | Default | Description |
|-----|------|---------|-------------|
{{- range .Values }}
{{- if not (or (hasPrefix "global" .Key) (hasPrefix "manager" .Key) (hasPrefix "crds" .Key) (hasPrefix "serviceMonitor" .Key) (hasPrefix "webhook" .Key) (hasPrefix "capsule-proxy" .Key) ) }}
{{- if not (or (hasPrefix "global" .Key) (hasPrefix "manager" .Key) (hasPrefix "crds" .Key) (hasPrefix "monitoring" .Key) (hasPrefix "webhook" .Key) (hasPrefix "capsule-proxy" .Key) ) }}
| {{ .Key }} | {{ .Type }} | {{ if .Default }}{{ .Default }}{{ else }}{{ .AutoDefault }}{{ end }} | {{ if .Description }}{{ .Description }}{{ else }}{{ .AutoDescription }}{{ end }} |
{{- end }}
{{- end }}
@@ -127,12 +127,12 @@ Here the values you can override:
{{- end }}
{{- end }}
### ServiceMonitor Parameters
### Monitoring Parameters
| Key | Type | Default | Description |
|-----|------|---------|-------------|
{{- range .Values }}
{{- if hasPrefix "serviceMonitor" .Key }}
{{- if hasPrefix "monitoring" .Key }}
| {{ .Key }} | {{ .Type }} | {{ if .Default }}{{ .Default }}{{ else }}{{ .AutoDefault }}{{ end }} | {{ if .Description }}{{ .Description }}{{ else }}{{ .AutoDescription }}{{ end }} |
{{- end }}
{{- end }}

View File

@@ -0,0 +1,9 @@
monitoring:
dashboards:
enabled: true
annotations:
k8s-sidecar-target-directory: /tmp/dashboards/Capsule
labels:
grafana_dashboard: "1"
operator:
enabled: true

View File

@@ -291,6 +291,26 @@ spec:
type: array
description: Tracks the quotas for the Resource.
type: object
exhaustions:
additionalProperties:
properties:
available:
anyOf:
- type: integer
- type: string
description: Available Resources to be claimed
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
requesting:
anyOf:
- type: integer
- type: string
description: Requesting Resources
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
description: Exhaustions from claims associated with the pool
type: object
namespaceCount:
default: 0
description: How many namespaces are considered

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,51 @@
{{- if $.Values.monitoring.dashboards.enabled }}
{{ range $path, $_ := .Files.Glob "dashboards/**-dashboard.json" }}
{{- with $ }}
{{- $content := (.Files.Get $path) }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "capsule.fullname" . }}-{{ $path | base | trimSuffix "-dashboard.json" | regexFind "[^_]+$" }}-dashboard
namespace: {{ default $.Release.Namespace $.Values.monitoring.dashboards.namespace | quote }}
annotations:
{{- with $.Values.monitoring.dashboards.annotations }}
{{- toYaml . | nindent 4 }}
{{- end }}
labels:
{{- include "capsule.labels" . | nindent 4 }}
{{- with $.Values.monitoring.dashboards.labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
data:
{{ base $path }}: |-
{{- $content | nindent 4 }}
{{- if $.Values.monitoring.dashboards.operator.enabled }}
---
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: {{ include "capsule.fullname" . }}-{{ $path | base | trimSuffix "-dashboard.json" | regexFind "[^_]+$" }}
namespace: {{ default $.Release.Namespace $.Values.monitoring.dashboards.namespace | quote }}
annotations:
{{- with $.Values.monitoring.dashboards.annotations }}
{{- toYaml . | nindent 4 }}
{{- end }}
labels:
{{- include "capsule.labels" . | nindent 4 }}
{{- with $.Values.monitoring.dashboards.labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
configMapRef:
name: {{ include "capsule.fullname" . }}-{{ $path | base | trimSuffix "-dashboard.json" | regexFind "[^_]+$" }}-dashboard
key: {{ base $path }}
{{- with (omit $.Values.monitoring.dashboards.operator "enabled") }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -1,22 +1,23 @@
{{- if not $.Values.crds.exclusive }}
{{- if .Values.serviceMonitor.enabled }}
{{- with (mergeOverwrite .Values.monitoring.serviceMonitor (default dict .Values.serviceMonitor)) -}}
{{- if .enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "capsule.fullname" . }}-monitor
namespace: {{ .Values.serviceMonitor.namespace | default .Release.Namespace }}
name: {{ include "capsule.fullname" $ }}
namespace: {{ .namespace | default $.Release.Namespace }}
labels:
{{- include "capsule.labels" . | nindent 4 }}
{{- with .Values.serviceMonitor.labels }}
{{- with .labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- with .Values.serviceMonitor.annotations }}
{{- with .annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
endpoints:
{{- with .Values.serviceMonitor.endpoint }}
{{- with .endpoint }}
- interval: {{ .interval }}
port: metrics
path: /metrics
@@ -31,18 +32,19 @@ spec:
{{- end }}
{{- end }}
jobLabel: app.kubernetes.io/name
{{- with .Values.serviceMonitor.targetLabels }}
{{- with .targetLabels }}
targetLabels: {{- toYaml . | nindent 4 }}
{{- end }}
selector:
matchLabels:
{{- if .Values.serviceMonitor.matchLabels }}
{{- toYaml .Values.serviceMonitor.matchLabels | nindent 6 }}
{{- if .matchLabels }}
{{- toYaml .matchLabels | nindent 6 }}
{{- else }}
{{- include "capsule.labels" . | nindent 6 }}
{{- include "capsule.selectorLabels" $ | nindent 6 }}
{{- end }}
namespaceSelector:
matchNames:
- {{ .Release.Namespace }}
- {{ $.Release.Namespace }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -331,6 +331,94 @@
},
"type": "object"
},
"monitoring": {
"properties": {
"dashboards": {
"properties": {
"annotations": {
"properties": {},
"type": "object"
},
"enabled": {
"type": "boolean"
},
"labels": {
"properties": {},
"type": "object"
},
"namespace": {
"type": "string"
},
"operator": {
"properties": {
"allowCrossNamespaceImport": {
"type": "boolean"
},
"enabled": {
"type": "boolean"
},
"folder": {
"type": "string"
},
"instanceSelector": {
"properties": {},
"type": "object"
},
"resyncPeriod": {
"type": "string"
}
},
"type": "object"
}
},
"type": "object"
},
"serviceMonitor": {
"properties": {
"annotations": {
"properties": {},
"type": "object"
},
"enabled": {
"type": "boolean"
},
"endpoint": {
"properties": {
"interval": {
"type": "string"
},
"metricRelabelings": {
"type": "array"
},
"relabelings": {
"type": "array"
},
"scrapeTimeout": {
"type": "string"
}
},
"type": "object"
},
"labels": {
"properties": {},
"type": "object"
},
"matchLabels": {
"properties": {},
"type": "object"
},
"namespace": {
"type": "string"
},
"targetLabels": {
"type": "array"
}
},
"type": "object"
}
},
"type": "object"
},
"nodeSelector": {
"properties": {},
"type": "object"
@@ -452,49 +540,6 @@
},
"type": "object"
},
"serviceMonitor": {
"properties": {
"annotations": {
"properties": {},
"type": "object"
},
"enabled": {
"type": "boolean"
},
"endpoint": {
"properties": {
"interval": {
"type": "string"
},
"metricRelabelings": {
"type": "array"
},
"relabelings": {
"type": "array"
},
"scrapeTimeout": {
"type": "string"
}
},
"type": "object"
},
"labels": {
"properties": {},
"type": "object"
},
"matchLabels": {
"properties": {},
"type": "object"
},
"namespace": {
"type": "string"
},
"targetLabels": {
"type": "array"
}
},
"type": "object"
},
"tls": {
"properties": {
"create": {

View File

@@ -377,27 +377,52 @@ webhooks:
- key: capsule.clastix.io/tenant
operator: Exists
# Monitoring Settings
monitoring:
# ServiceMonitor
serviceMonitor:
# -- Enable ServiceMonitor
enabled: false
# -- Install the ServiceMonitor into a different Namespace, as the monitoring stack one (default: the release one)
namespace: ''
# -- Assign additional labels according to Prometheus' serviceMonitorSelector matching labels
labels: {}
# -- Assign additional Annotations
annotations: {}
# -- Change matching labels
matchLabels: {}
# -- Set targetLabels for the serviceMonitor
targetLabels: []
endpoint:
# -- Set the scrape interval for the endpoint of the serviceMonitor
interval: "15s"
# -- Set the scrape timeout for the endpoint of the serviceMonitor
scrapeTimeout: ""
# -- Set metricRelabelings for the endpoint of the serviceMonitor
metricRelabelings: []
# -- Set relabelings for the endpoint of the serviceMonitor
relabelings: []
dashboards:
# -- Enable Dashboards to be deployed
enabled: false
# -- Annotations for dashboard configmaps
annotations: {}
# -- Labels for dashboard configmaps
labels: {}
# grafana_dashboard: "1"
# -- Custom namespace for dashboard configmaps
namespace: ""
# Grafana Operator
operator:
# -- Enable Operator Resources (GrafanaDashboard)
enabled: true
# -- Allow the Operator to match this resource with Grafanas outside the current namespace
allowCrossNamespaceImport: true
# -- How often the resource is synced, defaults to 10m0s if not set
resyncPeriod: "10m"
# -- Selects Grafana instances for import
instanceSelector: {}
# -- folder assignment for dashboard
folder: ""
# ServiceMonitor
serviceMonitor:
# -- Enable ServiceMonitor
enabled: false
# -- Install the ServiceMonitor into a different Namespace, as the monitoring stack one (default: the release one)
namespace: ''
# -- Assign additional labels according to Prometheus' serviceMonitorSelector matching labels
labels: {}
# -- Assign additional Annotations
annotations: {}
# -- Change matching labels
matchLabels: {}
# -- Set targetLabels for the serviceMonitor
targetLabels: []
endpoint:
# -- Set the scrape interval for the endpoint of the serviceMonitor
interval: "15s"
# -- Set the scrape timeout for the endpoint of the serviceMonitor
scrapeTimeout: ""
# -- Set metricRelabelings for the endpoint of the serviceMonitor
metricRelabelings: []
# -- Set relabelings for the endpoint of the serviceMonitor
relabelings: []

View File

@@ -51,9 +51,9 @@ func (r resourceClaimController) Reconcile(ctx context.Context, request ctrl.Req
instance := &capsulev1beta2.ResourcePoolClaim{}
if err = r.Get(ctx, request.NamespacedName, instance); err != nil {
if apierrors.IsNotFound(err) {
log.Info("Request object not found, could have been deleted after reconcile request")
log.V(5).Info("Request object not found, could have been deleted after reconcile request")
r.metrics.DeleteClaimMetric(request.Name)
r.metrics.DeleteClaimMetric(request.Name, request.Namespace)
return reconcile.Result{}, nil
}

View File

@@ -13,7 +13,6 @@ import (
"golang.org/x/sync/errgroup"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/types"
@@ -76,7 +75,7 @@ func (r resourcePoolController) Reconcile(ctx context.Context, request ctrl.Requ
instance := &capsulev1beta2.ResourcePool{}
if err = r.Get(ctx, request.NamespacedName, instance); err != nil {
if apierrors.IsNotFound(err) {
log.Info("Request object not found, could have been deleted after reconcile request")
log.V(5).Info("Request object not found, could have been deleted after reconcile request")
r.metrics.DeleteResourcePoolMetric(request.Name)
@@ -198,18 +197,23 @@ func (r *resourcePoolController) reconcile(
// Keeps track of resources which are exhausted by previous resource
// This is only required when Ordered is active
queuedResourcesMap := make(map[string]resource.Quantity)
exhaustions := make(map[string]api.PoolExhaustionResource)
// You can now iterate over `allClaims` in order
for _, claim := range claims {
log.Info("Found claim", "name", claim.Name, "namespace", claim.Namespace, "created", claim.CreationTimestamp)
log.V(5).Info("Found claim", "name", claim.Name, "namespace", claim.Namespace, "created", claim.CreationTimestamp)
err = r.reconcileResourceClaim(ctx, log.WithValues("Claim", claim.Name), pool, &claim, queuedResourcesMap)
err = r.reconcileResourceClaim(ctx, log.WithValues("Claim", claim.Name), pool, &claim, exhaustions)
if err != nil {
log.Error(err, "Failed to reconcile ResourceQuotaClaim", "claim", claim.Name)
}
}
log.V(7).Info("finalized reconciling claims", "exhaustions", exhaustions)
r.metrics.CalculateExhaustions(pool, exhaustions)
pool.Status.Exhaustions = exhaustions
pool.CalculateClaimedResources()
pool.AssignClaims()
@@ -222,7 +226,7 @@ func (r *resourcePoolController) reconcileResourceClaim(
log logr.Logger,
pool *capsulev1beta2.ResourcePool,
claim *capsulev1beta2.ResourcePoolClaim,
exhaustion map[string]resource.Quantity,
exhaustion map[string]api.PoolExhaustionResource,
) (err error) {
t := pool.GetClaimFromStatus(claim)
if t != nil {
@@ -257,7 +261,6 @@ func (r *resourcePoolController) reconcileResourceClaim(
return r.handleClaimResourceExhaustion(
ctx,
pool,
claim,
exhaustions,
exhaustion,
@@ -271,14 +274,14 @@ func (r *resourcePoolController) canClaimWithinNamespace(
log logr.Logger,
pool *capsulev1beta2.ResourcePool,
claim *capsulev1beta2.ResourcePoolClaim,
) (res map[string]PoolExhaustionResource) {
) (res map[string]api.PoolExhaustionResource) {
claimable := pool.GetAvailableClaimableResources()
log.V(5).Info("claimable resources", "claimable", claimable)
_, namespaceClaimed := pool.GetNamespaceClaims(claim.Namespace)
log.V(5).Info("namespace claimed resources", "claimed", namespaceClaimed)
res = make(map[string]PoolExhaustionResource)
res = make(map[string]api.PoolExhaustionResource)
for resourceName, req := range claim.Spec.ResourceClaims {
// Verify if total Quota is available
@@ -286,10 +289,9 @@ func (r *resourcePoolController) canClaimWithinNamespace(
if !exists || available.IsZero() || available.Cmp(req) < 0 {
log.V(5).Info("not enough resources available", "available", available, "requesting", req)
res[resourceName.String()] = PoolExhaustionResource{
res[resourceName.String()] = api.PoolExhaustionResource{
Available: available,
Requesting: req,
Namespace: false,
}
continue
@@ -303,12 +305,12 @@ func (r *resourcePoolController) canClaimWithinNamespace(
func (r *resourcePoolController) handleClaimOrderedExhaustion(
ctx context.Context,
claim *capsulev1beta2.ResourcePoolClaim,
exhaustion map[string]resource.Quantity,
exhaustions map[string]api.PoolExhaustionResource,
) (queued bool, err error) {
status := make([]string, 0)
for resourceName, qt := range claim.Spec.ResourceClaims {
req, ok := exhaustion[resourceName.String()]
req, ok := exhaustions[resourceName.String()]
if !ok {
continue
}
@@ -318,7 +320,7 @@ func (r *resourcePoolController) handleClaimOrderedExhaustion(
resourceName,
qt.String(),
resourceName,
req.String(),
req.Requesting.String(),
)
status = append(status, line)
}
@@ -339,32 +341,28 @@ func (r *resourcePoolController) handleClaimOrderedExhaustion(
func (r *resourcePoolController) handleClaimResourceExhaustion(
ctx context.Context,
pool *capsulev1beta2.ResourcePool,
claim *capsulev1beta2.ResourcePoolClaim,
exhaustions map[string]PoolExhaustionResource,
exhaustion map[string]resource.Quantity,
currentExhaustions map[string]api.PoolExhaustionResource,
exhaustions map[string]api.PoolExhaustionResource,
) (err error) {
status := make([]string, 0)
resourceNames := make([]string, 0)
for resourceName := range exhaustions {
for resourceName := range currentExhaustions {
resourceNames = append(resourceNames, resourceName)
}
sort.Strings(resourceNames)
for _, resourceName := range resourceNames {
ex := exhaustions[resourceName]
ex := currentExhaustions[resourceName]
if *pool.Spec.Config.OrderedQueue {
ext, ok := exhaustion[resourceName]
if ok {
ext.Add(ex.Requesting)
} else {
ext = ex.Requesting
}
exhaustion[resourceName] = ext
ext, ok := exhaustions[resourceName]
if ok {
ext.Requesting.Add(ex.Requesting)
exhaustions[resourceName] = ext
} else {
exhaustions[resourceName] = ex
}
line := fmt.Sprintf(
@@ -465,7 +463,7 @@ func (r *resourcePoolController) handleClaimDisassociation(
return nil
})
if err != nil {
log.Info("Removing owner reference failed", "claim", current.Name, "pool", pool.Name, "error", err)
log.V(3).Info("Removing owner reference failed", "claim", current.Name, "pool", pool.Name, "error", err)
return err
}

View File

@@ -1,16 +0,0 @@
// Copyright 2020-2023 Project Capsule Authors.
// SPDX-License-Identifier: Apache-2.0
package resourcepools
import (
"k8s.io/apimachinery/pkg/api/resource"
)
type PoolExhaustion map[string]PoolExhaustionResource
type PoolExhaustionResource struct {
Namespace bool
Available resource.Quantity
Requesting resource.Quantity
}

16
pkg/api/exhaustion.go Normal file
View File

@@ -0,0 +1,16 @@
// Copyright 2020-2023 Project Capsule Authors.
// SPDX-License-Identifier: Apache-2.0
package api
import (
"k8s.io/apimachinery/pkg/api/resource"
)
// +kubebuilder:object:generate=true
type PoolExhaustionResource struct {
// Available Resources to be claimed
Available resource.Quantity `json:"available,omitempty"`
// Requesting Resources
Requesting resource.Quantity `json:"requesting,omitempty"`
}

View File

@@ -287,6 +287,23 @@ func (in *PodOptions) DeepCopy() *PodOptions {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *PoolExhaustionResource) DeepCopyInto(out *PoolExhaustionResource) {
*out = *in
out.Available = in.Available.DeepCopy()
out.Requesting = in.Requesting.DeepCopy()
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PoolExhaustionResource.
func (in *PoolExhaustionResource) DeepCopy() *PoolExhaustionResource {
if in == nil {
return nil
}
out := new(PoolExhaustionResource)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ResourceQuotaSpec) DeepCopyInto(out *ResourceQuotaSpec) {
*out = *in

View File

@@ -5,10 +5,10 @@ package metrics
import (
"github.com/prometheus/client_golang/prometheus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
crtlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
capsulev1beta2 "github.com/projectcapsule/capsule/api/v1beta2"
"github.com/projectcapsule/capsule/pkg/meta"
)
type ClaimRecorder struct {
@@ -31,7 +31,7 @@ func NewClaimRecorder() *ClaimRecorder {
Name: "claim_condition",
Help: "The current condition status of a claim.",
},
[]string{"name", "target_namespace", "condition", "status", "reason", "pool"},
[]string{"name", "target_namespace", "condition", "reason", "pool"},
),
claimResourcesGauge: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
@@ -47,26 +47,29 @@ func NewClaimRecorder() *ClaimRecorder {
func (r *ClaimRecorder) Collectors() []prometheus.Collector {
return []prometheus.Collector{
r.claimConditionGauge,
r.claimResourcesGauge,
}
}
// RecordCondition records the condition as given for the ref.
func (r *ClaimRecorder) RecordClaimCondition(claim *capsulev1beta2.ResourcePoolClaim) {
for _, status := range []string{meta.AssignedCondition, meta.BoundCondition} {
var value float64
if status == claim.Status.Condition.Type {
value = 1
}
// Remove all Condition Metrics to avoid duplicates
r.claimConditionGauge.DeletePartialMatch(map[string]string{
"name": claim.Name,
"namespace": claim.Namespace,
})
r.claimConditionGauge.WithLabelValues(
claim.Name,
claim.Namespace,
status,
string(claim.Status.Condition.Status),
claim.Status.Condition.Reason,
claim.Status.Pool.Name.String(),
).Set(value)
value := 0
if claim.Status.Condition.Status == metav1.ConditionTrue {
value = 1
}
r.claimConditionGauge.WithLabelValues(
claim.Name,
claim.Namespace,
claim.Status.Condition.Type,
claim.Status.Condition.Reason,
claim.Status.Pool.Name.String(),
).Set(float64(value))
for resourceName, qt := range claim.Spec.ResourceClaims {
r.claimResourcesGauge.WithLabelValues(
@@ -78,8 +81,13 @@ func (r *ClaimRecorder) RecordClaimCondition(claim *capsulev1beta2.ResourcePoolC
}
// DeleteCondition deletes the condition metrics for the ref.
func (r *ClaimRecorder) DeleteClaimMetric(claim string) {
for _, status := range []string{meta.ReadyCondition, meta.NotReadyCondition} {
r.claimConditionGauge.DeleteLabelValues(claim, status)
}
func (r *ClaimRecorder) DeleteClaimMetric(claim string, namespace string) {
r.claimConditionGauge.DeletePartialMatch(map[string]string{
"name": claim,
"namespace": namespace,
})
r.claimResourcesGauge.DeletePartialMatch(map[string]string{
"name": claim,
"namespace": namespace,
})
}

View File

@@ -8,15 +8,19 @@ import (
crtlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
capsulev1beta2 "github.com/projectcapsule/capsule/api/v1beta2"
"github.com/projectcapsule/capsule/pkg/api"
)
type ResourcePoolRecorder struct {
poolResource *prometheus.GaugeVec
poolResourceLimit *prometheus.GaugeVec
poolResourceAvailable *prometheus.GaugeVec
poolResourceUsage *prometheus.GaugeVec
poolResourceExhaustion *prometheus.GaugeVec
poolNamespaceResourceUsage *prometheus.GaugeVec
poolResource *prometheus.GaugeVec
poolResourceLimit *prometheus.GaugeVec
poolResourceAvailable *prometheus.GaugeVec
poolResourceUsage *prometheus.GaugeVec
poolResourceUsagePercentage *prometheus.GaugeVec
poolResourceExhaustion *prometheus.GaugeVec
poolResourceExhaustionPercentage *prometheus.GaugeVec
poolNamespaceResourceUsage *prometheus.GaugeVec
poolNamespaceResourceUsagePercentage *prometheus.GaugeVec
}
func MustMakeResourcePoolRecorder() *ResourcePoolRecorder {
@@ -36,6 +40,14 @@ func NewResourcePoolRecorder() *ResourcePoolRecorder {
},
[]string{"pool", "resource"},
),
poolResourceExhaustionPercentage: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metricsPrefix,
Name: "pool_exhaustion_percentage",
Help: "Resources become exhausted, when there's not enough available for all claims and the claims get queued (Percentage)",
},
[]string{"pool", "resource"},
),
poolResource: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metricsPrefix,
@@ -60,7 +72,14 @@ func NewResourcePoolRecorder() *ResourcePoolRecorder {
},
[]string{"pool", "resource"},
),
poolResourceUsagePercentage: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metricsPrefix,
Name: "pool_usage_percentage",
Help: "Current resource usage for a given resource in a resource pool (percentage)",
},
[]string{"pool", "resource"},
),
poolResourceAvailable: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metricsPrefix,
@@ -77,6 +96,14 @@ func NewResourcePoolRecorder() *ResourcePoolRecorder {
},
[]string{"pool", "target_namespace", "resource"},
),
poolNamespaceResourceUsagePercentage: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metricsPrefix,
Name: "pool_namespace_usage_percentage",
Help: "Current resources claimed on namespace basis for a given resource in a resource pool for a specific namespace (percentage)",
},
[]string{"pool", "target_namespace", "resource"},
),
}
}
@@ -85,9 +112,12 @@ func (r *ResourcePoolRecorder) Collectors() []prometheus.Collector {
r.poolResource,
r.poolResourceLimit,
r.poolResourceUsage,
r.poolResourceUsagePercentage,
r.poolResourceAvailable,
r.poolResourceExhaustion,
r.poolResourceExhaustionPercentage,
r.poolNamespaceResourceUsage,
r.poolNamespaceResourceUsagePercentage,
}
}
@@ -124,11 +154,57 @@ func (r *ResourcePoolRecorder) ResourceUsageMetrics(pool *capsulev1beta2.Resourc
pool.Name,
resourceName.String(),
).Set(float64(available.MilliValue()) / 1000)
usagePercentage := float64(0)
if quantity.MilliValue() > 0 {
usagePercentage = (float64(claimed.MilliValue()) / float64(quantity.MilliValue())) * 100
}
r.poolResourceUsagePercentage.WithLabelValues(
pool.Name,
resourceName.String(),
).Set(usagePercentage)
}
r.resourceUsageMetricsByNamespace(pool)
}
// Emit exhaustion metrics
func (r *ResourcePoolRecorder) CalculateExhaustions(
pool *capsulev1beta2.ResourcePool,
current map[string]api.PoolExhaustionResource,
) {
for resource := range pool.Status.Exhaustions {
if _, ok := current[resource]; ok {
continue
}
r.poolResourceExhaustion.DeleteLabelValues(pool.Name, resource)
r.poolResourceExhaustionPercentage.DeleteLabelValues(pool.Name, resource)
}
for resource, ex := range current {
available := float64(ex.Available.MilliValue()) / 1000
requesting := float64(ex.Requesting.MilliValue()) / 1000
r.poolResourceExhaustion.WithLabelValues(
pool.Name,
resource,
).Set(float64(ex.Requesting.MilliValue()) / 1000)
// Calculate and expose overprovisioning percentage
if available > 0 && requesting > available {
percent := ((requesting - available) / available) * 100
r.poolResourceExhaustionPercentage.WithLabelValues(
pool.Name,
resource,
).Set(percent)
} else {
r.poolResourceExhaustionPercentage.DeleteLabelValues(pool.Name, resource)
}
}
}
// Delete all metrics for a namespace in a resource pool.
func (r *ResourcePoolRecorder) DeleteResourcePoolNamespaceMetric(pool string, namespace string) {
r.poolNamespaceResourceUsage.DeletePartialMatch(map[string]string{"pool": pool, "namespace": namespace})
@@ -147,7 +223,9 @@ func (r *ResourcePoolRecorder) cleanupAllMetricForLabels(labels map[string]strin
r.poolResourceLimit.DeletePartialMatch(labels)
r.poolResourceAvailable.DeletePartialMatch(labels)
r.poolResourceUsage.DeletePartialMatch(labels)
r.poolResourceUsagePercentage.DeletePartialMatch(labels)
r.poolNamespaceResourceUsage.DeletePartialMatch(labels)
r.poolNamespaceResourceUsagePercentage.DeletePartialMatch(labels)
r.poolResource.DeletePartialMatch(labels)
r.poolResourceExhaustion.DeletePartialMatch(labels)
}
@@ -163,6 +241,17 @@ func (r *ResourcePoolRecorder) resourceUsageMetricsByNamespace(pool *capsulev1be
namespace,
resourceName.String(),
).Set(float64(quantity.MilliValue()) / 1000)
availble, ok := pool.Status.Allocation.Hard[resourceName]
if !ok {
continue
}
r.poolNamespaceResourceUsagePercentage.WithLabelValues(
pool.Name,
namespace,
resourceName.String(),
).Set((float64(quantity.MilliValue()) / float64(availble.MilliValue())) * 100)
}
}
}