From dd39e1a6d5ede6523f7e68e15fe919cdad23a5a0 Mon Sep 17 00:00:00 2001 From: Hristo Hristov Date: Thu, 4 Dec 2025 09:57:45 +0200 Subject: [PATCH] feat(dra): support dra device classes (#1759) * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov * feat(dra): support dra device classes Signed-off-by: Hristo Hristov --------- Signed-off-by: Hristo Hristov --- api/v1beta2/tenant_status.go | 2 + api/v1beta2/tenant_types.go | 2 + api/v1beta2/zz_generated.deepcopy.go | 10 + charts/capsule/README.md | 7 + .../crds/capsule.clastix.io_tenants.yaml | 60 ++ .../validatingwebhookconfiguration.yaml | 38 ++ charts/capsule/values.schema.json | 49 ++ charts/capsule/values.yaml | 19 +- cmd/main.go | 2 + e2e/device_class_test.go | 535 ++++++++++++++++++ go.mod | 1 + go.sum | 2 + internal/controllers/tenant/manager.go | 10 + internal/controllers/tenant/status.go | 111 ++++ internal/webhook/dra/errors.go | 42 ++ internal/webhook/dra/validate.go | 109 ++++ internal/webhook/route/deviceclass.go | 24 + internal/webhook/utils/error.go | 4 + internal/webhook/utils/resources.go | 11 + 19 files changed, 1037 insertions(+), 1 deletion(-) create mode 100644 e2e/device_class_test.go create mode 100644 internal/webhook/dra/errors.go create mode 100644 internal/webhook/dra/validate.go create mode 100644 internal/webhook/route/deviceclass.go diff --git a/api/v1beta2/tenant_status.go b/api/v1beta2/tenant_status.go index 39900211..848d22ce 100644 --- a/api/v1beta2/tenant_status.go +++ b/api/v1beta2/tenant_status.go @@ -70,6 +70,8 @@ type TenantAvailableClassesStatus struct { RuntimeClasses []string `json:"runtime,omitempty"` // Available GatewayClasses GatewayClasses []string `json:"gateway,omitempty"` + // Available DeviceClasses + DeviceClasses []string `json:"device,omitempty"` } func (ms *TenantStatus) GetInstance(stat *TenantStatusNamespaceItem) *TenantStatusNamespaceItem { diff --git a/api/v1beta2/tenant_types.go b/api/v1beta2/tenant_types.go index f1434c80..1037dd6c 100644 --- a/api/v1beta2/tenant_types.go +++ b/api/v1beta2/tenant_types.go @@ -58,6 +58,8 @@ type TenantSpec struct { // A default value can be specified, and all the Pod resources created will inherit the declared class. // Optional. PriorityClasses *api.DefaultAllowedListSpec `json:"priorityClasses,omitempty"` + // Specifies options for the DeviceClass resources. + DeviceClasses *api.SelectorAllowedListSpec `json:"deviceClasses,omitempty"` // Specifies options for the GatewayClass resources. GatewayOptions GatewayOptions `json:"gatewayOptions,omitempty"` // Toggling the Tenant resources cordoning, when enable resources cannot be deleted. diff --git a/api/v1beta2/zz_generated.deepcopy.go b/api/v1beta2/zz_generated.deepcopy.go index 29eb6de7..c990987c 100644 --- a/api/v1beta2/zz_generated.deepcopy.go +++ b/api/v1beta2/zz_generated.deepcopy.go @@ -949,6 +949,11 @@ func (in *TenantAvailableClassesStatus) DeepCopyInto(out *TenantAvailableClasses *out = make([]string, len(*in)) copy(*out, *in) } + if in.DeviceClasses != nil { + in, out := &in.DeviceClasses, &out.DeviceClasses + *out = make([]string, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TenantAvailableClassesStatus. @@ -1275,6 +1280,11 @@ func (in *TenantSpec) DeepCopyInto(out *TenantSpec) { *out = new(api.DefaultAllowedListSpec) (*in).DeepCopyInto(*out) } + if in.DeviceClasses != nil { + in, out := &in.DeviceClasses, &out.DeviceClasses + *out = new(api.SelectorAllowedListSpec) + (*in).DeepCopyInto(*out) + } in.GatewayOptions.DeepCopyInto(&out.GatewayOptions) if in.ForceTenantPrefix != nil { in, out := &in.ForceTenantPrefix, &out.ForceTenantPrefix diff --git a/charts/capsule/README.md b/charts/capsule/README.md index 3e51462d..c7e9efac 100644 --- a/charts/capsule/README.md +++ b/charts/capsule/README.md @@ -182,6 +182,13 @@ The following Values have changed key or Value: | webhooks.hooks.defaults.ingress | object | `{}` | Deprecated, use webhooks.hooks.ingresses instead | | webhooks.hooks.defaults.pods | object | `{}` | Deprecated, use webhooks.hooks.pods instead | | webhooks.hooks.defaults.pvc | object | `{}` | Deprecated, use webhooks.hooks.persistentvolumeclaims instead | +| webhooks.hooks.devices.enabled | bool | `true` | Enable the Hook | +| webhooks.hooks.devices.failurePolicy | string | `"Fail"` | [FailurePolicy](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#failure-policy) | +| webhooks.hooks.devices.matchConditions | list | `[]` | [MatchConditions](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-matchpolicy) | +| webhooks.hooks.devices.matchPolicy | string | `"Equivalent"` | [MatchPolicy](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-matchpolicy) | +| webhooks.hooks.devices.namespaceSelector | object | `{"matchExpressions":[{"key":"capsule.clastix.io/tenant","operator":"Exists"}]}` | [NamespaceSelector](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-namespaceselector) | +| webhooks.hooks.devices.objectSelector | object | `{}` | [ObjectSelector](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-objectselector) | +| webhooks.hooks.devices.reinvocationPolicy | string | `"Never"` | [ReinvocationPolicy](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#reinvocation-policy) | | webhooks.hooks.gateways.enabled | bool | `true` | Enable the Hook | | webhooks.hooks.gateways.failurePolicy | string | `"Fail"` | [FailurePolicy](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#failure-policy) | | webhooks.hooks.gateways.matchConditions | list | `[]` | [MatchConditions](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-matchpolicy) | diff --git a/charts/capsule/crds/capsule.clastix.io_tenants.yaml b/charts/capsule/crds/capsule.clastix.io_tenants.yaml index 8e1a0164..127f54e6 100644 --- a/charts/capsule/crds/capsule.clastix.io_tenants.yaml +++ b/charts/capsule/crds/capsule.clastix.io_tenants.yaml @@ -1193,6 +1193,61 @@ spec: description: Toggling the Tenant resources cordoning, when enable resources cannot be deleted. type: boolean + deviceClasses: + description: Specifies options for the DeviceClass resources. + properties: + allowed: + description: Match exact elements which are allowed as class names + within this tenant + items: + type: string + type: array + allowedRegex: + description: Match elements by regex (DEPRECATED) + type: string + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic forceTenantPrefix: description: |- Use this if you want to disable/enable the Tenant name prefix to specific Tenants, overriding global forceTenantPrefix in CapsuleConfiguration. @@ -2571,6 +2626,11 @@ spec: classes: description: Available Class Types within Tenant properties: + device: + description: Available DeviceClasses + items: + type: string + type: array gateway: description: Available GatewayClasses items: diff --git a/charts/capsule/templates/validatingwebhookconfiguration.yaml b/charts/capsule/templates/validatingwebhookconfiguration.yaml index 78d1408e..a192b618 100644 --- a/charts/capsule/templates/validatingwebhookconfiguration.yaml +++ b/charts/capsule/templates/validatingwebhookconfiguration.yaml @@ -44,6 +44,44 @@ webhooks: timeoutSeconds: {{ $.Values.webhooks.validatingWebhooksTimeoutSeconds }} {{- end }} {{- end }} +{{- with .Values.webhooks.hooks.devices }} + {{- if .enabled }} +- name: devices.projectcapsule.dev + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + {{- include "capsule.webhooks.service" (dict "path" "/devices" "ctx" $) | nindent 4 }} + failurePolicy: {{ .failurePolicy }} + matchPolicy: {{ .matchPolicy }} + {{- with .namespaceSelector }} + namespaceSelector: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .objectSelector }} + objectSelector: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .matchConditions }} + matchConditions: + {{- toYaml . | nindent 4 }} + {{- end }} + rules: + - apiGroups: + - resource.k8s.io + apiVersions: + - v1 + operations: + - CREATE + - UPDATE + resources: + - resourceclaimtemplates + - resourceclaims + scope: Namespaced + sideEffects: None + timeoutSeconds: {{ $.Values.webhooks.validatingWebhooksTimeoutSeconds }} + {{- end }} +{{- end }} {{- with .Values.webhooks.hooks.gateways }} {{- if .enabled }} - name: gateway.projectcapsule.dev diff --git a/charts/capsule/values.schema.json b/charts/capsule/values.schema.json index 03456a15..2725227d 100644 --- a/charts/capsule/values.schema.json +++ b/charts/capsule/values.schema.json @@ -882,6 +882,55 @@ } } }, + "devices": { + "type": "object", + "properties": { + "enabled": { + "description": "Enable the Hook", + "type": "boolean" + }, + "failurePolicy": { + "description": "[FailurePolicy](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#failure-policy)", + "type": "string" + }, + "matchConditions": { + "description": "[MatchConditions](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-matchpolicy)", + "type": "array" + }, + "matchPolicy": { + "description": "[MatchPolicy](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-matchpolicy)", + "type": "string" + }, + "namespaceSelector": { + "description": "[NamespaceSelector](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-namespaceselector)", + "type": "object", + "properties": { + "matchExpressions": { + "type": "array", + "items": { + "type": "object", + "properties": { + "key": { + "type": "string" + }, + "operator": { + "type": "string" + } + } + } + } + } + }, + "objectSelector": { + "description": "[ObjectSelector](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-objectselector)", + "type": "object" + }, + "reinvocationPolicy": { + "description": "[ReinvocationPolicy](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#reinvocation-policy)", + "type": "string" + } + } + }, "gateways": { "type": "object", "properties": { diff --git a/charts/capsule/values.yaml b/charts/capsule/values.yaml index be0daa0b..6cf51092 100644 --- a/charts/capsule/values.yaml +++ b/charts/capsule/values.yaml @@ -565,7 +565,24 @@ webhooks: matchConditions: [] # -- [ReinvocationPolicy](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#reinvocation-policy) reinvocationPolicy: Never - + devices: + # -- Enable the Hook + enabled: true + # -- [FailurePolicy](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#failure-policy) + failurePolicy: Fail + # -- [MatchPolicy](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-matchpolicy) + matchPolicy: Equivalent + # -- [ObjectSelector](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-objectselector) + objectSelector: {} + # -- [NamespaceSelector](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-namespaceselector) + namespaceSelector: + matchExpressions: + - key: capsule.clastix.io/tenant + operator: Exists + # -- [MatchConditions](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#matching-requests-matchpolicy) + matchConditions: [] + # -- [ReinvocationPolicy](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/#reinvocation-policy) + reinvocationPolicy: Never networkpolicies: # -- Enable the Hook enabled: true diff --git a/cmd/main.go b/cmd/main.go index 96897c21..4e4c638a 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -44,6 +44,7 @@ import ( "github.com/projectcapsule/capsule/internal/metrics" "github.com/projectcapsule/capsule/internal/webhook" "github.com/projectcapsule/capsule/internal/webhook/defaults" + "github.com/projectcapsule/capsule/internal/webhook/dra" "github.com/projectcapsule/capsule/internal/webhook/gateway" "github.com/projectcapsule/capsule/internal/webhook/ingress" "github.com/projectcapsule/capsule/internal/webhook/misc" @@ -267,6 +268,7 @@ func main() { ), route.CustomResources(tenantvalidation.ResourceCounterHandler(manager.GetClient())), route.Gateway(gateway.Class(cfg)), + route.DeviceClass(dra.DeviceClass()), route.Defaults(defaults.Handler(cfg, kubeVersion)), route.TenantMutation( tenantmutation.MetaHandler(), diff --git a/e2e/device_class_test.go b/e2e/device_class_test.go new file mode 100644 index 00000000..ffd6839d --- /dev/null +++ b/e2e/device_class_test.go @@ -0,0 +1,535 @@ +// Copyright 2020-2023 Project Capsule Authors. +// SPDX-License-Identifier: Apache-2.0 + +package e2e + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + capsulev1beta2 "github.com/projectcapsule/capsule/api/v1beta2" + "github.com/projectcapsule/capsule/pkg/api" + resources "k8s.io/api/resource/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/selection" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var _ = Describe("when Tenant handles Device classes", Label("tenant", "classes", "device"), func() { + erm := "nvidia.com/gpu" + authorized := &resources.DeviceClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu.example.com", + Labels: map[string]string{ + "env": "authorized", + }, + }, + Spec: resources.DeviceClassSpec{ + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + ExtendedResourceName: &erm, + }, + } + authorized2 := &resources.DeviceClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu2.example.com", + Labels: map[string]string{ + "env": "authorized", + }, + }, + Spec: resources.DeviceClassSpec{ + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + ExtendedResourceName: &erm, + }, + } + unauthorized := &resources.DeviceClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: "gpu3.example.com", + Labels: map[string]string{ + "env": "unauthorized", + }, + }, + Spec: resources.DeviceClassSpec{ + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + ExtendedResourceName: &erm, + }, + } + + tntWithAuthorized := &capsulev1beta2.Tenant{ + ObjectMeta: metav1.ObjectMeta{ + Name: "e2e-authorized-deviceclass", + }, + Spec: capsulev1beta2.TenantSpec{ + Owners: []api.OwnerSpec{ + { + CoreOwnerSpec: api.CoreOwnerSpec{ + UserSpec: api.UserSpec{ + Name: "authorized-deviceclass", + Kind: "User", + }, + }, + }, + }, + DeviceClasses: &api.SelectorAllowedListSpec{ + LabelSelector: v1.LabelSelector{ + MatchLabels: map[string]string{ + "env": "authorized", + }, + }, + }, + }, + } + tntWithUnauthorized := &capsulev1beta2.Tenant{ + ObjectMeta: metav1.ObjectMeta{ + Name: "e2e-unauthorized-deviceclass", + }, + Spec: capsulev1beta2.TenantSpec{ + Owners: []api.OwnerSpec{ + { + CoreOwnerSpec: api.CoreOwnerSpec{ + UserSpec: api.UserSpec{ + Name: "unauthorized-deviceclass", + Kind: "User", + }, + }, + }, + }, + DeviceClasses: &api.SelectorAllowedListSpec{ + LabelSelector: v1.LabelSelector{ + MatchLabels: map[string]string{ + "env": "production", + }, + }, + }, + }, + } + + JustBeforeEach(func() { + for _, tnt := range []*capsulev1beta2.Tenant{tntWithAuthorized, tntWithUnauthorized} { + tnt.ResourceVersion = "" + EventuallyCreation(func() error { + return k8sClient.Create(context.TODO(), tnt) + }).Should(Succeed()) + } + for _, crd := range []*resources.DeviceClass{authorized, authorized2, unauthorized} { + crd.ResourceVersion = "" + EventuallyCreation(func() error { + return k8sClient.Create(context.TODO(), crd) + }).Should(Succeed()) + } + }) + JustAfterEach(func() { + for _, tnt := range []*capsulev1beta2.Tenant{tntWithAuthorized, tntWithUnauthorized} { + EventuallyCreation(func() error { + return ignoreNotFound(k8sClient.Delete(context.TODO(), tnt)) + }).Should(Succeed()) + } + + Eventually(func() (err error) { + req, _ := labels.NewRequirement("env", selection.Exists, nil) + + return k8sClient.DeleteAllOf(context.TODO(), &resources.DeviceClass{}, &client.DeleteAllOfOptions{ + ListOptions: client.ListOptions{ + LabelSelector: labels.NewSelector().Add(*req), + }, + }) + }, defaultTimeoutInterval, defaultPollInterval).Should(Succeed()) + }) + It("ResourceClaims", func() { + By("Verify Status (Creation)", func() { + Eventually(func() ([]string, error) { + t := &capsulev1beta2.Tenant{} + if err := k8sClient.Get( + context.TODO(), + types.NamespacedName{Name: tntWithAuthorized.GetName()}, + t, + ); err != nil { + return nil, err + } + + return t.Status.Classes.DeviceClasses, nil + }, defaultTimeoutInterval, defaultPollInterval). + Should(ConsistOf(authorized.GetName(), authorized2.GetName())) + }) + + ns := NewNamespace("") + NamespaceCreation(ns, tntWithAuthorized.Spec.Owners[0].UserSpec, defaultTimeoutInterval).Should(Succeed()) + TenantNamespaceList(tntWithAuthorized, defaultTimeoutInterval).Should(ContainElement(ns.GetName())) + + By("providing authorized device class", func() { + for _, class := range []*resources.DeviceClass{authorized} { + Eventually(func() (err error) { + g := &resources.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: class.GetName() + "-resource-claim", + Namespace: ns.GetName(), + }, + Spec: resources.ResourceClaimSpec{ + Devices: resources.DeviceClaim{ + Requests: []resources.DeviceRequest{ + { + Name: "authorized-device-class-resource-claim", + Exactly: &resources.ExactDeviceRequest{ + DeviceClassName: "gpu.example.com", + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + }, + }, + }, + }, + }, + } + + err = k8sClient.Create(context.TODO(), g) + return + }, defaultTimeoutInterval, defaultPollInterval).Should(Succeed()) + } + }) + + By("providing unauthorized device class", func() { + for _, class := range []*resources.DeviceClass{unauthorized} { + Eventually(func() (err error) { + g := &resources.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: class.GetName() + "-resource-claim", + Namespace: ns.GetName(), + }, + Spec: resources.ResourceClaimSpec{ + Devices: resources.DeviceClaim{ + Requests: []resources.DeviceRequest{ + { + Name: "unauthorized-device-class-resource-claim", + Exactly: &resources.ExactDeviceRequest{ + DeviceClassName: "gpu3.example.com", + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + }, + }, + }, + }, + }, + } + + err = k8sClient.Create(context.TODO(), g) + return + }, defaultTimeoutInterval, defaultPollInterval).ShouldNot(Succeed()) + } + }) + + By("providing non-existent device class", func() { + for _, class := range []*resources.DeviceClass{unauthorized} { + Eventually(func() (err error) { + g := &resources.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: class.GetName() + "-resource-claim", + Namespace: ns.GetName(), + }, + Spec: resources.ResourceClaimSpec{ + Devices: resources.DeviceClaim{ + Requests: []resources.DeviceRequest{ + { + Name: "missing-device-class-resource-claim", + Exactly: &resources.ExactDeviceRequest{ + DeviceClassName: "gpu53.example.com", + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + }, + }, + }, + }, + }, + } + + err = k8sClient.Create(context.TODO(), g) + return + }, defaultTimeoutInterval, defaultPollInterval).ShouldNot(Succeed()) + } + }) + + By("Verify Status (Deletion)", func() { + for _, class := range []*resources.DeviceClass{authorized} { + Expect(ignoreNotFound(k8sClient.Delete(context.TODO(), class))).To(Succeed()) + } + + Eventually(func() ([]string, error) { + t := &capsulev1beta2.Tenant{} + if err := k8sClient.Get( + context.TODO(), + types.NamespacedName{Name: tntWithAuthorized.GetName()}, + t, + ); err != nil { + return nil, err + } + + return t.Status.Classes.DeviceClasses, nil + }, defaultTimeoutInterval, defaultPollInterval). + ShouldNot(ConsistOf(authorized.GetName(), authorized2.GetName())) + }) + }) + It("ResourceClaimTemplates", func() { + + ns := NewNamespace("") + NamespaceCreation(ns, tntWithAuthorized.Spec.Owners[0].UserSpec, defaultTimeoutInterval).Should(Succeed()) + TenantNamespaceList(tntWithAuthorized, defaultTimeoutInterval).Should(ContainElement(ns.GetName())) + + By("providing authorized device class", func() { + for _, class := range []*resources.DeviceClass{authorized} { + Eventually(func() (err error) { + g := &resources.ResourceClaimTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: class.GetName() + "-resource-claim", + Namespace: ns.GetName(), + }, + Spec: resources.ResourceClaimTemplateSpec{ + Spec: resources.ResourceClaimSpec{ + Devices: resources.DeviceClaim{ + Requests: []resources.DeviceRequest{ + { + Name: "authorized-device-class-resource-claim", + Exactly: &resources.ExactDeviceRequest{ + DeviceClassName: "gpu.example.com", + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + err = k8sClient.Create(context.TODO(), g) + return + }, defaultTimeoutInterval, defaultPollInterval).Should(Succeed()) + } + }) + + By("providing unauthorized device class", func() { + for _, class := range []*resources.DeviceClass{unauthorized} { + Eventually(func() (err error) { + g := &resources.ResourceClaimTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: class.GetName() + "-resource-claim", + Namespace: ns.GetName(), + }, + Spec: resources.ResourceClaimTemplateSpec{ + Spec: resources.ResourceClaimSpec{ + Devices: resources.DeviceClaim{ + Requests: []resources.DeviceRequest{ + { + Name: "unauthorized-device-class-resource-claim", + Exactly: &resources.ExactDeviceRequest{ + DeviceClassName: "gpu3.example.com", + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + err = k8sClient.Create(context.TODO(), g) + return + }, defaultTimeoutInterval, defaultPollInterval).ShouldNot(Succeed()) + } + }) + + By("providing both authorized and unauthorized device classes", func() { + for _, class := range []*resources.DeviceClass{unauthorized} { + Eventually(func() (err error) { + g := &resources.ResourceClaimTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: class.GetName() + "-resource-claim", + Namespace: ns.GetName(), + }, + Spec: resources.ResourceClaimTemplateSpec{ + Spec: resources.ResourceClaimSpec{ + Devices: resources.DeviceClaim{ + Requests: []resources.DeviceRequest{ + { + Name: "unauthorized-device-class-resource-claim", + Exactly: &resources.ExactDeviceRequest{ + DeviceClassName: "gpu3.example.com", + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + }, + }, + { + Name: "authorized-device-class-resource-claim", + Exactly: &resources.ExactDeviceRequest{ + DeviceClassName: "gpu.example.com", + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + err = k8sClient.Create(context.TODO(), g) + return + }, defaultTimeoutInterval, defaultPollInterval).ShouldNot(Succeed()) + } + }) + + By("providing authorized and missing device classes", func() { + for _, class := range []*resources.DeviceClass{unauthorized} { + Eventually(func() (err error) { + g := &resources.ResourceClaimTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: class.GetName() + "-resource-claim", + Namespace: ns.GetName(), + }, + Spec: resources.ResourceClaimTemplateSpec{ + Spec: resources.ResourceClaimSpec{ + Devices: resources.DeviceClaim{ + Requests: []resources.DeviceRequest{ + { + Name: "missing-device-class-resource-claim", + Exactly: &resources.ExactDeviceRequest{ + DeviceClassName: "gpu63.example.com", + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + }, + }, + { + Name: "authorized-device-class-resource-claim", + Exactly: &resources.ExactDeviceRequest{ + DeviceClassName: "gpu.example.com", + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + err = k8sClient.Create(context.TODO(), g) + return + }, defaultTimeoutInterval, defaultPollInterval).ShouldNot(Succeed()) + } + }) + + By("providing two authorized device classes", func() { + for _, class := range []*resources.DeviceClass{unauthorized} { + Eventually(func() (err error) { + g := &resources.ResourceClaimTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: class.GetName() + "-resource-claim", + Namespace: ns.GetName(), + }, + Spec: resources.ResourceClaimTemplateSpec{ + Spec: resources.ResourceClaimSpec{ + Devices: resources.DeviceClaim{ + Requests: []resources.DeviceRequest{ + { + Name: "unauthorized-device-class-resource-claim", + Exactly: &resources.ExactDeviceRequest{ + DeviceClassName: "gpu2.example.com", + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + }, + }, + { + Name: "authorized-device-class-resource-claim", + Exactly: &resources.ExactDeviceRequest{ + DeviceClassName: "gpu.example.com", + Selectors: []resources.DeviceSelector{ + { + CEL: &resources.CELDeviceSelector{ + Expression: "device.driver == 'gpu.example.com' && device.attributes['gpu.example.com'].type == 'gpu'", + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + err = k8sClient.Create(context.TODO(), g) + return + }, defaultTimeoutInterval, defaultPollInterval).ShouldNot(Succeed()) + } + }) + }) +}) diff --git a/go.mod b/go.mod index c35e155b..f5284a09 100644 --- a/go.mod +++ b/go.mod @@ -19,6 +19,7 @@ require ( k8s.io/apimachinery v0.34.2 k8s.io/apiserver v0.34.2 k8s.io/client-go v0.34.2 + k8s.io/dynamic-resource-allocation v0.34.2 k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 sigs.k8s.io/cluster-api v1.11.3 sigs.k8s.io/controller-runtime v0.22.4 diff --git a/go.sum b/go.sum index 6e61dee2..83528aa3 100644 --- a/go.sum +++ b/go.sum @@ -319,6 +319,8 @@ k8s.io/cluster-bootstrap v0.33.3 h1:u2NTxJ5CFSBFXaDxLQoOWMly8eni31psVso+caq6uwI= k8s.io/cluster-bootstrap v0.33.3/go.mod h1:p970f8u8jf273zyQ5raD8WUu2XyAl0SAWOY82o7i/ds= k8s.io/component-base v0.34.2 h1:HQRqK9x2sSAsd8+R4xxRirlTjowsg6fWCPwWYeSvogQ= k8s.io/component-base v0.34.2/go.mod h1:9xw2FHJavUHBFpiGkZoKuYZ5pdtLKe97DEByaA+hHbM= +k8s.io/dynamic-resource-allocation v0.34.2 h1:SjlRGSWl6CZXoJwQNL+Y0wRfdH8PkJ4mHRNK6MMj0bY= +k8s.io/dynamic-resource-allocation v0.34.2/go.mod h1:ul6I+gfrCmC+OCuVdN0/iykyB2sPrIqh2WyKQ3RQPCU= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= diff --git a/internal/controllers/tenant/manager.go b/internal/controllers/tenant/manager.go index 4285c967..1f1e3e9c 100644 --- a/internal/controllers/tenant/manager.go +++ b/internal/controllers/tenant/manager.go @@ -12,6 +12,7 @@ import ( networkingv1 "k8s.io/api/networking/v1" nodev1 "k8s.io/api/node/v1" rbacv1 "k8s.io/api/rbac/v1" + resources "k8s.io/api/resource/v1" schedulingv1 "k8s.io/api/scheduling/v1" storagev1 "k8s.io/api/storage/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -72,6 +73,15 @@ func (r *Manager) SetupWithManager(mgr ctrl.Manager, ctrlConfig utils.Controller &corev1.Namespace{}, handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &capsulev1beta2.Tenant{}), ). + Watches( + &resources.DeviceClass{}, + r.statusOnlyHandlerClasses( + r.reconcileClassStatus, + r.collectAvailableDeviceClasses, + "cannot collect device classes", + ), + builder.WithPredicates(utils.UpdatedMetadataPredicate), + ). Watches( &storagev1.StorageClass{}, r.statusOnlyHandlerClasses( diff --git a/internal/controllers/tenant/status.go b/internal/controllers/tenant/status.go index ac595a65..dcdbe623 100644 --- a/internal/controllers/tenant/status.go +++ b/internal/controllers/tenant/status.go @@ -9,6 +9,7 @@ import ( "sort" nodev1 "k8s.io/api/node/v1" + resources "k8s.io/api/resource/v1" schedulingv1 "k8s.io/api/scheduling/v1" storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/api/meta" @@ -72,6 +73,14 @@ func (r Manager) reconcileClassStatus( func (r *Manager) collectAvailableResources(ctx context.Context, tnt *capsulev1beta2.Tenant) (err error) { log := log.FromContext(ctx) + log.V(5).Info("collecting available deviceclasses") + + if err = r.collectAvailableDeviceClasses(ctx, tnt); err != nil { + return err + } + + log.V(5).Info("collected available deviceclasses", "size", len(tnt.Status.Classes.DeviceClasses)) + log.V(5).Info("collecting available storageclasses") if err = r.collectAvailableStorageClasses(ctx, tnt); err != nil { @@ -101,6 +110,19 @@ func (r *Manager) collectAvailableResources(ctx context.Context, tnt *capsulev1b return nil } +func (r *Manager) collectAvailableDeviceClasses(ctx context.Context, tnt *capsulev1beta2.Tenant) (err error) { + if tnt.Status.Classes.DeviceClasses, err = listObjectNamesBySelector2( + ctx, + r.Client, + tnt.Spec.DeviceClasses, + &resources.DeviceClassList{}, + ); err != nil { + return err + } + + return nil +} + func (r *Manager) collectAvailableStorageClasses(ctx context.Context, tnt *capsulev1beta2.Tenant) (err error) { if tnt.Status.Classes.StorageClasses, err = listObjectNamesBySelector( ctx, @@ -246,3 +268,92 @@ func listObjectNamesBySelector( return objects, nil } + +func listObjectNamesBySelector2( + ctx context.Context, + c client.Client, + allowed *api.SelectorAllowedListSpec, + list client.ObjectList, + opts ...client.ListOption, +) ([]string, error) { + if err := c.List(ctx, list, opts...); err != nil { + return nil, err + } + + objs, err := meta.ExtractList(list) + if err != nil { + return nil, err + } + + objects := make([]string, 0) + + allNames := make(map[string]struct{}) + selected := make(map[string]struct{}) + + hasSelector := false + if allowed != nil { + hasSelector = len(allowed.MatchLabels) > 0 || + len(allowed.MatchExpressions) > 0 + } + + if allowed == nil || (!hasSelector && len(allowed.Exact) == 0) { + for _, o := range objs { + accessor, err := meta.Accessor(o) + if err != nil { + return nil, err + } + + objects = append(objects, accessor.GetName()) + } + + sort.Strings(objects) + + return objects, nil + } + + // Prepare selector + var sel labels.Selector + if hasSelector { + sel, err = metav1.LabelSelectorAsSelector(&allowed.LabelSelector) + if err != nil { + return nil, err + } + } + + // Evaluate objects + for _, obj := range objs { + accessor, err := meta.Accessor(obj) + if err != nil { + return nil, err + } + + name := accessor.GetName() + + allNames[name] = struct{}{} + + if hasSelector { + lbls := labels.Set(accessor.GetLabels()) + if sel.Matches(lbls) { + selected[name] = struct{}{} + } + } + } + + exact := allowed.Exact + + for _, name := range exact { + if _, exists := allNames[name]; !exists { + continue + } + + selected[name] = struct{}{} + } + + for name := range selected { + objects = append(objects, name) + } + + sort.Strings(objects) + + return objects, nil +} diff --git a/internal/webhook/dra/errors.go b/internal/webhook/dra/errors.go new file mode 100644 index 00000000..226589aa --- /dev/null +++ b/internal/webhook/dra/errors.go @@ -0,0 +1,42 @@ +// Copyright 2020-2025 Project Capsule Authors +// SPDX-License-Identifier: Apache-2.0 +package dra + +import ( + "fmt" + + "github.com/projectcapsule/capsule/internal/webhook/utils" + "github.com/projectcapsule/capsule/pkg/api" +) + +type deviceClassForbiddenError struct { + deviceClassName string + spec api.SelectorAllowedListSpec +} + +func (i deviceClassForbiddenError) Error() string { + err := fmt.Sprintf("Device Class %s is forbidden for the current Tenant: ", i.deviceClassName) + + return utils.AllowedValuesErrorMessage(i.spec, err) +} + +func NewDeviceClassForbidden(class string, spec api.SelectorAllowedListSpec) error { + return &deviceClassForbiddenError{ + deviceClassName: class, + spec: spec, + } +} + +type deviceClassUndefinedError struct { + spec api.SelectorAllowedListSpec +} + +func NewDeviceClassUndefined(spec api.SelectorAllowedListSpec) error { + return &deviceClassUndefinedError{ + spec: spec, + } +} + +func (i deviceClassUndefinedError) Error() string { + return utils.AllowedValuesErrorMessage(i.spec, "Selected DeviceClass is forbidden for the current Tenant or does not exist. Specify a device Class which is allowed by ") +} diff --git a/internal/webhook/dra/validate.go b/internal/webhook/dra/validate.go new file mode 100644 index 00000000..e034b277 --- /dev/null +++ b/internal/webhook/dra/validate.go @@ -0,0 +1,109 @@ +// Copyright 2020-2025 Project Capsule Authors +// SPDX-License-Identifier: Apache-2.0 + +package dra + +import ( + "context" + "net/http" + + corev1 "k8s.io/api/core/v1" + resources "k8s.io/api/resource/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + + capsulewebhook "github.com/projectcapsule/capsule/internal/webhook" + "github.com/projectcapsule/capsule/internal/webhook/utils" + "github.com/projectcapsule/capsule/pkg/utils/tenant" +) + +type deviceClass struct{} + +func DeviceClass() capsulewebhook.Handler { + return &deviceClass{} +} + +func (h *deviceClass) OnCreate(c client.Client, decoder admission.Decoder, recorder record.EventRecorder) capsulewebhook.Func { + return func(ctx context.Context, req admission.Request) *admission.Response { + switch res := req.Kind.Kind; res { + case "ResourceClaim": + rc := &resources.ResourceClaim{} + if err := decoder.Decode(req, rc); err != nil { + return utils.ErroredResponse(err) + } + + return h.validateResourceRequest(ctx, c, decoder, recorder, req, rc.Namespace, rc.Spec.Devices.Requests) + case "ResourceClaimTemplate": + rct := &resources.ResourceClaimTemplate{} + if err := decoder.Decode(req, rct); err != nil { + return utils.ErroredResponse(err) + } + + return h.validateResourceRequest(ctx, c, decoder, recorder, req, rct.Namespace, rct.Spec.Spec.Devices.Requests) + default: + return nil + } + } +} + +func (h *deviceClass) OnDelete(client.Client, admission.Decoder, record.EventRecorder) capsulewebhook.Func { + return func(context.Context, admission.Request) *admission.Response { + return nil + } +} + +func (h *deviceClass) OnUpdate(client.Client, admission.Decoder, record.EventRecorder) capsulewebhook.Func { + return func(context.Context, admission.Request) *admission.Response { + return nil + } +} + +func (h *deviceClass) validateResourceRequest(ctx context.Context, c client.Client, _ admission.Decoder, recorder record.EventRecorder, req admission.Request, namespace string, requests []resources.DeviceRequest) *admission.Response { + tnt, err := tenant.TenantByStatusNamespace(ctx, c, namespace) + if err != nil { + return utils.ErroredResponse(err) + } + + if tnt == nil { + return nil + } + + allowed := tnt.Spec.DeviceClasses + if allowed == nil { + return nil + } + + for _, dr := range requests { + dc, err := utils.GetDeviceClassByName(ctx, c, dr.Exactly.DeviceClassName) + if err != nil && !k8serrors.IsNotFound(err) { + response := admission.Errored(http.StatusInternalServerError, err) + + return &response + } + + if dc == nil { + recorder.Eventf(tnt, corev1.EventTypeWarning, "MissingDeviceClass", "%s %s/%s is missing DeviceClass", req.Kind.Kind, req.Namespace, req.Name) + + response := admission.Denied(NewDeviceClassUndefined(*allowed).Error()) + + return &response + } + + selector := allowed.SelectorMatch(dc) + + switch { + case allowed.Match(dc.Name) || selector: + return nil + default: + recorder.Eventf(tnt, corev1.EventTypeWarning, "ForbiddenDeviceClass", "%s %s/%s DeviceClass %s is forbidden for the current Tenant", req.Kind.Kind, req.Namespace, req.Name, &dc) + + response := admission.Denied(NewDeviceClassForbidden(dc.Name, *allowed).Error()) + + return &response + } + } + + return nil +} diff --git a/internal/webhook/route/deviceclass.go b/internal/webhook/route/deviceclass.go new file mode 100644 index 00000000..eb90b679 --- /dev/null +++ b/internal/webhook/route/deviceclass.go @@ -0,0 +1,24 @@ +// Copyright 2020-2025 Project Capsule Authors +// SPDX-License-Identifier: Apache-2.0 + +package route + +import ( + capsulewebhook "github.com/projectcapsule/capsule/internal/webhook" +) + +type deviceClass struct { + handlers []capsulewebhook.Handler +} + +func DeviceClass(handler ...capsulewebhook.Handler) capsulewebhook.Webhook { + return &deviceClass{handlers: handler} +} + +func (w *deviceClass) GetHandlers() []capsulewebhook.Handler { + return w.handlers +} + +func (w *deviceClass) GetPath() string { + return "/devices" +} diff --git a/internal/webhook/utils/error.go b/internal/webhook/utils/error.go index 5c8b5f79..f10f38c1 100644 --- a/internal/webhook/utils/error.go +++ b/internal/webhook/utils/error.go @@ -20,6 +20,10 @@ func ErroredResponse(err error) *admission.Response { } func DefaultAllowedValuesErrorMessage(allowed api.DefaultAllowedListSpec, err string) string { + return AllowedValuesErrorMessage(allowed.SelectorAllowedListSpec, err) +} + +func AllowedValuesErrorMessage(allowed api.SelectorAllowedListSpec, err string) string { var extra []string if len(allowed.Exact) > 0 { extra = append(extra, fmt.Sprintf("use one from the following list (%s)", strings.Join(allowed.Exact, ", "))) diff --git a/internal/webhook/utils/resources.go b/internal/webhook/utils/resources.go index 974acc7b..b9897075 100644 --- a/internal/webhook/utils/resources.go +++ b/internal/webhook/utils/resources.go @@ -9,6 +9,7 @@ import ( networkingv1 "k8s.io/api/networking/v1" networkingv1beta1 "k8s.io/api/networking/v1beta1" + resources "k8s.io/api/resource/v1" schedulev1 "k8s.io/api/scheduling/v1" storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/types" @@ -77,6 +78,16 @@ func GetGatewayClassClassByObjectName(ctx context.Context, c client.Client, gate return gatewayClass, nil } +// Get DeviceClass by name (Does not return error if not found). +func GetDeviceClassByName(ctx context.Context, c client.Client, name string) (*resources.DeviceClass, error) { + class := &resources.DeviceClass{} + if err := c.Get(ctx, types.NamespacedName{Name: name}, class); err != nil { + return nil, err + } + + return class, nil +} + // IsDefaultPriorityClass checks if the given PriorityClass is cluster default. func IsDefaultPriorityClass(class *schedulev1.PriorityClass) bool { if class != nil {