mirror of
https://github.com/open-cluster-management-io/ocm.git
synced 2026-02-14 18:09:57 +00:00
update kueue integration solution with kueue addon (#1038)
Signed-off-by: Qing Hao <qhao@redhat.com>
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
# Set up Multikueue with OCM Kueue Admission Check Controller
|
||||
# Setup MultiKueue with Open Cluster Management
|
||||
|
||||
This guide demonstrates how to use the external OCM [Kueue Admission Check Controller](https://kueue.sigs.k8s.io/docs/concepts/admission_check/) which integrates OCM `Placement` results with [MultiKueue](https://kueue.sigs.k8s.io/docs/concepts/multikueue/) for intelligent multi-cluster job scheduling.
|
||||
The controller reads OCM `Placement` decisions and generates corresponding `MultiKueueConfig` and `MultiKueueCluster` resources, streamlining the setup of the [MultiKueue](https://kueue.sigs.k8s.io/docs/concepts/multikueue/) environment and enabling users to select clusters based on custom criteria.
|
||||
@@ -33,91 +33,61 @@ REF: [Setup a MultiKueue environment](https://kueue.sigs.k8s.io/docs/tasks/manag
|
||||
1. A Kubernetes environment with OCM installed on a hub cluster and at least three managed clusters.
|
||||
2. [Kueue](https://kueue.sigs.k8s.io/docs/installation/) deployed across all clusters.
|
||||
3. [Managed-serviceaccount](https://github.com/open-cluster-management-io/managed-serviceaccount), [cluster-permission](https://github.com/open-cluster-management-io/cluster-permission) and [resource-usage-collect-addon](https://github.com/open-cluster-management-io/addon-contrib/tree/main/resource-usage-collect-addon) installed on managed clusters.
|
||||
4. [Kueue-addon](https://github.com/open-cluster-management-io/addon-contrib/tree/main/kueue-addon) is installed on managed clusters.
|
||||
|
||||
You can set up all of the above by running the following command (ensure [clusteradm](https://github.com/open-cluster-management-io/clusteradm) is already installed):
|
||||
|
||||
- You can set up these above by running the command:
|
||||
```bash
|
||||
./setup-env.sh
|
||||
```
|
||||
**Notice**: Currently, this functionality relies on the support of `ClusterProfile` and the user's manual installation of the Admission Check Controller.
|
||||
OCM achieves this by replacing some OCM images in this `setup-env.sh`. In the future, we plan to address the items listed in the [TODO section](#todo).
|
||||
|
||||
After that, you can verify your setup.
|
||||
|
||||
- Check the managed clusters.
|
||||
|
||||
```bash
|
||||
kubectl get mcl
|
||||
NAME HUB ACCEPTED MANAGED CLUSTER URLS JOINED AVAILABLE AGE
|
||||
cluster1 true https://cluster1-control-plane:6443 True True 116s
|
||||
cluster2 true https://cluster2-control-plane:6443 True True 94s
|
||||
cluster3 true https://cluster3-control-plane:6443 True True 73s
|
||||
NAME HUB ACCEPTED MANAGED CLUSTER URLS JOINED AVAILABLE AGE
|
||||
cluster1 true https://cluster1-control-plane:6443 True True 11m
|
||||
cluster2 true https://cluster2-control-plane:6443 True True 10m
|
||||
cluster3 true https://cluster3-control-plane:6443 True True 10m
|
||||
local-cluster true https://local-cluster-control-plane:6443 True True 11m
|
||||
```
|
||||
|
||||
- Verify the installed addons.
|
||||
|
||||
```bash
|
||||
kubectl get mca -A
|
||||
NAMESPACE NAME AVAILABLE DEGRADED PROGRESSING
|
||||
cluster1 managed-serviceaccount True False
|
||||
cluster1 resource-usage-collect True False
|
||||
cluster2 managed-serviceaccount True False
|
||||
cluster2 resource-usage-collect True False
|
||||
cluster3 managed-serviceaccount True False
|
||||
cluster3 resource-usage-collect True False
|
||||
NAMESPACE NAME AVAILABLE DEGRADED PROGRESSING
|
||||
cluster1 kueue-addon True False
|
||||
cluster1 managed-serviceaccount True False
|
||||
cluster1 resource-usage-collect True False
|
||||
cluster2 kueue-addon True False
|
||||
cluster2 managed-serviceaccount True False
|
||||
cluster2 resource-usage-collect True False
|
||||
cluster3 kueue-addon True False
|
||||
cluster3 managed-serviceaccount True False
|
||||
cluster3 resource-usage-collect True False
|
||||
local-cluster managed-serviceaccount True False
|
||||
local-cluster resource-usage-collect True False
|
||||
```
|
||||
|
||||
- Confirm Kueue is running on the clusters.
|
||||
|
||||
```bash
|
||||
kubectl get pods -n kueue-system --context kind-hub # Same for managed clusters.
|
||||
kubectl get pods -n kueue-system --context kind-local-cluster # Same for managed clusters.
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
kueue-controller-manager-87bd7888b-gqk4g 2/2 Running 0 69s
|
||||
```
|
||||
|
||||
- On the hub cluster, check `ClusterProfiles`.
|
||||
```bash
|
||||
kubectl get clusterprofile -A
|
||||
NAMESPACE NAME AGE
|
||||
open-cluster-management cluster1 23s
|
||||
open-cluster-management cluster2 23s
|
||||
open-cluster-management cluster3 23s
|
||||
```
|
||||
- The `ClusterProfile` status contains credentials that Kueue can use.
|
||||
```bash
|
||||
kubectl get clusterprofile -A -ojson | jq '.items[] | .metadata.name, .status.credentials[]'
|
||||
"cluster1"
|
||||
{
|
||||
"accessRef": {
|
||||
"kind": "Secret",
|
||||
"name": "kueue-admin-cluster1-kubeconfig",
|
||||
"namespace": "kueue-system"
|
||||
},
|
||||
"consumer": "kueue-admin"
|
||||
}
|
||||
"cluster2"
|
||||
{
|
||||
"accessRef": {
|
||||
"kind": "Secret",
|
||||
"name": "kueue-admin-cluster2-kubeconfig",
|
||||
"namespace": "kueue-system"
|
||||
},
|
||||
"consumer": "kueue-admin"
|
||||
}
|
||||
"cluster3"
|
||||
{
|
||||
"accessRef": {
|
||||
"kind": "Secret",
|
||||
"name": "kueue-admin-cluster3-kubeconfig",
|
||||
"namespace": "kueue-system"
|
||||
},
|
||||
"consumer": "kueue-admin"
|
||||
|
||||
}
|
||||
```
|
||||
- On hub cluster, Check secrets with `kubeconfig` for the managed cluster created under `kueue-system` namespace.
|
||||
|
||||
```bash
|
||||
kubectl get secret -n kueue-system
|
||||
NAME TYPE DATA AGE
|
||||
kueue-admin-cluster1-kubeconfig Opaque 1 4m4s
|
||||
kueue-admin-cluster2-kubeconfig Opaque 1 4m4s
|
||||
kueue-admin-cluster3-kubeconfig Opaque 1 4m4s
|
||||
kueue-webhook-server-cert Opaque 4 5m27s
|
||||
NAME TYPE DATA AGE
|
||||
kueue-webhook-server-cert Opaque 4 5m12s
|
||||
multikueue-cluster1 Opaque 1 3m38s
|
||||
multikueue-cluster2 Opaque 1 3m38s
|
||||
multikueue-cluster3 Opaque 1 3m38s
|
||||
multikueue-local-cluster Opaque 1 3m38s
|
||||
```
|
||||
|
||||
## User Stories
|
||||
@@ -126,24 +96,27 @@ kueue-webhook-server-cert Opaque 4 5m27s
|
||||
|
||||
As an admin, I want to automate [MultiKueue](https://kueue.sigs.k8s.io/docs/concepts/multikueue/) configuration across multiple clusters, so that I can streamline the setup process without manual intervention.
|
||||
|
||||
- With the help of the `ClusterProfile` API, we can easily set up MultiKueue environment.
|
||||
- With the secrets under `kueue-system` auto created, we can easily set up MultiKueue environment.
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./multikueue-setup-demo1.yaml
|
||||
```
|
||||
- After that, check the status of `MultiKueueCluster`, `AdmissionChecks` and `Clusterqueues`
|
||||
|
||||
- After that, check the status of the `MultiKueueCluster`, `AdmissionCheck`, and `ClusterQueue` resources.
|
||||
|
||||
```bash
|
||||
kubectl get multikueuecluster -A -ojson | jq '.items[] | .metadata.name, .status.conditions'
|
||||
kubectl get multikueueclusters -A -ojson | jq '.items[] | .metadata.name, .status.conditions'
|
||||
kubectl get admissionchecks -ojson | jq '.items[] | .metadata.name, .status.conditions'
|
||||
kubectl get clusterqueues -ojson | jq '.items[] | .metadata.name, .status.conditions'
|
||||
```
|
||||
|
||||
Success is indicated when "status": "True" and reasons like "Active" or "Ready" are present in the conditions.
|
||||
|
||||
```bash
|
||||
"multikueue-demo1-cluster1"
|
||||
"multikueue-config-demo1-cluster1"
|
||||
[
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T20:41:41Z",
|
||||
"lastTransitionTime": "2025-05-29T11:23:17Z",
|
||||
"message": "Connected",
|
||||
"observedGeneration": 1,
|
||||
"reason": "Active",
|
||||
@@ -151,10 +124,10 @@ Success is indicated when "status": "True" and reasons like "Active" or "Ready"
|
||||
"type": "Active"
|
||||
}
|
||||
]
|
||||
"multikueue-demo1-cluster2"
|
||||
"multikueue-config-demo1-cluster2"
|
||||
[
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T20:41:41Z",
|
||||
"lastTransitionTime": "2025-05-29T11:23:17Z",
|
||||
"message": "Connected",
|
||||
"observedGeneration": 1,
|
||||
"reason": "Active",
|
||||
@@ -165,34 +138,18 @@ Success is indicated when "status": "True" and reasons like "Active" or "Ready"
|
||||
"multikueue-demo1"
|
||||
[
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T20:41:41Z",
|
||||
"lastTransitionTime": "2025-05-29T11:23:17Z",
|
||||
"message": "The admission check is active",
|
||||
"observedGeneration": 1,
|
||||
"reason": "Active",
|
||||
"status": "True",
|
||||
"type": "Active"
|
||||
},
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T20:41:41Z",
|
||||
"message": "only one multikueue managed admission check can be used in one ClusterQueue",
|
||||
"observedGeneration": 1,
|
||||
"reason": "MultiKueue",
|
||||
"status": "True",
|
||||
"type": "SingleInstanceInClusterQueue"
|
||||
},
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T20:41:41Z",
|
||||
"message": "admission check cannot be applied at ResourceFlavor level",
|
||||
"observedGeneration": 1,
|
||||
"reason": "MultiKueue",
|
||||
"status": "True",
|
||||
"type": "FlavorIndependent"
|
||||
}
|
||||
]
|
||||
"cluster-queue-demo1"
|
||||
"cluster-queue"
|
||||
[
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T20:41:41Z",
|
||||
"lastTransitionTime": "2025-05-29T11:23:17Z",
|
||||
"message": "Can admit new workloads",
|
||||
"observedGeneration": 1,
|
||||
"reason": "Ready",
|
||||
@@ -201,13 +158,15 @@ Success is indicated when "status": "True" and reasons like "Active" or "Ready"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
- Deploy a job to the MultiKueue.
|
||||
|
||||
```bash
|
||||
kubectl create -f ./job-demo1.yaml
|
||||
```
|
||||
- Check the workload on the managed clusters. Here when the job’s Workload receives a QuotaReservation in the manager cluster, a copy of the Workload is created in all configured worker clusters.
|
||||
Once `kind-cluster1` admitted the workload, the manager removed the corresponding workloads from the other clusters(`kind-cluster2`).
|
||||
|
||||
- Check the workload on the managed clusters. Here, when the job's Workload receives a QuotaReservation in the manager cluster, a copy of the Workload is created in all configured worker clusters. Once `kind-cluster1` admits the workload, the manager removes the corresponding workloads from the other clusters (e.g., `kind-cluster2`).
|
||||
|
||||
```bash
|
||||
kubectl get workload --context kind-cluster1
|
||||
NAME QUEUE RESERVED IN ADMITTED AGE
|
||||
@@ -216,49 +175,67 @@ job-demo1-jobnktc6-6c5f3 user-queue-demo1 cluster-queue-demo1 True 5
|
||||
kubectl get workload --context kind-cluster2
|
||||
No resources found in default namespace. # After cluster1 admitted the workload, no workload should show up here.
|
||||
```
|
||||
|
||||
#### Story 2
|
||||
|
||||
As an admin, I want to use OCM `Placement` results for scheduling, so that clusters with specific attributes, like those with the `nvidia-t4` GPU accelerator label, are automatically selected and converted into a [MultiKueue](https://kueue.sigs.k8s.io/docs/concepts/multikueue/) for targeted workload deployment.
|
||||
|
||||
- You can manually label the accelerators on the clusters.
|
||||
- Cleanup the resource from demo1.
|
||||
|
||||
```bash
|
||||
kubectl label managedcluster cluster2 accelerator=nvidia-tesla-t4
|
||||
kubectl label managedcluster cluster3 accelerator=nvidia-tesla-t4
|
||||
kubectl delete -f ./multikueue-setup-demo1.yaml
|
||||
```
|
||||
The `placememt-demo2-1.yaml` selects clusters with the `nvidia-tesla-t4` accelerator label.
|
||||
|
||||
- If your environment is set up by `setup-env.sh`, you will see cluster2 and cluster3 with the label `accelerator=nvidia-tesla-t4` and 3 fake GPU resources.
|
||||
|
||||
```bash
|
||||
kubectl get mcl -l accelerator=nvidia-tesla-t4
|
||||
NAME HUB ACCEPTED MANAGED CLUSTER URLS JOINED AVAILABLE AGE
|
||||
cluster2 true https://cluster2-control-plane:6443 True True 37m
|
||||
cluster3 true https://cluster3-control-plane:6443 True True 37m
|
||||
|
||||
kubectl get node -ojson --context kind-cluster2 | jq '.items[] | .status.capacity, .status.allocatable' | grep gpu
|
||||
"nvidia.com/gpu": "3",
|
||||
"nvidia.com/gpu": "3",
|
||||
kubectl get node -ojson --context kind-cluster3 | jq '.items[] | .status.capacity, .status.allocatable' | grep gpu
|
||||
"nvidia.com/gpu": "3",
|
||||
"nvidia.com/gpu": "3",
|
||||
```
|
||||
|
||||
- Bind the cluster set to the Kueue namespace and verify the bindings.
|
||||
|
||||
```bash
|
||||
clusteradm clusterset bind global --namespace kueue-system
|
||||
clusteradm get clustersets
|
||||
<ManagedClusterSet>
|
||||
└── <default>
|
||||
│ ├── <Status> 4 ManagedClusters selected
|
||||
│ ├── <Clusters> [cluster1 cluster2 cluster3 local-cluster]
|
||||
│ ├── <BoundNamespace>
|
||||
└── <global>
|
||||
└── <BoundNamespace> kueue-system,open-cluster-management-addon
|
||||
└── <Status> 4 ManagedClusters selected
|
||||
└── <Clusters> [cluster1 cluster2 cluster3 local-cluster]
|
||||
```
|
||||
|
||||
- The `placement-demo2-1.yaml` selects clusters with the `nvidia-tesla-t4` accelerator label.
|
||||
|
||||
```yaml
|
||||
apiVersion: cluster.open-cluster-management.io/v1beta1
|
||||
kind: Placement
|
||||
metadata:
|
||||
name: placement-demo2
|
||||
name: multikueue-config-demo2
|
||||
namespace: kueue-system
|
||||
spec:
|
||||
clusterSets:
|
||||
- spoke
|
||||
tolerations:
|
||||
- key: cluster.open-cluster-management.io/unreachable
|
||||
operator: Exists
|
||||
- key: cluster.open-cluster-management.io/unavailable
|
||||
operator: Exists
|
||||
...
|
||||
predicates:
|
||||
- requiredClusterSelector:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
accelerator: nvidia-tesla-t4
|
||||
```
|
||||
- Bind the cluster set to the Kueue namespace and verify the bindings.
|
||||
|
||||
```bash
|
||||
clusteradm clusterset bind spoke --namespace kueue-system
|
||||
clusteradm get clustersets
|
||||
<ManagedClusterSet>
|
||||
└── <spoke>
|
||||
└── <BoundNamespace> default,kueue-system
|
||||
└── <Status> 3 ManagedClusters selected
|
||||
└── <Clusters> [cluster1 cluster2 cluster3]
|
||||
```
|
||||
|
||||
- Apply the placement policy.
|
||||
Apply the placement.
|
||||
|
||||
```bash
|
||||
kubectl apply -f placement-demo2-1.yaml
|
||||
@@ -270,30 +247,35 @@ kubectl apply -f placement-demo2-1.yaml
|
||||
kubectl apply -f ./multikueue-setup-demo2.yaml
|
||||
```
|
||||
|
||||
- Check the `MultikueueKonfig` and `MultikueueClusters`.
|
||||
- Check the `MultikueueConfig` and `MultikueueClusters`.
|
||||
|
||||
```bash
|
||||
kubectl get multikueueconfig
|
||||
NAME AGE
|
||||
placement-demo2 60s
|
||||
multikueue-config-demo2 10s
|
||||
|
||||
kubectl get multikueuecluster
|
||||
NAME AGE
|
||||
placement-demo2-cluster2 60s
|
||||
placement-demo2-cluster3 60s
|
||||
kubectl get multikueueclusters
|
||||
NAME AGE
|
||||
multikueue-config-demo2-cluster2 19s
|
||||
multikueue-config-demo2-cluster3 19s
|
||||
```
|
||||
- After that, check the status of `MultiKueueCluster`, `AdmissionChecks` and `Clusterqueues`
|
||||
|
||||
- After that, check the status of the `MultiKueueCluster`, `AdmissionCheck`, and `ClusterQueue` resources.
|
||||
|
||||
|
||||
```bash
|
||||
kubectl get multikueuecluster -A -ojson | jq '.items[] | .metadata.name, .status.conditions'
|
||||
kubectl get multikueueclusters -A -ojson | jq '.items[] | .metadata.name, .status.conditions'
|
||||
kubectl get admissionchecks -ojson | jq '.items[] | .metadata.name, .status.conditions'
|
||||
kubectl get clusterqueues -ojson | jq '.items[] | .metadata.name, .status.conditions'
|
||||
```
|
||||
|
||||
If success, there should be "status": "True" and reasons like "Active" or "Ready" presented in the conditions.
|
||||
|
||||
```bash
|
||||
"placement-demo2-cluster2"
|
||||
"multikueue-config-demo2-cluster2"
|
||||
[
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T22:03:16Z",
|
||||
"lastTransitionTime": "2025-05-29T11:28:34Z",
|
||||
"message": "Connected",
|
||||
"observedGeneration": 1,
|
||||
"reason": "Active",
|
||||
@@ -301,10 +283,10 @@ If success, there should be "status": "True" and reasons like "Active" or "Ready
|
||||
"type": "Active"
|
||||
}
|
||||
]
|
||||
"placement-demo2-cluster3"
|
||||
"multikueue-config-demo2-cluster3"
|
||||
[
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T22:03:16Z",
|
||||
"lastTransitionTime": "2025-05-29T11:28:34Z",
|
||||
"message": "Connected",
|
||||
"observedGeneration": 1,
|
||||
"reason": "Active",
|
||||
@@ -312,47 +294,31 @@ If success, there should be "status": "True" and reasons like "Active" or "Ready
|
||||
"type": "Active"
|
||||
}
|
||||
]
|
||||
"multikueue-demo2" # The status of the admissioncheck `multikueue-demo2`
|
||||
"multikueue-config-demo2" # The status of the admissioncheck `multikueue-config-demo2`
|
||||
[
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T22:03:16Z",
|
||||
"lastTransitionTime": "2025-05-29T11:28:34Z",
|
||||
"message": "MultiKueueConfig multikueue-config-demo2 and MultiKueueClusters are generated successfully",
|
||||
"reason": "Active",
|
||||
"status": "True",
|
||||
"type": "Active"
|
||||
}
|
||||
]
|
||||
"multikueue-demo2" # The status of the admissioncheck `multikueue-demo2`
|
||||
[
|
||||
{
|
||||
"lastTransitionTime": "2025-05-29T11:28:34Z",
|
||||
"message": "The admission check is active",
|
||||
"observedGeneration": 1,
|
||||
"reason": "Active",
|
||||
"status": "True",
|
||||
"type": "Active"
|
||||
},
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T22:03:16Z",
|
||||
"message": "only one multikueue managed admission check can be used in one ClusterQueue",
|
||||
"observedGeneration": 1,
|
||||
"reason": "MultiKueue",
|
||||
"status": "True",
|
||||
"type": "SingleInstanceInClusterQueue"
|
||||
},
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T22:03:16Z",
|
||||
"message": "admission check cannot be applied at ResourceFlavor level",
|
||||
"observedGeneration": 1,
|
||||
"reason": "MultiKueue",
|
||||
"status": "True",
|
||||
"type": "FlavorIndependent"
|
||||
}
|
||||
]
|
||||
"placement-demo2" # The status of the admissioncheck `placement-demo2`
|
||||
"cluster-queue"
|
||||
[
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T22:03:16Z",
|
||||
"message": "MultiKueueConfig and MultiKueueCluster generated",
|
||||
"reason": "Active",
|
||||
"status": "True",
|
||||
"type": "Active"
|
||||
}
|
||||
]
|
||||
"cluster-queue-demo2"
|
||||
[
|
||||
{
|
||||
"lastTransitionTime": "2024-08-31T22:03:16Z",
|
||||
"lastTransitionTime": "2025-05-29T11:28:34Z",
|
||||
"message": "Can admit new workloads",
|
||||
"observedGeneration": 1,
|
||||
"reason": "Ready",
|
||||
@@ -361,86 +327,30 @@ If success, there should be "status": "True" and reasons like "Active" or "Ready
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
- Create a job requesting GPU resources to the MultiKueue.
|
||||
|
||||
```bash
|
||||
kubectl create -f ./job-demo2.yaml
|
||||
```
|
||||
- Check the workload on managed clusters. Like we explained in the case in story 1, once one cluster(here `kind-cluster3`) has admitted the workload, the manager removed the corresponding workloads from the other clusters(here `kind-cluster2`).
|
||||
|
||||
- Check the workload on managed clusters. As explained in Story 1, once one cluster (here `kind-cluster3`) has admitted the workload, the manager removes the corresponding workloads from the other clusters (here `kind-cluster2`).
|
||||
|
||||
```bash
|
||||
kubectl get workload --context kind-cluster2
|
||||
No resources found in default namespace.
|
||||
|
||||
kubectl get workload --context kind-cluster3
|
||||
NAME QUEUE RESERVED IN ADMITTED AGE
|
||||
job-demo2-jobl2t6d-a8cdd user-queue-demo2 cluster-queue-demo2 True 3s
|
||||
NAME QUEUE RESERVED IN ADMITTED FINISHED AGE
|
||||
job-demo2-jobfpf8q-58705 user-queue cluster-queue True 5m24s
|
||||
```
|
||||
|
||||
#### Story 3
|
||||
|
||||
As an admin, I want to leverage OCM's `AddonPlacementScore` for dynamic workload scheduling, so that clusters with higher GPU scores, indicating clusters with more GPU resources, are selected and converted into a [MultiKueue](https://kueue.sigs.k8s.io/docs/concepts/multikueue/), which automatically adjusts by adding or removing clusters as scores change.
|
||||
|
||||
`placememt-demo2-2` selects clusters with the `nvidia-tesla-t4` accelerator label, and select one cluster with the highest GPU-score, indicating having more GPU resources.
|
||||
- Here in this environment, cluster1 has no GPUs, while cluster2 and cluster3 each have 3 GPUs. Check `AddonPlacementScore`—the score ranges from -100 to 100, with clusters having more resources available receiving higher scores. Here, cluster1, which has no GPUs, should have a score of -100, and the cluster running the workload (from Story 2, `kind-cluster3`) will have a lower score.
|
||||
|
||||
```yaml
|
||||
apiVersion: cluster.open-cluster-management.io/v1beta1
|
||||
kind: Placement
|
||||
metadata:
|
||||
name: placement-demo2
|
||||
namespace: kueue-system
|
||||
spec:
|
||||
clusterSets:
|
||||
- spoke
|
||||
tolerations:
|
||||
- key: cluster.open-cluster-management.io/unreachable
|
||||
operator: Exists
|
||||
- key: cluster.open-cluster-management.io/unavailable
|
||||
operator: Exists
|
||||
predicates:
|
||||
- requiredClusterSelector:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
accelerator: nvidia-tesla-t4
|
||||
numberOfClusters: 1
|
||||
prioritizerPolicy:
|
||||
mode: Exact
|
||||
configurations:
|
||||
- scoreCoordinate:
|
||||
type: AddOn
|
||||
addOn:
|
||||
resourceName: resource-usage-score
|
||||
scoreName: gpuClusterAvailable
|
||||
weight: 1
|
||||
```
|
||||
- You can manually edit the GPU resources on the managed clusters for testing, for example on `kind-cluster2`, set 3 fake GPU resources on the `control-plane-node`.
|
||||
```bash
|
||||
kubectl edit-status node cluster2-control-plane --context kind-cluster2 # Same operation with other clusters/nodes.
|
||||
```
|
||||
- Edit the `status` of the node `cluster2-control-plane`:
|
||||
```yaml
|
||||
allocatable:
|
||||
cpu: "8"
|
||||
ephemeral-storage: 61202244Ki
|
||||
hugepages-1Gi: "0"
|
||||
hugepages-2Mi: "0"
|
||||
hugepages-32Mi: "0"
|
||||
hugepages-64Ki: "0"
|
||||
memory: 8027168Ki
|
||||
nvidia.com/gpu: "3" # Add 3 fake GPUs in allocatable
|
||||
pods: "110"
|
||||
capacity:
|
||||
cpu: "8"
|
||||
ephemeral-storage: 61202244Ki
|
||||
hugepages-1Gi: "0"
|
||||
hugepages-2Mi: "0"
|
||||
hugepages-32Mi: "0"
|
||||
hugepages-64Ki: "0"
|
||||
memory: 8027168Ki
|
||||
nvidia.com/gpu: "3" # Add 3 fake GPUs in capacity
|
||||
pods: "110"
|
||||
```
|
||||
|
||||
- Here in this environment, cluster1 has no GPUs, while cluster2 and cluster3 each have 3 GPUs.
|
||||
Check `AddonPlacementScore`, the range of the score is from -100 to 100, clusters with more resources available have higher scores.
|
||||
Here cluster1, which has no GPUs, should have a score of -100, and the cluster running the workload(here from story 2 we have one workload running on `kind-cluster3`) will have a lower score.
|
||||
```bash
|
||||
kubectl get addonplacementscore -A -ojson | jq '.items[] | .metadata.name, .status.scores[5]'
|
||||
"resource-usage-score"
|
||||
@@ -460,166 +370,69 @@ kubectl get addonplacementscore -A -ojson | jq '.items[] | .metadata.name, .stat
|
||||
}
|
||||
```
|
||||
|
||||
- Apply the changes in the `Placement` to update MultiKueue dynamically.
|
||||
- The `placement-demo2-2.yaml` selects clusters with the `nvidia-tesla-t4` accelerator label, and select one cluster with the highest GPU-score, indicating having more GPU resources.
|
||||
|
||||
```yaml
|
||||
apiVersion: cluster.open-cluster-management.io/v1beta1
|
||||
kind: Placement
|
||||
metadata:
|
||||
name: multikueue-config-demo2
|
||||
namespace: kueue-system
|
||||
spec:
|
||||
...
|
||||
predicates:
|
||||
- requiredClusterSelector:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
accelerator: nvidia-tesla-t4
|
||||
numberOfClusters: 1
|
||||
prioritizerPolicy:
|
||||
mode: Exact
|
||||
configurations:
|
||||
- scoreCoordinate:
|
||||
type: AddOn
|
||||
addOn:
|
||||
resourceName: resource-usage-score
|
||||
scoreName: gpuClusterAvailable
|
||||
weight: 1
|
||||
```
|
||||
|
||||
Apply the changes in the `Placement` to update MultiKueue dynamically.
|
||||
|
||||
```bash
|
||||
kubectl apply -f ./placement-demo2-2.yaml
|
||||
```
|
||||
|
||||
- Review the update in `MultikueueKonfig`.
|
||||
```bash
|
||||
kubectl get multikueueconfig
|
||||
NAME AGE
|
||||
placement-demo2 22m
|
||||
- Review the update in `MultikueueConfig`.
|
||||
|
||||
kubectl get multikueueconfig placement-demo2 -oyaml
|
||||
apiVersion: kueue.x-k8s.io/v1alpha1
|
||||
```bash
|
||||
kubectl get multikueueconfig multikueue-config-demo2 -oyaml
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: MultiKueueConfig
|
||||
metadata:
|
||||
creationTimestamp: "2024-08-31T22:03:16Z"
|
||||
generation: 5
|
||||
name: placement-demo2
|
||||
resourceVersion: "18109"
|
||||
uid: 3c16af72-94bf-4444-bf79-7e896165aabc
|
||||
creationTimestamp: "2025-05-29T11:28:34Z"
|
||||
generation: 7
|
||||
name: multikueue-config-demo2
|
||||
resourceVersion: "11913"
|
||||
uid: da363d4c-c0e8-43b4-a335-a52dc5a3cabf
|
||||
spec:
|
||||
clusters:
|
||||
- placement-demo2-cluster2 # cluster2 has a higher GPU score, so it got selected by the placement decision.
|
||||
- multikueue-config-demo2-cluster2 # cluster2 has a higher GPU score, so it got selected by the placement decision.
|
||||
```
|
||||
- Create a job for the updated MultiKueue and check the workload, this time the workload is admitted by `kind-cluster2`, in `kind-cluster3` can only find the old workload from Story 2.
|
||||
|
||||
- Create a job for the updated MultiKueue and check the workload, this time the workload is admitted by `kind-cluster2`. In `kind-cluster3`, you can only find the old workload from Story 2.
|
||||
|
||||
```bash
|
||||
kubectl create -f ./job-demo2.yaml
|
||||
kubectl get workload --context kind-cluster2
|
||||
NAME QUEUE RESERVED IN ADMITTED AGE
|
||||
job-demo2-jobxn888-4b91e user-queue-demo2 cluster-queue-demo2 True 6s
|
||||
NAME QUEUE RESERVED IN ADMITTED FINISHED AGE
|
||||
job-demo2-jobfxmh7-f4c34 user-queue cluster-queue True 8s
|
||||
|
||||
kubectl get workload --context kind-cluster3
|
||||
NAME QUEUE RESERVED IN ADMITTED AGE
|
||||
job-demo2-jobl2t6d-a8cdd user-queue-demo2 cluster-queue-demo2 True 9m13s
|
||||
NAME QUEUE RESERVED IN ADMITTED FINISHED AGE
|
||||
job-demo2-jobfpf8q-58705 user-queue cluster-queue True 5m24s
|
||||
```
|
||||
|
||||
## Design Details
|
||||
## Design Details and Workflow
|
||||
|
||||
### OCM Admission Check Controller
|
||||
|
||||
The OCM Admission Check Controller will integrate OCM `Placement` results into MultiKueue by reading `Placement` decisions and generating the necessary `MultiKueueConfig` and `MultiKueueCluster` resources.
|
||||
|
||||
- `controllerName`: Identifies the controller that processes the Admission Check, currently set to `open-cluster-management.io/placement`
|
||||
- `parameters`: Identifies a configuration with additional parameters for the check, here we add the existing OCM `Placement` component. Clusters specified in the `Placement` will be bound to the `kueue-system` namespace.
|
||||
|
||||
Example OCM Admission Check Controller design:
|
||||
|
||||
```yaml
|
||||
# OCM implements an admissioncheck controller to automate the MultiKueue setup process.
|
||||
# MultiKueueConfigs and MultiKueueClusters are generated dynamically based on OCM placement decisions.
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: AdmissionCheck
|
||||
metadata:
|
||||
name: placement-demo2
|
||||
spec:
|
||||
controllerName: open-cluster-management.io/placement
|
||||
parameters:
|
||||
apiGroup: cluster.open-cluster-management.io
|
||||
kind: Placement
|
||||
name: placement-demo2
|
||||
# Leverages OCM's placement mechanism to select clusters based on specific criteria.
|
||||
# For example `Placement-demo2-1` selects clusters with the `nvidia-tesla-t4` accelerator label.
|
||||
```
|
||||
|
||||
### Changes in the Configuration Process with OCM Admission Check Controller
|
||||
|
||||
Using the OCM Admission Check Controller significantly simplifies the configuration process for system administrators by automating several manual tasks.
|
||||
|
||||
#### Before Using OCM Admission Check Controller
|
||||
|
||||
In the traditional setup, administrators must manually configure both `MultiKueueConfig` and `MultiKueueCluster` resources:
|
||||
|
||||
- **MultiKueueConfig**: Defines which clusters are part of the [MultiKueue](https://kueue.sigs.k8s.io/docs/concepts/multikueue/) environment. Admins need to specify each cluster manually.
|
||||
- **MultiKueueCluster**: Each cluster requires a `MultiKueueCluster` resource, which includes a kubeconfig secret that administrators must create manually for secure communication.
|
||||
|
||||
```yaml
|
||||
apiVersion: kueue.x-k8s.io/v1alpha1
|
||||
kind: MultiKueueConfig
|
||||
metadata:
|
||||
name: multikueue-config
|
||||
spec:
|
||||
clusters:
|
||||
- multikueue-cluster1
|
||||
- multikueue-cluster2
|
||||
---
|
||||
apiVersion: kueue.x-k8s.io/v1alpha1
|
||||
kind: MultiKueueCluster
|
||||
metadata:
|
||||
name: multikueue-cluster1
|
||||
spec:
|
||||
kubeConfig:
|
||||
locationType: Secret
|
||||
location: kueue-admin-cluster1-kubeconfig
|
||||
---
|
||||
apiVersion: kueue.x-k8s.io/v1alpha1
|
||||
kind: MultiKueueCluster
|
||||
metadata:
|
||||
name: multikueue-cluster2
|
||||
spec:
|
||||
kubeConfig:
|
||||
locationType: Secret
|
||||
location: kueue-admin-cluster2-kubeconfig
|
||||
```
|
||||
|
||||
#### After Using OCM Admission Check Controller
|
||||
|
||||
With the OCM Admission Check Controller, the need for manual configuration of `MultiKueueConfig` and `MultiKueueCluster` is eliminated. Instead, the administrator only needs to configure two additional admission checks in the ClusterQueue resource:
|
||||
`multikueue-demo2` and `placement-demo2` (see in `multikueue-setup-demo2.yaml`) which leverage OCM's placement mechanism to select clusters based on specific criteria and automate the process of setting up `MultiKueueConfig` and `MultiKueueCluster`.
|
||||
|
||||
```yaml
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: ClusterQueue
|
||||
metadata:
|
||||
name: "cluster-queue-demo2"
|
||||
spec:
|
||||
namespaceSelector: {} # match all.
|
||||
resourceGroups:
|
||||
- coveredResources: ["cpu", "memory","nvidia.com/gpu"]
|
||||
flavors:
|
||||
- name: "default-flavor-demo2"
|
||||
resources:
|
||||
- name: "cpu"
|
||||
nominalQuota: 9
|
||||
- name: "memory"
|
||||
nominalQuota: 36Gi
|
||||
- name: "nvidia.com/gpu"
|
||||
nominalQuota: 3
|
||||
admissionChecks:
|
||||
- multikueue-demo2
|
||||
- placement-demo2
|
||||
---
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: AdmissionCheck
|
||||
metadata:
|
||||
name: multikueue-demo2
|
||||
spec:
|
||||
controllerName: kueue.x-k8s.io/multikueue
|
||||
parameters:
|
||||
apiGroup: kueue.x-k8s.io
|
||||
kind: MultiKueueConfig
|
||||
name: placement-demo2
|
||||
---
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: AdmissionCheck
|
||||
metadata:
|
||||
name: placement-demo2
|
||||
spec:
|
||||
controllerName: open-cluster-management.io/placement
|
||||
parameters:
|
||||
apiGroup: cluster.open-cluster-management.io
|
||||
kind: Placement
|
||||
name: placement-demo2
|
||||
```
|
||||
|
||||
#### OCM Admission Check Controller Workflow
|
||||
|
||||
- The OCM Admission Check Controller retrieves the OCM `Placement` associated with an AdmissionCheck in the `kueue-system` namespace.
|
||||
- It uses a `PlacementDecisionTracker` to gather the selected clusters and retrieves their `ClusterProfile` for `credentials`.
|
||||
- The controller creates or updates `MultiKueueCluster` resources with the kubeconfig details for each cluster, and then lists these clusters in a `MultiKueueConfig` resource.
|
||||
- Finally, it updates the AdmissionCheck condition to true, indicating successful generation of the `MultiKueueConfig` and `MultiKueueCluster`, readying the [MultiKueue](https://kueue.sigs.k8s.io/docs/concepts/multikueue/) environment for job scheduling.
|
||||
|
||||
## TODO
|
||||
- In the future, the `AdmissionCheckcontroller` may be added to `featureGates` as a user-enabled feature or possibly developed into an individual component running as a pod on the `hub`.
|
||||
For more detailed design and workflow information, please refer to the [kueue-addon](https://github.com/open-cluster-management-io/addon-contrib/blob/main/kueue-addon/README.md).
|
||||
|
||||
@@ -1,144 +0,0 @@
|
||||
# the permission is copied from https://kueue.sigs.k8s.io/docs/tasks/manage/setup_multikueue/
|
||||
apiVersion: rbac.open-cluster-management.io/v1alpha1
|
||||
kind: ClusterPermission
|
||||
metadata:
|
||||
name: kueue-admin-CLUSTER_NAME
|
||||
namespace: CLUSTER_NAME
|
||||
spec:
|
||||
clusterRole:
|
||||
rules:
|
||||
- apiGroups:
|
||||
- batch
|
||||
resources:
|
||||
- jobs
|
||||
verbs:
|
||||
- create
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- batch
|
||||
resources:
|
||||
- jobs/status
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- jobset.x-k8s.io
|
||||
resources:
|
||||
- jobsets
|
||||
verbs:
|
||||
- create
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- jobset.x-k8s.io
|
||||
resources:
|
||||
- jobsets/status
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- kueue.x-k8s.io
|
||||
resources:
|
||||
- workloads
|
||||
verbs:
|
||||
- create
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- kueue.x-k8s.io
|
||||
resources:
|
||||
- workloads/status
|
||||
verbs:
|
||||
- get
|
||||
- patch
|
||||
- update
|
||||
- apiGroups:
|
||||
- kubeflow.org
|
||||
resources:
|
||||
- tfjobs
|
||||
verbs:
|
||||
- create
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- kubeflow.org
|
||||
resources:
|
||||
- tfjobs/status
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- kubeflow.org
|
||||
resources:
|
||||
- paddlejobs
|
||||
verbs:
|
||||
- create
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- kubeflow.org
|
||||
resources:
|
||||
- paddlejobs/status
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- kubeflow.org
|
||||
resources:
|
||||
- pytorchjobs
|
||||
verbs:
|
||||
- create
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- kubeflow.org
|
||||
resources:
|
||||
- pytorchjobs/status
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- kubeflow.org
|
||||
resources:
|
||||
- xgboostjobs
|
||||
verbs:
|
||||
- create
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- kubeflow.org
|
||||
resources:
|
||||
- xgboostjobs/status
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- kubeflow.org
|
||||
resources:
|
||||
- mpijobs
|
||||
verbs:
|
||||
- create
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- kubeflow.org
|
||||
resources:
|
||||
- mpijobs/status
|
||||
verbs:
|
||||
- get
|
||||
clusterRoleBinding:
|
||||
subject:
|
||||
kind: ServiceAccount
|
||||
name: kueue-admin-CLUSTER_NAME
|
||||
namespace: open-cluster-management-agent-addon
|
||||
7
solutions/kueue-admission-check/env/msa.yaml
vendored
7
solutions/kueue-admission-check/env/msa.yaml
vendored
@@ -1,7 +0,0 @@
|
||||
apiVersion: authentication.open-cluster-management.io/v1beta1
|
||||
kind: ManagedServiceAccount
|
||||
metadata:
|
||||
name: kueue-admin-CLUSTER_NAME
|
||||
namespace: CLUSTER_NAME
|
||||
spec:
|
||||
rotation: {}
|
||||
@@ -1,219 +0,0 @@
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
annotations:
|
||||
controller-gen.kubebuilder.io/version: v0.14.0
|
||||
name: clusterprofiles.multicluster.x-k8s.io
|
||||
spec:
|
||||
group: multicluster.x-k8s.io
|
||||
names:
|
||||
kind: ClusterProfile
|
||||
listKind: ClusterProfileList
|
||||
plural: clusterprofiles
|
||||
singular: clusterprofile
|
||||
scope: Namespaced
|
||||
versions:
|
||||
- name: v1alpha1
|
||||
schema:
|
||||
openAPIV3Schema:
|
||||
description: ClusterProfile represents a single cluster in a multi-cluster
|
||||
deployment.
|
||||
properties:
|
||||
apiVersion:
|
||||
description: |-
|
||||
APIVersion defines the versioned schema of this representation of an object.
|
||||
Servers should convert recognized schemas to the latest internal value, and
|
||||
may reject unrecognized values.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
|
||||
type: string
|
||||
kind:
|
||||
description: |-
|
||||
Kind is a string value representing the REST resource this object represents.
|
||||
Servers may infer this from the endpoint the client submits requests to.
|
||||
Cannot be updated.
|
||||
In CamelCase.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
|
||||
type: string
|
||||
metadata:
|
||||
type: object
|
||||
spec:
|
||||
description: ClusterProfileSpec defines the desired state of ClusterProfile.
|
||||
properties:
|
||||
clusterManager:
|
||||
description: ClusterManager defines which cluster manager owns this
|
||||
ClusterProfile resource
|
||||
properties:
|
||||
name:
|
||||
description: Name defines the name of the cluster manager
|
||||
type: string
|
||||
required:
|
||||
- name
|
||||
type: object
|
||||
x-kubernetes-validations:
|
||||
- message: ClusterManager is immutable
|
||||
rule: self == oldSelf
|
||||
displayName:
|
||||
description: DisplayName defines a human-readable name of the ClusterProfile
|
||||
type: string
|
||||
required:
|
||||
- clusterManager
|
||||
type: object
|
||||
status:
|
||||
description: ClusterProfileStatus defines the observed state of ClusterProfile.
|
||||
properties:
|
||||
conditions:
|
||||
description: Conditions contains the different condition statuses
|
||||
for this cluster.
|
||||
items:
|
||||
description: "Condition contains details for one aspect of the current
|
||||
state of this API Resource.\n---\nThis struct is intended for
|
||||
direct use as an array at the field path .status.conditions. For
|
||||
example,\n\n\n\ttype FooStatus struct{\n\t // Represents the
|
||||
observations of a foo's current state.\n\t // Known .status.conditions.type
|
||||
are: \"Available\", \"Progressing\", and \"Degraded\"\n\t //
|
||||
+patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t
|
||||
\ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\"
|
||||
patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t
|
||||
\ // other fields\n\t}"
|
||||
properties:
|
||||
lastTransitionTime:
|
||||
description: |-
|
||||
lastTransitionTime is the last time the condition transitioned from one status to another.
|
||||
This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable.
|
||||
format: date-time
|
||||
type: string
|
||||
message:
|
||||
description: |-
|
||||
message is a human readable message indicating details about the transition.
|
||||
This may be an empty string.
|
||||
maxLength: 32768
|
||||
type: string
|
||||
observedGeneration:
|
||||
description: |-
|
||||
observedGeneration represents the .metadata.generation that the condition was set based upon.
|
||||
For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
|
||||
with respect to the current state of the instance.
|
||||
format: int64
|
||||
minimum: 0
|
||||
type: integer
|
||||
reason:
|
||||
description: |-
|
||||
reason contains a programmatic identifier indicating the reason for the condition's last transition.
|
||||
Producers of specific condition types may define expected values and meanings for this field,
|
||||
and whether the values are considered a guaranteed API.
|
||||
The value should be a CamelCase string.
|
||||
This field may not be empty.
|
||||
maxLength: 1024
|
||||
minLength: 1
|
||||
pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
|
||||
type: string
|
||||
status:
|
||||
description: status of the condition, one of True, False, Unknown.
|
||||
enum:
|
||||
- "True"
|
||||
- "False"
|
||||
- Unknown
|
||||
type: string
|
||||
type:
|
||||
description: |-
|
||||
type of condition in CamelCase or in foo.example.com/CamelCase.
|
||||
---
|
||||
Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be
|
||||
useful (see .node.status.conditions), the ability to deconflict is important.
|
||||
The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt)
|
||||
maxLength: 316
|
||||
pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
|
||||
type: string
|
||||
required:
|
||||
- lastTransitionTime
|
||||
- message
|
||||
- reason
|
||||
- status
|
||||
- type
|
||||
type: object
|
||||
type: array
|
||||
credentials:
|
||||
description: |-
|
||||
TokenRequests describes a list of token requests on this cluster and its
|
||||
approval status.
|
||||
items:
|
||||
properties:
|
||||
accessRef:
|
||||
description: RequestRef points to a specific AuthTokenRequest
|
||||
object.
|
||||
properties:
|
||||
kind:
|
||||
description: Kind is the kind of the referred token request
|
||||
object.
|
||||
type: string
|
||||
name:
|
||||
description: Name is the name of the referred token request
|
||||
object.
|
||||
type: string
|
||||
namespace:
|
||||
description: Namespace is the namespace of the referred
|
||||
token request object.
|
||||
type: string
|
||||
required:
|
||||
- kind
|
||||
- name
|
||||
- namespace
|
||||
type: object
|
||||
consumer:
|
||||
type: string
|
||||
required:
|
||||
- accessRef
|
||||
- consumer
|
||||
type: object
|
||||
type: array
|
||||
properties:
|
||||
description: |-
|
||||
Properties defines name/value pairs to represent properties of a cluster.
|
||||
It could be a collection of ClusterProperty (KEP-2149) resources,
|
||||
but could also be info based on other implementations.
|
||||
The names of the properties can be predefined names from ClusterProperty resources
|
||||
and is allowed to be customized by different cluster managers.
|
||||
items:
|
||||
description: |-
|
||||
Property defines a name/value pair to represent a property of a cluster.
|
||||
It could be a ClusterProperty (KEP-2149) resource,
|
||||
but could also be info based on other implementations.
|
||||
The name of the property can be predefined name from a ClusterProperty resource
|
||||
and is allowed to be customized by different cluster managers.
|
||||
This property can store various configurable details and metrics of a cluster,
|
||||
which may include information such as the number of nodes, total and free CPU,
|
||||
and total and free memory, among other potential attributes.
|
||||
properties:
|
||||
name:
|
||||
description: |-
|
||||
Name is the name of a property resource on cluster. It's a well-known
|
||||
or customized name to identify the property.
|
||||
maxLength: 253
|
||||
minLength: 1
|
||||
type: string
|
||||
value:
|
||||
description: Value is a property-dependent string
|
||||
maxLength: 1024
|
||||
minLength: 1
|
||||
type: string
|
||||
required:
|
||||
- name
|
||||
- value
|
||||
type: object
|
||||
type: array
|
||||
version:
|
||||
description: Version defines the version information of the cluster.
|
||||
properties:
|
||||
kubernetes:
|
||||
description: Kubernetes is the kubernetes version of the cluster.
|
||||
type: string
|
||||
type: object
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
type: object
|
||||
served: true
|
||||
storage: true
|
||||
subresources:
|
||||
status: {}
|
||||
@@ -1,65 +0,0 @@
|
||||
[
|
||||
{
|
||||
"op": "add",
|
||||
"path": "/rules/-",
|
||||
"value": {
|
||||
"apiGroups": ["rbac.open-cluster-management.io"],
|
||||
"resources": ["clusterpermissions"],
|
||||
"verbs": ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"op": "add",
|
||||
"path": "/rules/-",
|
||||
"value": {
|
||||
"apiGroups": ["authentication.open-cluster-management.io"],
|
||||
"resources": ["managedserviceaccounts"],
|
||||
"verbs": ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"op": "add",
|
||||
"path": "/rules/-",
|
||||
"value": {
|
||||
"apiGroups": ["kueue.x-k8s.io"],
|
||||
"resources": ["multikueueconfigs"],
|
||||
"verbs": ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"op": "add",
|
||||
"path": "/rules/-",
|
||||
"value": {
|
||||
"apiGroups": ["kueue.x-k8s.io"],
|
||||
"resources": ["multikueueclusters"],
|
||||
"verbs": ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"op": "add",
|
||||
"path": "/rules/-",
|
||||
"value": {
|
||||
"apiGroups": ["kueue.x-k8s.io"],
|
||||
"resources": ["admissionchecks"],
|
||||
"verbs": ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"op": "add",
|
||||
"path": "/rules/-",
|
||||
"value": {
|
||||
"apiGroups": ["kueue.x-k8s.io"],
|
||||
"resources": ["admissionchecks/status"],
|
||||
"verbs": ["update", "patch"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"op": "add",
|
||||
"path": "/rules/-",
|
||||
"value": {
|
||||
"apiGroups": [""],
|
||||
"resources": ["secrets"],
|
||||
"verbs": ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
}
|
||||
}
|
||||
]
|
||||
@@ -1,18 +0,0 @@
|
||||
[
|
||||
{
|
||||
"op": "replace",
|
||||
"path": "/spec/installStrategy",
|
||||
"value": {
|
||||
"placements": [
|
||||
{
|
||||
"name": "placement-spoke",
|
||||
"namespace": "default",
|
||||
"rolloutStrategy": {
|
||||
"type": "All"
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "Placements"
|
||||
}
|
||||
}
|
||||
]
|
||||
@@ -1,14 +0,0 @@
|
||||
# clusteradm clusterset bind global --namespace default
|
||||
apiVersion: cluster.open-cluster-management.io/v1beta1
|
||||
kind: Placement
|
||||
metadata:
|
||||
name: placement-spoke
|
||||
namespace: default
|
||||
spec:
|
||||
clusterSets:
|
||||
- spoke
|
||||
tolerations:
|
||||
- key: cluster.open-cluster-management.io/unreachable
|
||||
operator: Exists
|
||||
- key: cluster.open-cluster-management.io/unavailable
|
||||
operator: Exists
|
||||
@@ -1,90 +0,0 @@
|
||||
apiVersion: work.open-cluster-management.io/v1alpha1
|
||||
kind: ManifestWorkReplicaSet
|
||||
metadata:
|
||||
name: single-clusterqueue
|
||||
namespace: default
|
||||
spec:
|
||||
placementRefs:
|
||||
- name: placement-spoke
|
||||
manifestWorkTemplate:
|
||||
workload:
|
||||
manifests:
|
||||
- apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: kueue-manager-ocm-rolebinding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: kueue-manager-role
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: klusterlet-work-sa
|
||||
namespace: open-cluster-management-agent
|
||||
- apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: kueue-batch-admin-ocm-rolebinding
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: kueue-batch-admin-role
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: klusterlet-work-sa
|
||||
namespace: open-cluster-management-agent
|
||||
- apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: ResourceFlavor
|
||||
metadata:
|
||||
name: "default-flavor-demo1"
|
||||
- apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: ClusterQueue
|
||||
metadata:
|
||||
name: "cluster-queue-demo1"
|
||||
spec:
|
||||
namespaceSelector: {} # match all.
|
||||
resourceGroups:
|
||||
- coveredResources: ["cpu", "memory"]
|
||||
flavors:
|
||||
- name: "default-flavor-demo1"
|
||||
resources:
|
||||
- name: "cpu"
|
||||
nominalQuota: 9
|
||||
- name: "memory"
|
||||
nominalQuota: 36Gi
|
||||
- apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: LocalQueue
|
||||
metadata:
|
||||
namespace: "default"
|
||||
name: "user-queue-demo1"
|
||||
spec:
|
||||
clusterQueue: "cluster-queue-demo1"
|
||||
- apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: ResourceFlavor
|
||||
metadata:
|
||||
name: "default-flavor-demo2"
|
||||
- apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: ClusterQueue
|
||||
metadata:
|
||||
name: "cluster-queue-demo2"
|
||||
spec:
|
||||
namespaceSelector: {} # match all.
|
||||
resourceGroups:
|
||||
- coveredResources: ["cpu", "memory","nvidia.com/gpu"]
|
||||
flavors:
|
||||
- name: "default-flavor-demo2"
|
||||
resources:
|
||||
- name: "cpu"
|
||||
nominalQuota: 9
|
||||
- name: "memory"
|
||||
nominalQuota: 36Gi
|
||||
- name: "nvidia.com/gpu"
|
||||
nominalQuota: 3
|
||||
- apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: LocalQueue
|
||||
metadata:
|
||||
namespace: "default"
|
||||
name: "user-queue-demo2"
|
||||
spec:
|
||||
clusterQueue: "cluster-queue-demo2"
|
||||
|
||||
@@ -4,7 +4,7 @@ metadata:
|
||||
generateName: demo1-job
|
||||
namespace: default
|
||||
labels:
|
||||
kueue.x-k8s.io/queue-name: user-queue-demo1
|
||||
kueue.x-k8s.io/queue-name: "user-queue"
|
||||
spec:
|
||||
parallelism: 1
|
||||
completions: 1
|
||||
|
||||
@@ -4,7 +4,7 @@ metadata:
|
||||
generateName: demo2-job
|
||||
namespace: default
|
||||
labels:
|
||||
kueue.x-k8s.io/queue-name: "user-queue-demo2"
|
||||
kueue.x-k8s.io/queue-name: "user-queue"
|
||||
spec:
|
||||
parallelism: 1
|
||||
completions: 1
|
||||
|
||||
@@ -1,23 +1,25 @@
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: ResourceFlavor
|
||||
metadata:
|
||||
name: "default-flavor-demo1"
|
||||
name: "default-flavor"
|
||||
---
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: ClusterQueue
|
||||
metadata:
|
||||
name: "cluster-queue-demo1"
|
||||
name: "cluster-queue"
|
||||
spec:
|
||||
namespaceSelector: {} # match all.
|
||||
resourceGroups:
|
||||
- coveredResources: ["cpu", "memory"]
|
||||
- coveredResources: ["cpu", "memory","nvidia.com/gpu"]
|
||||
flavors:
|
||||
- name: "default-flavor-demo1"
|
||||
- name: "default-flavor"
|
||||
resources:
|
||||
- name: "cpu"
|
||||
nominalQuota: 9
|
||||
- name: "memory"
|
||||
nominalQuota: 36Gi
|
||||
- name: "nvidia.com/gpu"
|
||||
nominalQuota: 3
|
||||
admissionChecks:
|
||||
- multikueue-demo1
|
||||
---
|
||||
@@ -25,9 +27,9 @@ apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: LocalQueue
|
||||
metadata:
|
||||
namespace: "default"
|
||||
name: "user-queue-demo1"
|
||||
name: "user-queue"
|
||||
spec:
|
||||
clusterQueue: "cluster-queue-demo1"
|
||||
clusterQueue: "cluster-queue"
|
||||
---
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: AdmissionCheck
|
||||
@@ -46,26 +48,26 @@ metadata:
|
||||
name: multikueue-config-demo1
|
||||
spec:
|
||||
clusters:
|
||||
- multikueue-demo1-cluster1
|
||||
- multikueue-demo1-cluster2
|
||||
- multikueue-config-demo1-cluster1
|
||||
- multikueue-config-demo1-cluster2
|
||||
---
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: MultiKueueCluster
|
||||
metadata:
|
||||
name: multikueue-demo1-cluster1
|
||||
name: multikueue-config-demo1-cluster1
|
||||
spec:
|
||||
kubeConfig:
|
||||
locationType: Secret
|
||||
location: kueue-admin-cluster1-kubeconfig
|
||||
# a secret called "kueue-admin-cluster1-kubeconfig" should be created in the namespace the kueue
|
||||
location: multikueue-cluster1
|
||||
# a secret called " multikueue-cluster1" should be created in the namespace the kueue
|
||||
# controller manager runs into, holding the kubeConfig needed to connect to the
|
||||
# worker cluster in the "kubeconfig" key;
|
||||
---
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: MultiKueueCluster
|
||||
metadata:
|
||||
name: multikueue-demo1-cluster2
|
||||
name: multikueue-config-demo1-cluster2
|
||||
spec:
|
||||
kubeConfig:
|
||||
locationType: Secret
|
||||
location: kueue-admin-cluster2-kubeconfig
|
||||
location: multikueue-cluster2
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: ResourceFlavor
|
||||
metadata:
|
||||
name: "default-flavor-demo2"
|
||||
name: "default-flavor"
|
||||
---
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: ClusterQueue
|
||||
metadata:
|
||||
name: "cluster-queue-demo2"
|
||||
name: "cluster-queue"
|
||||
spec:
|
||||
namespaceSelector: {} # match all.
|
||||
resourceGroups:
|
||||
- coveredResources: ["cpu", "memory","nvidia.com/gpu"]
|
||||
flavors:
|
||||
- name: "default-flavor-demo2"
|
||||
- name: "default-flavor"
|
||||
resources:
|
||||
- name: "cpu"
|
||||
nominalQuota: 9
|
||||
@@ -22,15 +22,15 @@ spec:
|
||||
nominalQuota: 3
|
||||
admissionChecks:
|
||||
- multikueue-demo2
|
||||
- placement-demo2
|
||||
- multikueue-config-demo2
|
||||
---
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: LocalQueue
|
||||
metadata:
|
||||
namespace: "default"
|
||||
name: "user-queue-demo2"
|
||||
name: "user-queue"
|
||||
spec:
|
||||
clusterQueue: "cluster-queue-demo2"
|
||||
clusterQueue: "cluster-queue"
|
||||
---
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: AdmissionCheck
|
||||
@@ -41,17 +41,17 @@ spec:
|
||||
parameters:
|
||||
apiGroup: kueue.x-k8s.io
|
||||
kind: MultiKueueConfig
|
||||
name: placement-demo2
|
||||
name: multikueue-config-demo2
|
||||
---
|
||||
# OCM implements an admissioncheck controller to automate the MultiKueue setup process.
|
||||
# MultiKueueConfigs and MultiKueueClusters are generated dynamically based on OCM placement decisions.
|
||||
apiVersion: kueue.x-k8s.io/v1beta1
|
||||
kind: AdmissionCheck
|
||||
metadata:
|
||||
name: placement-demo2
|
||||
name: multikueue-config-demo2
|
||||
spec:
|
||||
controllerName: open-cluster-management.io/placement
|
||||
parameters:
|
||||
apiGroup: cluster.open-cluster-management.io
|
||||
kind: Placement
|
||||
name: placement-demo2
|
||||
name: multikueue-config-demo2
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
apiVersion: cluster.open-cluster-management.io/v1beta1
|
||||
kind: Placement
|
||||
metadata:
|
||||
name: placement-demo2
|
||||
name: multikueue-config-demo2
|
||||
namespace: kueue-system
|
||||
spec:
|
||||
clusterSets:
|
||||
- spoke
|
||||
- global
|
||||
tolerations:
|
||||
- key: cluster.open-cluster-management.io/unreachable
|
||||
operator: Exists
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
apiVersion: cluster.open-cluster-management.io/v1beta1
|
||||
kind: Placement
|
||||
metadata:
|
||||
name: placement-demo2
|
||||
name: multikueue-config-demo2
|
||||
namespace: kueue-system
|
||||
spec:
|
||||
clusterSets:
|
||||
- spoke
|
||||
- global
|
||||
tolerations:
|
||||
- key: cluster.open-cluster-management.io/unreachable
|
||||
operator: Exists
|
||||
|
||||
@@ -2,12 +2,28 @@
|
||||
|
||||
cd $(dirname ${BASH_SOURCE})
|
||||
|
||||
set -e
|
||||
set -euo pipefail
|
||||
|
||||
hub=${CLUSTER1:-hub}
|
||||
# Parse command line arguments
|
||||
FORCE=false
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--force)
|
||||
FORCE=true
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
echo "Usage: $0 [--force]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
hub=${HUB:-local-cluster}
|
||||
c1=${CLUSTER1:-cluster1}
|
||||
c2=${CLUSTER2:-cluster2}
|
||||
c3=${CLUSTER2:-cluster3}
|
||||
c3=${CLUSTER3:-cluster3}
|
||||
|
||||
hubctx="kind-${hub}"
|
||||
c1ctx="kind-${c1}"
|
||||
@@ -24,113 +40,132 @@ jobset_manifest="https://github.com/kubernetes-sigs/jobset/releases/download/v0.
|
||||
mpi_operator_manifest="https://github.com/kubeflow/mpi-operator/releases/download/v0.6.0/mpi-operator.yaml"
|
||||
training_operator_kustomize="github.com/kubeflow/training-operator.git/manifests/overlays/standalone?ref=v1.8.1"
|
||||
|
||||
# ocm setup
|
||||
echo "Parepare kind clusters"
|
||||
for cluster in "${all_clusters[@]}"; do
|
||||
kind create cluster --name "$cluster" --image kindest/node:v1.29.0
|
||||
done
|
||||
# Function to create kind clusters
|
||||
create_clusters() {
|
||||
if [[ "$FORCE" == "true" ]]; then
|
||||
echo "Deleting existing clusters due to --force flag..."
|
||||
for cluster in "${all_clusters[@]}"; do
|
||||
kind delete cluster --name "$cluster" || true
|
||||
done
|
||||
fi
|
||||
|
||||
echo "Initialize the ocm hub cluster with ClusterProfile enabled"
|
||||
clusteradm init --feature-gates="ManifestWorkReplicaSet=true,ManagedClusterAutoApproval=true,ClusterProfile=true" --bundle-version="v0.15.0" --wait --context ${hubctx}
|
||||
joincmd=$(clusteradm get token --context ${hubctx} | grep clusteradm)
|
||||
echo "Prepare kind clusters"
|
||||
for cluster in "${all_clusters[@]}"; do
|
||||
kind create cluster --name "$cluster" --image kindest/node:v1.29.0 || true
|
||||
done
|
||||
}
|
||||
|
||||
echo "Join clusters to hub"
|
||||
$(echo ${joincmd} --force-internal-endpoint-lookup --wait --context ${c1ctx} | sed "s/<cluster_name>/$c1/g")
|
||||
$(echo ${joincmd} --force-internal-endpoint-lookup --wait --context ${c2ctx} | sed "s/<cluster_name>/$c2/g")
|
||||
$(echo ${joincmd} --force-internal-endpoint-lookup --wait --context ${c3ctx} | sed "s/<cluster_name>/$c3/g")
|
||||
# Function to setup OCM
|
||||
setup_ocm() {
|
||||
echo "Initialize the ocm hub cluster"
|
||||
clusteradm init --wait --context ${hubctx}
|
||||
joincmd=$(clusteradm get token --context ${hubctx} | grep clusteradm)
|
||||
|
||||
echo "Accept join of clusters"
|
||||
clusteradm accept --context ${hubctx} --clusters ${c1},${c2},${c3} --wait
|
||||
echo "Join clusters to hub"
|
||||
eval "${joincmd//<cluster_name>/${hub}} --force-internal-endpoint-lookup --wait --context ${hubctx}"
|
||||
eval "${joincmd//<cluster_name>/${c1}} --force-internal-endpoint-lookup --wait --context ${c1ctx}"
|
||||
eval "${joincmd//<cluster_name>/${c2}} --force-internal-endpoint-lookup --wait --context ${c2ctx}"
|
||||
eval "${joincmd//<cluster_name>/${c3}} --force-internal-endpoint-lookup --wait --context ${c3ctx}"
|
||||
|
||||
kubectl get managedclusters --all-namespaces --context ${hubctx}
|
||||
echo "Accept join of clusters"
|
||||
clusteradm accept --context ${hubctx} --clusters ${hub},${c1},${c2},${c3} --wait
|
||||
|
||||
# install kueue, jobset, workflow
|
||||
for ctx in "${all_ctx[@]}"; do
|
||||
echo "Install Kueue, Jobset on $ctx"
|
||||
kubectl apply --server-side -f "$kueue_manifest" --context "$ctx"
|
||||
echo "waiting for kueue-system pods to be ready"
|
||||
kubectl wait --for=condition=Ready pods --all -n kueue-system --timeout=300s --context "$ctx"
|
||||
kubectl apply --server-side -f "$jobset_manifest" --context "$ctx"
|
||||
done
|
||||
# label local-cluster
|
||||
kubectl label managedclusters ${hub} local-cluster=true --context ${hubctx}
|
||||
kubectl get managedclusters --all-namespaces --context ${hubctx}
|
||||
}
|
||||
|
||||
for ctx in "${spoke_ctx[@]}"; do
|
||||
echo "Install Kubeflow MPI Operator, Training Operator on $ctx"
|
||||
kubectl apply --server-side -f "$mpi_operator_manifest" --context "$ctx" || true
|
||||
kubectl apply --server-side -k "$training_operator_kustomize" --context "$ctx" || true
|
||||
done
|
||||
# Function to install Kueue, jobset, workflow
|
||||
install_kueue() {
|
||||
for ctx in "${all_ctx[@]}"; do
|
||||
echo "Install Kueue, Jobset on $ctx"
|
||||
kubectl apply --server-side -f "$kueue_manifest" --context "$ctx"
|
||||
echo "waiting for kueue-system pods to be ready"
|
||||
kubectl wait --for=condition=Ready pods --all -n kueue-system --timeout=300s --context "$ctx"
|
||||
kubectl apply --server-side -f "$jobset_manifest" --context "$ctx"
|
||||
done
|
||||
|
||||
kubectl config use-context ${hubctx}
|
||||
# patch some ocm resoures and images
|
||||
echo "Patch permission"
|
||||
kubectl patch clusterrole cluster-manager --type='json' -p "$(cat env/patch-clusterrole.json)"
|
||||
for ctx in "${spoke_ctx[@]}"; do
|
||||
echo "Install Kubeflow MPI Operator, Training Operator on $ctx"
|
||||
kubectl apply --server-side -f "$mpi_operator_manifest" --context "$ctx" || true
|
||||
kubectl apply --server-side -k "$training_operator_kustomize" --context "$ctx" || true
|
||||
done
|
||||
}
|
||||
|
||||
echo "Patch image"
|
||||
# quay.io/haoqing/registration-operator:kueue-v0.9.1 grants more permission for registration and placement.
|
||||
# quay.io/haoqing/registration-operator:kueue-v0.9.1 creates worker’s kubeconfig secret for multikueue.
|
||||
# quay.io/haoqing/placement:kueue-v0.9.1 implements the admission check controller.
|
||||
# The source code is in repo https://github.com/haoqing0110/OCM/tree/br_ocm-v0.15.1-kueue-v0.9.1.
|
||||
kubectl patch deployment cluster-manager -n open-cluster-management --type=json -p='[
|
||||
{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "quay.io/haoqing/registration-operator:kueue-v0.9.1"},
|
||||
{"op": "replace", "path": "/spec/template/spec/containers/0/imagePullPolicy", "value": "Always"}
|
||||
]'
|
||||
kubectl patch clustermanager cluster-manager --type=json -p='[
|
||||
{"op": "replace", "path": "/spec/registrationImagePullSpec", "value": "quay.io/haoqing/registration:kueue-v0.9.1"},
|
||||
{"op": "replace", "path": "/spec/placementImagePullSpec", "value": "quay.io/haoqing/placement:kueue-v0.9.1"}
|
||||
]'
|
||||
# Function to install OCM addons
|
||||
install_ocm_addons() {
|
||||
kubectl config use-context ${hubctx}
|
||||
|
||||
# install addons
|
||||
echo "Install managed-serviceaccount"
|
||||
helm repo add ocm https://open-cluster-management.io/helm-charts/
|
||||
helm repo update
|
||||
helm uninstall -n open-cluster-management-addon managed-serviceaccount || true
|
||||
helm install \
|
||||
-n open-cluster-management-addon --create-namespace \
|
||||
managed-serviceaccount ocm/managed-serviceaccount \
|
||||
--set tag=latest \
|
||||
--set featureGates.ephemeralIdentity=true \
|
||||
--set enableAddOnDeploymentConfig=true \
|
||||
--set hubDeployMode=AddOnTemplate
|
||||
echo "Add ocm helm repo"
|
||||
helm repo add ocm https://open-cluster-management.io/helm-charts/
|
||||
helm repo update
|
||||
|
||||
echo "Install managed-serviceaccount mca"
|
||||
clusteradm create clusterset spoke
|
||||
clusteradm clusterset set spoke --clusters ${c1},${c2},${c3}
|
||||
clusteradm clusterset bind spoke --namespace default
|
||||
kubectl apply -f env/placement.yaml || true
|
||||
kubectl patch clustermanagementaddon managed-serviceaccount --type='json' -p="$(cat env/patch-mg-sa-cma.json)" || true
|
||||
echo "Install managed-serviceaccount"
|
||||
helm upgrade --install \
|
||||
-n open-cluster-management-addon --create-namespace \
|
||||
managed-serviceaccount ocm/managed-serviceaccount \
|
||||
--set featureGates.ephemeralIdentity=true \
|
||||
--set enableAddOnDeploymentConfig=true \
|
||||
--set hubDeployMode=AddOnTemplate
|
||||
|
||||
echo "Install cluster-permission"
|
||||
git clone git@github.com:open-cluster-management-io/cluster-permission.git || true
|
||||
cd cluster-permission
|
||||
helm uninstall -n open-cluster-management cluster-permission || true
|
||||
helm install cluster-permission chart/ \
|
||||
--namespace open-cluster-management \
|
||||
--create-namespace \
|
||||
--set global.imageOverrides.cluster_permission=quay.io/open-cluster-management/cluster-permission:latest \
|
||||
--set global.pullPolicy=Always
|
||||
cd -
|
||||
rm -rf cluster-permission
|
||||
echo "Install cluster-permission"
|
||||
helm upgrade --install \
|
||||
-n open-cluster-management --create-namespace \
|
||||
cluster-permission ocm/cluster-permission \
|
||||
--set global.imageOverrides.cluster_permission=quay.io/open-cluster-management/cluster-permission:latest
|
||||
|
||||
echo "Install resource-usage-collect-addon"
|
||||
git clone git@github.com:open-cluster-management-io/addon-contrib.git || true
|
||||
cd addon-contrib/resource-usage-collect-addon
|
||||
make deploy
|
||||
cd -
|
||||
rm -rf addon-contrib
|
||||
echo "Install kueue-addon"
|
||||
helm upgrade --install \
|
||||
-n open-cluster-management-addon --create-namespace \
|
||||
kueue-addon ocm/kueue-addon \
|
||||
--set skipClusterSetBinding=true
|
||||
|
||||
# prepare credentials for multikueue
|
||||
echo "Setup queue on the spoke"
|
||||
kubectl apply -f env/single-clusterqueue-setup-mwrs.yaml
|
||||
echo "Install resource-usage-collect-addon"
|
||||
git clone https://github.com/open-cluster-management-io/addon-contrib.git || true
|
||||
cd addon-contrib/resource-usage-collect-addon
|
||||
helm install resource-usage-collect-addon chart/ \
|
||||
-n open-cluster-management-addon --create-namespace \
|
||||
--set skipClusterSetBinding=true \
|
||||
--set global.image.repository=quay.io/haoqing/resource-usage-collect-addon
|
||||
cd -
|
||||
|
||||
echo "Setup credentials for clusterprofile"
|
||||
for CLUSTER in "${spoke_clusters[@]}"; do
|
||||
sed "s/CLUSTER_NAME/$CLUSTER/g" env/clusterpermission.yaml | kubectl apply -f -
|
||||
sed "s/CLUSTER_NAME/$CLUSTER/g" env/msa.yaml | kubectl apply -f -
|
||||
done
|
||||
rm -rf addon-contrib
|
||||
}
|
||||
|
||||
echo "Setup faked GPU on the spoke"
|
||||
kubectl label managedcluster cluster2 accelerator=nvidia-tesla-t4
|
||||
kubectl label managedcluster cluster3 accelerator=nvidia-tesla-t4
|
||||
# Function to setup fake GPU
|
||||
setup_fake_gpu() {
|
||||
echo "Setup fake GPU on the spoke clusters"
|
||||
kubectl label managedcluster cluster2 accelerator=nvidia-tesla-t4 --context ${hubctx}
|
||||
kubectl label managedcluster cluster3 accelerator=nvidia-tesla-t4 --context ${hubctx}
|
||||
|
||||
echo "IMPORTANT: RUN BELOW COMMAND MANUALLY on cluster2 and cluster3 !!!"
|
||||
echo "kubectl edit-status node cluster2-control-plane --context ${c2ctx}" with nvidia.com/gpu: "3"
|
||||
echo "kubectl edit-status node cluster3-control-plane --context ${c3ctx}" with nvidia.com/gpu: "3"
|
||||
kubectl patch node cluster2-control-plane --subresource=status --type='merge' --patch='{
|
||||
"status": {
|
||||
"capacity": {
|
||||
"nvidia.com/gpu": "3"
|
||||
},
|
||||
"allocatable": {
|
||||
"nvidia.com/gpu": "3"
|
||||
}
|
||||
}
|
||||
}' --context ${c2ctx}
|
||||
|
||||
kubectl patch node cluster3-control-plane --subresource=status --type='merge' --patch='{
|
||||
"status": {
|
||||
"capacity": {
|
||||
"nvidia.com/gpu": "3"
|
||||
},
|
||||
"allocatable": {
|
||||
"nvidia.com/gpu": "3"
|
||||
}
|
||||
}
|
||||
}' --context ${c3ctx}
|
||||
|
||||
echo "Fake GPU resources added successfully to cluster2 and cluster3!"
|
||||
}
|
||||
|
||||
# Main execution
|
||||
create_clusters
|
||||
setup_ocm
|
||||
install_kueue
|
||||
install_ocm_addons
|
||||
setup_fake_gpu
|
||||
Reference in New Issue
Block a user