uprade supported kueue version to v0.9.1 (#748)

Signed-off-by: Qing Hao <qhao@redhat.com>
This commit is contained in:
Qing Hao
2024-12-09 12:50:48 +08:00
committed by GitHub
parent a967216d4d
commit 29bf254745
11 changed files with 195 additions and 281 deletions

View File

@@ -34,8 +34,6 @@ REF: [Setup a MultiKueue environment](https://kueue.sigs.k8s.io/docs/tasks/manag
2. [Kueue](https://kueue.sigs.k8s.io/docs/installation/) deployed across all clusters.
3. [Managed-serviceaccount](https://github.com/open-cluster-management-io/managed-serviceaccount), [cluster-permission](https://github.com/open-cluster-management-io/cluster-permission) and [resource-usage-collect-addon](https://github.com/open-cluster-management-io/addon-contrib/tree/main/resource-usage-collect-addon) installed on managed clusters.
- You can set up these above by running the command:
```bash
./setup-env.sh
@@ -624,18 +622,4 @@ spec:
- Finally, it updates the AdmissionCheck condition to true, indicating successful generation of the `MultiKueueConfig` and `MultiKueueCluster`, readying the [MultiKueue](https://kueue.sigs.k8s.io/docs/concepts/multikueue/) environment for job scheduling.
## TODO
- In the future, the `AdmissionCheckcontroller` may be added to `featureGates` as a user-enabled feature or possibly developed into an individual component running as a pod on the `hub`.
- Users may also need to enable the `ClusterProfile` feature in the `featureGates` to utilize the OCM Admission Check. This can be done by configuring the `ClusterManager` in `hub`.
```yaml
apiVersion: operator.open-cluster-management.io/v1
kind: ClusterManager
metadata:
name: cluster-manager
spec:
registrationConfiguration:
featureGates:
- feature: ClusterProfile
mode: Enable
...
```
- In the future, the `AdmissionCheckcontroller` may be added to `featureGates` as a user-enabled feature or possibly developed into an individual component running as a pod on the `hub`.

View File

@@ -0,0 +1,144 @@
# the permission is copied from https://kueue.sigs.k8s.io/docs/tasks/manage/setup_multikueue/
apiVersion: rbac.open-cluster-management.io/v1alpha1
kind: ClusterPermission
metadata:
name: kueue-admin-CLUSTER_NAME
namespace: CLUSTER_NAME
spec:
clusterRole:
rules:
- apiGroups:
- batch
resources:
- jobs
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- batch
resources:
- jobs/status
verbs:
- get
- apiGroups:
- jobset.x-k8s.io
resources:
- jobsets
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- jobset.x-k8s.io
resources:
- jobsets/status
verbs:
- get
- apiGroups:
- kueue.x-k8s.io
resources:
- workloads
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- kueue.x-k8s.io
resources:
- workloads/status
verbs:
- get
- patch
- update
- apiGroups:
- kubeflow.org
resources:
- tfjobs
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- tfjobs/status
verbs:
- get
- apiGroups:
- kubeflow.org
resources:
- paddlejobs
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- paddlejobs/status
verbs:
- get
- apiGroups:
- kubeflow.org
resources:
- pytorchjobs
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- pytorchjobs/status
verbs:
- get
- apiGroups:
- kubeflow.org
resources:
- xgboostjobs
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- xgboostjobs/status
verbs:
- get
- apiGroups:
- kubeflow.org
resources:
- mpijobs
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- mpijobs/status
verbs:
- get
clusterRoleBinding:
subject:
kind: ServiceAccount
name: kueue-admin-CLUSTER_NAME
namespace: open-cluster-management-agent-addon

View File

@@ -1,63 +0,0 @@
apiVersion: rbac.open-cluster-management.io/v1alpha1
kind: ClusterPermission
metadata:
name: kueue-admin-cluster1
namespace: cluster1
spec:
clusterRole:
rules:
- apiGroups:
- batch
resources:
- jobs
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- batch
resources:
- jobs/status
verbs:
- get
- apiGroups:
- jobset.x-k8s.io
resources:
- jobsets
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- jobset.x-k8s.io
resources:
- jobsets/status
verbs:
- get
- apiGroups:
- kueue.x-k8s.io
resources:
- workloads
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- kueue.x-k8s.io
resources:
- workloads/status
verbs:
- get
- patch
- update
clusterRoleBinding:
subject:
kind: ServiceAccount
name: kueue-admin-cluster1
namespace: open-cluster-management-agent-addon

View File

@@ -1,63 +0,0 @@
apiVersion: rbac.open-cluster-management.io/v1alpha1
kind: ClusterPermission
metadata:
name: kueue-admin-cluster2
namespace: cluster2
spec:
clusterRole:
rules:
- apiGroups:
- batch
resources:
- jobs
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- batch
resources:
- jobs/status
verbs:
- get
- apiGroups:
- jobset.x-k8s.io
resources:
- jobsets
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- jobset.x-k8s.io
resources:
- jobsets/status
verbs:
- get
- apiGroups:
- kueue.x-k8s.io
resources:
- workloads
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- kueue.x-k8s.io
resources:
- workloads/status
verbs:
- get
- patch
- update
clusterRoleBinding:
subject:
kind: ServiceAccount
name: kueue-admin-cluster2
namespace: open-cluster-management-agent-addon

View File

@@ -1,63 +0,0 @@
apiVersion: rbac.open-cluster-management.io/v1alpha1
kind: ClusterPermission
metadata:
name: kueue-admin-cluster3
namespace: cluster3
spec:
clusterRole:
rules:
- apiGroups:
- batch
resources:
- jobs
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- batch
resources:
- jobs/status
verbs:
- get
- apiGroups:
- jobset.x-k8s.io
resources:
- jobsets
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- jobset.x-k8s.io
resources:
- jobsets/status
verbs:
- get
- apiGroups:
- kueue.x-k8s.io
resources:
- workloads
verbs:
- create
- delete
- get
- list
- watch
- apiGroups:
- kueue.x-k8s.io
resources:
- workloads/status
verbs:
- get
- patch
- update
clusterRoleBinding:
subject:
kind: ServiceAccount
name: kueue-admin-cluster3
namespace: open-cluster-management-agent-addon

View File

@@ -1,7 +0,0 @@
apiVersion: authentication.open-cluster-management.io/v1beta1
kind: ManagedServiceAccount
metadata:
name: kueue-admin-cluster1
namespace: cluster1
spec:
rotation: {}

View File

@@ -1,7 +0,0 @@
apiVersion: authentication.open-cluster-management.io/v1beta1
kind: ManagedServiceAccount
metadata:
name: kueue-admin-cluster3
namespace: cluster3
spec:
rotation: {}

View File

@@ -1,7 +1,7 @@
apiVersion: authentication.open-cluster-management.io/v1beta1
kind: ManagedServiceAccount
metadata:
name: kueue-admin-cluster2
namespace: cluster2
name: kueue-admin-CLUSTER_NAME
namespace: CLUSTER_NAME
spec:
rotation: {}

View File

@@ -1,22 +1,4 @@
[
{
"op": "add",
"path": "/rules/-",
"value": {
"apiGroups": ["multicluster.x-k8s.io"],
"resources": ["clusterprofiles"],
"verbs": ["get", "list", "watch", "create", "update", "patch", "delete"]
}
},
{
"op": "add",
"path": "/rules/-",
"value": {
"apiGroups": ["multicluster.x-k8s.io"],
"resources": ["clusterprofiles/status"],
"verbs": ["update", "patch"]
}
},
{
"op": "add",
"path": "/rules/-",

View File

@@ -40,7 +40,7 @@ spec:
kind: MultiKueueConfig
name: multikueue-config-demo1
---
apiVersion: kueue.x-k8s.io/v1alpha1
apiVersion: kueue.x-k8s.io/v1beta1
kind: MultiKueueConfig
metadata:
name: multikueue-config-demo1
@@ -49,7 +49,7 @@ spec:
- multikueue-demo1-cluster1
- multikueue-demo1-cluster2
---
apiVersion: kueue.x-k8s.io/v1alpha1
apiVersion: kueue.x-k8s.io/v1beta1
kind: MultiKueueCluster
metadata:
name: multikueue-demo1-cluster1
@@ -61,7 +61,7 @@ spec:
# controller manager runs into, holding the kubeConfig needed to connect to the
# worker cluster in the "kubeconfig" key;
---
apiVersion: kueue.x-k8s.io/v1alpha1
apiVersion: kueue.x-k8s.io/v1beta1
kind: MultiKueueCluster
metadata:
name: multikueue-demo1-cluster2

View File

@@ -14,58 +14,69 @@ c1ctx="kind-${c1}"
c2ctx="kind-${c2}"
c3ctx="kind-${c3}"
kind create cluster --name "${hub}" --image kindest/node:v1.29.0@sha256:eaa1450915475849a73a9227b8f201df25e55e268e5d619312131292e324d570
kind create cluster --name "${c1}" --image kindest/node:v1.29.0@sha256:eaa1450915475849a73a9227b8f201df25e55e268e5d619312131292e324d570
kind create cluster --name "${c2}" --image kindest/node:v1.29.0@sha256:eaa1450915475849a73a9227b8f201df25e55e268e5d619312131292e324d570
kind create cluster --name "${c3}" --image kindest/node:v1.29.0@sha256:eaa1450915475849a73a9227b8f201df25e55e268e5d619312131292e324d570
spoke_clusters=(${c1} ${c2} ${c3})
all_clusters=(${hub} ${spoke_clusters[@]})
spoke_ctx=(${c1ctx} ${c2ctx} ${c3ctx})
all_ctx=(${hubctx} ${spoke_ctx[@]})
echo "Initialize the ocm hub cluster"
kueue_manifest="https://github.com/kubernetes-sigs/kueue/releases/download/v0.9.1/manifests.yaml"
jobset_manifest="https://github.com/kubernetes-sigs/jobset/releases/download/v0.7.1/manifests.yaml"
mpi_operator_manifest="https://github.com/kubeflow/mpi-operator/releases/download/v0.6.0/mpi-operator.yaml"
training_operator_kustomize="github.com/kubeflow/training-operator.git/manifests/overlays/standalone?ref=v1.8.1"
clusteradm init --feature-gates="ManifestWorkReplicaSet=true,ManagedClusterAutoApproval=true" --bundle-version="latest" --wait --context ${hubctx}
# ocm setup
echo "Parepare kind clusters"
for cluster in "${all_clusters[@]}"; do
kind create cluster --name "$cluster" --image kindest/node:v1.29.0
done
echo "Initialize the ocm hub cluster with ClusterProfile enabled"
clusteradm init --feature-gates="ManifestWorkReplicaSet=true,ManagedClusterAutoApproval=true,ClusterProfile=true" --bundle-version="v0.15.0" --wait --context ${hubctx}
joincmd=$(clusteradm get token --context ${hubctx} | grep clusteradm)
echo "Join cluster1 to hub"
echo "Join clusters to hub"
$(echo ${joincmd} --force-internal-endpoint-lookup --wait --context ${c1ctx} | sed "s/<cluster_name>/$c1/g")
echo "Join cluster2 to hub"
$(echo ${joincmd} --force-internal-endpoint-lookup --wait --context ${c2ctx} | sed "s/<cluster_name>/$c2/g")
echo "Join cluster3 to hub"
$(echo ${joincmd} --force-internal-endpoint-lookup --wait --context ${c3ctx} | sed "s/<cluster_name>/$c3/g")
echo "Accept join of cluster1 and cluster2"
echo "Accept join of clusters"
clusteradm accept --context ${hubctx} --clusters ${c1},${c2},${c3} --wait
kubectl get managedclusters --all-namespaces --context ${hubctx}
echo "Install Kueue (this can be replaced with OCM Manifestwork in the future)"
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.7.1/manifests.yaml --context ${hubctx}
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.7.1/manifests.yaml --context ${c1ctx}
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.7.1/manifests.yaml --context ${c2ctx}
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.7.1/manifests.yaml --context ${c3ctx}
# install kueue, jobset, workflow
for ctx in "${all_ctx[@]}"; do
echo "Install Kueue, Jobset on $ctx"
kubectl apply --server-side -f "$kueue_manifest" --context "$ctx"
kubectl apply --server-side -f "$jobset_manifest" --context "$ctx"
done
echo "Install Jobset for MultiKueue (this can be replaced with OCM Manifestwork in the future)"
kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml --context ${hubctx}
kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml --context ${c1ctx}
kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml --context ${c2ctx}
kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml --context ${c3ctx}
for ctx in "${spoke_ctx[@]}"; do
echo "Install Kubeflow MPI Operator, Training Operator on $ctx"
kubectl apply --server-side -f "$mpi_operator_manifest" --context "$ctx" || true
kubectl apply --server-side -k "$training_operator_kustomize" --context "$ctx" || true
done
kubectl config use-context ${hubctx}
# patch some ocm resoures and images
echo "Patch permission"
kubectl patch clusterrole cluster-manager --type='json' -p "$(cat env/patch-clusterrole.json)"
echo "Patch image"
# quay.io/haoqing/registration-operator:kueue-v0.9.1 grants more permission for registration and placement.
# quay.io/haoqing/registration-operator:kueue-v0.9.1 creates workers kubeconfig secret for multikueue.
# quay.io/haoqing/placement:kueue-v0.9.1 implements the admission check controller.
# The source code is in repo https://github.com/haoqing0110/OCM/tree/br_ocm-v0.15.1-kueue-v0.9.1.
kubectl patch deployment cluster-manager -n open-cluster-management --type=json -p='[
{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "quay.io/haoqing/registration-operator:latest"},
{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "quay.io/haoqing/registration-operator:kueue-v0.9.1"},
{"op": "replace", "path": "/spec/template/spec/containers/0/imagePullPolicy", "value": "Always"}
]'
kubectl patch clustermanager cluster-manager --type=json -p='[{"op": "replace", "path": "/spec/registrationImagePullSpec", "value": "quay.io/haoqing/registration:latest"}]'
kubectl patch clustermanager cluster-manager --type=json -p='[{"op": "replace", "path": "/spec/placementImagePullSpec", "value": "quay.io/haoqing/placement:latest"}]'
echo "Install CRDs"
kubectl create -f env/multicluster.x-k8s.io_clusterprofiles.yaml
kubectl patch clustermanager cluster-manager --type=json -p='[
{"op": "replace", "path": "/spec/registrationImagePullSpec", "value": "quay.io/haoqing/registration:kueue-v0.9.1"},
{"op": "replace", "path": "/spec/placementImagePullSpec", "value": "quay.io/haoqing/placement:kueue-v0.9.1"}
]'
# install addons
echo "Install managed-serviceaccount"
git clone git@github.com:open-cluster-management-io/managed-serviceaccount.git || true
cd managed-serviceaccount
@@ -103,19 +114,15 @@ make deploy
cd -
rm -rf addon-contrib
echo "Enable MultiKueue on the hub"
kubectl patch deployment kueue-controller-manager -n kueue-system --type='json' -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args", "value": ["--config=/controller_manager_config.yaml", "--zap-log-level=2", "--feature-gates=MultiKueue=true"]}]'
# prepare credentials for multikueue
echo "Setup queue on the spoke"
kubectl apply -f env/single-clusterqueue-setup-mwrs.yaml
echo "Setup credentials for clusterprofile"
kubectl apply -f env/cp-c1.yaml
kubectl apply -f env/cp-c2.yaml
kubectl apply -f env/cp-c3.yaml
kubectl apply -f env/msa-c1.yaml
kubectl apply -f env/msa-c2.yaml
kubectl apply -f env/msa-c3.yaml
for CLUSTER in "${spoke_clusters[@]}"; do
sed "s/CLUSTER_NAME/$CLUSTER/g" env/clusterpermission.yaml | kubectl apply -f -
sed "s/CLUSTER_NAME/$CLUSTER/g" env/msa.yaml | kubectl apply -f -
done
echo "Setup faked GPU on the spoke"
kubectl label managedcluster cluster2 accelerator=nvidia-tesla-t4