mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 18:10:00 +00:00
Added VMware Node Scenarios (#285)
* Added VMware node scenarios * Made vmware plugin independent of Krkn * Revert changes made to node status watch * Fixed minor documentation changes
This commit is contained in:
committed by
GitHub
parent
f4bc30d2a1
commit
08deae63dd
@@ -14,6 +14,7 @@ kraken:
|
||||
- plugin_scenarios:
|
||||
- scenarios/openshift/etcd.yml
|
||||
- scenarios/openshift/regex_openshift_pod_kill.yml
|
||||
- scenarios/openshift/vmware_node_scenarios.yml
|
||||
- node_scenarios: # List of chaos node scenarios to load
|
||||
- scenarios/openshift/node_scenarios_example.yml
|
||||
- plugin_scenarios:
|
||||
|
||||
@@ -5,6 +5,7 @@ Supported Cloud Providers:
|
||||
* [Openstack](#openstack)
|
||||
* [Azure](#azure)
|
||||
* [Alibaba](#alibaba)
|
||||
* [VMware](#vmware)
|
||||
|
||||
## AWS
|
||||
|
||||
@@ -53,3 +54,15 @@ See the [Installation guide](https://www.alibabacloud.com/help/en/alibaba-cloud-
|
||||
Refer to [region and zone page](https://www.alibabacloud.com/help/en/elastic-compute-service/latest/regions-and-zones#concept-2459516) to get the region id for the region you are running on.
|
||||
|
||||
Set cloud_type to either alibaba or alicloud in your node scenario yaml file.
|
||||
|
||||
## VMware
|
||||
|
||||
Set the following environment variables
|
||||
|
||||
1. ```export VSPHERE_IP=<vSphere_client_IP_address>```
|
||||
|
||||
2. ```export VSPHERE_USERNAME=<vSphere_client_username>```
|
||||
|
||||
3. ```export VSPHERE_PASSWORD=<vSphere_client_password>```
|
||||
|
||||
These are the credentials that you would normally use to access the vSphere client.
|
||||
@@ -64,6 +64,10 @@ How to set up Alibaba cli to run node scenarios is defined [here](cloud_setup.md
|
||||
. Releasing a node is 2 steps, stopping the node and then releasing it.
|
||||
|
||||
|
||||
#### VMware
|
||||
How to set up VMware vSphere to run node scenarios is defined [here](cloud_setup.md#vmware)
|
||||
|
||||
|
||||
#### General
|
||||
|
||||
**NOTE**: The `node_crash_scenario` and `stop_kubelet_scenario` scenario is supported independent of the cloud platform.
|
||||
|
||||
@@ -579,7 +579,7 @@ def watch_node_status(node, status, timeout, resource_version):
|
||||
cli.list_node,
|
||||
field_selector=f"metadata.name={node}",
|
||||
timeout_seconds=timeout,
|
||||
resource_version=f"{resource_version}",
|
||||
resource_version=f"{resource_version}"
|
||||
):
|
||||
conditions = [status for status in event["object"].status.conditions if status.type == "Ready"]
|
||||
if conditions[0].status == status:
|
||||
|
||||
@@ -5,7 +5,7 @@ from os.path import abspath
|
||||
from typing import List, Dict
|
||||
|
||||
from arcaflow_plugin_sdk import schema, serialization, jsonschema
|
||||
|
||||
import kraken.plugins.vmware.vmware_plugin as vmware_plugin
|
||||
from kraken.plugins.pod_plugin import kill_pods, wait_for_pods
|
||||
from kraken.plugins.run_python_plugin import run_python_file
|
||||
|
||||
@@ -153,6 +153,30 @@ PLUGINS = Plugins(
|
||||
[
|
||||
"error"
|
||||
]
|
||||
),
|
||||
PluginStep(
|
||||
vmware_plugin.node_start,
|
||||
[
|
||||
"error"
|
||||
]
|
||||
),
|
||||
PluginStep(
|
||||
vmware_plugin.node_stop,
|
||||
[
|
||||
"error"
|
||||
]
|
||||
),
|
||||
PluginStep(
|
||||
vmware_plugin.node_reboot,
|
||||
[
|
||||
"error"
|
||||
]
|
||||
),
|
||||
PluginStep(
|
||||
vmware_plugin.node_terminate,
|
||||
[
|
||||
"error"
|
||||
]
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
179
kraken/plugins/vmware/kubernetes_functions.py
Normal file
179
kraken/plugins/vmware/kubernetes_functions.py
Normal file
@@ -0,0 +1,179 @@
|
||||
from kubernetes import config, client
|
||||
from kubernetes.client.rest import ApiException
|
||||
import logging
|
||||
import random
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class Actions(Enum):
|
||||
"""
|
||||
This enumeration indicates different kinds of node operations
|
||||
"""
|
||||
|
||||
START = "Start"
|
||||
STOP = "Stop"
|
||||
TERMINATE = "Terminate"
|
||||
REBOOT = "Reboot"
|
||||
|
||||
|
||||
def setup_kubernetes(kubeconfig_path):
|
||||
"""
|
||||
Sets up the Kubernetes client
|
||||
"""
|
||||
|
||||
if kubeconfig_path is None:
|
||||
kubeconfig_path = config.KUBE_CONFIG_DEFAULT_LOCATION
|
||||
kubeconfig = config.kube_config.KubeConfigMerger(kubeconfig_path)
|
||||
|
||||
if kubeconfig.config is None:
|
||||
raise Exception(
|
||||
"Invalid kube-config file: %s. " "No configuration found." % kubeconfig_path
|
||||
)
|
||||
loader = config.kube_config.KubeConfigLoader(
|
||||
config_dict=kubeconfig.config,
|
||||
)
|
||||
client_config = client.Configuration()
|
||||
loader.load_and_set(client_config)
|
||||
return client.ApiClient(configuration=client_config)
|
||||
|
||||
|
||||
def list_killable_nodes(core_v1, label_selector=None):
|
||||
"""
|
||||
Returns a list of nodes that can be stopped/reset/released
|
||||
"""
|
||||
|
||||
nodes = []
|
||||
try:
|
||||
if label_selector:
|
||||
ret = core_v1.list_node(pretty=True, label_selector=label_selector)
|
||||
else:
|
||||
ret = core_v1.list_node(pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling CoreV1Api->list_node: %s\n" % e)
|
||||
raise e
|
||||
for node in ret.items:
|
||||
for cond in node.status.conditions:
|
||||
if str(cond.type) == "Ready" and str(cond.status) == "True":
|
||||
nodes.append(node.metadata.name)
|
||||
return nodes
|
||||
|
||||
|
||||
def list_startable_nodes(core_v1, label_selector=None):
|
||||
"""
|
||||
Returns a list of nodes that can be started
|
||||
"""
|
||||
|
||||
nodes = []
|
||||
try:
|
||||
if label_selector:
|
||||
ret = core_v1.list_node(pretty=True, label_selector=label_selector)
|
||||
else:
|
||||
ret = core_v1.list_node(pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling CoreV1Api->list_node: %s\n" % e)
|
||||
raise e
|
||||
for node in ret.items:
|
||||
for cond in node.status.conditions:
|
||||
if str(cond.type) == "Ready" and str(cond.status) != "True":
|
||||
nodes.append(node.metadata.name)
|
||||
return nodes
|
||||
|
||||
|
||||
def get_node_list(cfg, action, core_v1):
|
||||
"""
|
||||
Returns a list of nodes to be used in the node scenarios. The list returned is constructed as follows:
|
||||
- If the key 'name' is present in the node scenario config, the value is extracted and split into
|
||||
a list
|
||||
- Each node in the list is fed to the get_node function which checks if the node is killable or
|
||||
fetches the node using the label selector
|
||||
"""
|
||||
|
||||
def get_node(node_name, label_selector, instance_kill_count, action, core_v1):
|
||||
list_nodes_func = (
|
||||
list_startable_nodes if action == Actions.START else list_killable_nodes
|
||||
)
|
||||
if node_name in list_nodes_func(core_v1):
|
||||
return [node_name]
|
||||
elif node_name:
|
||||
logging.info(
|
||||
"Node with provided node_name does not exist or the node might "
|
||||
"be in NotReady state."
|
||||
)
|
||||
nodes = list_nodes_func(core_v1, label_selector)
|
||||
if not nodes:
|
||||
raise Exception("Ready nodes with the provided label selector do not exist")
|
||||
logging.info(
|
||||
"Ready nodes with the label selector %s: %s" % (label_selector, nodes)
|
||||
)
|
||||
number_of_nodes = len(nodes)
|
||||
if instance_kill_count == number_of_nodes:
|
||||
return nodes
|
||||
nodes_to_return = []
|
||||
for i in range(instance_kill_count):
|
||||
node_to_add = nodes[random.randint(0, len(nodes) - 1)]
|
||||
nodes_to_return.append(node_to_add)
|
||||
nodes.remove(node_to_add)
|
||||
return nodes_to_return
|
||||
|
||||
if cfg.name:
|
||||
input_nodes = cfg.name.split(",")
|
||||
else:
|
||||
input_nodes = [""]
|
||||
scenario_nodes = set()
|
||||
|
||||
if cfg.skip_openshift_checks:
|
||||
scenario_nodes = input_nodes
|
||||
else:
|
||||
for node in input_nodes:
|
||||
nodes = get_node(
|
||||
node, cfg.label_selector, cfg.instance_count, action, core_v1
|
||||
)
|
||||
scenario_nodes.update(nodes)
|
||||
|
||||
return list(scenario_nodes)
|
||||
|
||||
|
||||
def watch_node_status(node, status, timeout, watch_resource, core_v1):
|
||||
"""
|
||||
Monitor the status of a node for change
|
||||
"""
|
||||
count = timeout
|
||||
for event in watch_resource.stream(
|
||||
core_v1.list_node,
|
||||
field_selector=f"metadata.name={node}",
|
||||
timeout_seconds=timeout,
|
||||
):
|
||||
conditions = [
|
||||
status
|
||||
for status in event["object"].status.conditions
|
||||
if status.type == "Ready"
|
||||
]
|
||||
if conditions[0].status == status:
|
||||
watch_resource.stop()
|
||||
break
|
||||
else:
|
||||
count -= 1
|
||||
logging.info("Status of node " + node + ": " + str(conditions[0].status))
|
||||
if not count:
|
||||
watch_resource.stop()
|
||||
|
||||
|
||||
def wait_for_ready_status(node, timeout, watch_resource, core_v1):
|
||||
"""
|
||||
Wait until the node status becomes Ready
|
||||
"""
|
||||
watch_node_status(node, "True", timeout, watch_resource, core_v1)
|
||||
|
||||
|
||||
def wait_for_not_ready_status(node, timeout, watch_resource, core_v1):
|
||||
"""
|
||||
Wait until the node status becomes Not Ready
|
||||
"""
|
||||
watch_node_status(node, "False", timeout, watch_resource, core_v1)
|
||||
|
||||
|
||||
def wait_for_unknown_status(node, timeout, watch_resource, core_v1):
|
||||
"""
|
||||
Wait until the node status becomes Unknown
|
||||
"""
|
||||
watch_node_status(node, "Unknown", timeout, watch_resource, core_v1)
|
||||
119
kraken/plugins/vmware/test_vmware_plugin.py
Normal file
119
kraken/plugins/vmware/test_vmware_plugin.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import unittest
|
||||
import os
|
||||
import os
|
||||
import logging
|
||||
from arcaflow_plugin_sdk import plugin
|
||||
from kraken.plugins.vmware.kubernetes_functions import Actions
|
||||
from kraken.plugins.vmware import vmware_plugin
|
||||
|
||||
|
||||
class NodeScenariosTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
vsphere_env_vars = ["VSPHERE_IP", "VSPHERE_USERNAME", "VSPHERE_PASSWORD"]
|
||||
self.credentials_present = all(
|
||||
env_var in os.environ for env_var in vsphere_env_vars
|
||||
)
|
||||
|
||||
def test_serialization(self):
|
||||
plugin.test_object_serialization(
|
||||
vmware_plugin.NodeScenarioConfig(name="test", skip_openshift_checks=True),
|
||||
self.fail,
|
||||
)
|
||||
plugin.test_object_serialization(
|
||||
vmware_plugin.NodeScenarioSuccessOutput(
|
||||
nodes={}, action=Actions.START
|
||||
),
|
||||
self.fail,
|
||||
)
|
||||
plugin.test_object_serialization(
|
||||
vmware_plugin.NodeScenarioErrorOutput(
|
||||
error="Hello World", action=Actions.START
|
||||
),
|
||||
self.fail,
|
||||
)
|
||||
|
||||
def test_node_start(self):
|
||||
if not self.credentials_present:
|
||||
self.skipTest(
|
||||
"Check if the environmental variables 'VSPHERE_IP', 'VSPHERE_USERNAME', 'VSPHERE_PASSWORD' are set"
|
||||
)
|
||||
vsphere = vmware_plugin.vSphere(verify=False)
|
||||
vm_id, vm_name = vsphere.create_default_vm()
|
||||
if vm_id is None:
|
||||
self.fail("Could not create test VM")
|
||||
|
||||
output_id, output_data = vmware_plugin.node_start(
|
||||
vmware_plugin.NodeScenarioConfig(
|
||||
name=vm_name, skip_openshift_checks=True, verify_session=False
|
||||
)
|
||||
)
|
||||
if output_id == "error":
|
||||
logging.error(output_data.error)
|
||||
self.fail("The VMware VM did not start because an error occurred")
|
||||
vsphere.release_instances(vm_name)
|
||||
|
||||
def test_node_stop(self):
|
||||
if not self.credentials_present:
|
||||
self.skipTest(
|
||||
"Check if the environmental variables 'VSPHERE_IP', 'VSPHERE_USERNAME', 'VSPHERE_PASSWORD' are set"
|
||||
)
|
||||
vsphere = vmware_plugin.vSphere(verify=False)
|
||||
vm_id, vm_name = vsphere.create_default_vm()
|
||||
if vm_id is None:
|
||||
self.fail("Could not create test VM")
|
||||
vsphere.start_instances(vm_name)
|
||||
|
||||
output_id, output_data = vmware_plugin.node_stop(
|
||||
vmware_plugin.NodeScenarioConfig(
|
||||
name=vm_name, skip_openshift_checks=True, verify_session=False
|
||||
)
|
||||
)
|
||||
if output_id == "error":
|
||||
logging.error(output_data.error)
|
||||
self.fail("The VMware VM did not stop because an error occurred")
|
||||
vsphere.release_instances(vm_name)
|
||||
|
||||
def test_node_reboot(self):
|
||||
if not self.credentials_present:
|
||||
self.skipTest(
|
||||
"Check if the environmental variables 'VSPHERE_IP', 'VSPHERE_USERNAME', 'VSPHERE_PASSWORD' are set"
|
||||
)
|
||||
vsphere = vmware_plugin.vSphere(verify=False)
|
||||
vm_id, vm_name = vsphere.create_default_vm()
|
||||
if vm_id is None:
|
||||
self.fail("Could not create test VM")
|
||||
vsphere.start_instances(vm_name)
|
||||
|
||||
output_id, output_data = vmware_plugin.node_reboot(
|
||||
vmware_plugin.NodeScenarioConfig(
|
||||
name=vm_name, skip_openshift_checks=True, verify_session=False
|
||||
)
|
||||
)
|
||||
if output_id == "error":
|
||||
logging.error(output_data.error)
|
||||
self.fail("The VMware VM did not reboot because an error occurred")
|
||||
vsphere.release_instances(vm_name)
|
||||
|
||||
def test_node_terminate(self):
|
||||
if not self.credentials_present:
|
||||
self.skipTest(
|
||||
"Check if the environmental variables 'VSPHERE_IP', 'VSPHERE_USERNAME', 'VSPHERE_PASSWORD' are set"
|
||||
)
|
||||
vsphere = vmware_plugin.vSphere(verify=False)
|
||||
vm_id, vm_name = vsphere.create_default_vm()
|
||||
if vm_id is None:
|
||||
self.fail("Could not create test VM")
|
||||
vsphere.start_instances(vm_name)
|
||||
|
||||
output_id, output_data = vmware_plugin.node_terminate(
|
||||
vmware_plugin.NodeScenarioConfig(
|
||||
name=vm_name, skip_openshift_checks=True, verify_session=False
|
||||
)
|
||||
)
|
||||
if output_id == "error":
|
||||
logging.error(output_data.error)
|
||||
self.fail("The VMware VM did not reboot because an error occurred")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
651
kraken/plugins/vmware/vmware_plugin.py
Normal file
651
kraken/plugins/vmware/vmware_plugin.py
Normal file
@@ -0,0 +1,651 @@
|
||||
#!/usr/bin/env python
|
||||
import sys
|
||||
import time
|
||||
import typing
|
||||
from os import environ
|
||||
from dataclasses import dataclass, field
|
||||
import random
|
||||
from traceback import format_exc
|
||||
import logging
|
||||
from kraken.plugins.vmware import kubernetes_functions as kube_helper
|
||||
from com.vmware.vcenter_client import ResourcePool
|
||||
from arcaflow_plugin_sdk import validation, plugin
|
||||
from kubernetes import client, watch
|
||||
from vmware.vapi.vsphere.client import create_vsphere_client
|
||||
from com.vmware.vcenter_client import VM
|
||||
from com.vmware.vcenter.vm_client import Power
|
||||
from com.vmware.vapi.std.errors_client import (
|
||||
AlreadyInDesiredState,
|
||||
NotAllowedInCurrentState,
|
||||
)
|
||||
import requests
|
||||
import sys
|
||||
|
||||
|
||||
class vSphere:
|
||||
def __init__(self, verify=True):
|
||||
"""
|
||||
Initialize the vSphere client by using the the env variables:
|
||||
'VSPHERE_IP', 'VSPHERE_USERNAME', 'VSPHERE_PASSWORD'
|
||||
"""
|
||||
self.server = environ.get("VSPHERE_IP")
|
||||
self.username = environ.get("VSPHERE_USERNAME")
|
||||
self.password = environ.get("VSPHERE_PASSWORD")
|
||||
session = self.get_unverified_session() if not verify else None
|
||||
self.credentials_present = (
|
||||
True if self.server and self.username and self.password else False
|
||||
)
|
||||
if not self.credentials_present:
|
||||
raise Exception(
|
||||
"Environmental variables 'VSPHERE_IP', 'VSPHERE_USERNAME', 'VSPHERE_PASSWORD' are not set"
|
||||
)
|
||||
self.client = create_vsphere_client(
|
||||
server=self.server,
|
||||
username=self.username,
|
||||
password=self.password,
|
||||
session=session,
|
||||
)
|
||||
|
||||
def get_unverified_session(self):
|
||||
"""
|
||||
Returns an unverified session object
|
||||
"""
|
||||
|
||||
session = requests.session()
|
||||
session.verify = False
|
||||
requests.packages.urllib3.disable_warnings()
|
||||
return session
|
||||
|
||||
def get_vm(self, instance_id):
|
||||
"""
|
||||
Returns the VM ID corresponding to the VM Name (instance_id)
|
||||
If there are multiple matches, this only returns the first one
|
||||
"""
|
||||
|
||||
names = set([instance_id])
|
||||
vms = self.client.vcenter.VM.list(VM.FilterSpec(names=names))
|
||||
|
||||
if len(vms) == 0:
|
||||
logging.info("VM with name ({}) not found".format(instance_id))
|
||||
return None
|
||||
vm = vms[0].vm
|
||||
|
||||
return vm
|
||||
|
||||
def release_instances(self, instance_id):
|
||||
"""
|
||||
Deletes the VM whose name is given by 'instance_id'
|
||||
"""
|
||||
|
||||
vm = self.get_vm(instance_id)
|
||||
if not vm:
|
||||
raise Exception(
|
||||
"VM with the name ({}) does not exist."
|
||||
"Please create the vm first.".format(instance_id)
|
||||
)
|
||||
state = self.client.vcenter.vm.Power.get(vm)
|
||||
if state == Power.Info(state=Power.State.POWERED_ON):
|
||||
self.client.vcenter.vm.Power.stop(vm)
|
||||
elif state == Power.Info(state=Power.State.SUSPENDED):
|
||||
self.client.vcenter.vm.Power.start(vm)
|
||||
self.client.vcenter.vm.Power.stop(vm)
|
||||
self.client.vcenter.VM.delete(vm)
|
||||
logging.info("Deleted VM -- '{}-({})'".format(instance_id, vm))
|
||||
|
||||
def reboot_instances(self, instance_id):
|
||||
"""
|
||||
Reboots the VM whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the VM is not powered on
|
||||
"""
|
||||
|
||||
vm = self.get_vm(instance_id)
|
||||
try:
|
||||
self.client.vcenter.vm.Power.reset(vm)
|
||||
logging.info("Reset VM -- '{}-({})'".format(instance_id, vm))
|
||||
return True
|
||||
except NotAllowedInCurrentState:
|
||||
logging.info(
|
||||
"VM '{}'-'({})' is not Powered On. Cannot reset it".format(
|
||||
instance_id, vm
|
||||
)
|
||||
)
|
||||
return False
|
||||
|
||||
def stop_instances(self, instance_id):
|
||||
"""
|
||||
Stops the VM whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the VM is already powered off
|
||||
"""
|
||||
|
||||
vm = self.get_vm(instance_id)
|
||||
try:
|
||||
self.client.vcenter.vm.Power.stop(vm)
|
||||
logging.info("Stopped VM -- '{}-({})'".format(instance_id, vm))
|
||||
return True
|
||||
except AlreadyInDesiredState:
|
||||
logging.info(
|
||||
"VM '{}'-'({})' is already Powered Off".format(instance_id, vm)
|
||||
)
|
||||
return False
|
||||
|
||||
def start_instances(self, instance_id):
|
||||
"""
|
||||
Stops the VM whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the VM is already powered on
|
||||
"""
|
||||
|
||||
vm = self.get_vm(instance_id)
|
||||
try:
|
||||
self.client.vcenter.vm.Power.start(vm)
|
||||
logging.info("Started VM -- '{}-({})'".format(instance_id, vm))
|
||||
return True
|
||||
except AlreadyInDesiredState:
|
||||
logging.info("VM '{}'-'({})' is already Powered On".format(instance_id, vm))
|
||||
return False
|
||||
|
||||
def list_instances(self, datacenter):
|
||||
"""
|
||||
Returns a list of VMs present in the datacenter
|
||||
"""
|
||||
|
||||
datacenter_filter = self.client.vcenter.Datacenter.FilterSpec(
|
||||
names=set([datacenter])
|
||||
)
|
||||
datacenter_summaries = self.client.vcenter.Datacenter.list(datacenter_filter)
|
||||
try:
|
||||
datacenter_id = datacenter_summaries[0].datacenter
|
||||
except IndexError:
|
||||
logging.error("Datacenter '{}' doesn't exist".format(datacenter))
|
||||
sys.exit(1)
|
||||
|
||||
vm_filter = self.client.vcenter.VM.FilterSpec(datacenters={datacenter_id})
|
||||
vm_summaries = self.client.vcenter.VM.list(vm_filter)
|
||||
vm_names = []
|
||||
for vm in vm_summaries:
|
||||
vm_names.append({"vm_name": vm.name, "vm_id": vm.vm})
|
||||
return vm_names
|
||||
|
||||
def get_datacenter_list(self):
|
||||
"""
|
||||
Returns a dictionary containing all the datacenter names and IDs
|
||||
"""
|
||||
|
||||
datacenter_summaries = self.client.vcenter.Datacenter.list()
|
||||
datacenter_names = [
|
||||
{"datacenter_id": datacenter.datacenter, "datacenter_name": datacenter.name}
|
||||
for datacenter in datacenter_summaries
|
||||
]
|
||||
return datacenter_names
|
||||
|
||||
def get_datastore_list(self, datacenter=None):
|
||||
"""
|
||||
Returns a dictionary containing all the datastore names and IDs belonging to a specific datacenter
|
||||
"""
|
||||
|
||||
datastore_filter = self.client.vcenter.Datastore.FilterSpec(
|
||||
datacenters={datacenter}
|
||||
)
|
||||
datastore_summaries = self.client.vcenter.Datastore.list(datastore_filter)
|
||||
datastore_names = []
|
||||
for datastore in datastore_summaries:
|
||||
datastore_names.append(
|
||||
{"datastore_name": datastore.name, "datastore_id": datastore.datastore}
|
||||
)
|
||||
return datastore_names
|
||||
|
||||
def get_folder_list(self, datacenter=None):
|
||||
"""
|
||||
Returns a dictionary containing all the folder names and IDs belonging to a specific datacenter
|
||||
"""
|
||||
|
||||
folder_filter = self.client.vcenter.Folder.FilterSpec(datacenters={datacenter})
|
||||
folder_summaries = self.client.vcenter.Folder.list(folder_filter)
|
||||
folder_names = []
|
||||
for folder in folder_summaries:
|
||||
folder_names.append(
|
||||
{"folder_name": folder.name, "folder_id": folder.folder}
|
||||
)
|
||||
return folder_names
|
||||
|
||||
def get_resource_pool(self, datacenter, resource_pool_name=None):
|
||||
"""
|
||||
Returns the identifier of the resource pool with the given name or the
|
||||
first resource pool in the datacenter if the name is not provided.
|
||||
"""
|
||||
|
||||
names = set([resource_pool_name]) if resource_pool_name else None
|
||||
filter_spec = ResourcePool.FilterSpec(
|
||||
datacenters=set([datacenter]), names=names
|
||||
)
|
||||
resource_pool_summaries = self.client.vcenter.ResourcePool.list(filter_spec)
|
||||
if len(resource_pool_summaries) > 0:
|
||||
resource_pool = resource_pool_summaries[0].resource_pool
|
||||
return resource_pool
|
||||
else:
|
||||
logging.error(
|
||||
"ResourcePool not found in Datacenter '{}'".format(datacenter)
|
||||
)
|
||||
return None
|
||||
|
||||
def create_default_vm(self, guest_os="RHEL_7_64", max_attempts=10):
|
||||
"""
|
||||
Creates a default VM with 2 GB memory, 1 CPU and 16 GB disk space in a
|
||||
random datacenter. Accepts the guest OS as a parameter. Since the VM placement
|
||||
is random, it might fail due to resource constraints. So, this function tries
|
||||
for upto 'max_attempts' to create the VM
|
||||
"""
|
||||
|
||||
def create_vm(vm_name, resource_pool, folder, datastore, guest_os):
|
||||
"""
|
||||
Creates a VM and returns its ID and name. Requires the VM name, resource
|
||||
pool name, folder name, datastore and the guest OS
|
||||
"""
|
||||
|
||||
placement_spec = VM.PlacementSpec(
|
||||
folder=folder, resource_pool=resource_pool, datastore=datastore
|
||||
)
|
||||
vm_create_spec = VM.CreateSpec(
|
||||
name=vm_name, guest_os=guest_os, placement=placement_spec
|
||||
)
|
||||
|
||||
vm_id = self.client.vcenter.VM.create(vm_create_spec)
|
||||
return vm_id
|
||||
|
||||
for _ in range(max_attempts):
|
||||
try:
|
||||
datacenter_list = self.get_datacenter_list()
|
||||
datacenter = random.choice(datacenter_list)
|
||||
resource_pool = self.get_resource_pool(datacenter["datacenter_id"])
|
||||
folder = random.choice(
|
||||
self.get_folder_list(datacenter["datacenter_id"])
|
||||
)["folder_id"]
|
||||
datastore = random.choice(
|
||||
self.get_datastore_list(datacenter["datacenter_id"])
|
||||
)["datastore_id"]
|
||||
vm_name = "Test-" + str(time.time_ns())
|
||||
return (
|
||||
create_vm(vm_name, resource_pool, folder, datastore, guest_os),
|
||||
vm_name,
|
||||
)
|
||||
except Exception as e:
|
||||
pass
|
||||
logging.error(
|
||||
"Default VM could not be created in {} attempts. Check your VMware resources".format(
|
||||
max_attempts
|
||||
)
|
||||
)
|
||||
return None, None
|
||||
|
||||
def get_vm_status(self, instance_id):
|
||||
"""
|
||||
Returns the status of the VM whose name is given by 'instance_id'
|
||||
"""
|
||||
|
||||
try:
|
||||
vm = self.get_vm(instance_id)
|
||||
state = self.client.vcenter.vm.Power.get(vm).state
|
||||
logging.info("Check instance %s status", instance_id)
|
||||
return state
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to get node instance status %s. Encountered following "
|
||||
"exception: %s." % (instance_id, e)
|
||||
)
|
||||
return None
|
||||
|
||||
def wait_until_released(self, instance_id, timeout):
|
||||
"""
|
||||
Waits until the VM is deleted or until the timeout. Returns True if
|
||||
the VM is successfully deleted, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
vm = self.get_vm(instance_id)
|
||||
while vm is not None:
|
||||
vm = self.get_vm(instance_id)
|
||||
logging.info(
|
||||
"VM %s is still being deleted, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"VM %s is still not deleted in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
def wait_until_running(self, instance_id, timeout):
|
||||
"""
|
||||
Waits until the VM switches to POWERED_ON state or until the timeout.
|
||||
Returns True if the VM switches to POWERED_ON, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
status = self.get_vm_status(instance_id)
|
||||
while status != Power.State.POWERED_ON:
|
||||
status = self.get_vm_status(instance_id)
|
||||
logging.info(
|
||||
"VM %s is still not running, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info("VM %s is still not ready in allotted time" % instance_id)
|
||||
return False
|
||||
return True
|
||||
|
||||
def wait_until_stopped(self, instance_id, timeout):
|
||||
"""
|
||||
Waits until the VM switches to POWERED_OFF state or until the timeout.
|
||||
Returns True if the VM switches to POWERED_OFF, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
status = self.get_vm_status(instance_id)
|
||||
while status != Power.State.POWERED_OFF:
|
||||
status = self.get_vm_status(instance_id)
|
||||
logging.info(
|
||||
"VM %s is still not running, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info("VM %s is still not ready in allotted time" % instance_id)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@dataclass
|
||||
class Node:
|
||||
name: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeScenarioSuccessOutput:
|
||||
|
||||
nodes: typing.Dict[int, Node] = field(
|
||||
metadata={
|
||||
"name": "Nodes started/stopped/terminated/rebooted",
|
||||
"description": """Map between timestamps and the pods started/stopped/terminated/rebooted.
|
||||
The timestamp is provided in nanoseconds""",
|
||||
}
|
||||
)
|
||||
action: kube_helper.Actions = field(
|
||||
metadata={
|
||||
"name": "The action performed on the node",
|
||||
"description": """The action performed or attempted to be performed on the node. Possible values
|
||||
are : Start, Stop, Terminate, Reboot""",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeScenarioErrorOutput:
|
||||
|
||||
error: str
|
||||
action: kube_helper.Actions = field(
|
||||
metadata={
|
||||
"name": "The action performed on the node",
|
||||
"description": """The action attempted to be performed on the node. Possible values are : Start
|
||||
Stop, Terminate, Reboot""",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeScenarioConfig:
|
||||
|
||||
name: typing.Annotated[
|
||||
typing.Optional[str],
|
||||
validation.required_if_not("label_selector"),
|
||||
validation.required_if("skip_openshift_checks"),
|
||||
] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Name",
|
||||
"description": "Name(s) for target nodes. Required if label_selector is not set.",
|
||||
},
|
||||
)
|
||||
|
||||
runs: typing.Annotated[typing.Optional[int], validation.min(1)] = field(
|
||||
default=1,
|
||||
metadata={
|
||||
"name": "Number of runs per node",
|
||||
"description": "Number of times to inject each scenario under actions (will perform on same node each time)",
|
||||
},
|
||||
)
|
||||
|
||||
label_selector: typing.Annotated[
|
||||
typing.Optional[str], validation.min(1), validation.required_if_not("name")
|
||||
] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Label selector",
|
||||
"description": "Kubernetes label selector for the target nodes. Required if name is not set.\n"
|
||||
"See https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ for details.",
|
||||
},
|
||||
)
|
||||
|
||||
timeout: typing.Annotated[typing.Optional[int], validation.min(1)] = field(
|
||||
default=180,
|
||||
metadata={
|
||||
"name": "Timeout",
|
||||
"description": "Timeout to wait for the target pod(s) to be removed in seconds.",
|
||||
},
|
||||
)
|
||||
|
||||
instance_count: typing.Annotated[typing.Optional[int], validation.min(1)] = field(
|
||||
default=1,
|
||||
metadata={
|
||||
"name": "Instance Count",
|
||||
"description": "Number of nodes to perform action/select that match the label selector.",
|
||||
},
|
||||
)
|
||||
|
||||
skip_openshift_checks: typing.Optional[bool] = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"name": "Skip Openshift Checks",
|
||||
"description": "Skip checking the status of the openshift nodes.",
|
||||
},
|
||||
)
|
||||
|
||||
verify_session: bool = field(
|
||||
default=True,
|
||||
metadata={
|
||||
"name": "Verify API Session",
|
||||
"description": "Verifies the vSphere client session. It is enabled by default",
|
||||
},
|
||||
)
|
||||
|
||||
kubeconfig_path: typing.Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Kubeconfig path",
|
||||
"description": "Path to your Kubeconfig file. Defaults to ~/.kube/config.\n"
|
||||
"See https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ for "
|
||||
"details.",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="node-start",
|
||||
name="Start the node",
|
||||
description="Start the node(s) by starting the VMware VM on which the node is configured",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_start(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
vsphere = vSphere(verify=cfg.verify_session)
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
watch_resource = watch.Watch()
|
||||
node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.START, core_v1)
|
||||
nodes_started = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
logging.info("Starting the node %s " % (name))
|
||||
vm_started = vsphere.start_instances(name)
|
||||
if vm_started:
|
||||
vsphere.wait_until_running(name, cfg.timeout)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_ready_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
nodes_started[int(time.time_ns())] = Node(name=name)
|
||||
logging.info("Node with instance ID: %s is in running state" % name)
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Test Failed")
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.START
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_started, kube_helper.Actions.START
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="node-stop",
|
||||
name="Stop the node",
|
||||
description="Stop the node(s) by starting the VMware VM on which the node is configured",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_stop(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
vsphere = vSphere(verify=cfg.verify_session)
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
watch_resource = watch.Watch()
|
||||
node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.STOP, core_v1)
|
||||
nodes_stopped = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
logging.info("Stopping the node %s " % (name))
|
||||
vm_stopped = vsphere.stop_instances(name)
|
||||
if vm_stopped:
|
||||
vsphere.wait_until_stopped(name, cfg.timeout)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_ready_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
nodes_stopped[int(time.time_ns())] = Node(name=name)
|
||||
logging.info("Node with instance ID: %s is in stopped state" % name)
|
||||
logging.info("node_stop_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Test Failed")
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.STOP
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_stopped, kube_helper.Actions.STOP
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="node-reboot",
|
||||
name="Reboot VMware VM",
|
||||
description="Reboot the node(s) by starting the VMware VM on which the node is configured",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_reboot(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
vsphere = vSphere(verify=cfg.verify_session)
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
watch_resource = watch.Watch()
|
||||
node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.REBOOT, core_v1)
|
||||
nodes_rebooted = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
logging.info("Rebooting the node %s " % (name))
|
||||
vsphere.reboot_instances(name)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_unknown_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
kube_helper.wait_for_ready_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
nodes_rebooted[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has rebooted successfully" % name
|
||||
)
|
||||
logging.info("node_reboot_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Test Failed")
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.REBOOT
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_rebooted, kube_helper.Actions.REBOOT
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="node-terminate",
|
||||
name="Reboot VMware VM",
|
||||
description="Wait for the specified number of pods to be present",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_terminate(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
vsphere = vSphere(verify=cfg.verify_session)
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
node_list = kube_helper.get_node_list(
|
||||
cfg, kube_helper.Actions.TERMINATE, core_v1
|
||||
)
|
||||
nodes_terminated = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info(
|
||||
"Starting node_termination_scenario injection by first stopping the node"
|
||||
)
|
||||
vsphere.stop_instances(name)
|
||||
vsphere.wait_until_stopped(name, cfg.timeout)
|
||||
logging.info("Releasing the node with instance ID: %s " % (name))
|
||||
vsphere.release_instances(name)
|
||||
vsphere.wait_until_released(name, cfg.timeout)
|
||||
nodes_terminated[int(time.time_ns())] = Node(name=name)
|
||||
logging.info("Node with instance ID: %s has been released" % name)
|
||||
logging.info(
|
||||
"node_terminate_scenario has been successfully injected!"
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to terminate node instance. Test Failed")
|
||||
logging.error("node_terminate_scenario injection failed!")
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.TERMINATE
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_terminated, kube_helper.Actions.TERMINATE
|
||||
)
|
||||
@@ -12,7 +12,7 @@ oauth2client>=4.1.3
|
||||
python-openstackclient
|
||||
gitpython
|
||||
paramiko
|
||||
setuptools
|
||||
setuptools==63.4.1
|
||||
openshift-client
|
||||
python-ipmi
|
||||
podman-compose
|
||||
@@ -23,3 +23,4 @@ werkzeug==2.0.3
|
||||
aliyun-python-sdk-core-v3
|
||||
aliyun-python-sdk-ecs
|
||||
arcaflow-plugin-sdk==0.3.0
|
||||
git+https://github.com/vmware/vsphere-automation-sdk-python.git
|
||||
10
scenarios/openshift/vmware_node_scenarios.yml
Normal file
10
scenarios/openshift/vmware_node_scenarios.yml
Normal file
@@ -0,0 +1,10 @@
|
||||
# yaml-language-server: $schema=../plugin.schema.json
|
||||
- id: <node-stop/node-start/node-reboot/node-terminate>
|
||||
config:
|
||||
name: <node_name> # Node on which scenario has to be injected; can set multiple names separated by comma
|
||||
label_selector: <label_selector> # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
|
||||
runs: 1 # Number of times to inject each scenario under actions (will perform on same node each time)
|
||||
instance_count: 1 # Number of nodes to perform action/select that match the label selector
|
||||
timeout: 300 # Duration to wait for completion of node scenario injection
|
||||
verify_session: True # Set to True if you want to verify the vSphere client session using certificates; else False
|
||||
skip_openshift_checks: False # Set to True if you don't want to wait for the status of the nodes to change on OpenShift before passing the scenario
|
||||
Reference in New Issue
Block a user