mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 09:59:59 +00:00
Baremetal Node Support (#74)
* Support for baremtal node scenarious * Finished baremetal support * Added documentation for baremetal * Clarify limitations of implementation in documentation * Add baremetal support to new run.py file * Allow use on newer machines Some older machines require lanplus instead of lan * Setup to allow per-device user, pass, and bmc address Also set min version for a dependency * Fix linting issues * More linting issue fixes * More linter issues * Account for linter standard non-conformity * Added baremetal warning Co-authored-by: jaredoconnell <jocnnel@redhat.com>
This commit is contained in:
@@ -20,6 +20,18 @@ Following node chaos scenarios are supported:
|
||||
|
||||
How to set up AWS cli to run node scenarios is defined [here](cloud_setup.md#aws)
|
||||
|
||||
#### Baremetal
|
||||
**NOTE**: Baremetal requires setting the IPMI user and password to power on, off, and reboot nodes, using the config options `bm_user` and `bm_password`. It can either be set in the root of the entry in the scenarios config, or it can be set per machine.
|
||||
|
||||
If no per-machine addresses are specified, kraken attempts to use the BMC value in the BareMetalHost object. To list them, you can do 'oc get bmh -o wide --all-namespaces'. If the BMC values are blank, you must specify them per-machine using the config option 'bmc_addr' as specified below.
|
||||
|
||||
For per-machine settings, add a "bmc_info" section to the entry in the scenarios config. Inside there, add a configuration section using the node name. In that, add per-machine settings. Valid settings are 'bmc_user', 'bmc_password', and 'bmc_addr'.
|
||||
For examples, see the example node scenario or the example below.
|
||||
|
||||
**NOTE**: Baremetal requires oc (openshift client) be installed on the machine running Kraken.
|
||||
|
||||
**NOTE**: Baremetal machines are fragile. Some node actions can occasionally corrupt the filesystem if it does not shut down properly, and sometimes the kubelet does not start properly.
|
||||
|
||||
#### GCP
|
||||
How to set up GCP cli to run node scenarios is defined [here](cloud_setup.md#gcp)
|
||||
|
||||
@@ -80,4 +92,20 @@ node_scenarios:
|
||||
- named
|
||||
ssh_private_key: /root/.ssh/id_rsa # ssh key to access the helper node
|
||||
cloud_type: openstack
|
||||
- actions:
|
||||
- node_stop_start_scenario
|
||||
node_name:
|
||||
label_selector: node-role.kubernetes.io/worker
|
||||
instance_kill_count: 1
|
||||
timeout: 120
|
||||
cloud_type: bm
|
||||
bmc_user: defaultuser # For baremetal (bm) cloud type. The default IPMI username. Optional if specified for all machines.
|
||||
bmc_password: defaultpass # For baremetal (bm) cloud type. The default IPMI password. Optional if specified for all machines.
|
||||
bmc_info: # This section is here to specify baremetal per-machine info, so it is optional if there is no per-machine info.
|
||||
node-1: # The node name for the baremetal machine
|
||||
bmc_addr: mgmt-machine1.example.com # Optional. For baremetal nodes with the IPMI BMC address missing from 'oc get bmh'
|
||||
node-2:
|
||||
bmc_addr: mgmt-machine2.example.com
|
||||
bmc_user: user # The baremetal IPMI user. Overrides the default IPMI user specified above. Optional if the default is set.
|
||||
bmc_password: pass # The baremetal IPMI password. Overrides the default IPMI user specified above. Optional if the default is set.
|
||||
```
|
||||
|
||||
178
kraken/node_actions/bm_node_scenarios.py
Normal file
178
kraken/node_actions/bm_node_scenarios.py
Normal file
@@ -0,0 +1,178 @@
|
||||
import kraken.node_actions.common_node_functions as nodeaction
|
||||
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
|
||||
import logging
|
||||
import openshift as oc
|
||||
import pyipmi
|
||||
import pyipmi.interfaces
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
|
||||
|
||||
class BM:
|
||||
def __init__(self, bm_info, user, passwd):
|
||||
self.user = user
|
||||
self.passwd = passwd
|
||||
self.bm_info = bm_info
|
||||
|
||||
def get_node_object(self, node_name):
|
||||
with oc.project("openshift-machine-api"):
|
||||
return oc.selector("node/" + node_name).object()
|
||||
|
||||
# Get the ipmi or other BMC address of the baremetal node
|
||||
def get_bmc_addr(self, node_name):
|
||||
# Addresses in the config get higher priority.
|
||||
if self.bm_info is not None and node_name in self.bm_info and "bmc_addr" in self.bm_info[node_name]:
|
||||
return self.bm_info[node_name]["bmc_addr"]
|
||||
|
||||
# Get the bmc addr from the BareMetalHost object.
|
||||
with oc.project("openshift-machine-api"):
|
||||
logging.info("Getting node with name: %s" % (node_name))
|
||||
node = self.get_node_object(node_name)
|
||||
provider_id = node.model.spec.providerID
|
||||
startOfUid = provider_id.rfind("/") # The / before the uid
|
||||
startOfName = provider_id.rfind("/", 0, startOfUid) + 1
|
||||
bmh_name = provider_id[startOfName:startOfUid]
|
||||
bmh_resource_name = "baremetalhost.metal3.io/" + bmh_name
|
||||
bmh_object = oc.selector(bmh_resource_name).object()
|
||||
if len(bmh_object.model.spec.bmc.addr) == 0:
|
||||
logging.error(
|
||||
'BMC addr empty for node "%s". Either fix the BMH object,'
|
||||
" or specify the address in the scenario config" % node_name
|
||||
)
|
||||
sys.exit(1)
|
||||
return bmh_object.model.spec.bmc.address
|
||||
|
||||
def get_ipmi_connection(self, bmc_addr, node_name):
|
||||
type_position = bmc_addr.find("://")
|
||||
if type_position == -1:
|
||||
host = bmc_addr
|
||||
else:
|
||||
host = bmc_addr[type_position + 3 :]
|
||||
port_position = host.find(":")
|
||||
if port_position == -1:
|
||||
port = 623
|
||||
else:
|
||||
port = int(host[port_position + 1 :])
|
||||
host = host[0:port_position]
|
||||
|
||||
# Determine correct username and password
|
||||
# If specified, uses device-specific user/pass. Else uses the global one.
|
||||
if self.bm_info is not None and node_name in self.bm_info:
|
||||
user = self.bm_info[node_name].get("bmc_user", self.user)
|
||||
passwd = self.bm_info[node_name].get("bmc_password", self.passwd)
|
||||
else:
|
||||
user = self.user
|
||||
passwd = self.passwd
|
||||
if user is None or passwd is None:
|
||||
logging.error(
|
||||
"Missing IPMI BMI user and/or password for baremetal cloud. "
|
||||
"Please specify either a global or per-machine user and pass"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# Establish connection
|
||||
interface = pyipmi.interfaces.create_interface("ipmitool", interface_type="lanplus")
|
||||
|
||||
connection = pyipmi.create_connection(interface)
|
||||
|
||||
connection.target = pyipmi.Target(ipmb_address=0x20)
|
||||
connection.session.set_session_type_rmcp(host, port)
|
||||
connection.session.set_auth_type_user(user, passwd)
|
||||
connection.session.establish()
|
||||
return connection
|
||||
|
||||
# Start the node instance
|
||||
def start_instances(self, bmc_addr, node_name):
|
||||
self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_up()
|
||||
|
||||
# Stop the node instance
|
||||
def stop_instances(self, bmc_addr, node_name):
|
||||
self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_down()
|
||||
|
||||
# Reboot the node instance
|
||||
def reboot_instances(self, bmc_addr, node_name):
|
||||
self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_cycle()
|
||||
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, bmc_addr, node_name):
|
||||
while not self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on:
|
||||
time.sleep(1)
|
||||
|
||||
# Wait until the node instance is stopped
|
||||
def wait_until_stopped(self, bmc_addr, node_name):
|
||||
while self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on:
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
class bm_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, bm_info, user, passwd):
|
||||
self.bm = BM(bm_info, user, passwd)
|
||||
|
||||
# Node scenario to start the node
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
bmc_addr = self.bm.get_bmc_addr(node)
|
||||
logging.info("Starting the node %s with bmc address: %s " % (node, bmc_addr))
|
||||
self.bm.start_instances(bmc_addr, node)
|
||||
self.bm.wait_until_running(bmc_addr, node)
|
||||
nodeaction.wait_for_ready_status(node, timeout)
|
||||
logging.info("Node with bmc address: %s is in running state" % (bmc_addr))
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to start node instance. Encountered following "
|
||||
"exception: %s. Test Failed. Most errors are caused by "
|
||||
"an incorrect ipmi address or login" % (e)
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
bmc_addr = self.bm.get_bmc_addr(node)
|
||||
logging.info("Stopping the node %s with bmc address: %s " % (node, bmc_addr))
|
||||
self.bm.stop_instances(bmc_addr, node)
|
||||
self.bm.wait_until_stopped(bmc_addr, node)
|
||||
logging.info("Node with bmc address: %s is in stopped state" % (bmc_addr))
|
||||
nodeaction.wait_for_unknown_status(node, timeout)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to stop node instance. Encountered following exception: %s. "
|
||||
"Test Failed. Most errors are caused by "
|
||||
"an incorrect ipmi address or login" % (e)
|
||||
)
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
logging.info("Node termination scenario is not supported on baremetal")
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
bmc_addr = self.bm.get_bmc_addr(node)
|
||||
logging.info("BMC Addr: %s" % (bmc_addr))
|
||||
logging.info("Rebooting the node %s with bmc address: %s " % (node, bmc_addr))
|
||||
self.bm.reboot_instances(bmc_addr, node)
|
||||
nodeaction.wait_for_unknown_status(node, timeout)
|
||||
nodeaction.wait_for_ready_status(node, timeout)
|
||||
logging.info("Node with bmc address: %s has been rebooted" % (bmc_addr))
|
||||
logging.info("node_reboot_scenario has been successfuly injected!")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to reboot node instance. Encountered following exception:"
|
||||
" %s. Test Failed. Most errors are caused by "
|
||||
"an incorrect ipmi address or login" % (e)
|
||||
)
|
||||
traceback.print_exc()
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
@@ -34,7 +34,7 @@ def wait_for_unknown_status(node, timeout):
|
||||
for _ in range(timeout):
|
||||
if kubecli.get_node_status(node) == "Unknown":
|
||||
break
|
||||
time.sleep(1)
|
||||
time.sleep(3)
|
||||
if kubecli.get_node_status(node) != "Unknown":
|
||||
raise Exception("Node condition status isn't Unknown")
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ from kraken.node_actions.general_cloud_node_scenarios import general_node_scenar
|
||||
from kraken.node_actions.az_node_scenarios import azure_node_scenarios
|
||||
from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios
|
||||
from kraken.node_actions.openstack_node_scenarios import openstack_node_scenarios
|
||||
from kraken.node_actions.bm_node_scenarios import bm_node_scenarios
|
||||
import kraken.node_actions.common_node_functions as common_node_functions
|
||||
import kraken.cerberus.setup as cerberus
|
||||
|
||||
@@ -28,6 +29,10 @@ def get_node_scenario_object(node_scenario):
|
||||
return openstack_node_scenarios()
|
||||
elif node_scenario["cloud_type"] == "azure" or node_scenario["cloud_type"] == "az":
|
||||
return azure_node_scenarios()
|
||||
elif node_scenario["cloud_type"] == "bm":
|
||||
return bm_node_scenarios(
|
||||
node_scenario.get("bmc_info"), node_scenario.get("bmc_user", None), node_scenario.get("bmc_password", None)
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Cloud type " + node_scenario["cloud_type"] + " is not currently supported; "
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
datetime
|
||||
pyfiglet
|
||||
PyYAML
|
||||
PyYAML>=5.1
|
||||
git+https://github.com/powerfulseal/powerfulseal.git
|
||||
requests
|
||||
boto3
|
||||
@@ -14,3 +14,5 @@ python-openstackclient
|
||||
gitpython
|
||||
paramiko
|
||||
setuptools
|
||||
openshift-client
|
||||
python-ipmi
|
||||
|
||||
Reference in New Issue
Block a user