mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-15 18:40:09 +00:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4f305e78aa | ||
|
|
b17e933134 | ||
|
|
beea484597 | ||
|
|
0222b0f161 | ||
|
|
f7e674d5ad | ||
|
|
7aea12ce6c | ||
|
|
625e1e90cf |
@@ -42,7 +42,7 @@ telemetry:
|
||||
prometheus_backup: True # enables/disables prometheus data collection
|
||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||
backup_threads: 5 # number of telemetry download/upload threads
|
||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||
archive_path: /tmp # local path where the archive files will be temporarily stored
|
||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||
archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||
|
||||
@@ -26,7 +26,7 @@ Here is an excerpt:
|
||||
## Maintainer Levels
|
||||
|
||||
### Contributor
|
||||
Contributors contributor to the community. Anyone can become a contributor by participating in discussions, reporting bugs, or contributing code or documentation.
|
||||
Contributors contribute to the community. Anyone can become a contributor by participating in discussions, reporting bugs, or contributing code or documentation.
|
||||
|
||||
#### Responsibilities:
|
||||
|
||||
@@ -80,4 +80,4 @@ Represent the project in the broader open-source community.
|
||||
|
||||
|
||||
# Credits
|
||||
Sections of this documents have been borrowed from [Kubernetes governance](https://github.com/kubernetes/community/blob/master/governance.md)
|
||||
Sections of this document have been borrowed from [Kubernetes governance](https://github.com/kubernetes/community/blob/master/governance.md)
|
||||
@@ -16,5 +16,5 @@ Following are a list of enhancements that we are planning to work on adding supp
|
||||
- [x] [Krknctl - client for running Krkn scenarios with ease](https://github.com/krkn-chaos/krknctl)
|
||||
- [x] [AI Chat bot to help get started with Krkn and commands](https://github.com/krkn-chaos/krkn-lightspeed)
|
||||
- [ ] [Ability to roll back cluster to original state if chaos fails](https://github.com/krkn-chaos/krkn/issues/804)
|
||||
- [ ] Add recovery time metrics to each scenario for each better regression analysis
|
||||
- [ ] Add recovery time metrics to each scenario for better regression analysis
|
||||
- [ ] [Add resiliency scoring to chaos scenarios ran on cluster](https://github.com/krkn-chaos/krkn/issues/125)
|
||||
@@ -40,4 +40,4 @@ The security team currently consists of the [Maintainers of Krkn](https://github
|
||||
|
||||
## Process and Supported Releases
|
||||
|
||||
The Krkn security team will investigate and provide a fix in a timely mannner depending on the severity. The fix will be included in the new release of Krkn and details will be included in the release notes.
|
||||
The Krkn security team will investigate and provide a fix in a timely manner depending on the severity. The fix will be included in the new release of Krkn and details will be included in the release notes.
|
||||
|
||||
@@ -39,7 +39,7 @@ cerberus:
|
||||
Sunday:
|
||||
slack_team_alias: # The slack team alias to be tagged while reporting failures in the slack channel when no watcher is assigned
|
||||
|
||||
custom_checks: # Relative paths of files conataining additional user defined checks
|
||||
custom_checks: # Relative paths of files containing additional user defined checks
|
||||
|
||||
tunings:
|
||||
timeout: 3 # Number of seconds before requests fail
|
||||
|
||||
@@ -93,7 +93,7 @@ telemetry:
|
||||
prometheus_pod_name: "" # name of the prometheus pod (if distribution is kubernetes)
|
||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||
backup_threads: 5 # number of telemetry download/upload threads
|
||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||
archive_path: /tmp # local path where the archive files will be temporarily stored
|
||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||
archive_size: 500000
|
||||
|
||||
@@ -32,7 +32,7 @@ tunings:
|
||||
|
||||
telemetry:
|
||||
enabled: False # enable/disables the telemetry collection feature
|
||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||
archive_path: /tmp # local path where the archive files will be temporarily stored
|
||||
events_backup: False # enables/disables cluster events collection
|
||||
logs_backup: False
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ telemetry:
|
||||
prometheus_backup: True # enables/disables prometheus data collection
|
||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||
backup_threads: 5 # number of telemetry download/upload threads
|
||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||
archive_path: /tmp # local path where the archive files will be temporarily stored
|
||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||
archive_size: 500000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||
|
||||
@@ -214,7 +214,7 @@ def metrics(
|
||||
end_time=datetime.datetime.fromtimestamp(end_time), granularity=30
|
||||
)
|
||||
else:
|
||||
logging.info('didnt match keys')
|
||||
logging.info("didn't match keys")
|
||||
continue
|
||||
|
||||
for returned_metric in metrics_result:
|
||||
|
||||
@@ -36,7 +36,7 @@ def get_test_pods(
|
||||
- pods matching the label on which network policy
|
||||
need to be applied
|
||||
|
||||
namepsace (string)
|
||||
namespace (string)
|
||||
- namespace in which the pod is present
|
||||
|
||||
kubecli (KrknKubernetes)
|
||||
|
||||
@@ -11,7 +11,7 @@ def get_node_by_name(node_name_list, kubecli: KrknKubernetes):
|
||||
for node_name in node_name_list:
|
||||
if node_name not in killable_nodes:
|
||||
logging.info(
|
||||
f"Node with provided ${node_name} does not exist or the node might "
|
||||
f"Node with provided {node_name} does not exist or the node might "
|
||||
"be in NotReady state."
|
||||
)
|
||||
return
|
||||
|
||||
@@ -49,7 +49,11 @@ class VirtChecker:
|
||||
for vmi in self.kube_vm_plugin.vmis_list:
|
||||
node_name = vmi.get("status",{}).get("nodeName")
|
||||
vmi_name = vmi.get("metadata",{}).get("name")
|
||||
ip_address = vmi.get("status",{}).get("interfaces",[])[0].get("ipAddress")
|
||||
interfaces = vmi.get("status",{}).get("interfaces",[])
|
||||
if not interfaces:
|
||||
logging.warning(f"VMI {vmi_name} has no network interfaces, skipping")
|
||||
continue
|
||||
ip_address = interfaces[0].get("ipAddress")
|
||||
namespace = vmi.get("metadata",{}).get("namespace")
|
||||
# If node_name_list exists, only add if node name is in list
|
||||
|
||||
@@ -74,7 +78,8 @@ class VirtChecker:
|
||||
else:
|
||||
logging.debug(f"Disconnected access for {ip_address} on {worker_name} is failed: {output}")
|
||||
vmi = self.kube_vm_plugin.get_vmi(vmi_name,self.namespace)
|
||||
new_ip_address = vmi.get("status",{}).get("interfaces",[])[0].get("ipAddress")
|
||||
interfaces = vmi.get("status",{}).get("interfaces",[])
|
||||
new_ip_address = interfaces[0].get("ipAddress") if interfaces else None
|
||||
new_node_name = vmi.get("status",{}).get("nodeName")
|
||||
# if vm gets deleted, it'll start up with a new ip address
|
||||
if new_ip_address != ip_address:
|
||||
@@ -102,7 +107,7 @@ class VirtChecker:
|
||||
|
||||
def get_vm_access(self, vm_name: str = '', namespace: str = ''):
|
||||
"""
|
||||
This method returns True when the VM is access and an error message when it is not, using virtctl protocol
|
||||
This method returns True when the VM is accessible and an error message when it is not, using virtctl protocol
|
||||
:param vm_name:
|
||||
:param namespace:
|
||||
:return: virtctl_status 'True' if successful, or an error message if it fails.
|
||||
|
||||
@@ -1,23 +1,23 @@
|
||||
aliyun-python-sdk-core==2.13.36
|
||||
aliyun-python-sdk-ecs==4.24.25
|
||||
arcaflow-plugin-sdk==0.14.0
|
||||
boto3==1.28.61
|
||||
boto3>=1.34.0 # Updated to support urllib3 2.x
|
||||
azure-identity==1.16.1
|
||||
azure-keyvault==4.2.0
|
||||
azure-mgmt-compute==30.5.0
|
||||
azure-mgmt-network==27.0.0
|
||||
coverage==7.6.12
|
||||
datetime==5.4
|
||||
docker>=6.0,<7.0 # docker 7.0+ has breaking changes with Unix sockets
|
||||
docker>=6.0,<7.0 # docker 7.0+ has breaking changes; works with requests<2.32
|
||||
gitpython==3.1.41
|
||||
google-auth==2.37.0
|
||||
google-cloud-compute==1.22.0
|
||||
ibm_cloud_sdk_core==3.18.0
|
||||
ibm_vpc==0.20.0
|
||||
ibm_cloud_sdk_core>=3.20.0 # Requires urllib3>=2.1.0 (compatible with updated boto3)
|
||||
ibm_vpc==0.26.3 # Requires ibm_cloud_sdk_core
|
||||
jinja2==3.1.6
|
||||
krkn-lib==6.0.1
|
||||
lxml==5.1.0
|
||||
kubernetes==34.1.0
|
||||
krkn-lib==6.0.2
|
||||
numpy==1.26.4
|
||||
pandas==2.2.0
|
||||
openshift-client==1.0.21
|
||||
@@ -29,11 +29,13 @@ python-ipmi==0.5.4
|
||||
python-openstackclient==6.5.0
|
||||
requests<2.32 # requests 2.32+ breaks Unix socket support (http+docker scheme)
|
||||
requests-unixsocket>=0.4.0 # Required for Docker Unix socket support
|
||||
urllib3>=2.1.0,<2.4.0 # Compatible with all dependencies
|
||||
service_identity==24.1.0
|
||||
PyYAML==6.0.1
|
||||
setuptools==78.1.1
|
||||
wheel>=0.44.0
|
||||
zope.interface==6.1
|
||||
colorlog==6.10.1
|
||||
|
||||
git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
|
||||
cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability
|
||||
|
||||
@@ -6,6 +6,7 @@ import sys
|
||||
import yaml
|
||||
import logging
|
||||
import optparse
|
||||
from colorlog import ColoredFormatter
|
||||
import pyfiglet
|
||||
import uuid
|
||||
import time
|
||||
@@ -646,15 +647,23 @@ if __name__ == "__main__":
|
||||
# If no command or regular execution, continue with existing logic
|
||||
report_file = options.output
|
||||
tee_handler = TeeLogHandler()
|
||||
handlers = [
|
||||
logging.FileHandler(report_file, mode="w"),
|
||||
logging.StreamHandler(),
|
||||
tee_handler,
|
||||
]
|
||||
|
||||
fmt = "%(asctime)s [%(levelname)s] %(message)s"
|
||||
plain = logging.Formatter(fmt)
|
||||
colored = ColoredFormatter(
|
||||
"%(asctime)s [%(log_color)s%(levelname)s%(reset)s] %(message)s",
|
||||
log_colors={'DEBUG': 'white', 'INFO': 'white', 'WARNING': 'yellow', 'ERROR': 'red', 'CRITICAL': 'bold_red'},
|
||||
reset=True, style='%'
|
||||
)
|
||||
file_handler = logging.FileHandler(report_file, mode="w")
|
||||
file_handler.setFormatter(plain)
|
||||
stream_handler = logging.StreamHandler()
|
||||
stream_handler.setFormatter(colored)
|
||||
tee_handler.setFormatter(plain)
|
||||
handlers = [file_handler, stream_handler, tee_handler]
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if options.debug else logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=handlers,
|
||||
)
|
||||
option_error = False
|
||||
|
||||
826
tests/test_vmware_node_scenarios.py
Normal file
826
tests/test_vmware_node_scenarios.py
Normal file
@@ -0,0 +1,826 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Test suite for VMWare node scenarios
|
||||
|
||||
This test suite covers both the VMWare class and vmware_node_scenarios class
|
||||
using mocks to avoid actual VMWare CLI calls.
|
||||
|
||||
Usage:
|
||||
python -m coverage run -a -m unittest tests/test_vmware_node_scenarios.py -v
|
||||
|
||||
Assisted By: Claude Code
|
||||
"""
|
||||
|
||||
|
||||
import unittest
|
||||
from unittest.mock import MagicMock, patch, PropertyMock
|
||||
from krkn.scenario_plugins.node_actions.vmware_node_scenarios import vmware_node_scenarios, vSphere
|
||||
from krkn_lib.models.k8s import AffectedNodeStatus
|
||||
from com.vmware.vcenter.vm_client import Power
|
||||
|
||||
class TestVmwareNodeScenarios(unittest.TestCase):
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def setUp(self, mock_vsphere_class):
|
||||
# Mock the configuration and dependencies
|
||||
self.mock_kubecli = MagicMock()
|
||||
self.mock_affected_nodes_status = AffectedNodeStatus()
|
||||
self.mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = self.mock_vsphere
|
||||
|
||||
# Initialize the scenario class
|
||||
self.vmware_scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=self.mock_affected_nodes_status
|
||||
)
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_reboot_node_success(self, mock_vsphere_class):
|
||||
"""Test successful node reboot."""
|
||||
node_name = "test-node-01"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
mock_vsphere.reboot_instances.return_value = True
|
||||
|
||||
# Create a fresh instance with mocked vSphere
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
# Execute the reboot scenario
|
||||
scenarios.node_reboot_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Assertions
|
||||
mock_vsphere.reboot_instances.assert_called_with(node_name)
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_node_not_found(self, mock_vsphere_class):
|
||||
"""Test behavior when the VM does not exist in vCenter."""
|
||||
node_name = "non-existent-node"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
mock_vsphere.get_vm.return_value = None
|
||||
mock_vsphere.reboot_instances.side_effect = Exception(f"VM {node_name} not found")
|
||||
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
# This should handle the exception gracefully (just log it)
|
||||
scenarios.node_reboot_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_stop_start_node(self, mock_vsphere_class):
|
||||
"""Test stopping and then starting a node."""
|
||||
node_name = "test-node-02"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
mock_vsphere.stop_instances.return_value = True
|
||||
mock_vsphere.start_instances.return_value = True
|
||||
mock_vsphere.wait_until_stopped.return_value = True
|
||||
mock_vsphere.wait_until_running.return_value = True
|
||||
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
# Test stop scenario
|
||||
scenarios.node_stop_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300,
|
||||
poll_interval=5
|
||||
)
|
||||
mock_vsphere.stop_instances.assert_called_with(node_name)
|
||||
|
||||
# Test start scenario
|
||||
scenarios.node_start_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300,
|
||||
poll_interval=5
|
||||
)
|
||||
mock_vsphere.start_instances.assert_called_with(node_name)
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_vcenter_connection_failure(self, mock_vsphere_class):
|
||||
"""Test scenario where connection to vCenter fails."""
|
||||
# Force the vSphere init to raise an exception
|
||||
mock_vsphere_class.side_effect = Exception("Connection Refused")
|
||||
|
||||
with self.assertRaises(Exception):
|
||||
vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_node_terminate_scenario(self, mock_vsphere_class):
|
||||
"""Test node termination scenario."""
|
||||
node_name = "test-node-terminate"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
mock_vsphere.stop_instances.return_value = True
|
||||
mock_vsphere.wait_until_stopped.return_value = True
|
||||
mock_vsphere.wait_until_released.return_value = True
|
||||
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
# Execute terminate scenario
|
||||
scenarios.node_terminate_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300,
|
||||
poll_interval=5
|
||||
)
|
||||
|
||||
# Verify the sequence of calls
|
||||
mock_vsphere.stop_instances.assert_called_with(node_name)
|
||||
mock_vsphere.wait_until_stopped.assert_called_once()
|
||||
mock_vsphere.release_instances.assert_called_with(node_name)
|
||||
mock_vsphere.wait_until_released.assert_called_once()
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_node_already_stopped(self, mock_vsphere_class):
|
||||
"""Test scenario when node is already in the stopped state."""
|
||||
node_name = "already-stopped-node"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
# Return False indicating VM is already stopped
|
||||
mock_vsphere.stop_instances.return_value = False
|
||||
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
scenarios.node_stop_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300,
|
||||
poll_interval=5
|
||||
)
|
||||
|
||||
# Should still call stop_instances but not wait_until_stopped
|
||||
mock_vsphere.stop_instances.assert_called_with(node_name)
|
||||
mock_vsphere.wait_until_stopped.assert_not_called()
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_node_already_started(self, mock_vsphere_class):
|
||||
"""Test scenario when node is already in the running state."""
|
||||
node_name = "already-running-node"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
# Return False indicating VM is already running
|
||||
mock_vsphere.start_instances.return_value = False
|
||||
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
scenarios.node_start_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300,
|
||||
poll_interval=5
|
||||
)
|
||||
|
||||
# Should still call start_instances but not wait_until_running
|
||||
mock_vsphere.start_instances.assert_called_with(node_name)
|
||||
mock_vsphere.wait_until_running.assert_not_called()
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.nodeaction')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_reboot_with_kube_check(self, mock_vsphere_class, mock_nodeaction):
|
||||
"""Test reboot scenario with Kubernetes health checks enabled."""
|
||||
node_name = "test-node-kube-check"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
mock_vsphere.reboot_instances.return_value = True
|
||||
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=True, # Enable kube checks
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
scenarios.node_reboot_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Verify kube health check was called
|
||||
mock_nodeaction.wait_for_unknown_status.assert_called_once()
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.nodeaction')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_start_with_kube_check(self, mock_vsphere_class, mock_nodeaction):
|
||||
"""Test start scenario with Kubernetes health checks enabled."""
|
||||
node_name = "test-node-start-kube"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
mock_vsphere.start_instances.return_value = True
|
||||
mock_vsphere.wait_until_running.return_value = True
|
||||
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=True,
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
scenarios.node_start_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300,
|
||||
poll_interval=5
|
||||
)
|
||||
|
||||
# Verify both vSphere and kube checks were called
|
||||
mock_vsphere.wait_until_running.assert_called_once()
|
||||
mock_nodeaction.wait_for_ready_status.assert_called_once()
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_multiple_instance_kill_count(self, mock_vsphere_class):
|
||||
"""Test scenario with multiple instance kill count (loop)."""
|
||||
node_name = "test-node-multiple"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
mock_vsphere.reboot_instances.return_value = True
|
||||
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
# Test with kill count of 3
|
||||
scenarios.node_reboot_scenario(
|
||||
instance_kill_count=3,
|
||||
node=node_name,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Should be called 3 times
|
||||
assert mock_vsphere.reboot_instances.call_count == 3
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_stop_failure_exception_handling(self, mock_vsphere_class):
|
||||
"""Test exception handling during node stop."""
|
||||
node_name = "failing-node"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
mock_vsphere.stop_instances.side_effect = Exception("vSphere API Error")
|
||||
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
# Should not raise exception, just log it
|
||||
scenarios.node_stop_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300,
|
||||
poll_interval=5
|
||||
)
|
||||
|
||||
# Verify it attempted to stop
|
||||
mock_vsphere.stop_instances.assert_called_with(node_name)
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_terminate_failure_exception_handling(self, mock_vsphere_class):
|
||||
"""Test exception handling during node termination."""
|
||||
node_name = "terminate-failing-node"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
mock_vsphere.stop_instances.return_value = True
|
||||
mock_vsphere.wait_until_stopped.return_value = True
|
||||
mock_vsphere.release_instances.side_effect = Exception("Cannot delete VM")
|
||||
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
# Should not raise exception
|
||||
scenarios.node_terminate_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300,
|
||||
poll_interval=5
|
||||
)
|
||||
|
||||
# Verify termination was attempted
|
||||
mock_vsphere.release_instances.assert_called_with(node_name)
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_affected_nodes_tracking(self, mock_vsphere_class):
|
||||
"""Test that affected nodes are properly tracked."""
|
||||
node_name = "tracked-node"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
mock_vsphere.reboot_instances.return_value = True
|
||||
|
||||
affected_status = AffectedNodeStatus()
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=affected_status
|
||||
)
|
||||
|
||||
# Verify no affected nodes initially
|
||||
assert len(affected_status.affected_nodes) == 0
|
||||
|
||||
scenarios.node_reboot_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Verify affected node was tracked
|
||||
assert len(affected_status.affected_nodes) == 1
|
||||
assert affected_status.affected_nodes[0].node_name == node_name
|
||||
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.vSphere')
|
||||
def test_reboot_not_allowed_state(self, mock_vsphere_class):
|
||||
"""Test reboot when VM is in a state that doesn't allow reboot."""
|
||||
node_name = "powered-off-node"
|
||||
mock_vsphere = MagicMock()
|
||||
mock_vsphere_class.return_value = mock_vsphere
|
||||
# Return False indicating reboot failed (VM not powered on)
|
||||
mock_vsphere.reboot_instances.return_value = False
|
||||
|
||||
scenarios = vmware_node_scenarios(
|
||||
kubecli=self.mock_kubecli,
|
||||
node_action_kube_check=False,
|
||||
affected_nodes_status=AffectedNodeStatus()
|
||||
)
|
||||
|
||||
scenarios.node_reboot_scenario(
|
||||
instance_kill_count=1,
|
||||
node=node_name,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Should attempt reboot
|
||||
mock_vsphere.reboot_instances.assert_called_with(node_name)
|
||||
|
||||
|
||||
class TestVSphereClass(unittest.TestCase):
|
||||
"""Test suite for the vSphere class."""
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_vsphere_initialization_success(self, mock_session, mock_create_client):
|
||||
"""Test successful vSphere client initialization."""
|
||||
mock_client = MagicMock()
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
|
||||
self.assertEqual(vsphere.server, '192.168.1.100')
|
||||
self.assertEqual(vsphere.username, 'admin')
|
||||
self.assertEqual(vsphere.password, 'password123')
|
||||
self.assertTrue(vsphere.credentials_present)
|
||||
mock_create_client.assert_called_once()
|
||||
|
||||
@patch.dict('os.environ', {}, clear=True)
|
||||
def test_vsphere_initialization_missing_credentials(self):
|
||||
"""Test vSphere initialization fails when credentials are missing."""
|
||||
with self.assertRaises(Exception) as context:
|
||||
vSphere()
|
||||
|
||||
self.assertIn("Environmental variables", str(context.exception))
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_get_vm_success(self, mock_session, mock_create_client):
|
||||
"""Test getting a VM by name."""
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
mock_client.vcenter.VM.list.return_value = [mock_vm_obj]
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
vm_id = vsphere.get_vm('test-vm')
|
||||
|
||||
self.assertEqual(vm_id, 'vm-123')
|
||||
mock_client.vcenter.VM.list.assert_called_once()
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_get_vm_not_found(self, mock_session, mock_create_client):
|
||||
"""Test getting a VM that doesn't exist."""
|
||||
mock_client = MagicMock()
|
||||
mock_client.vcenter.VM.list.return_value = []
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
vm_id = vsphere.get_vm('non-existent-vm')
|
||||
|
||||
self.assertIsNone(vm_id)
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_reboot_instances_success(self, mock_session, mock_create_client):
|
||||
"""Test successful VM reboot."""
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
mock_client.vcenter.VM.list.return_value = [mock_vm_obj]
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
result = vsphere.reboot_instances('test-vm')
|
||||
|
||||
self.assertTrue(result)
|
||||
mock_client.vcenter.vm.Power.reset.assert_called_with('vm-123')
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_reboot_instances_not_powered_on(self, mock_session, mock_create_client):
|
||||
"""Test reboot fails when VM is not powered on."""
|
||||
from com.vmware.vapi.std.errors_client import NotAllowedInCurrentState
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
mock_client.vcenter.VM.list.return_value = [mock_vm_obj]
|
||||
mock_client.vcenter.vm.Power.reset.side_effect = NotAllowedInCurrentState()
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
result = vsphere.reboot_instances('test-vm')
|
||||
|
||||
self.assertFalse(result)
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_stop_instances_success(self, mock_session, mock_create_client):
|
||||
"""Test successful VM stop."""
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
mock_client.vcenter.VM.list.return_value = [mock_vm_obj]
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
result = vsphere.stop_instances('test-vm')
|
||||
|
||||
self.assertTrue(result)
|
||||
mock_client.vcenter.vm.Power.stop.assert_called_with('vm-123')
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_stop_instances_already_stopped(self, mock_session, mock_create_client):
|
||||
"""Test stop when VM is already stopped."""
|
||||
from com.vmware.vapi.std.errors_client import AlreadyInDesiredState
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
mock_client.vcenter.VM.list.return_value = [mock_vm_obj]
|
||||
mock_client.vcenter.vm.Power.stop.side_effect = AlreadyInDesiredState()
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
result = vsphere.stop_instances('test-vm')
|
||||
|
||||
self.assertFalse(result)
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_start_instances_success(self, mock_session, mock_create_client):
|
||||
"""Test successful VM start."""
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
mock_client.vcenter.VM.list.return_value = [mock_vm_obj]
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
result = vsphere.start_instances('test-vm')
|
||||
|
||||
self.assertTrue(result)
|
||||
mock_client.vcenter.vm.Power.start.assert_called_with('vm-123')
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_start_instances_already_started(self, mock_session, mock_create_client):
|
||||
"""Test start when VM is already running."""
|
||||
from com.vmware.vapi.std.errors_client import AlreadyInDesiredState
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
mock_client.vcenter.VM.list.return_value = [mock_vm_obj]
|
||||
mock_client.vcenter.vm.Power.start.side_effect = AlreadyInDesiredState()
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
result = vsphere.start_instances('test-vm')
|
||||
|
||||
self.assertFalse(result)
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_get_vm_status(self, mock_session, mock_create_client):
|
||||
"""Test getting VM status."""
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
mock_client.vcenter.VM.list.return_value = [mock_vm_obj]
|
||||
mock_power_state = MagicMock()
|
||||
mock_power_state.state = Power.State.POWERED_ON
|
||||
mock_client.vcenter.vm.Power.get.return_value = mock_power_state
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
status = vsphere.get_vm_status('test-vm')
|
||||
|
||||
self.assertEqual(status, Power.State.POWERED_ON)
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_get_vm_status_exception(self, mock_session, mock_create_client):
|
||||
"""Test get_vm_status handles exceptions gracefully."""
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
mock_client.vcenter.VM.list.return_value = [mock_vm_obj]
|
||||
mock_client.vcenter.vm.Power.get.side_effect = Exception("API Error")
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
status = vsphere.get_vm_status('test-vm')
|
||||
|
||||
self.assertIsNone(status)
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.time.sleep')
|
||||
def test_wait_until_running(self, mock_sleep, mock_session, mock_create_client):
|
||||
"""Test waiting for VM to reach POWERED_ON state."""
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
mock_client.vcenter.VM.list.return_value = [mock_vm_obj]
|
||||
|
||||
# Simulate VM transitioning to POWERED_ON after 2 checks
|
||||
mock_power_states = [
|
||||
MagicMock(state=Power.State.POWERED_OFF),
|
||||
MagicMock(state=Power.State.POWERED_ON)
|
||||
]
|
||||
mock_client.vcenter.vm.Power.get.side_effect = mock_power_states
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
mock_affected_node = MagicMock()
|
||||
result = vsphere.wait_until_running('test-vm', timeout=60, affected_node=mock_affected_node)
|
||||
|
||||
self.assertTrue(result)
|
||||
mock_affected_node.set_affected_node_status.assert_called_once()
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.time.sleep')
|
||||
def test_wait_until_stopped(self, mock_sleep, mock_session, mock_create_client):
|
||||
"""Test waiting for VM to reach POWERED_OFF state."""
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
mock_client.vcenter.VM.list.return_value = [mock_vm_obj]
|
||||
|
||||
# Simulate VM transitioning to POWERED_OFF
|
||||
mock_power_states = [
|
||||
MagicMock(state=Power.State.POWERED_ON),
|
||||
MagicMock(state=Power.State.POWERED_OFF)
|
||||
]
|
||||
mock_client.vcenter.vm.Power.get.side_effect = mock_power_states
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
mock_affected_node = MagicMock()
|
||||
result = vsphere.wait_until_stopped('test-vm', timeout=60, affected_node=mock_affected_node)
|
||||
|
||||
self.assertTrue(result)
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.time.sleep')
|
||||
def test_wait_until_running_timeout(self, mock_sleep, mock_session, mock_create_client):
|
||||
"""Test wait_until_running times out."""
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
mock_client.vcenter.VM.list.return_value = [mock_vm_obj]
|
||||
|
||||
# VM is POWERED_OFF initially, then transitions to POWERED_ON after timeout to exit loop
|
||||
call_count = [0]
|
||||
def get_status_side_effect(vm):
|
||||
call_count[0] += 1
|
||||
# Return POWERED_OFF for first 2 calls (to exceed timeout=2 with 5 second increments)
|
||||
# Then return POWERED_ON to exit the loop
|
||||
if call_count[0] <= 2:
|
||||
return MagicMock(state=Power.State.POWERED_OFF)
|
||||
return MagicMock(state=Power.State.POWERED_ON)
|
||||
|
||||
mock_client.vcenter.vm.Power.get.side_effect = get_status_side_effect
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
mock_affected_node = MagicMock()
|
||||
result = vsphere.wait_until_running('test-vm', timeout=2, affected_node=mock_affected_node)
|
||||
|
||||
self.assertFalse(result)
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.time.sleep')
|
||||
def test_wait_until_released(self, mock_sleep, mock_session, mock_create_client):
|
||||
"""Test waiting for VM to be deleted."""
|
||||
mock_client = MagicMock()
|
||||
mock_vm_obj = MagicMock()
|
||||
mock_vm_obj.vm = 'vm-123'
|
||||
|
||||
# VM exists first, then is deleted
|
||||
mock_client.vcenter.VM.list.side_effect = [
|
||||
[mock_vm_obj], # VM exists
|
||||
[] # VM deleted
|
||||
]
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
mock_affected_node = MagicMock()
|
||||
result = vsphere.wait_until_released('test-vm', timeout=60, affected_node=mock_affected_node)
|
||||
|
||||
self.assertTrue(result)
|
||||
mock_affected_node.set_affected_node_status.assert_called_once()
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_get_datacenter_list(self, mock_session, mock_create_client):
|
||||
"""Test getting list of datacenters."""
|
||||
mock_client = MagicMock()
|
||||
mock_dc1 = MagicMock()
|
||||
mock_dc1.datacenter = 'dc-1'
|
||||
mock_dc1.name = 'Datacenter1'
|
||||
mock_dc2 = MagicMock()
|
||||
mock_dc2.datacenter = 'dc-2'
|
||||
mock_dc2.name = 'Datacenter2'
|
||||
mock_client.vcenter.Datacenter.list.return_value = [mock_dc1, mock_dc2]
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
datacenters = vsphere.get_datacenter_list()
|
||||
|
||||
self.assertEqual(len(datacenters), 2)
|
||||
self.assertEqual(datacenters[0]['datacenter_name'], 'Datacenter1')
|
||||
self.assertEqual(datacenters[1]['datacenter_name'], 'Datacenter2')
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_release_instances_vm_not_found(self, mock_session, mock_create_client):
|
||||
"""Test release_instances raises exception when VM not found."""
|
||||
mock_client = MagicMock()
|
||||
mock_client.vcenter.VM.list.return_value = []
|
||||
mock_create_client.return_value = mock_client
|
||||
|
||||
vsphere = vSphere()
|
||||
|
||||
with self.assertRaises(Exception) as context:
|
||||
vsphere.release_instances('non-existent-vm')
|
||||
|
||||
self.assertIn("does not exist", str(context.exception))
|
||||
|
||||
@patch.dict('os.environ', {
|
||||
'VSPHERE_IP': '192.168.1.100',
|
||||
'VSPHERE_USERNAME': 'admin',
|
||||
'VSPHERE_PASSWORD': 'password123'
|
||||
})
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.create_vsphere_client')
|
||||
@patch('krkn.scenario_plugins.node_actions.vmware_node_scenarios.requests.session')
|
||||
def test_get_unverified_session(self, mock_session_class, mock_create_client):
|
||||
"""Test creating an unverified session."""
|
||||
mock_session_instance = MagicMock()
|
||||
mock_session_class.return_value = mock_session_instance
|
||||
mock_create_client.return_value = MagicMock()
|
||||
|
||||
vsphere = vSphere()
|
||||
session = vsphere.get_unverified_session()
|
||||
|
||||
self.assertFalse(session.verify)
|
||||
mock_session_class.assert_called()
|
||||
@@ -1,54 +0,0 @@
|
||||
# ⚠️ DEPRECATED - This project has moved
|
||||
|
||||
> **All development has moved to [github.com/krkn-chaos/krkn-ai](https://github.com/krkn-chaos/krkn-ai)**
|
||||
>
|
||||
> This directory is no longer maintained. Please visit the new repository for:
|
||||
> - Latest features and updates
|
||||
> - Active development and support
|
||||
> - Bug fixes and improvements
|
||||
> - Documentation and examples
|
||||
>
|
||||
> See [../README.md](../README.md) for more information.
|
||||
|
||||
---
|
||||
|
||||
# aichaos
|
||||
Enhancing Chaos Engineering with AI-assisted fault injection for better resiliency and non-functional testing.
|
||||
|
||||
## Generate python package wheel file
|
||||
```
|
||||
$ python3.9 generate_wheel_package.py sdist bdist_wheel
|
||||
$ cp dist/aichaos-0.0.1-py3-none-any.whl docker/
|
||||
```
|
||||
This creates a python package file aichaos-0.0.1-py3-none-any.whl in the dist folder.
|
||||
|
||||
## Build Image
|
||||
```
|
||||
$ cd docker
|
||||
$ podman build -t aichaos:1.0 .
|
||||
OR
|
||||
$ docker build -t aichaos:1.0 .
|
||||
```
|
||||
|
||||
## Run Chaos AI
|
||||
```
|
||||
$ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
|
||||
OR
|
||||
$ docker run -v aichaos-config.json:/config/aichaos-config.json --privileged -v /var/run/docker.sock:/var/run/docker.sock --name aichaos -p 5001:5001 aichaos:1.0
|
||||
```
|
||||
|
||||
The output should look like:
|
||||
```
|
||||
$ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
|
||||
* Serving Flask app 'swagger_api' (lazy loading)
|
||||
* Environment: production
|
||||
WARNING: This is a development server. Do not use it in a production deployment.
|
||||
Use a production WSGI server instead.
|
||||
* Debug mode: on
|
||||
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
|
||||
* Running on all addresses (0.0.0.0)
|
||||
* Running on http://127.0.0.1:5001
|
||||
* Running on http://172.17.0.2:5001
|
||||
```
|
||||
|
||||
You can try out the APIs in browser at http://<server-ip>:5001/apidocs (eg. http://127.0.0.1:5001/apidocs). For testing out, you can try “GenerateChaos” api with ‘kubeconfig’ file and application URLs to test.
|
||||
@@ -1,21 +0,0 @@
|
||||
FROM bitnami/kubectl:1.20.9 as kubectl
|
||||
FROM python:3.9
|
||||
WORKDIR /app
|
||||
RUN pip3 install --upgrade pip
|
||||
COPY config config/
|
||||
COPY requirements.txt .
|
||||
RUN mkdir -p /app/logs
|
||||
RUN pip3 install -r requirements.txt
|
||||
|
||||
COPY --from=kubectl /opt/bitnami/kubectl/bin/kubectl /usr/local/bin/
|
||||
|
||||
COPY swagger_api.py .
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
RUN curl -fsSLO https://get.docker.com/builds/Linux/x86_64/docker-17.03.1-ce.tgz && tar --strip-components=1 -xvzf docker-17.03.1-ce.tgz -C /usr/local/bin
|
||||
|
||||
RUN apt-get update && apt-get install -y podman
|
||||
|
||||
COPY aichaos-0.0.1-py3-none-any.whl .
|
||||
RUN pip3 install aichaos-0.0.1-py3-none-any.whl
|
||||
CMD ["python3", "swagger_api.py"]
|
||||
@@ -1,7 +0,0 @@
|
||||
{
|
||||
"command": "podman",
|
||||
"chaosengine": "kraken",
|
||||
"faults": "pod-delete",
|
||||
"iterations": 1,
|
||||
"maxfaults": 5
|
||||
}
|
||||
@@ -1,15 +0,0 @@
|
||||
|
||||
Get Log from the Chaos ID.---
|
||||
tags:
|
||||
- ChaosAI API Results
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Results for the given Chaos ID.
|
||||
@@ -1,36 +0,0 @@
|
||||
{
|
||||
"apiVersion": "1.0",
|
||||
"kind": "ChaosEngine",
|
||||
"metadata": {
|
||||
"name": "engine-cartns3"
|
||||
},
|
||||
"spec": {
|
||||
"engineState": "active",
|
||||
"annotationCheck": "false",
|
||||
"appinfo": {
|
||||
"appns": "robot-shop",
|
||||
"applabel": "service=payment",
|
||||
"appkind": "deployment"
|
||||
},
|
||||
"chaosServiceAccount": "pod-delete-sa",
|
||||
"experiments": [
|
||||
{
|
||||
"name": "pod-delete",
|
||||
"spec": {
|
||||
"components": {
|
||||
"env": [
|
||||
{
|
||||
"name": "FORCE",
|
||||
"value": "true"
|
||||
},
|
||||
{
|
||||
"name": "TOTAL_CHAOS_DURATION",
|
||||
"value": "120"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -1,40 +0,0 @@
|
||||
|
||||
Generate chaos on an application deployed on a cluster.
|
||||
---
|
||||
tags:
|
||||
- ChaosAI API
|
||||
parameters:
|
||||
- name: file
|
||||
in: formData
|
||||
type: file
|
||||
required: true
|
||||
description: Kube-config file
|
||||
- name: namespace
|
||||
in: formData
|
||||
type: string
|
||||
default: robot-shop
|
||||
required: true
|
||||
description: Namespace to test
|
||||
- name: podlabels
|
||||
in: formData
|
||||
type: string
|
||||
default: service=cart,service=payment
|
||||
required: true
|
||||
description: Pod labels to test
|
||||
- name: nodelabels
|
||||
in: formData
|
||||
type: string
|
||||
required: false
|
||||
description: Node labels to test
|
||||
- name: urls
|
||||
in: formData
|
||||
type: string
|
||||
default: http://<application-url>:8097/api/cart/health,http://<application-url>:8097/api/payment/health
|
||||
required: true
|
||||
description: Application URLs to test
|
||||
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Chaos ID for the initiated chaos.
|
||||
@@ -1,15 +0,0 @@
|
||||
|
||||
Get Episodes from the Chaos ID.---
|
||||
tags:
|
||||
- ChaosAI API Results
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Results for the given Chaos ID.
|
||||
@@ -1,15 +0,0 @@
|
||||
|
||||
Get Log from the Chaos ID.---
|
||||
tags:
|
||||
- ChaosAI API Results
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Results for the given Chaos ID.
|
||||
@@ -1,15 +0,0 @@
|
||||
|
||||
Get QTable from the Chaos ID.---
|
||||
tags:
|
||||
- ChaosAI API Results
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Results for the given Chaos ID.
|
||||
@@ -1,15 +0,0 @@
|
||||
|
||||
Get status of the Constraints ID.---
|
||||
tags:
|
||||
- ChaosAI API
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Chaos for the given ID.
|
||||
@@ -1,6 +0,0 @@
|
||||
numpy
|
||||
pandas
|
||||
requests
|
||||
Flask==2.2.5
|
||||
Werkzeug==3.1.5
|
||||
flasgger==0.9.5
|
||||
@@ -1,186 +0,0 @@
|
||||
import json, os
|
||||
import logging
|
||||
# import numpy as np
|
||||
# import pandas as pd
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from flask import Flask, request
|
||||
from flasgger import Swagger
|
||||
from flasgger.utils import swag_from
|
||||
# import zipfile
|
||||
import sys
|
||||
|
||||
# sys.path.append("..")
|
||||
from src.aichaos_main import AIChaos
|
||||
|
||||
app = Flask(__name__)
|
||||
Swagger(app)
|
||||
flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "app", "logs") + '/'
|
||||
|
||||
|
||||
class AIChaosSwagger:
|
||||
def __init__(self, flaskdir=''):
|
||||
self.flaskdir = flaskdir
|
||||
|
||||
@app.route("/")
|
||||
def empty(params=''):
|
||||
return "AI Chaos Repository!"
|
||||
|
||||
def startchaos(self, kubeconfigfile, file_id, params):
|
||||
print('[StartChaos]', file_id, kubeconfigfile)
|
||||
dir = flaskdir
|
||||
outfile = ''.join([dir, 'out-', file_id])
|
||||
initfile = ''.join([dir, 'init-', file_id])
|
||||
with open(initfile, 'w'):
|
||||
pass
|
||||
if os.path.exists(outfile):
|
||||
os.remove(outfile)
|
||||
# kubeconfigfile = params['file']
|
||||
os.environ["KUBECONFIG"] = kubeconfigfile
|
||||
os.system("export KUBECONFIG="+kubeconfigfile)
|
||||
os.system("echo $KUBECONFIG")
|
||||
print('setting kubeconfig')
|
||||
params['command'] = 'podman'
|
||||
params['chaosengine'] = 'kraken'
|
||||
params['faults'] = 'pod-delete'
|
||||
params['iterations'] = 1
|
||||
params['maxfaults'] = 5
|
||||
if os.path.isfile('/config/aichaos-config.json'):
|
||||
with open('/config/aichaos-config.json') as f:
|
||||
config_params = json.load(f)
|
||||
params['command'] = config_params['command']
|
||||
params['chaosengine'] = config_params['chaosengine']
|
||||
params['faults']= config_params['faults']
|
||||
params['iterations'] = config_params['iterations']
|
||||
params['maxfaults'] = config_params['maxfaults']
|
||||
# faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
|
||||
faults = []
|
||||
for f in params['faults'].split(','):
|
||||
if f in ['pod-delete']:
|
||||
for p in params['podlabels'].split(','):
|
||||
faults.append(f + ':' + p)
|
||||
elif f in ['network-chaos', 'node-memory-hog', 'node-cpu-hog']:
|
||||
for p in params['nodelabels'].split(','):
|
||||
faults.append(f + ':' + p)
|
||||
else:
|
||||
pass
|
||||
|
||||
print('#faults:', len(faults), faults)
|
||||
states = {'200': 0, '500': 1, '501': 2, '502': 3, '503': 4, '504': 5,
|
||||
'401': 6, '403': 7, '404': 8, '429': 9,
|
||||
'Timeout': 10, 'Other': 11}
|
||||
rewards = {'200': -1, '500': 0.8, '501': 0.8, '502': 0.8, '503': 0.8, '504': 0.8,
|
||||
'401': 1, '403': 1, '404': 1, '429': 1,
|
||||
'Timeout': 1, 'Other': 1}
|
||||
logfile = self.flaskdir + 'log_' + str(file_id)
|
||||
qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
|
||||
efile = self.flaskdir + 'efile_' + str(file_id)
|
||||
epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
|
||||
# probe_url = params['probeurl']
|
||||
cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
|
||||
'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
|
||||
'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
|
||||
aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
|
||||
logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
|
||||
urls=params['urls'].split(','), namespace=params['namespace'],
|
||||
max_faults=int(params['maxfaults']),
|
||||
num_requests=10, timeout=2,
|
||||
chaos_engine=params['chaosengine'],
|
||||
chaos_dir='config/', kubeconfig=kubeconfigfile,
|
||||
loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=int(params['iterations']),
|
||||
command=params['command'])
|
||||
print('checking kubeconfig')
|
||||
os.system("echo $KUBECONFIG")
|
||||
aichaos.start_chaos()
|
||||
|
||||
file = open(outfile, "w")
|
||||
file.write('done')
|
||||
file.close()
|
||||
os.remove(initfile)
|
||||
# os.remove(csvfile)
|
||||
# ConstraintsInference().remove_temp_files(dir, file_id)
|
||||
return 'WRITE'
|
||||
|
||||
@app.route('/GenerateChaos/', methods=['POST'])
|
||||
@swag_from('config/yml/chaosGen.yml')
|
||||
def chaos_gen():
|
||||
dir = flaskdir
|
||||
sw = AIChaosSwagger(flaskdir=dir)
|
||||
f = request.files['file']
|
||||
list = os.listdir(dir)
|
||||
for i in range(10000):
|
||||
fname = 'kubeconfig-'+str(i)
|
||||
if fname not in list:
|
||||
break
|
||||
kubeconfigfile = ''.join([dir, 'kubeconfig-', str(i)])
|
||||
f.save(kubeconfigfile)
|
||||
# creating empty file
|
||||
open(kubeconfigfile, 'a').close()
|
||||
# print('HEADER:', f.headers)
|
||||
print('[GenerateChaos] reqs:', request.form.to_dict())
|
||||
# print('[GenerateChaos]', f.filename, datetime.now())
|
||||
thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
|
||||
thread.daemon = True
|
||||
print(thread.getName())
|
||||
thread.start()
|
||||
return 'Chaos ID: ' + str(i)
|
||||
|
||||
@app.route('/GetStatus/<chaosid>', methods=['GET'])
|
||||
@swag_from('config/yml/status.yml')
|
||||
def get_status(chaosid):
|
||||
print('[GetStatus]', chaosid, flaskdir)
|
||||
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
return 'Completed'
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Does not exist'
|
||||
|
||||
@app.route('/GetQTable/<chaosid>', methods=['GET'])
|
||||
@swag_from('config/yml/qtable.yml')
|
||||
def get_qtable(chaosid):
|
||||
print('[GetQTable]', chaosid)
|
||||
qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(qfile):
|
||||
f = open(qfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
@app.route('/GetEpisodes/<chaosid>', methods=['GET'])
|
||||
@swag_from('config/yml/episodes.yml')
|
||||
def get_episodes(chaosid):
|
||||
print('[GetEpisodes]', chaosid)
|
||||
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
f = open(epfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
|
||||
@app.route('/GetLog/<chaosid>', methods=['GET'])
|
||||
@swag_from('config/yml/log.yml')
|
||||
def get_log(chaosid):
|
||||
print('[GetLog]', chaosid)
|
||||
epfile = flaskdir + 'log_' + str(chaosid)
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
f = open(epfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True, host='0.0.0.0', port='5001')
|
||||
@@ -1,21 +0,0 @@
|
||||
import setuptools
|
||||
# from setuptools_cythonize import get_cmdclass
|
||||
|
||||
setuptools.setup(
|
||||
# cmdclass=get_cmdclass(),
|
||||
name="aichaos",
|
||||
version="0.0.1",
|
||||
author="Sandeep Hans",
|
||||
author_email="shans001@in.ibm.com",
|
||||
description="Chaos AI",
|
||||
long_description="Chaos Engineering using AI",
|
||||
long_description_content_type="text/markdown",
|
||||
url="",
|
||||
packages=setuptools.find_packages(),
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
],
|
||||
python_requires='>=3.9',
|
||||
)
|
||||
@@ -1,11 +0,0 @@
|
||||
numpy
|
||||
pandas
|
||||
notebook
|
||||
jupyterlab
|
||||
jupyter
|
||||
seaborn==0.13.2
|
||||
requests
|
||||
wheel
|
||||
Flask==2.2.5
|
||||
flasgger==0.9.5
|
||||
pillow==10.3.0
|
||||
@@ -1,213 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
|
||||
class AIChaos:
|
||||
def __init__(self, states=None, faults=None, rewards=None, pod_names=[], chaos_dir=None,
|
||||
chaos_experiment='experiment.json',
|
||||
chaos_journal='journal.json', iterations=1000, static_run=False):
|
||||
self.faults = faults
|
||||
self.pod_names = pod_names
|
||||
self.states = states
|
||||
self.rewards = rewards
|
||||
self.episodes = []
|
||||
|
||||
self.chaos_dir = chaos_dir
|
||||
self.chaos_experiment = chaos_experiment
|
||||
self.chaos_journal = chaos_journal
|
||||
|
||||
self.iterations = iterations
|
||||
# Initialize parameters
|
||||
self.gamma = 0.75 # Discount factor
|
||||
self.alpha = 0.9 # Learning rate
|
||||
|
||||
# Initializing Q-Values
|
||||
# self.Q = np.array(np.zeros([9, 9]))
|
||||
# self.Q = np.array(np.zeros([len(faults), len(faults)]))
|
||||
# currently action is a single fault, later on we will do multiple faults together
|
||||
# For multiple faults, the no of cols in q-matrix will be all combinations of faults (infinite)
|
||||
# eg. {f1,f2},f3,f4,{f4,f5} - f1,f2 in parallel, then f3, then f4, then f4,f5 in parallel produces end state
|
||||
# self.Q = np.array(np.zeros([len(states), len(states)]))
|
||||
self.Q = np.array(np.zeros([len(states), len(faults)]))
|
||||
self.state_matrix = np.array(np.zeros([len(states), len(states)]))
|
||||
|
||||
# may be Q is a dictionary of dictionaries, for each state there is a dictionary of faults
|
||||
# Q = {'500' = {'f1f2f4': 0.3, 'f1': 0.5}, '404' = {'f2': 0.22}}
|
||||
|
||||
self.logger = logging.getLogger()
|
||||
# run from old static experiment and journal files
|
||||
self.static_run = static_run
|
||||
|
||||
# End state is reached when system is down or return error code like '500','404'
|
||||
def get_next_state(self):
|
||||
self.logger.info('[GET_NEXT_STATE]')
|
||||
f = open(self.chaos_dir + self.chaos_journal)
|
||||
data = json.load(f)
|
||||
|
||||
# before the experiment (if before steady state is false, after is null?)
|
||||
for probe in data['steady_states']['before']['probes']:
|
||||
if not probe['tolerance_met']:
|
||||
# start_state = probe['activity']['tolerance']
|
||||
# end_state = probe['status']
|
||||
start_state, end_state = None, None
|
||||
return start_state, end_state
|
||||
|
||||
# after the experiment
|
||||
for probe in data['steady_states']['after']['probes']:
|
||||
# if probe['output']['status'] == probe['activity']['tolerance']:
|
||||
if not probe['tolerance_met']:
|
||||
# print(probe)
|
||||
start_state = probe['activity']['tolerance']
|
||||
end_state = probe['output']['status']
|
||||
# end_state = probe['status']
|
||||
return start_state, end_state
|
||||
# if tolerances for all probes are met
|
||||
start_state = probe['activity']['tolerance']
|
||||
end_state = probe['activity']['tolerance']
|
||||
return start_state, end_state
|
||||
|
||||
def inject_faults(self, fault, pod_name):
|
||||
self.logger.info('[INJECT_FAULT] ' + fault)
|
||||
f = open(self.chaos_dir + self.chaos_experiment)
|
||||
data = json.load(f)
|
||||
for m in data['method']:
|
||||
if 'provider' in m:
|
||||
if fault == 'kill_microservice':
|
||||
m['name'] = 'kill-microservice'
|
||||
m['provider']['module'] = 'chaosk8s.actions'
|
||||
m['provider']['arguments']['name'] = pod_name
|
||||
else:
|
||||
m['provider']['arguments']['name_pattern'] = pod_name
|
||||
m['provider']['func'] = fault
|
||||
|
||||
print('[INJECT_FAULT] method:', m)
|
||||
# self.logger.info('[INJECT_FAULT] ' + m['provider']['arguments']['name_pattern'])
|
||||
# self.logger.info('[INJECT_FAULT] ' + str(m))
|
||||
|
||||
exp_file = self.chaos_dir + 'experiment_' + str(random.randint(1, 10)) + '.json'
|
||||
with open(exp_file, 'w') as f:
|
||||
json.dump(data, f)
|
||||
exp_file = self.chaos_dir + 'experiment.json'
|
||||
# execute faults
|
||||
# cmd = 'cd ' + self.chaos_dir + ';chaos run ' + self.chaos_experiment
|
||||
cmd = 'cd ' + self.chaos_dir + ';chaos run ' + exp_file
|
||||
if not self.static_run:
|
||||
os.system(cmd)
|
||||
|
||||
def create_episode(self):
|
||||
self.logger.info('[CREATE_EPISODE]')
|
||||
episode = []
|
||||
while True:
|
||||
# inject more faults
|
||||
# TODO: model - choose faults based on q-learning ...
|
||||
fault_pod = random.choice(self.faults)
|
||||
fault = fault_pod.split(':')[0]
|
||||
pod_name = fault_pod.split(':')[1]
|
||||
# fault = random.choice(self.faults)
|
||||
# pod_name = random.choice(self.pod_names)
|
||||
# fault = lstm_model.get_next_fault(episode)
|
||||
# fault = get_max_prob_fault(episode)
|
||||
|
||||
self.inject_faults(fault, pod_name)
|
||||
start_state, next_state = self.get_next_state()
|
||||
print('[CREATE EPISODE]', start_state, next_state)
|
||||
# if before state tolerance is not met
|
||||
if start_state is None and next_state is None:
|
||||
continue
|
||||
|
||||
episode.append({'fault': fault, 'pod_name': pod_name})
|
||||
self.update_q_fault(fault_pod, episode, start_state, next_state)
|
||||
# self.update_q_fault(fault, episode, start_state, next_state)
|
||||
# if an end_state is reached
|
||||
# if next_state is not None:
|
||||
if start_state != next_state:
|
||||
self.logger.info('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
|
||||
self.logger.info('[CREATE_EPISODE] END STATE:' + str(next_state))
|
||||
return episode, start_state, next_state
|
||||
|
||||
def update_q_fault(self, fault, episode, start_state, end_state):
|
||||
self.logger.info('[UPDATE_Q]')
|
||||
print('[UPDATE_Q] ', str(start_state), str(end_state))
|
||||
if end_state is None:
|
||||
end_state = start_state
|
||||
|
||||
# reward is dependent on the error response (eg. '404') and length of episode
|
||||
reward = self.rewards[str(end_state)] / len(episode)
|
||||
current_state = self.states[str(start_state)]
|
||||
next_state = self.states[str(end_state)]
|
||||
fault_index = self.faults.index(fault)
|
||||
|
||||
TD = reward + \
|
||||
self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
|
||||
self.Q[current_state, fault_index]
|
||||
self.Q[current_state, fault_index] += self.alpha * TD
|
||||
|
||||
# update state matrix
|
||||
TD_state = reward + \
|
||||
self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
|
||||
self.state_matrix[current_state, next_state]
|
||||
self.state_matrix[current_state, next_state] += self.alpha * TD_state
|
||||
|
||||
# def update_q(self, episode, start_state, end_state):
|
||||
# self.logger.info('[UPDATE_Q]')
|
||||
# if end_state is None:
|
||||
# end_state = start_state
|
||||
#
|
||||
# # reward is dependent on the error response (eg. '404') and length of episode
|
||||
# reward = self.rewards[str(end_state)] / len(episode)
|
||||
# current_state = self.states[str(start_state)]
|
||||
# next_state = self.states[str(end_state)]
|
||||
# TD = reward + \
|
||||
# self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
|
||||
# self.Q[current_state, next_state]
|
||||
# self.Q[current_state, next_state] += self.alpha * TD
|
||||
|
||||
def start_chaos(self):
|
||||
for i in range(self.iterations):
|
||||
episode, start_state, end_state = self.create_episode()
|
||||
# update Q matrix
|
||||
# will do it with each fault injection
|
||||
# self.update_q(episode, start_state, end_state)
|
||||
print(self.Q)
|
||||
print(self.state_matrix)
|
||||
|
||||
|
||||
def test_chaos():
|
||||
svc_list = ['cart', 'catalogue', 'dispatch', 'mongodb', 'mysql', 'payment', 'rabbitmq', 'ratings', 'redis',
|
||||
'shipping', 'user', 'web']
|
||||
# Define faults
|
||||
# faults = ['terminate_pods']
|
||||
# faults = ['terminate_pods:' + x for x in pod_names]
|
||||
faults = ['kill_microservice:' + x for x in svc_list]
|
||||
# Define the states
|
||||
states = {
|
||||
'200': 0,
|
||||
'500': 1,
|
||||
'404': 2
|
||||
}
|
||||
# Define rewards, currently not used
|
||||
rewards = {
|
||||
'200': 0,
|
||||
'500': 0.8,
|
||||
'404': 1
|
||||
}
|
||||
|
||||
# cdir = '/Users/sandeephans/Downloads/chaos/chaostoolkit-samples-master/service-down-not-visible-to-users/'
|
||||
cdir = '/Users/sandeephans/Downloads/openshift/'
|
||||
cexp = 'experiment.json'
|
||||
cjournal = 'journal.json'
|
||||
|
||||
aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
|
||||
chaos_dir=cdir, chaos_experiment=cexp, chaos_journal=cjournal,
|
||||
static_run=False)
|
||||
aichaos.start_chaos()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
test_chaos()
|
||||
@@ -1,248 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import logging
|
||||
|
||||
# sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
import src.utils as utils
|
||||
from src.kraken_utils import KrakenUtils
|
||||
from src.qlearning import QLearning
|
||||
from src.test_application import TestApplication
|
||||
|
||||
|
||||
class AIChaos:
|
||||
def __init__(self, namespace='robot-shop', states=None, faults=None, rewards=None, urls=[], max_faults=5,
|
||||
service_weights=None, ctd_subsets=None, pod_names=[], chaos_dir='../config/', kubeconfig='~/.kube/config',
|
||||
chaos_experiment='experiment.json', logfile='log', qfile='qfile.csv', efile='efile', epfile='episodes.json',
|
||||
loglevel=logging.INFO,
|
||||
chaos_journal='journal.json', iterations=10, alpha=0.9, gamma=0.2, epsilon=0.3,
|
||||
num_requests=10, sleep_time=1, timeout=2, chaos_engine='kraken', dstk_probes=None,
|
||||
static_run=False, all_faults=False, command='podman'):
|
||||
self.namespace = namespace
|
||||
self.faults = faults
|
||||
self.unused_faults = faults.copy()
|
||||
self.all_faults = all_faults
|
||||
self.pod_names = pod_names
|
||||
self.states = states
|
||||
self.rewards = rewards
|
||||
self.urls = urls
|
||||
self.max_faults = max_faults
|
||||
self.episodes = []
|
||||
self.service_weights = service_weights
|
||||
self.ctd_subsets = ctd_subsets
|
||||
|
||||
self.kubeconfig = kubeconfig
|
||||
self.chaos_dir = chaos_dir
|
||||
self.chaos_experiment = chaos_experiment
|
||||
self.chaos_journal = chaos_journal
|
||||
self.command = command
|
||||
|
||||
if chaos_engine == 'kraken':
|
||||
self.chaos_engine = KrakenUtils(namespace, kubeconfig=kubeconfig, chaos_dir=chaos_dir, chaos_experiment=chaos_experiment, command=self.command)
|
||||
else:
|
||||
self.chaos_engine = None
|
||||
|
||||
self.iterations = iterations
|
||||
# Initialize RL parameters
|
||||
self.epsilon = epsilon # epsilon decay policy
|
||||
# self.epsdecay = 0
|
||||
|
||||
# log files
|
||||
self.logfile = logfile
|
||||
self.qfile = qfile
|
||||
self.efile = efile
|
||||
self.epfile = epfile
|
||||
open(efile, 'w+').close()
|
||||
open(logfile, 'w+').close()
|
||||
open(logfile, 'r+').truncate(0)
|
||||
logging.getLogger("requests").setLevel(logging.WARNING)
|
||||
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
||||
logging.basicConfig(filename=logfile, filemode='w+', level=loglevel)
|
||||
self.logger = logging.getLogger(logfile.replace('/',''))
|
||||
self.logger.addHandler(logging.FileHandler(logfile))
|
||||
|
||||
self.testapp = TestApplication(num_requests, timeout, sleep_time)
|
||||
self.ql = QLearning(gamma, alpha, faults, states, rewards, urls)
|
||||
|
||||
# run from old static experiment and journal files
|
||||
self.static_run = static_run
|
||||
|
||||
def realistic(self, faults_pods):
|
||||
self.logger.debug('[Realistic] ' + str(faults_pods))
|
||||
fp = faults_pods.copy()
|
||||
for f1 in faults_pods:
|
||||
for f2 in faults_pods:
|
||||
if f1 == f2:
|
||||
continue
|
||||
if f1 in fp and f2 in fp:
|
||||
f1_fault, load_1 = utils.get_load(f1.split(':')[0])
|
||||
f1_pod = f1.split(':')[1]
|
||||
f2_fault, load_2 = utils.get_load(f2.split(':')[0])
|
||||
f2_pod = f2.split(':')[1]
|
||||
if f1_pod == f2_pod:
|
||||
if f1_fault == 'pod-delete':
|
||||
fp.remove(f2)
|
||||
if f1_fault == f2_fault:
|
||||
# if int(load_1) > int(load_2):
|
||||
# randomly remove one fault from same faults with different params
|
||||
fp.remove(f2)
|
||||
if self.service_weights is None:
|
||||
return fp
|
||||
|
||||
fp_copy = fp.copy()
|
||||
for f in fp:
|
||||
f_fault = f.split(':')[0]
|
||||
f_pod = f.split(':')[1].replace('service=', '')
|
||||
self.logger.debug('[ServiceWeights] ' + f + ' ' + str(self.service_weights[f_pod][f_fault]))
|
||||
if self.service_weights[f_pod][f_fault] == 0:
|
||||
fp_copy.remove(f)
|
||||
|
||||
self.logger.debug('[Realistic] ' + str(fp_copy))
|
||||
return fp_copy
|
||||
|
||||
def select_faults(self):
|
||||
max_faults = min(self.max_faults, len(self.unused_faults))
|
||||
num_faults = random.randint(1, max_faults)
|
||||
if self.all_faults:
|
||||
num_faults = len(self.unused_faults)
|
||||
if random.random() > self.epsilon:
|
||||
self.logger.info('[Exploration]')
|
||||
# faults_pods = random.sample(self.faults, k=num_faults)
|
||||
# using used faults list to avoid starvation
|
||||
faults_pods = random.sample(self.unused_faults, k=num_faults)
|
||||
faults_pods = self.realistic(faults_pods)
|
||||
for f in faults_pods:
|
||||
self.unused_faults.remove(f)
|
||||
if len(self.unused_faults) == 0:
|
||||
self.unused_faults = self.faults.copy()
|
||||
else:
|
||||
self.logger.info('[Exploitation]')
|
||||
first_row = self.ql.Q[:, 0, :][0]
|
||||
top_k_indices = np.argpartition(first_row, -num_faults)[-num_faults:]
|
||||
faults_pods = [self.faults[i] for i in top_k_indices]
|
||||
faults_pods = self.realistic(faults_pods)
|
||||
|
||||
return faults_pods
|
||||
|
||||
def create_episode(self, ctd_subset=None):
|
||||
self.logger.debug('[CREATE_EPISODE]')
|
||||
episode = []
|
||||
|
||||
if ctd_subset is None:
|
||||
faults_pods = self.select_faults()
|
||||
else:
|
||||
faults_pods = ctd_subset
|
||||
self.logger.info('CTD Subset: ' + str(faults_pods))
|
||||
|
||||
# faults_pods = self.realistic(faults_pods)
|
||||
if len(faults_pods) == 0:
|
||||
return [], 200, 200
|
||||
|
||||
engines = []
|
||||
for fp in faults_pods:
|
||||
fault = fp.split(':')[0]
|
||||
pod_name = fp.split(':')[1]
|
||||
engine = self.chaos_engine.inject_faults(fault, pod_name)
|
||||
engines.append(engine)
|
||||
episode.append({'fault': fault, 'pod_name': pod_name})
|
||||
self.logger.info('[create_episode]' + str(faults_pods))
|
||||
engines_running = self.chaos_engine.wait_engines(engines)
|
||||
self.logger.info('[create_episode] engines_running' + str(engines_running))
|
||||
if not engines_running:
|
||||
return None, None, None
|
||||
|
||||
# randomly shuffling urls
|
||||
urls = random.sample(self.urls, len(self.urls))
|
||||
ep_json = []
|
||||
for url in urls:
|
||||
start_state, next_state = self.testapp.test_load(url)
|
||||
self.logger.info('[CREATE EPISODE]' + str(start_state) + ',' + str(next_state))
|
||||
# if before state tolerance is not met
|
||||
if start_state is None and next_state is None:
|
||||
# self.cleanup()
|
||||
self.chaos_engine.stop_engines()
|
||||
continue
|
||||
|
||||
### episode.append({'fault': fault, 'pod_name': pod_name})
|
||||
# self.update_q_fault(fault_pod, episode, start_state, next_state)
|
||||
url_index = self.urls.index(url)
|
||||
self.logger.info('[CREATEEPISODE]' + str(url) + ':' + str(url_index))
|
||||
for fp in faults_pods:
|
||||
self.ql.update_q_fault(fp, episode, start_state, next_state, self.urls.index(url))
|
||||
ep_json.append({'start_state': start_state, 'next_state': next_state, 'url': url, 'faults': episode})
|
||||
|
||||
self.logger.debug('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
|
||||
self.logger.debug('[CREATE_EPISODE] END STATE:' + str(next_state))
|
||||
|
||||
self.chaos_engine.print_result(engines)
|
||||
self.chaos_engine.stop_engines(episode=episode)
|
||||
# ep_json = {'start_state': start_state, 'next_state': next_state, 'faults': episode}
|
||||
|
||||
return ep_json, start_state, next_state
|
||||
|
||||
def start_chaos(self):
|
||||
self.logger.info('[INITIALIZING]')
|
||||
self.logger.info('Logfile: '+self.logfile)
|
||||
self.logger.info('Loggerfile: '+self.logger.handlers[0].stream.name)
|
||||
self.logger.info('Chaos Engine: ' + self.chaos_engine.get_name())
|
||||
self.logger.debug('Faults:' + str(self.faults))
|
||||
|
||||
self.chaos_engine.cleanup()
|
||||
if self.ctd_subsets is None:
|
||||
for i in range(self.iterations):
|
||||
episode, start_state, end_state = self.create_episode()
|
||||
self.logger.debug('[start_chaos]' + str(i) + ' ' + str(episode))
|
||||
if episode is None:
|
||||
continue
|
||||
# update Q matrix
|
||||
# will do it with each fault injection
|
||||
# self.update_q(episode, start_state, end_state)
|
||||
# if episode['next_state'] != '200':
|
||||
self.episodes.extend(episode)
|
||||
self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
|
||||
# print(i, self.state_matrix)
|
||||
self.write_q()
|
||||
self.write_episode(episode)
|
||||
else:
|
||||
for i, subset in enumerate(self.ctd_subsets):
|
||||
episode, start_state, end_state = self.create_episode(subset)
|
||||
self.logger.debug('[start_chaos]' + str(episode))
|
||||
if episode is None:
|
||||
continue
|
||||
self.episodes.append(episode)
|
||||
self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
|
||||
self.write_q()
|
||||
self.write_episode(episode)
|
||||
|
||||
self.chaos_engine.cleanup()
|
||||
# self.remove_temp_file()
|
||||
with open(self.epfile, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.episodes, f, ensure_ascii=False, indent=4)
|
||||
self.logger.info('COMPLETE!!!')
|
||||
|
||||
def write_q(self):
|
||||
df = pd.DataFrame(self.ql.Q[:, 0, :], index=self.urls, columns=self.faults)
|
||||
df.to_csv(self.qfile)
|
||||
return df
|
||||
|
||||
def write_episode(self, episode):
|
||||
for ep in episode:
|
||||
with open(self.efile, "a") as outfile:
|
||||
x = [e['fault'] + ':' + e['pod_name'] for e in ep['faults']]
|
||||
x.append(ep['url'])
|
||||
x.append(str(ep['next_state']))
|
||||
outfile.write(','.join(x) + '\n')
|
||||
|
||||
def remove_temp_file(self):
|
||||
mydir = self.chaos_dir + 'experiments'
|
||||
print('Removing temp files from: '+mydir)
|
||||
self.logger.debug('Removing temp files: '+mydir)
|
||||
if os.path.exists(mydir):
|
||||
return
|
||||
filelist = [f for f in os.listdir(mydir) if f.endswith(".json")]
|
||||
for f in filelist:
|
||||
print(f)
|
||||
os.remove(os.path.join(mydir, f))
|
||||
@@ -1,56 +0,0 @@
|
||||
import random
|
||||
|
||||
|
||||
class Experiments:
|
||||
def __init__(self):
|
||||
self.k = 0
|
||||
|
||||
def monotonic(self, aichaos, num_sets=3):
|
||||
for i in range(num_sets):
|
||||
faults_pods = random.sample(aichaos.faults, k=2)
|
||||
faults_set = [[faults_pods[0]], [faults_pods[1]], [faults_pods[0], faults_pods[1]]]
|
||||
|
||||
resp1, resp2, resp_both = 0, 0, 0
|
||||
for fl in faults_set:
|
||||
engines = []
|
||||
for fp in fl:
|
||||
fault = fp.split(':')[0]
|
||||
pod_name = fp.split(':')[1]
|
||||
engine = aichaos.inject_faults_litmus(fault, pod_name)
|
||||
engines.append(engine)
|
||||
aichaos.litmus.wait_engines(engines)
|
||||
|
||||
for index, url in enumerate(aichaos.urls):
|
||||
start_state, next_state = aichaos.test_load(url)
|
||||
print(i, fl, next_state)
|
||||
# self.write(str(fl), next_state)
|
||||
if resp1 == 0:
|
||||
resp1 = next_state
|
||||
elif resp2 == 0:
|
||||
resp2 = next_state
|
||||
else:
|
||||
resp_both = next_state
|
||||
|
||||
aichaos.litmus.stop_engines()
|
||||
self.write_resp(str(faults_set[2]), resp1, resp2, resp_both)
|
||||
print('Experiment Complete!!!')
|
||||
|
||||
@staticmethod
|
||||
def write(fault, next_state):
|
||||
with open("experiment", "a") as outfile:
|
||||
outfile.write(fault + ',' + str(next_state) + ',' + '\n')
|
||||
|
||||
|
||||
@staticmethod
|
||||
def write_resp(faults, resp1, resp2, resp3):
|
||||
monotonic = True
|
||||
if resp3 == 200:
|
||||
if resp1 != 200 or resp2 != 200:
|
||||
monotonic = False
|
||||
else:
|
||||
if resp1 == 200 and resp2 == 200:
|
||||
monotonic = False
|
||||
|
||||
with open("experiment", "a") as outfile:
|
||||
# outfile.write(faults + ',' + str(resp1) + ',' + '\n')
|
||||
outfile.write(faults + ',' + str(resp1) + ',' + str(resp2) + ',' + str(resp3) + ',' + str(monotonic) + '\n')
|
||||
@@ -1,99 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
|
||||
import src.utils as utils
|
||||
|
||||
|
||||
class KrakenUtils:
|
||||
def __init__(self, namespace='robot-shop', chaos_dir='../config/',
|
||||
chaos_experiment='experiment.json', kubeconfig='~/.kube/config', wait_checks=60, command='podman'):
|
||||
self.chaos_dir = chaos_dir
|
||||
self.chaos_experiment = chaos_experiment
|
||||
self.namespace = namespace
|
||||
self.kubeconfig = kubeconfig
|
||||
self.logger = logging.getLogger()
|
||||
self.engines = []
|
||||
self.wait_checks = wait_checks
|
||||
self.command = command
|
||||
|
||||
def exp_status(self, engine='engine-cartns3'):
|
||||
substring_list = ['Waiting for the specified duration','Waiting for wait_duration', 'Step workload started, waiting for response']
|
||||
substr = '|'.join(substring_list)
|
||||
# cmd = "docker logs "+engine+" 2>&1 | grep Waiting"
|
||||
# cmd = "docker logs "+engine+" 2>&1 | grep -E '"+substr+"'"
|
||||
cmd = self.command +" logs "+engine+" 2>&1 | grep -E '"+substr+"'"
|
||||
line = os.popen(cmd).read()
|
||||
self.logger.debug('[exp_status]'+line)
|
||||
# if 'Waiting for the specified duration' in line:
|
||||
# if 'Waiting for' in line or 'waiting for' in line:
|
||||
# if 'Waiting for the specified duration' in line or 'Waiting for wait_duration' in line or 'Step workload started, waiting for response' in line:
|
||||
if any(map(line.__contains__, substring_list)):
|
||||
return 'Running'
|
||||
return 'Not Running'
|
||||
|
||||
# print chaos result, check if litmus showed any error
|
||||
def print_result(self, engines):
|
||||
# self.logger.debug('')
|
||||
for e in engines:
|
||||
# cmd = 'kubectl describe chaosresult ' + e + ' -n ' + self.namespace + ' | grep "Fail Step:"'
|
||||
# line = os.popen(cmd).read()
|
||||
# self.logger.debug('[Chaos Result] '+e+' : '+line)
|
||||
self.logger.debug('[KRAKEN][Chaos Result] '+e)
|
||||
|
||||
def wait_engines(self, engines=[]):
|
||||
status = 'Completed'
|
||||
max_checks = self.wait_checks
|
||||
for e in engines:
|
||||
self.logger.info('[Wait Engines] ' + e)
|
||||
for i in range(max_checks):
|
||||
status = self.exp_status(e)
|
||||
if status == 'Running':
|
||||
break
|
||||
time.sleep(1)
|
||||
# return False, if even one engine is not running
|
||||
if status != 'Running':
|
||||
return False
|
||||
|
||||
self.engines = engines
|
||||
# return True if all engines are running
|
||||
return True
|
||||
|
||||
|
||||
def cleanup(self):
|
||||
self.logger.debug('Removing previous engines')
|
||||
# cmd = "docker rm $(docker ps -q -f 'status=exited')"
|
||||
if len(self.engines) > 0:
|
||||
cmd = self.command+" stop " + " ".join(self.engines) + " >> temp"
|
||||
os.system(cmd)
|
||||
self.engines = []
|
||||
|
||||
cmd = self.command+" container prune -f >> temp"
|
||||
os.system(cmd)
|
||||
self.logger.debug('Engines removed')
|
||||
|
||||
def stop_engines(self, episode=[]):
|
||||
self.cleanup()
|
||||
|
||||
def get_name(self):
|
||||
return 'kraken'
|
||||
|
||||
def inject_faults(self, fault, pod_name):
|
||||
self.logger.debug('[KRAKEN][INJECT_FAULT] ' + fault + ':' + pod_name)
|
||||
fault, load = utils.get_load(fault)
|
||||
engine = 'engine-' + pod_name.replace('=', '-').replace('/','-') + '-' + fault
|
||||
if fault == 'pod-delete':
|
||||
cmd = self.command+' run -d -e NAMESPACE='+self.namespace+' -e POD_LABEL='+pod_name+' --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z quay.io/redhat-chaos/krkn-hub:pod-scenarios >> temp'
|
||||
elif fault == 'network-chaos':
|
||||
# 'docker run -e NODE_NAME=minikube-m03 -e DURATION=10 --name=knetwork --net=host -v /home/chaos/.kube/kube-config-raw:/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'
|
||||
cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120 --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'
|
||||
elif fault == 'node-memory-hog':
|
||||
cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120 -e NODES_AFFECTED_PERC=100 --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-memory-hog >> temp'
|
||||
elif fault == 'node-cpu-hog':
|
||||
cmd = self.command+' run -e NODE_SELECTORS='+pod_name+' -e NODE_CPU_PERCENTAGE=100 -e NAMESPACE='+self.namespace+' -e TOTAL_CHAOS_DURATION=120 -e NODE_CPU_CORE=100 --name='+engine+' --net=host -env-host=true -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-cpu-hog'
|
||||
else:
|
||||
cmd = 'echo'
|
||||
self.logger.debug('[KRAKEN][INJECT_FAULT] ' + cmd)
|
||||
os.system(cmd)
|
||||
return engine
|
||||
@@ -1,62 +0,0 @@
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class QLearning:
|
||||
def __init__(self, gamma=None, alpha=None, faults=None, states=None, rewards=None, urls=None):
|
||||
self.gamma = gamma # Discount factor
|
||||
self.alpha = alpha # Learning rate
|
||||
self.faults = faults
|
||||
self.states = states
|
||||
self.rewards = rewards
|
||||
|
||||
# Initializing Q-Values
|
||||
# self.Q = np.array(np.zeros([len(states), len(states)]))
|
||||
self.Q = np.array(np.zeros([len(urls), len(states), len(faults)]))
|
||||
self.state_matrix = np.array(np.zeros([len(states), len(states)]))
|
||||
|
||||
self.logger = logging.getLogger()
|
||||
|
||||
def update_q_fault(self, fault, episode, start_state, end_state, url_index):
|
||||
self.logger.info('[UPDATE_Q] ' + str(url_index) + ' ' + fault + ' ' + str(start_state) + '->' + str(end_state))
|
||||
if end_state is None:
|
||||
end_state = start_state
|
||||
if end_state not in self.states:
|
||||
end_state = 'Other'
|
||||
# reward is dependent on the error response (eg. '404') and length of episode
|
||||
reward = self.rewards[str(end_state)] / len(episode)
|
||||
current_state = self.states[str(start_state)]
|
||||
next_state = self.states[str(end_state)]
|
||||
fault_index = self.faults.index(fault)
|
||||
# self.logger.debug('[update_q]' + fault + ' ' + str(fault_index) + ' ' + str(reward))
|
||||
# self.logger.debug('reward, gamma: ' + str(reward) + ' ' + str(self.gamma))
|
||||
# self.logger.debug(
|
||||
# 'gamma*val' + str(self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])]))
|
||||
# self.logger.debug('current state val:' + str(self.Q[url_index, current_state, fault_index]))
|
||||
|
||||
TD = reward + \
|
||||
self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])] - \
|
||||
self.Q[url_index, current_state, fault_index]
|
||||
self.Q[url_index, current_state, fault_index] += self.alpha * TD
|
||||
|
||||
# update state matrix
|
||||
TD_state = reward + \
|
||||
self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
|
||||
self.state_matrix[current_state, next_state]
|
||||
self.state_matrix[current_state, next_state] += self.alpha * TD_state
|
||||
# self.logger.debug('updated Q' + str(self.Q[url_index, current_state, fault_index]))
|
||||
|
||||
# def update_q(self, episode, start_state, end_state):
|
||||
# self.logger.info('[UPDATE_Q]')
|
||||
# if end_state is None:
|
||||
# end_state = start_state
|
||||
#
|
||||
# # reward is dependent on the error response (eg. '404') and length of episode
|
||||
# reward = self.rewards[str(end_state)] / len(episode)
|
||||
# current_state = self.states[str(start_state)]
|
||||
# next_state = self.states[str(end_state)]
|
||||
# TD = reward + \
|
||||
# self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
|
||||
# self.Q[current_state, next_state]
|
||||
# self.Q[current_state, next_state] += self.alpha * TD
|
||||
@@ -1,171 +0,0 @@
|
||||
import json, os
|
||||
import logging
|
||||
# import numpy as np
|
||||
# import pandas as pd
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from flask import Flask, request
|
||||
from flasgger import Swagger
|
||||
from flasgger.utils import swag_from
|
||||
# import zipfile
|
||||
import sys
|
||||
|
||||
sys.path.append("..")
|
||||
from aichaos_main import AIChaos
|
||||
|
||||
app = Flask(__name__)
|
||||
Swagger(app)
|
||||
flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "config", "experiments",
|
||||
"flask") + '/'
|
||||
|
||||
|
||||
class AIChaosSwagger:
|
||||
def __init__(self, flaskdir=''):
|
||||
self.flaskdir = flaskdir
|
||||
|
||||
@app.route("/")
|
||||
def empty(params=''):
|
||||
return "AI Chaos Repository!"
|
||||
|
||||
def startchaos(self, kubeconfigfile, file_id, params):
|
||||
print('[StartChaos]', file_id, kubeconfigfile)
|
||||
dir = flaskdir
|
||||
outfile = ''.join([dir, 'out-', file_id])
|
||||
initfile = ''.join([dir, 'init-', file_id])
|
||||
with open(initfile, 'w'):
|
||||
pass
|
||||
if os.path.exists(outfile):
|
||||
os.remove(outfile)
|
||||
# cons = ConstraintsInference(outdir=dir).get_constraints(csvfile, file_id, params, verbose=False,
|
||||
# write_local=False)
|
||||
os.environ["KUBECONFIG"] = kubeconfigfile
|
||||
params['command'] = 'podman'
|
||||
params['chaos_engine'] = 'kraken'
|
||||
params['faults'] = 'pod-delete'
|
||||
params['iterations'] = 1
|
||||
params['maxfaults'] = 5
|
||||
if os.path.isfile('/config/aichaos-config.json'):
|
||||
with open('/config/aichaos-config.json') as f:
|
||||
config_params = json.load(f)
|
||||
params['command'] = config_params['command']
|
||||
params['chaos_engine'] = config_params['chaos_engine']
|
||||
params['faults']= config_params['faults']
|
||||
params['iterations'] = config_params['iterations']
|
||||
params['maxfaults'] = config_params['maxfaults']
|
||||
faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
|
||||
print('#faults:', len(faults), faults)
|
||||
states = {'200': 0, '500': 1, '502': 2, '503': 3, '404': 4, 'Timeout': 5}
|
||||
rewards = {'200': -1, '500': 0.8, '502': 0.8, '503': 0.8, '404': 1, 'Timeout': 1}
|
||||
logfile = self.flaskdir + 'log_' + str(file_id)
|
||||
qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
|
||||
efile = self.flaskdir + 'efile_' + str(file_id)
|
||||
epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
|
||||
probe_url = params['probeurl']
|
||||
probes = {'pod-delete': 'executeprobe', 'cpu-hog': 'wolffi/cpu_load', 'disk-fill': 'wolffi/memory_load',
|
||||
'io_load': 'wolffi/io_load', 'http_delay': 'wolffi/http_delay', 'packet_delay': 'wolffi/packet_delay',
|
||||
'packet_duplication': 'wolffi/packet_duplication', 'packet_loss': 'wolffi/packet_loss',
|
||||
'packet_corruption': 'wolffi/packet_corruption',
|
||||
'packet_reordering': 'wolffi/packet_reordering', 'network_load': 'wolffi/network_load',
|
||||
'http_bad_request': 'wolffi/http_bad_request',
|
||||
'http_unauthorized': 'wolffi/http_unauthorized', 'http_forbidden': 'wolffi/http_forbidden',
|
||||
'http_not_found': 'wolffi/http_not_found',
|
||||
'http_method_not_allowed': 'wolffi/http_method_not_allowed',
|
||||
'http_not_acceptable': 'wolffi/http_not_acceptable',
|
||||
'http_request_timeout': 'wolffi/http_request_timeout',
|
||||
'http_unprocessable_entity': 'wolffi/http_unprocessable_entity',
|
||||
'http_internal_server_error': 'wolffi/http_internal_server_error',
|
||||
'http_not_implemented': 'wolffi/http_not_implemented',
|
||||
'http_bad_gateway': 'wolffi/http_bad_gateway',
|
||||
'http_service_unavailable': 'wolffi/http_service_unavailable',
|
||||
'bandwidth_restrict': 'wolffi/bandwidth_restrict',
|
||||
'pod_cpu_load': 'wolffi/pod_cpu_load', 'pod_memory_load': 'wolffi/pod_memory_load',
|
||||
'pod_io_load': 'wolffi/pod_io_load',
|
||||
'pod_network_load': 'wolffi/pod_network_load'
|
||||
}
|
||||
dstk_probes = {k: probe_url + v for k, v in probes.items()}
|
||||
cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
|
||||
'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
|
||||
'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
|
||||
aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
|
||||
logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
|
||||
urls=params['urls'].split(','), namespace=params['namespace'],
|
||||
max_faults=params['maxfaults'],
|
||||
num_requests=10, timeout=2,
|
||||
chaos_engine=params['chaos_engine'], dstk_probes=dstk_probes, command=params['command'],
|
||||
loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=params['iterations'])
|
||||
aichaos.start_chaos()
|
||||
|
||||
file = open(outfile, "w")
|
||||
file.write('done')
|
||||
file.close()
|
||||
os.remove(initfile)
|
||||
# os.remove(csvfile)
|
||||
# ConstraintsInference().remove_temp_files(dir, file_id)
|
||||
return 'WRITE'
|
||||
|
||||
@app.route('/GenerateChaos/', methods=['POST'])
|
||||
@swag_from('../config/yml/chaosGen.yml')
|
||||
def chaos_gen():
|
||||
dir = flaskdir
|
||||
sw = AIChaosSwagger(flaskdir=dir)
|
||||
f = request.files['file']
|
||||
list = os.listdir(dir)
|
||||
for i in range(10000):
|
||||
if str(i) not in list:
|
||||
break
|
||||
kubeconfigfile = ''.join([dir, str(i)])
|
||||
f.save(kubeconfigfile)
|
||||
print('HEADER:', f.headers)
|
||||
print('[GenerateChaos] reqs:', request.form.to_dict())
|
||||
print('[GenerateChaos]', f.filename, datetime.now())
|
||||
# thread = threading.Thread(target=sw.write_constraints, args=(csvfile, str(i), parameters))
|
||||
thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
|
||||
thread.daemon = True
|
||||
print(thread.getName())
|
||||
thread.start()
|
||||
return 'Chaos ID: ' + str(i)
|
||||
|
||||
@app.route('/GetStatus/<chaosid>', methods=['GET'])
|
||||
@swag_from('../config/yml/status.yml')
|
||||
def get_status(chaosid):
|
||||
print('[GetStatus]', chaosid, flaskdir)
|
||||
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
return 'Completed'
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Does not exist'
|
||||
|
||||
@app.route('/GetQTable/<chaosid>', methods=['GET'])
|
||||
@swag_from('../config/yml/qtable.yml')
|
||||
def get_qtable(chaosid):
|
||||
print('[GetQTable]', chaosid)
|
||||
qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(qfile):
|
||||
f = open(qfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
@app.route('/GetEpisodes/<chaosid>', methods=['GET'])
|
||||
@swag_from('../config/yml/episodes.yml')
|
||||
def get_episodes(chaosid):
|
||||
print('[GetEpisodes]', chaosid)
|
||||
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
f = open(epfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True, host='0.0.0.0', port='5001')
|
||||
@@ -1,83 +0,0 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import requests
|
||||
|
||||
|
||||
class TestApplication:
|
||||
def __init__(self, num_requests=10, timeout=2, sleep_time=1):
|
||||
self.num_requests = num_requests
|
||||
self.timeout = timeout
|
||||
self.sleep_time = sleep_time
|
||||
self.logger = logging.getLogger()
|
||||
|
||||
def test_load(self, url=''):
|
||||
# url = 'http://192.168.49.2:31902/api/cart/health'
|
||||
timeout_count = 0
|
||||
avg_lat = 0
|
||||
for i in range(self.num_requests):
|
||||
try:
|
||||
r = requests.get(url, verify=False, timeout=self.timeout)
|
||||
avg_lat += r.elapsed.total_seconds()
|
||||
self.logger.info(
|
||||
url + ' ' + str(i) + ':' + str(r.status_code) + " {:.2f}".format(r.elapsed.total_seconds())
|
||||
+ " {:.2f}".format(avg_lat))
|
||||
if r.status_code != 200:
|
||||
return '200', r.status_code
|
||||
# except requests.exceptions.Timeout as toe:
|
||||
except Exception as toe:
|
||||
self.logger.info(url + ' ' + str(i) + ':' + 'Timeout Exception!')
|
||||
timeout_count += 1
|
||||
if timeout_count > 3:
|
||||
return '200', 'Timeout'
|
||||
# except Exception as e:
|
||||
# self.logger.debug('Connection refused!'+str(e))
|
||||
time.sleep(self.sleep_time)
|
||||
self.logger.info(url + "Avg: {:.2f}".format(avg_lat/self.num_requests))
|
||||
return '200', '200'
|
||||
|
||||
# def test_load_hey(self):
|
||||
# cmd = 'hey -c 2 -z 20s http://192.168.49.2:31902/api/cart/health > temp'
|
||||
# os.system(cmd)
|
||||
# with open('temp') as f:
|
||||
# datafile = f.readlines()
|
||||
# found = False
|
||||
# for line in datafile:
|
||||
# if 'Status code distribution:' in line:
|
||||
# found = True
|
||||
# if found:
|
||||
# print('[test_load]', line)
|
||||
# m = re.search(r"\[([A-Za-z0-9_]+)\]", line)
|
||||
# if m is not None:
|
||||
# resp_code = m.group(1)
|
||||
# if resp_code != 200:
|
||||
# return '200', resp_code
|
||||
# return '200', '200'
|
||||
|
||||
# # End state is reached when system is down or return error code like '500','404'
|
||||
# def get_next_state(self):
|
||||
# self.logger.info('[GET_NEXT_STATE]')
|
||||
# f = open(self.chaos_dir + self.chaos_journal)
|
||||
# data = json.load(f)
|
||||
#
|
||||
# # before the experiment (if before steady state is false, after is null?)
|
||||
# for probe in data['steady_states']['before']['probes']:
|
||||
# if not probe['tolerance_met']:
|
||||
# # start_state = probe['activity']['tolerance']
|
||||
# # end_state = probe['status']
|
||||
# start_state, end_state = None, None
|
||||
# return start_state, end_state
|
||||
#
|
||||
# # after the experiment
|
||||
# for probe in data['steady_states']['after']['probes']:
|
||||
# # if probe['output']['status'] == probe['activity']['tolerance']:
|
||||
# if not probe['tolerance_met']:
|
||||
# # print(probe)
|
||||
# start_state = probe['activity']['tolerance']
|
||||
# end_state = probe['output']['status']
|
||||
# # end_state = probe['status']
|
||||
# return start_state, end_state
|
||||
# # if tolerances for all probes are met
|
||||
# start_state = probe['activity']['tolerance']
|
||||
# end_state = probe['activity']['tolerance']
|
||||
# return start_state, end_state
|
||||
@@ -1,10 +0,0 @@
|
||||
import re
|
||||
|
||||
|
||||
def get_load(fault):
|
||||
params = re.findall(r'\(.*?\)', fault)
|
||||
load = 100
|
||||
if len(params) > 0:
|
||||
load = params[0].strip('()')
|
||||
fault = fault.strip(params[0])
|
||||
return fault, load
|
||||
@@ -46,7 +46,7 @@ To run the recommender with a config file specify the config file path with the
|
||||
You can customize the default values by editing the `recommender_config.yaml` file. The configuration file contains the following options:
|
||||
|
||||
- `application`: Specify the application name.
|
||||
- `namespaces`: Specify the namespaces names (separated by coma or space). If you want to profile
|
||||
- `namespaces`: Specify the namespaces names (separated by comma or space). If you want to profile
|
||||
- `labels`: Specify the labels (not used).
|
||||
- `kubeconfig`: Specify the location of the kubeconfig file (not used).
|
||||
- `prometheus_endpoint`: Specify the prometheus endpoint (must).
|
||||
|
||||
Reference in New Issue
Block a user