diff --git a/CI/tests/test_app_outages.sh b/CI/tests/test_app_outages.sh index 51bf3372..8448b619 100755 --- a/CI/tests/test_app_outages.sh +++ b/CI/tests/test_app_outages.sh @@ -10,7 +10,7 @@ function functional_test_app_outage { yq -i '.application_outage.duration=10' scenarios/openshift/app_outage.yaml yq -i '.application_outage.pod_selector={"scenario":"outage"}' scenarios/openshift/app_outage.yaml yq -i '.application_outage.namespace="default"' scenarios/openshift/app_outage.yaml - export scenario_type="application_outages" + export scenario_type="application_outages_scenarios" export scenario_file="scenarios/openshift/app_outage.yaml" export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/app_outage.yaml diff --git a/CI/tests/test_arca_cpu_hog.sh b/CI/tests/test_arca_cpu_hog.sh index 51fc24e1..7989be21 100644 --- a/CI/tests/test_arca_cpu_hog.sh +++ b/CI/tests/test_arca_cpu_hog.sh @@ -7,9 +7,9 @@ trap finish EXIT function functional_test_arca_cpu_hog { - yq -i '.input_list[0].node_selector={"kubernetes.io/hostname":"kind-worker2"}' scenarios/arcaflow/cpu-hog/input.yaml - export scenario_type="arcaflow_scenarios" - export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml" + yq -i '.input_list[0].node_selector={"kubernetes.io/hostname":"kind-worker2"}' scenarios/kube/cpu-hog/input.yaml + export scenario_type="hog_scenarios" + export scenario_file="scenarios/kube/cpu-hog/input.yaml" export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/arca_cpu_hog.yaml python3 -m coverage run -a run_kraken.py -c CI/config/arca_cpu_hog.yaml diff --git a/CI/tests/test_arca_io_hog.sh b/CI/tests/test_arca_io_hog.sh index 652e883b..155cbd11 100644 --- a/CI/tests/test_arca_io_hog.sh +++ b/CI/tests/test_arca_io_hog.sh @@ -7,9 +7,9 @@ trap finish EXIT function functional_test_arca_io_hog { - yq -i '.input_list[0].node_selector={"kubernetes.io/hostname":"kind-worker2"}' scenarios/arcaflow/io-hog/input.yaml - export scenario_type="arcaflow_scenarios" - export scenario_file="scenarios/arcaflow/io-hog/input.yaml" + yq -i '.input_list[0].node_selector={"kubernetes.io/hostname":"kind-worker2"}' scenarios/kube/io-hog/input.yaml + export scenario_type="hog_scenarios" + export scenario_file="scenarios/kube/io-hog/input.yaml" export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/arca_io_hog.yaml python3 -m coverage run -a run_kraken.py -c CI/config/arca_io_hog.yaml diff --git a/CI/tests/test_arca_memory_hog.sh b/CI/tests/test_arca_memory_hog.sh index c6f6b7bb..83e12961 100644 --- a/CI/tests/test_arca_memory_hog.sh +++ b/CI/tests/test_arca_memory_hog.sh @@ -7,9 +7,9 @@ trap finish EXIT function functional_test_arca_memory_hog { - yq -i '.input_list[0].node_selector={"kubernetes.io/hostname":"kind-worker2"}' scenarios/arcaflow/memory-hog/input.yaml - export scenario_type="arcaflow_scenarios" - export scenario_file="scenarios/arcaflow/memory-hog/input.yaml" + yq -i '.input_list[0].node_selector={"kubernetes.io/hostname":"kind-worker2"}' scenarios/kube/memory-hog/input.yaml + export scenario_type="hog_scenarios" + export scenario_file="scenarios/kube/memory-hog/input.yaml" export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/arca_memory_hog.yaml python3 -m coverage run -a run_kraken.py -c CI/config/arca_memory_hog.yaml diff --git a/CI/tests/test_container.sh b/CI/tests/test_container.sh index 93e3676c..9042b021 100755 --- a/CI/tests/test_container.sh +++ b/CI/tests/test_container.sh @@ -12,7 +12,7 @@ function functional_test_container_crash { yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/container_etcd.yml yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/container_etcd.yml export scenario_type="container_scenarios" - export scenario_file="- scenarios/openshift/container_etcd.yml" + export scenario_file="scenarios/openshift/container_etcd.yml" export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml diff --git a/CI/tests/test_namespace.sh b/CI/tests/test_namespace.sh index ce5e8a01..b2a1d578 100755 --- a/CI/tests/test_namespace.sh +++ b/CI/tests/test_namespace.sh @@ -6,8 +6,8 @@ trap error ERR trap finish EXIT function funtional_test_namespace_deletion { - export scenario_type="namespace_scenarios" - export scenario_file="- scenarios/openshift/ingress_namespace.yaml" + export scenario_type="service_disruption_scenarios" + export scenario_file="scenarios/openshift/ingress_namespace.yaml" export post_config="" yq '.scenarios[0].namespace="^namespace-scenario$"' -i scenarios/openshift/ingress_namespace.yaml yq '.scenarios[0].wait_time=30' -i scenarios/openshift/ingress_namespace.yaml diff --git a/CI/tests/test_net_chaos.sh b/CI/tests/test_net_chaos.sh index b7a4eb5a..767ab0d1 100755 --- a/CI/tests/test_net_chaos.sh +++ b/CI/tests/test_net_chaos.sh @@ -15,7 +15,7 @@ function functional_test_network_chaos { yq -i 'del(.network_chaos.egress.latency)' scenarios/openshift/network_chaos.yaml yq -i 'del(.network_chaos.egress.loss)' scenarios/openshift/network_chaos.yaml - export scenario_type="network_chaos" + export scenario_type="network_chaos_scenarios" export scenario_file="scenarios/openshift/network_chaos.yaml" export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/network_chaos.yaml diff --git a/CI/tests/test_service_hijacking.sh b/CI/tests/test_service_hijacking.sh index fedb75ca..37c092c3 100644 --- a/CI/tests/test_service_hijacking.sh +++ b/CI/tests/test_service_hijacking.sh @@ -35,7 +35,7 @@ TEXT_MIME="text/plain; charset=utf-8" function functional_test_service_hijacking { - export scenario_type="service_hijacking" + export scenario_type="service_hijacking_scenarios" export scenario_file="scenarios/kube/service_hijacking.yaml" export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/service_hijacking.yaml diff --git a/CI/tests/test_telemetry.sh b/CI/tests/test_telemetry.sh index 15c47220..e1f83bf3 100644 --- a/CI/tests/test_telemetry.sh +++ b/CI/tests/test_telemetry.sh @@ -18,8 +18,8 @@ function functional_test_telemetry { yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml - export scenario_type="arcaflow_scenarios" - export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml" + export scenario_type="hog_scenarios" + export scenario_file="scenarios/kube/cpu-hog/input.yaml" export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml) diff --git a/README.md b/README.md index e9e9f089..c75052f9 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,12 @@ If adding a new scenario or tweaking the main config, be sure to add in updates Please read [this file]((CI/README.md#adding-a-test-case)) for more information on updates. +### Scenario Plugin Development + +If you're gearing up to develop new scenarios, take a moment to review our +[Scenario Plugin API Documentation](docs/scenario_plugin_api.md). +It’s the perfect starting point to tap into your chaotic creativity! + ### Community Key Members(slack_usernames/full name): paigerube14/Paige Rubendall, mffiedler/Mike Fiedler, tsebasti/Tullio Sebastiani, yogi/Yogananth Subramanian, sahil/Sahil Shah, pradeep/Pradeep Surisetty and ravielluri/Naga Ravi Chaitanya Elluri. * [**#krkn on Kubernetes Slack**](https://kubernetes.slack.com/messages/C05SFMHRWK1) diff --git a/config/config.yaml b/config/config.yaml index 7c1d52c6..f26121de 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,6 +1,6 @@ kraken: distribution: kubernetes # Distribution can be kubernetes or openshift - kubeconfig_path: ~/.kube/config # Path to kubeconfig + kubeconfig_path: ~/.kube/config # Path to kubeconfig exit_on_failure: False # Exit when a post action scenario fails publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081 signal_state: RUN # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details @@ -8,43 +8,46 @@ kraken: port: 8081 # Signal port chaos_scenarios: # List of policies/chaos scenarios to load - - arcaflow_scenarios: - - scenarios/arcaflow/cpu-hog/input.yaml - - scenarios/arcaflow/memory-hog/input.yaml - - scenarios/arcaflow/io-hog/input.yaml - - application_outages: + - hog_scenarios: + - scenarios/kube/cpu-hog/input.yaml + - scenarios/kube/memory-hog/input.yaml + - scenarios/kube/io-hog/input.yaml + - scenarios/kube/io-hog/input.yaml + - application_outages_scenarios: - scenarios/openshift/app_outage.yaml - container_scenarios: # List of chaos pod scenarios to load - - - scenarios/openshift/container_etcd.yml - - plugin_scenarios: + - scenarios/openshift/container_etcd.yml + - pod_network_scenarios: + - scenarios/openshift/network_chaos_ingress.yml + - scenarios/openshift/pod_network_outage.yml + - pod_disruption_scenarios: - scenarios/openshift/etcd.yml - scenarios/openshift/regex_openshift_pod_kill.yml - - scenarios/openshift/vmware_node_scenarios.yml - - scenarios/openshift/network_chaos_ingress.yml - scenarios/openshift/prom_kill.yml - - node_scenarios: # List of chaos node scenarios to load - - scenarios/openshift/aws_node_scenarios.yml - - plugin_scenarios: - scenarios/openshift/openshift-apiserver.yml - scenarios/openshift/openshift-kube-apiserver.yml + - vmware_node_scenarios: + - scenarios/openshift/vmware_node_scenarios.yml + - ibmcloud_node_scenarios: + - scenarios/openshift/ibmcloud_node_scenarios.yml + - node_scenarios: # List of chaos node scenarios to load + - scenarios/openshift/aws_node_scenarios.yml - time_scenarios: # List of chaos time scenarios to load - scenarios/openshift/time_scenarios_example.yml - cluster_shut_down_scenarios: - - - scenarios/openshift/cluster_shut_down_scenario.yml - - scenarios/openshift/post_action_shut_down.py + - scenarios/openshift/cluster_shut_down_scenario.yml - service_disruption_scenarios: - - - scenarios/openshift/regex_namespace.yaml - - - scenarios/openshift/ingress_namespace.yaml - - scenarios/openshift/post_action_namespace.py - - zone_outages: + - scenarios/openshift/regex_namespace.yaml + - scenarios/openshift/ingress_namespace.yaml + - zone_outages_scenarios: - scenarios/openshift/zone_outage.yaml - pvc_scenarios: - scenarios/openshift/pvc_scenario.yaml - - network_chaos: + - network_chaos_scenarios: - scenarios/openshift/network_chaos.yaml - - service_hijacking: + - service_hijacking_scenarios: - scenarios/kube/service_hijacking.yaml - - syn_flood: + - syn_flood_scenarios: - scenarios/kube/syn_flood.yaml cerberus: diff --git a/config/config_kind.yaml b/config/config_kind.yaml index 495528c1..6c56b78d 100644 --- a/config/config_kind.yaml +++ b/config/config_kind.yaml @@ -6,7 +6,7 @@ kraken: publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081 signal_state: RUN # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details signal_address: 0.0.0.0 # Signal listening address - chaos_scenarios: # List of policies/chaos scenarios to load + chaos_scenarios: # List of policies/chaos scenarios to load - plugin_scenarios: - scenarios/kind/scheduler.yml - node_scenarios: diff --git a/config/config_kubernetes.yaml b/config/config_kubernetes.yaml index 13003edc..b2652acc 100644 --- a/config/config_kubernetes.yaml +++ b/config/config_kubernetes.yaml @@ -7,7 +7,7 @@ kraken: signal_state: RUN # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details chaos_scenarios: # List of policies/chaos scenarios to load - container_scenarios: # List of chaos pod scenarios to load - - - scenarios/kube/container_dns.yml + - scenarios/kube/container_dns.yml - plugin_scenarios: - scenarios/kube/scheduler.yml diff --git a/config/config_performance.yaml b/config/config_performance.yaml index ec22023b..a03c38ef 100644 --- a/config/config_performance.yaml +++ b/config/config_performance.yaml @@ -12,15 +12,14 @@ kraken: - scenarios/openshift/regex_openshift_pod_kill.yml - scenarios/openshift/prom_kill.yml - node_scenarios: # List of chaos node scenarios to load - - scenarios/openshift/node_scenarios_example.yml + - scenarios/openshift/node_scenarios_example.yml - plugin_scenarios: - scenarios/openshift/openshift-apiserver.yml - scenarios/openshift/openshift-kube-apiserver.yml - time_scenarios: # List of chaos time scenarios to load - scenarios/openshift/time_scenarios_example.yml - cluster_shut_down_scenarios: - - - scenarios/openshift/cluster_shut_down_scenario.yml - - scenarios/openshift/post_action_shut_down.py + - scenarios/openshift/cluster_shut_down_scenario.yml - service_disruption_scenarios: - scenarios/openshift/regex_namespace.yaml - scenarios/openshift/ingress_namespace.yaml diff --git a/docs/scenario_plugin_api.md b/docs/scenario_plugin_api.md new file mode 100644 index 00000000..42260dcf --- /dev/null +++ b/docs/scenario_plugin_api.md @@ -0,0 +1,136 @@ +# Scenario Plugin API: + +This API enables seamless integration of Scenario Plugins for Krkn. Plugins are automatically +detected and loaded by the plugin loader, provided they extend the `AbstractPluginScenario` +abstract class, implement the required methods, and adhere to the specified [naming conventions](#naming-conventions). + +## Plugin folder: + +The plugin loader automatically loads plugins found in the `krkn/scenario_plugins` directory, +relative to the Krkn root folder. Each plugin must reside in its own directory and can consist +of one or more Python files. The entry point for each plugin is a Python class that extends the +[AbstractPluginScenario](../krkn/scenario_plugins/abstract_scenario_plugin.py) abstract class and implements its required methods. + +## `AbstractPluginScenario` abstract class: + +This [abstract class](../krkn/scenario_plugins/abstract_scenario_plugin.py) defines the contract between the plugin and krkn. +It consists of two methods: +- `run(...)` +- `get_scenario_type()` + +Most IDEs can automatically suggest and implement the abstract methods defined in `AbstractPluginScenario`: +![pycharm](scenario_plugin_pycharm.gif) +_(IntelliJ PyCharm)_ + +### `run(...)` + +```python + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + +``` + +This method represents the entry point of the plugin and the first method +that will be executed. +#### Parameters: + +- `run_uuid`: + - the uuid of the chaos run generated by krkn for every single run. +- `scenario`: + - the config file of the scenario that is currently executed +- `krkn_config`: + - the full dictionary representation of the `config.yaml` +- `lib_telemetry` + - it is a composite object of all the [krkn-lib](https://krkn-chaos.github.io/krkn-lib-docs/modules.html) objects and methods needed by a krkn plugin to run. +- `scenario_telemetry` + - the `ScenarioTelemetry` object of the scenario that is currently executed + +### Return value: +Returns 0 if the scenario succeeds and 1 if it fails. +> [!WARNING] +> All the exception must be handled __inside__ the run method and not propagated. + +### `get_scenario_types()`: + +```python def get_scenario_types(self) -> list[str]:``` + +Indicates the scenario types specified in the `config.yaml`. For the plugin to be properly +loaded, recognized and executed, it must be implemented and must return one or more +strings matching `scenario_type` strings set in the config. +> [!WARNING] +> Multiple strings can map to a *single* `ScenarioPlugin` but the same string cannot map +> to different plugins, an exception will be thrown for scenario_type redefinition. + +> [!Note] +> The `scenario_type` strings must be unique across all plugins; otherwise, an exception will be thrown. + +## Naming conventions: +A key requirement for developing a plugin that will be properly loaded +by the plugin loader is following the established naming conventions. +These conventions are enforced to maintain a uniform and readable codebase, +making it easier to onboard new developers from the community. + +### plugin folder: +- the plugin folder must be placed in the `krkn/scenario_plugin` folder starting from the krkn root folder +- the plugin folder __cannot__ contain the words + - `plugin` + - `scenario` +### plugin file name and class name: +- the plugin file containing the main plugin class must be named in _snake case_ and must have the suffix `_scenario_plugin`: + - `example_scenario_plugin.py` +- the main plugin class must named in _capital camel case_ and must have the suffix `ScenarioPlugin` : + - `ExampleScenarioPlugin` +- the file name must match the class name in the respective syntax: + - `example_scenario_plugin.py` -> `ExampleScenarioPlugin` + +### scenario type: +- the scenario type __must__ be unique between all the scenarios. + +### logging: +If your new scenario does not adhere to the naming conventions, an error log will be generated in the Krkn standard output, +providing details about the issue: + +```commandline +2024-10-03 18:06:31,136 [INFO] 📣 `ScenarioPluginFactory`: types from config.yaml mapped to respective classes for execution: +2024-10-03 18:06:31,136 [INFO] ✅ type: application_outages_scenarios ➡️ `ApplicationOutageScenarioPlugin` +2024-10-03 18:06:31,136 [INFO] ✅ types: [hog_scenarios, arcaflow_scenario] ➡️ `ArcaflowScenarioPlugin` +2024-10-03 18:06:31,136 [INFO] ✅ type: container_scenarios ➡️ `ContainerScenarioPlugin` +2024-10-03 18:06:31,136 [INFO] ✅ type: managedcluster_scenarios ➡️ `ManagedClusterScenarioPlugin` +2024-10-03 18:06:31,137 [INFO] ✅ types: [pod_disruption_scenarios, pod_network_scenario, vmware_node_scenarios, ibmcloud_node_scenarios] ➡️ `NativeScenarioPlugin` +2024-10-03 18:06:31,137 [INFO] ✅ type: network_chaos_scenarios ➡️ `NetworkChaosScenarioPlugin` +2024-10-03 18:06:31,137 [INFO] ✅ type: node_scenarios ➡️ `NodeActionsScenarioPlugin` +2024-10-03 18:06:31,137 [INFO] ✅ type: pvc_scenarios ➡️ `PvcScenarioPlugin` +2024-10-03 18:06:31,137 [INFO] ✅ type: service_disruption_scenarios ➡️ `ServiceDisruptionScenarioPlugin` +2024-10-03 18:06:31,137 [INFO] ✅ type: service_hijacking_scenarios ➡️ `ServiceHijackingScenarioPlugin` +2024-10-03 18:06:31,137 [INFO] ✅ type: cluster_shut_down_scenarios ➡️ `ShutDownScenarioPlugin` +2024-10-03 18:06:31,137 [INFO] ✅ type: syn_flood_scenarios ➡️ `SynFloodScenarioPlugin` +2024-10-03 18:06:31,137 [INFO] ✅ type: time_scenarios ➡️ `TimeActionsScenarioPlugin` +2024-10-03 18:06:31,137 [INFO] ✅ type: zone_outages_scenarios ➡️ `ZoneOutageScenarioPlugin` + +2024-09-18 14:48:41,735 [INFO] Failed to load Scenario Plugins: + +2024-09-18 14:48:41,735 [ERROR] ⛔ Class: ExamplePluginScenario Module: krkn.scenario_plugins.example.example_scenario_plugin +2024-09-18 14:48:41,735 [ERROR] ⚠️ scenario plugin class name must start with a capital letter, end with `ScenarioPlugin`, and cannot be just `ScenarioPlugin`. +``` + +>[!NOTE] +>If you're trying to understand how the scenario types in the config.yaml are mapped to +> their corresponding plugins, this log will guide you! +> Each scenario plugin class mentioned can be found in the `krkn/scenario_plugin` folder +> simply convert the camel case notation and remove the ScenarioPlugin suffix from the class name +> e.g `ShutDownScenarioPlugin` class can be found in the `krkn/scenario_plugin/shut_down` folder. + +## ExampleScenarioPlugin +The [ExampleScenarioPlugin](../krkn/tests/test_classes/example_scenario_plugin.py) class included in the tests folder can be used as a scaffolding for new plugins and it is considered +part of the documentation. + +For any questions or further guidance, feel free to reach out to us on the +[Kubernetes workspace](https://kubernetes.slack.com/) in the `#krkn` channel. +We’re happy to assist. Now, __release the Krkn!__ + diff --git a/docs/scenario_plugin_pycharm.gif b/docs/scenario_plugin_pycharm.gif new file mode 100644 index 00000000..60221be4 Binary files /dev/null and b/docs/scenario_plugin_pycharm.gif differ diff --git a/kraken/application_outage/actions.py b/kraken/application_outage/actions.py deleted file mode 100644 index 0bd35a3c..00000000 --- a/kraken/application_outage/actions.py +++ /dev/null @@ -1,100 +0,0 @@ -import yaml -import logging -import time - -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift - -import kraken.cerberus.setup as cerberus -from jinja2 import Template -from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import get_yaml_item_value, log_exception - -from kraken import utils - - -# Reads the scenario config, applies and deletes a network policy to -# block the traffic for the specified duration -def run(scenarios_list, - config, - wait_duration, - telemetry: KrknTelemetryOpenshift, - telemetry_request_id: str) -> (list[str], list[ScenarioTelemetry]): - failed_post_scenarios = "" - scenario_telemetries: list[ScenarioTelemetry] = [] - failed_scenarios = [] - for app_outage_config in scenarios_list: - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = app_outage_config - scenario_telemetry.start_timestamp = time.time() - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, app_outage_config) - if len(app_outage_config) > 1: - try: - with open(app_outage_config, "r") as f: - app_outage_config_yaml = yaml.full_load(f) - scenario_config = app_outage_config_yaml["application_outage"] - pod_selector = get_yaml_item_value( - scenario_config, "pod_selector", "{}" - ) - traffic_type = get_yaml_item_value( - scenario_config, "block", "[Ingress, Egress]" - ) - namespace = get_yaml_item_value( - scenario_config, "namespace", "" - ) - duration = get_yaml_item_value( - scenario_config, "duration", 60 - ) - - start_time = int(time.time()) - - network_policy_template = """--- -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: kraken-deny -spec: - podSelector: - matchLabels: {{ pod_selector }} - policyTypes: {{ traffic_type }} -""" - t = Template(network_policy_template) - rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type) - yaml_spec = yaml.safe_load(rendered_spec) - # Block the traffic by creating network policy - logging.info("Creating the network policy") - - telemetry.kubecli.create_net_policy(yaml_spec, namespace) - - # wait for the specified duration - logging.info("Waiting for the specified duration in the config: %s" % (duration)) - time.sleep(duration) - - # unblock the traffic by deleting the network policy - logging.info("Deleting the network policy") - telemetry.kubecli.delete_net_policy("kraken-deny", namespace) - - logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) - - end_time = int(time.time()) - cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) - except Exception as e : - scenario_telemetry.exit_status = 1 - failed_scenarios.append(app_outage_config) - log_exception(app_outage_config) - else: - scenario_telemetry.exit_status = 0 - scenario_telemetry.end_timestamp = time.time() - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - scenario_telemetries.append(scenario_telemetry) - return failed_scenarios, scenario_telemetries - diff --git a/kraken/arcaflow_plugin/__init__.py b/kraken/arcaflow_plugin/__init__.py deleted file mode 100644 index 9438d945..00000000 --- a/kraken/arcaflow_plugin/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .arcaflow_plugin import * -from .context_auth import ContextAuth diff --git a/kraken/arcaflow_plugin/arcaflow_plugin.py b/kraken/arcaflow_plugin/arcaflow_plugin.py deleted file mode 100644 index 5cd11da8..00000000 --- a/kraken/arcaflow_plugin/arcaflow_plugin.py +++ /dev/null @@ -1,204 +0,0 @@ -import time -import arcaflow -import os -import yaml -import logging -from pathlib import Path -from typing import List - -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift - -from .context_auth import ContextAuth -from krkn_lib.models.telemetry import ScenarioTelemetry - -from .. import utils - - -def run(scenarios_list: List[str], - telemetry: KrknTelemetryOpenshift, - telemetry_request_id: str - ) -> (list[str], list[ScenarioTelemetry]): - scenario_telemetries: list[ScenarioTelemetry] = [] - failed_post_scenarios = [] - for scenario in scenarios_list: - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = scenario - start_time = time.time() - scenario_telemetry.start_timestamp = start_time - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, scenario) - engine_args = build_args(scenario) - status_code = run_workflow(engine_args, telemetry.kubecli.get_kubeconfig_path()) - end_time = time.time() - scenario_telemetry.end_timestamp = end_time - scenario_telemetry.exit_status = status_code - - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(start_time), - int(end_time)) - - # this is the design proposal for the namespaced logs collection - # check the krkn-lib latest commit to follow also the changes made here - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(start_time), - int(end_time)) - - scenario_telemetries.append(scenario_telemetry) - if status_code != 0: - failed_post_scenarios.append(scenario) - return failed_post_scenarios, scenario_telemetries - - -def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str) -> int: - set_arca_kubeconfig(engine_args, kubeconfig_path) - exit_status = arcaflow.run(engine_args) - return exit_status - - -def build_args(input_file: str) -> arcaflow.EngineArgs: - """sets the kubeconfig parsed by setArcaKubeConfig as an input to the arcaflow workflow""" - current_path = Path().resolve() - context = f"{current_path}/{Path(input_file).parent}" - workflow = f"{context}/workflow.yaml" - config = f"{context}/config.yaml" - if not os.path.exists(context): - raise Exception( - "context folder for arcaflow workflow not found: {}".format( - context) - ) - if not os.path.exists(input_file): - raise Exception( - "input file for arcaflow workflow not found: {}".format(input_file)) - if not os.path.exists(workflow): - raise Exception( - "workflow file for arcaflow workflow not found: {}".format( - workflow) - ) - if not os.path.exists(config): - raise Exception( - "configuration file for arcaflow workflow not found: {}".format( - config) - ) - - engine_args = arcaflow.EngineArgs() - engine_args.context = context - engine_args.config = config - engine_args.workflow = workflow - engine_args.input = f"{current_path}/{input_file}" - return engine_args - - -def set_arca_kubeconfig(engine_args: arcaflow.EngineArgs, kubeconfig_path: str): - - context_auth = ContextAuth() - if not os.path.exists(kubeconfig_path): - raise Exception("kubeconfig not found in {}".format(kubeconfig_path)) - - with open(kubeconfig_path, "r") as stream: - try: - kubeconfig = yaml.safe_load(stream) - context_auth.fetch_auth_data(kubeconfig) - except Exception as e: - logging.error("impossible to read kubeconfig file in: {}".format( - kubeconfig_path)) - raise e - - kubeconfig_str = set_kubeconfig_auth(kubeconfig, context_auth) - - with open(engine_args.input, "r") as stream: - input_file = yaml.safe_load(stream) - if "input_list" in input_file and isinstance(input_file["input_list"],list): - for index, _ in enumerate(input_file["input_list"]): - if isinstance(input_file["input_list"][index], dict): - input_file["input_list"][index]["kubeconfig"] = kubeconfig_str - else: - input_file["kubeconfig"] = kubeconfig_str - stream.close() - with open(engine_args.input, "w") as stream: - yaml.safe_dump(input_file, stream) - - with open(engine_args.config, "r") as stream: - config_file = yaml.safe_load(stream) - if config_file["deployers"]["image"]["deployer_name"] == "kubernetes": - kube_connection = set_kubernetes_deployer_auth(config_file["deployers"]["image"]["connection"], context_auth) - config_file["deployers"]["image"]["connection"]=kube_connection - with open(engine_args.config, "w") as stream: - yaml.safe_dump(config_file, stream,explicit_start=True, width=4096) - - -def set_kubernetes_deployer_auth(deployer: any, context_auth: ContextAuth) -> any: - if context_auth.clusterHost is not None : - deployer["host"] = context_auth.clusterHost - if context_auth.clientCertificateData is not None : - deployer["cert"] = context_auth.clientCertificateData - if context_auth.clientKeyData is not None: - deployer["key"] = context_auth.clientKeyData - if context_auth.clusterCertificateData is not None: - deployer["cacert"] = context_auth.clusterCertificateData - if context_auth.username is not None: - deployer["username"] = context_auth.username - if context_auth.password is not None: - deployer["password"] = context_auth.password - if context_auth.bearerToken is not None: - deployer["bearerToken"] = context_auth.bearerToken - return deployer - - -def set_kubeconfig_auth(kubeconfig: any, context_auth: ContextAuth) -> str: - """ - Builds an arcaflow-compatible kubeconfig representation and returns it as a string. - In order to run arcaflow plugins in kubernetes/openshift the kubeconfig must contain client certificate/key - and server certificate base64 encoded within the kubeconfig file itself in *-data fields. That is not always the - case, infact kubeconfig may contain filesystem paths to those files, this function builds an arcaflow-compatible - kubeconfig file and returns it as a string that can be safely included in input.yaml - """ - - if "current-context" not in kubeconfig.keys(): - raise Exception( - "invalid kubeconfig file, impossible to determine current-context" - ) - user_id = None - cluster_id = None - user_name = None - cluster_name = None - current_context = kubeconfig["current-context"] - for context in kubeconfig["contexts"]: - if context["name"] == current_context: - user_name = context["context"]["user"] - cluster_name = context["context"]["cluster"] - if user_name is None: - raise Exception( - "user not set for context {} in kubeconfig file".format(current_context) - ) - if cluster_name is None: - raise Exception( - "cluster not set for context {} in kubeconfig file".format(current_context) - ) - - for index, user in enumerate(kubeconfig["users"]): - if user["name"] == user_name: - user_id = index - for index, cluster in enumerate(kubeconfig["clusters"]): - if cluster["name"] == cluster_name: - cluster_id = index - - if cluster_id is None: - raise Exception( - "no cluster {} found in kubeconfig users".format(cluster_name) - ) - if "client-certificate" in kubeconfig["users"][user_id]["user"]: - kubeconfig["users"][user_id]["user"]["client-certificate-data"] = context_auth.clientCertificateDataBase64 - del kubeconfig["users"][user_id]["user"]["client-certificate"] - - if "client-key" in kubeconfig["users"][user_id]["user"]: - kubeconfig["users"][user_id]["user"]["client-key-data"] = context_auth.clientKeyDataBase64 - del kubeconfig["users"][user_id]["user"]["client-key"] - - if "certificate-authority" in kubeconfig["clusters"][cluster_id]["cluster"]: - kubeconfig["clusters"][cluster_id]["cluster"]["certificate-authority-data"] = context_auth.clusterCertificateDataBase64 - del kubeconfig["clusters"][cluster_id]["cluster"]["certificate-authority"] - kubeconfig_str = yaml.dump(kubeconfig) - return kubeconfig_str diff --git a/kraken/chaos_recommender/prometheus.py b/kraken/chaos_recommender/prometheus.py deleted file mode 100644 index 723b6d5e..00000000 --- a/kraken/chaos_recommender/prometheus.py +++ /dev/null @@ -1,144 +0,0 @@ -import logging - -from prometheus_api_client import PrometheusConnect -import pandas as pd -import urllib3 - - -saved_metrics_path = "./utilisation.txt" - - -def convert_data_to_dataframe(data, label): - df = pd.DataFrame() - df['service'] = [item['metric']['pod'] for item in data] - df[label] = [item['value'][1] for item in data] - - return df - - -def convert_data(data, service): - result = {} - for entry in data: - pod_name = entry['metric']['pod'] - value = entry['value'][1] - result[pod_name] = value - return result.get(service) # for those pods whose limits are not defined they can take as much resources, there assigning a very high value - - -def convert_data_limits(data, node_data, service, prometheus): - result = {} - for entry in data: - pod_name = entry['metric']['pod'] - value = entry['value'][1] - result[pod_name] = value - return result.get(service, get_node_capacity(node_data, service, prometheus)) # for those pods whose limits are not defined they can take as much resources, there assigning a very high value - -def get_node_capacity(node_data, pod_name, prometheus ): - - # Get the node name on which the pod is running - query = f'kube_pod_info{{pod="{pod_name}"}}' - result = prometheus.custom_query(query) - if not result: - return None - - node_name = result[0]['metric']['node'] - - for item in node_data: - if item['metric']['node'] == node_name: - return item['value'][1] - - return '1000000000' - - -def save_utilization_to_file(utilization, filename, prometheus): - - merged_df = pd.DataFrame(columns=['namespace', 'service', 'CPU', 'CPU_LIMITS', 'MEM', 'MEM_LIMITS', 'NETWORK']) - for namespace in utilization: - # Loading utilization_data[] for namespace - # indexes -- 0 CPU, 1 CPU limits, 2 mem, 3 mem limits, 4 network - utilization_data = utilization[namespace] - df_cpu = convert_data_to_dataframe(utilization_data[0], "CPU") - services = df_cpu.service.unique() - logging.info(f"Services for namespace {namespace}: {services}") - - for s in services: - - new_row_df = pd.DataFrame({ - "namespace": namespace, "service": s, - "CPU": convert_data(utilization_data[0], s), - "CPU_LIMITS": convert_data_limits(utilization_data[1],utilization_data[5], s, prometheus), - "MEM": convert_data(utilization_data[2], s), - "MEM_LIMITS": convert_data_limits(utilization_data[3], utilization_data[6], s, prometheus), - "NETWORK": convert_data(utilization_data[4], s)}, index=[0]) - merged_df = pd.concat([merged_df, new_row_df], ignore_index=True) - - # Convert columns to string - merged_df['CPU'] = merged_df['CPU'].astype(str) - merged_df['MEM'] = merged_df['MEM'].astype(str) - merged_df['CPU_LIMITS'] = merged_df['CPU_LIMITS'].astype(str) - merged_df['MEM_LIMITS'] = merged_df['MEM_LIMITS'].astype(str) - merged_df['NETWORK'] = merged_df['NETWORK'].astype(str) - - # Extract integer part before the decimal point - #merged_df['CPU'] = merged_df['CPU'].str.split('.').str[0] - #merged_df['MEM'] = merged_df['MEM'].str.split('.').str[0] - #merged_df['CPU_LIMITS'] = merged_df['CPU_LIMITS'].str.split('.').str[0] - #merged_df['MEM_LIMITS'] = merged_df['MEM_LIMITS'].str.split('.').str[0] - #merged_df['NETWORK'] = merged_df['NETWORK'].str.split('.').str[0] - - merged_df.to_csv(filename, sep='\t', index=False) - - -def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, - namespaces, scrape_duration): - urllib3.disable_warnings() - prometheus = PrometheusConnect(url=prometheus_endpoint, headers={ - 'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True) - - # Dicts for saving utilisation and queries -- key is namespace - utilization = {} - queries = {} - - logging.info("Fetching utilization...") - for namespace in namespaces: - - # Fetch CPU utilization - cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration) - cpu_result = prometheus.custom_query(cpu_query) - - cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace) - cpu_limits_result = prometheus.custom_query(cpu_limits_query) - - node_cpu_limits_query = 'kube_node_status_capacity{resource="cpu", unit="core"}*1000' - node_cpu_limits_result = prometheus.custom_query(node_cpu_limits_query) - - mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration) - mem_result = prometheus.custom_query(mem_query) - - mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace) - mem_limits_result = prometheus.custom_query(mem_limits_query) - - node_mem_limits_query = 'kube_node_status_capacity{resource="memory", unit="byte"}' - node_mem_limits_result = prometheus.custom_query(node_mem_limits_query) - - network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \ - (avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration) - network_result = prometheus.custom_query(network_query) - - utilization[namespace] = [cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result, node_cpu_limits_result, node_mem_limits_result ] - queries[namespace] = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query) - - save_utilization_to_file(utilization, saved_metrics_path, prometheus) - - return saved_metrics_path, queries - - -def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query): - queries = { - "cpu_query": cpu_query, - "cpu_limit_query": cpu_limits_query, - "memory_query": mem_query, - "memory_limit_query": mem_limits_query, - "network_query": network_query - } - return queries diff --git a/kraken/managedcluster_scenarios/manifestwork.j2 b/kraken/managedcluster_scenarios/manifestwork.j2 deleted file mode 100644 index 0d66e47f..00000000 --- a/kraken/managedcluster_scenarios/manifestwork.j2 +++ /dev/null @@ -1,68 +0,0 @@ -apiVersion: work.open-cluster-management.io/v1 -kind: ManifestWork -metadata: - namespace: {{managedcluster_name}} - name: managedcluster-scenarios-template -spec: - workload: - manifests: - - apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRole - metadata: - name: scale-deploy - namespace: open-cluster-management - rules: - - apiGroups: ["apps"] - resources: ["deployments/scale"] - verbs: ["patch"] - - apiGroups: ["apps"] - resources: ["deployments"] - verbs: ["get"] - - apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - name: scale-deploy-to-sa - namespace: open-cluster-management - subjects: - - kind: ServiceAccount - name: internal-kubectl - namespace: open-cluster-management - roleRef: - kind: ClusterRole - name: scale-deploy - apiGroup: rbac.authorization.k8s.io - - apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - name: scale-deploy-to-sa - namespace: open-cluster-management-agent - subjects: - - kind: ServiceAccount - name: internal-kubectl - namespace: open-cluster-management - roleRef: - kind: ClusterRole - name: scale-deploy - apiGroup: rbac.authorization.k8s.io - - apiVersion: v1 - kind: ServiceAccount - metadata: - name: internal-kubectl - namespace: open-cluster-management - - apiVersion: batch/v1 - kind: Job - metadata: - name: managedcluster-scenarios-template - namespace: open-cluster-management - spec: - template: - spec: - serviceAccountName: internal-kubectl - containers: - - name: kubectl - image: quay.io/sighup/kubectl-kustomize:1.21.6_3.9.1 - command: ["/bin/sh", "-c"] - args: - - {{args}} - restartPolicy: Never - backoffLimit: 0 \ No newline at end of file diff --git a/kraken/managedcluster_scenarios/run.py b/kraken/managedcluster_scenarios/run.py deleted file mode 100644 index eb6a4f1d..00000000 --- a/kraken/managedcluster_scenarios/run.py +++ /dev/null @@ -1,78 +0,0 @@ -import yaml -import logging -import time -from kraken.managedcluster_scenarios.managedcluster_scenarios import managedcluster_scenarios -import kraken.managedcluster_scenarios.common_managedcluster_functions as common_managedcluster_functions -import kraken.cerberus.setup as cerberus -from krkn_lib.k8s import KrknKubernetes -from krkn_lib.utils.functions import get_yaml_item_value - -# Get the managedcluster scenarios object of specfied cloud type -# krkn_lib -def get_managedcluster_scenario_object(managedcluster_scenario, kubecli: KrknKubernetes): - return managedcluster_scenarios(kubecli) - -# Run defined scenarios -# krkn_lib -def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes): - for managedcluster_scenario_config in scenarios_list: - with open(managedcluster_scenario_config, "r") as f: - managedcluster_scenario_config = yaml.full_load(f) - for managedcluster_scenario in managedcluster_scenario_config["managedcluster_scenarios"]: - managedcluster_scenario_object = get_managedcluster_scenario_object(managedcluster_scenario, kubecli) - if managedcluster_scenario["actions"]: - for action in managedcluster_scenario["actions"]: - start_time = int(time.time()) - inject_managedcluster_scenario(action, managedcluster_scenario, managedcluster_scenario_object, kubecli) - logging.info("Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) - end_time = int(time.time()) - cerberus.get_status(config, start_time, end_time) - logging.info("") - - -# Inject the specified managedcluster scenario -# krkn_lib -def inject_managedcluster_scenario(action, managedcluster_scenario, managedcluster_scenario_object, kubecli: KrknKubernetes): - # Get the managedcluster scenario configurations - run_kill_count = get_yaml_item_value( - managedcluster_scenario, "runs", 1 - ) - instance_kill_count = get_yaml_item_value( - managedcluster_scenario, "instance_count", 1 - ) - managedcluster_name = get_yaml_item_value( - managedcluster_scenario, "managedcluster_name", "" - ) - label_selector = get_yaml_item_value( - managedcluster_scenario, "label_selector", "" - ) - timeout = get_yaml_item_value(managedcluster_scenario, "timeout", 120) - # Get the managedcluster to apply the scenario - if managedcluster_name: - managedcluster_name_list = managedcluster_name.split(",") - else: - managedcluster_name_list = [managedcluster_name] - for single_managedcluster_name in managedcluster_name_list: - managedclusters = common_managedcluster_functions.get_managedcluster(single_managedcluster_name, label_selector, instance_kill_count, kubecli) - for single_managedcluster in managedclusters: - if action == "managedcluster_start_scenario": - managedcluster_scenario_object.managedcluster_start_scenario(run_kill_count, single_managedcluster, timeout) - elif action == "managedcluster_stop_scenario": - managedcluster_scenario_object.managedcluster_stop_scenario(run_kill_count, single_managedcluster, timeout) - elif action == "managedcluster_stop_start_scenario": - managedcluster_scenario_object.managedcluster_stop_start_scenario(run_kill_count, single_managedcluster, timeout) - elif action == "managedcluster_termination_scenario": - managedcluster_scenario_object.managedcluster_termination_scenario(run_kill_count, single_managedcluster, timeout) - elif action == "managedcluster_reboot_scenario": - managedcluster_scenario_object.managedcluster_reboot_scenario(run_kill_count, single_managedcluster, timeout) - elif action == "stop_start_klusterlet_scenario": - managedcluster_scenario_object.stop_start_klusterlet_scenario(run_kill_count, single_managedcluster, timeout) - elif action == "start_klusterlet_scenario": - managedcluster_scenario_object.stop_klusterlet_scenario(run_kill_count, single_managedcluster, timeout) - elif action == "stop_klusterlet_scenario": - managedcluster_scenario_object.stop_klusterlet_scenario(run_kill_count, single_managedcluster, timeout) - elif action == "managedcluster_crash_scenario": - managedcluster_scenario_object.managedcluster_crash_scenario(run_kill_count, single_managedcluster, timeout) - else: - logging.info("There is no managedcluster action that matches %s, skipping scenario" % action) diff --git a/kraken/network_chaos/actions.py b/kraken/network_chaos/actions.py deleted file mode 100644 index 1fc851d1..00000000 --- a/kraken/network_chaos/actions.py +++ /dev/null @@ -1,228 +0,0 @@ -import yaml -import logging -import time -import os -import random - -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift - -import kraken.cerberus.setup as cerberus -import kraken.node_actions.common_node_functions as common_node_functions -from jinja2 import Environment, FileSystemLoader -from krkn_lib.k8s import KrknKubernetes -from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import get_yaml_item_value, log_exception - -from kraken import utils - - -# krkn_lib -# Reads the scenario config and introduces traffic variations in Node's host network interface. -def run(scenarios_list, - config, - wait_duration, - telemetry: KrknTelemetryOpenshift, - telemetry_request_id: str) -> (list[str], list[ScenarioTelemetry]): - logging.info("Runing the Network Chaos tests") - failed_post_scenarios = "" - scenario_telemetries: list[ScenarioTelemetry] = [] - failed_scenarios = [] - for net_config in scenarios_list: - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = net_config - scenario_telemetry.start_timestamp = time.time() - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, net_config) - try: - with open(net_config, "r") as file: - param_lst = ["latency", "loss", "bandwidth"] - test_config = yaml.safe_load(file) - test_dict = test_config["network_chaos"] - test_duration = int( - get_yaml_item_value(test_dict, "duration", 300) - ) - test_interface = get_yaml_item_value( - test_dict, "interfaces", [] - ) - test_node = get_yaml_item_value(test_dict, "node_name", "") - test_node_label = get_yaml_item_value( - test_dict, "label_selector", - "node-role.kubernetes.io/master" - ) - test_execution = get_yaml_item_value( - test_dict, "execution", "serial" - ) - test_instance_count = get_yaml_item_value( - test_dict, "instance_count", 1 - ) - test_egress = get_yaml_item_value( - test_dict, "egress", {"bandwidth": "100mbit"} - ) - if test_node: - node_name_list = test_node.split(",") - else: - node_name_list = [test_node] - nodelst = [] - for single_node_name in node_name_list: - nodelst.extend(common_node_functions.get_node(single_node_name, test_node_label, test_instance_count, telemetry.kubecli)) - file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__))) - env = Environment(loader=file_loader, autoescape=True) - pod_template = env.get_template("pod.j2") - test_interface = verify_interface(test_interface, nodelst, pod_template, telemetry.kubecli) - joblst = [] - egress_lst = [i for i in param_lst if i in test_egress] - chaos_config = { - "network_chaos": { - "duration": test_duration, - "interfaces": test_interface, - "node_name": ",".join(nodelst), - "execution": test_execution, - "instance_count": test_instance_count, - "egress": test_egress, - } - } - logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config)) - job_template = env.get_template("job.j2") - try: - for i in egress_lst: - for node in nodelst: - exec_cmd = get_egress_cmd( - test_execution, test_interface, i, test_dict["egress"], duration=test_duration - ) - logging.info("Executing %s on node %s" % (exec_cmd, node)) - job_body = yaml.safe_load( - job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd) - ) - joblst.append(job_body["metadata"]["name"]) - api_response = telemetry.kubecli.create_job(job_body) - if api_response is None: - raise Exception("Error creating job") - if test_execution == "serial": - logging.info("Waiting for serial job to finish") - start_time = int(time.time()) - wait_for_job(joblst[:], telemetry.kubecli, test_duration + 300) - logging.info("Waiting for wait_duration %s" % wait_duration) - time.sleep(wait_duration) - end_time = int(time.time()) - cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) - if test_execution == "parallel": - break - if test_execution == "parallel": - logging.info("Waiting for parallel job to finish") - start_time = int(time.time()) - wait_for_job(joblst[:], telemetry.kubecli, test_duration + 300) - logging.info("Waiting for wait_duration %s" % wait_duration) - time.sleep(wait_duration) - end_time = int(time.time()) - cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) - except Exception as e: - logging.error("Network Chaos exiting due to Exception %s" % e) - raise RuntimeError() - finally: - logging.info("Deleting jobs") - delete_job(joblst[:], telemetry.kubecli) - except (RuntimeError, Exception): - scenario_telemetry.exit_status = 1 - failed_scenarios.append(net_config) - log_exception(net_config) - else: - scenario_telemetry.exit_status = 0 - scenario_telemetry.end_timestamp = time.time() - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - scenario_telemetries.append(scenario_telemetry) - return failed_scenarios, scenario_telemetries - - -# krkn_lib -def verify_interface(test_interface, nodelst, template, kubecli: KrknKubernetes): - pod_index = random.randint(0, len(nodelst) - 1) - pod_body = yaml.safe_load(template.render(nodename=nodelst[pod_index])) - logging.info("Creating pod to query interface on node %s" % nodelst[pod_index]) - kubecli.create_pod(pod_body, "default", 300) - try: - if test_interface == []: - cmd = "ip r | grep default | awk '/default/ {print $5}'" - output = kubecli.exec_cmd_in_pod(cmd, "fedtools", "default") - test_interface = [output.replace("\n", "")] - else: - cmd = "ip -br addr show|awk -v ORS=',' '{print $1}'" - output = kubecli.exec_cmd_in_pod(cmd, "fedtools", "default") - interface_lst = output[:-1].split(",") - for interface in test_interface: - if interface not in interface_lst: - logging.error("Interface %s not found in node %s interface list %s" % (interface, nodelst[pod_index], interface_lst)) - #sys.exit(1) - raise RuntimeError() - return test_interface - finally: - logging.info("Deleteing pod to query interface on node") - kubecli.delete_pod("fedtools", "default") - - -# krkn_lib -def get_job_pods(api_response, kubecli: KrknKubernetes): - controllerUid = api_response.metadata.labels["controller-uid"] - pod_label_selector = "controller-uid=" + controllerUid - pods_list = kubecli.list_pods(label_selector=pod_label_selector, namespace="default") - return pods_list[0] - - -# krkn_lib -def wait_for_job(joblst, kubecli: KrknKubernetes, timeout=300): - waittime = time.time() + timeout - count = 0 - joblen = len(joblst) - while count != joblen: - for jobname in joblst: - try: - api_response = kubecli.get_job_status(jobname, namespace="default") - if api_response.status.succeeded is not None or api_response.status.failed is not None: - count += 1 - joblst.remove(jobname) - except Exception: - logging.warning("Exception in getting job status") - if time.time() > waittime: - raise Exception("Starting pod failed") - time.sleep(5) - - -# krkn_lib -def delete_job(joblst, kubecli: KrknKubernetes): - for jobname in joblst: - try: - api_response = kubecli.get_job_status(jobname, namespace="default") - if api_response.status.failed is not None: - pod_name = get_job_pods(api_response, kubecli) - pod_stat = kubecli.read_pod(name=pod_name, namespace="default") - logging.error(pod_stat.status.container_statuses) - pod_log_response = kubecli.get_pod_log(name=pod_name, namespace="default") - pod_log = pod_log_response.data.decode("utf-8") - logging.error(pod_log) - except Exception: - logging.warning("Exception in getting job status") - kubecli.delete_job(name=jobname, namespace="default") - - -def get_egress_cmd(execution, test_interface, mod, vallst, duration=30): - tc_set = tc_unset = tc_ls = "" - param_map = {"latency": "delay", "loss": "loss", "bandwidth": "rate"} - for i in test_interface: - tc_set = "{0} tc qdisc add dev {1} root netem".format(tc_set, i) - tc_unset = "{0} tc qdisc del dev {1} root ;".format(tc_unset, i) - tc_ls = "{0} tc qdisc ls dev {1} ;".format(tc_ls, i) - if execution == "parallel": - for val in vallst.keys(): - tc_set += " {0} {1} ".format(param_map[val], vallst[val]) - tc_set += ";" - else: - tc_set += " {0} {1} ;".format(param_map[mod], vallst[mod]) - exec_cmd = "{0} {1} sleep {2};{3} sleep 20;{4}".format(tc_set, tc_ls, duration, tc_unset, tc_ls) - return exec_cmd diff --git a/kraken/node_actions/run.py b/kraken/node_actions/run.py deleted file mode 100644 index 50214dd9..00000000 --- a/kraken/node_actions/run.py +++ /dev/null @@ -1,174 +0,0 @@ -import yaml -import logging -import sys -import time - -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift - -from kraken import utils -from kraken.node_actions.aws_node_scenarios import aws_node_scenarios -from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios -from kraken.node_actions.az_node_scenarios import azure_node_scenarios -from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios -from kraken.node_actions.openstack_node_scenarios import openstack_node_scenarios -from kraken.node_actions.alibaba_node_scenarios import alibaba_node_scenarios -from kraken.node_actions.bm_node_scenarios import bm_node_scenarios -from kraken.node_actions.docker_node_scenarios import docker_node_scenarios -import kraken.node_actions.common_node_functions as common_node_functions -import kraken.cerberus.setup as cerberus -from krkn_lib.k8s import KrknKubernetes -from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes -from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import get_yaml_item_value, log_exception - -node_general = False - - -# Get the node scenarios object of specfied cloud type -# krkn_lib -def get_node_scenario_object(node_scenario, kubecli: KrknKubernetes): - if "cloud_type" not in node_scenario.keys() or node_scenario["cloud_type"] == "generic": - global node_general - node_general = True - return general_node_scenarios(kubecli) - if node_scenario["cloud_type"] == "aws": - return aws_node_scenarios(kubecli) - elif node_scenario["cloud_type"] == "gcp": - return gcp_node_scenarios(kubecli) - elif node_scenario["cloud_type"] == "openstack": - return openstack_node_scenarios(kubecli) - elif node_scenario["cloud_type"] == "azure" or node_scenario["cloud_type"] == "az": - return azure_node_scenarios(kubecli) - elif node_scenario["cloud_type"] == "alibaba" or node_scenario["cloud_type"] == "alicloud": - return alibaba_node_scenarios(kubecli) - elif node_scenario["cloud_type"] == "bm": - return bm_node_scenarios( - node_scenario.get("bmc_info"), node_scenario.get("bmc_user", None), node_scenario.get("bmc_password", None), - kubecli - ) - elif node_scenario["cloud_type"] == "docker": - return docker_node_scenarios(kubecli) - else: - logging.error( - "Cloud type " + node_scenario["cloud_type"] + " is not currently supported; " - "try using 'generic' if wanting to stop/start kubelet or fork bomb on any " - "cluster" - ) - sys.exit(1) - - -# Run defined scenarios -# krkn_lib -def run(scenarios_list, - config, - wait_duration, - telemetry: KrknTelemetryOpenshift, - telemetry_request_id: str) -> (list[str], list[ScenarioTelemetry]): - scenario_telemetries: list[ScenarioTelemetry] = [] - failed_scenarios = [] - for node_scenario_config in scenarios_list: - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = node_scenario_config - scenario_telemetry.start_timestamp = time.time() - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, node_scenario_config) - with open(node_scenario_config, "r") as f: - node_scenario_config = yaml.full_load(f) - for node_scenario in node_scenario_config["node_scenarios"]: - node_scenario_object = get_node_scenario_object(node_scenario, telemetry.kubecli) - if node_scenario["actions"]: - for action in node_scenario["actions"]: - start_time = int(time.time()) - try: - inject_node_scenario(action, node_scenario, node_scenario_object, telemetry.kubecli) - logging.info("Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) - end_time = int(time.time()) - cerberus.get_status(config, start_time, end_time) - logging.info("") - except (RuntimeError, Exception) as e: - scenario_telemetry.exit_status = 1 - failed_scenarios.append(node_scenario_config) - log_exception(node_scenario_config) - else: - scenario_telemetry.exit_status = 0 - - scenario_telemetry.end_timestamp = time.time() - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - scenario_telemetries.append(scenario_telemetry) - - return failed_scenarios, scenario_telemetries - - -# Inject the specified node scenario -def inject_node_scenario(action, node_scenario, node_scenario_object, kubecli: KrknKubernetes): - generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario") - # Get the node scenario configurations - run_kill_count = get_yaml_item_value(node_scenario, "runs", 1) - instance_kill_count = get_yaml_item_value( - node_scenario, "instance_count", 1 - ) - node_name = get_yaml_item_value(node_scenario, "node_name", "") - label_selector = get_yaml_item_value(node_scenario, "label_selector", "") - if action == "node_stop_start_scenario": - duration = get_yaml_item_value(node_scenario, "duration", 120) - timeout = get_yaml_item_value(node_scenario, "timeout", 120) - service = get_yaml_item_value(node_scenario, "service", "") - ssh_private_key = get_yaml_item_value( - node_scenario, "ssh_private_key", "~/.ssh/id_rsa" - ) - # Get the node to apply the scenario - if node_name: - node_name_list = node_name.split(",") - else: - node_name_list = [node_name] - for single_node_name in node_name_list: - nodes = common_node_functions.get_node(single_node_name, label_selector, instance_kill_count, kubecli) - for single_node in nodes: - if node_general and action not in generic_cloud_scenarios: - logging.info("Scenario: " + action + " is not set up for generic cloud type, skipping action") - else: - if action == "node_start_scenario": - node_scenario_object.node_start_scenario(run_kill_count, single_node, timeout) - elif action == "node_stop_scenario": - node_scenario_object.node_stop_scenario(run_kill_count, single_node, timeout) - elif action == "node_stop_start_scenario": - node_scenario_object.node_stop_start_scenario(run_kill_count, single_node, timeout, duration) - elif action == "node_termination_scenario": - node_scenario_object.node_termination_scenario(run_kill_count, single_node, timeout) - elif action == "node_reboot_scenario": - node_scenario_object.node_reboot_scenario(run_kill_count, single_node, timeout) - elif action == "stop_start_kubelet_scenario": - node_scenario_object.stop_start_kubelet_scenario(run_kill_count, single_node, timeout) - elif action == "restart_kubelet_scenario": - node_scenario_object.restart_kubelet_scenario(run_kill_count, single_node, timeout) - elif action == "stop_kubelet_scenario": - node_scenario_object.stop_kubelet_scenario(run_kill_count, single_node, timeout) - elif action == "node_crash_scenario": - node_scenario_object.node_crash_scenario(run_kill_count, single_node, timeout) - elif action == "stop_start_helper_node_scenario": - if node_scenario["cloud_type"] != "openstack": - logging.error( - "Scenario: " + action + " is not supported for " - "cloud type " + node_scenario["cloud_type"] + ", skipping action" - ) - else: - if not node_scenario["helper_node_ip"]: - logging.error("Helper node IP address is not provided") - sys.exit(1) - node_scenario_object.helper_node_stop_start_scenario( - run_kill_count, node_scenario["helper_node_ip"], timeout - ) - node_scenario_object.helper_node_service_status( - node_scenario["helper_node_ip"], service, ssh_private_key, timeout - ) - else: - logging.info("There is no node action that matches %s, skipping scenario" % action) diff --git a/kraken/plugins/__init__.py b/kraken/plugins/__init__.py deleted file mode 100644 index 970deed6..00000000 --- a/kraken/plugins/__init__.py +++ /dev/null @@ -1,332 +0,0 @@ -import dataclasses -import json -import logging -from os.path import abspath -from typing import List, Dict, Any -import time - -from arcaflow_plugin_sdk import schema, serialization, jsonschema -from arcaflow_plugin_kill_pod import kill_pods, wait_for_pods -from krkn_lib.k8s import KrknKubernetes -from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift - -import kraken.plugins.node_scenarios.vmware_plugin as vmware_plugin -import kraken.plugins.node_scenarios.ibmcloud_plugin as ibmcloud_plugin -from kraken import utils -from kraken.plugins.run_python_plugin import run_python_file -from kraken.plugins.network.ingress_shaping import network_chaos -from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_outage -from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_egress_shaping -from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes -from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_ingress_shaping -from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import log_exception - - -@dataclasses.dataclass -class PluginStep: - schema: schema.StepSchema - error_output_ids: List[str] - - def render_output(self, output_id: str, output_data) -> str: - return json.dumps({ - "output_id": output_id, - "output_data": self.schema.outputs[output_id].serialize(output_data), - }, indent='\t') - - -class Plugins: - """ - Plugins is a class that can run plugins sequentially. The output is rendered to the standard output and the process - is aborted if a step fails. - """ - steps_by_id: Dict[str, PluginStep] - - def __init__(self, steps: List[PluginStep]): - self.steps_by_id = dict() - for step in steps: - if step.schema.id in self.steps_by_id: - raise Exception( - "Duplicate step ID: {}".format(step.schema.id) - ) - self.steps_by_id[step.schema.id] = step - - def unserialize_scenario(self, file: str) -> Any: - return serialization.load_from_file(abspath(file)) - - def run(self, file: str, kubeconfig_path: str, kraken_config: str, run_uuid:str): - """ - Run executes a series of steps - """ - data = self.unserialize_scenario(abspath(file)) - if not isinstance(data, list): - raise Exception( - "Invalid scenario configuration file: {} expected list, found {}".format(file, type(data).__name__) - ) - i = 0 - for entry in data: - if not isinstance(entry, dict): - raise Exception( - "Invalid scenario configuration file: {} expected a list of dict's, found {} on step {}".format( - file, - type(entry).__name__, - i - ) - ) - if "id" not in entry: - raise Exception( - "Invalid scenario configuration file: {} missing 'id' field on step {}".format( - file, - i, - ) - ) - if "config" not in entry: - raise Exception( - "Invalid scenario configuration file: {} missing 'config' field on step {}".format( - file, - i, - ) - ) - - if entry["id"] not in self.steps_by_id: - raise Exception( - "Invalid step {} in {} ID: {} expected one of: {}".format( - i, - file, - entry["id"], - ', '.join(self.steps_by_id.keys()) - ) - ) - step = self.steps_by_id[entry["id"]] - unserialized_input = step.schema.input.unserialize(entry["config"]) - if "kubeconfig_path" in step.schema.input.properties: - unserialized_input.kubeconfig_path = kubeconfig_path - if "kraken_config" in step.schema.input.properties: - unserialized_input.kraken_config = kraken_config - output_id, output_data = step.schema(params=unserialized_input, run_id=run_uuid) - - logging.info(step.render_output(output_id, output_data) + "\n") - if output_id in step.error_output_ids: - raise Exception( - "Step {} in {} ({}) failed".format(i, file, step.schema.id) - ) - i = i + 1 - - def json_schema(self): - """ - This function generates a JSON schema document and renders it from the steps passed. - """ - result = { - "$id": "https://github.com/redhat-chaos/krkn/", - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "Kraken Arcaflow scenarios", - "description": "Serial execution of Arcaflow Python plugins. See https://github.com/arcaflow for details.", - "type": "array", - "minContains": 1, - "items": { - "oneOf": [ - - ] - } - } - for step_id in self.steps_by_id.keys(): - step = self.steps_by_id[step_id] - step_input = jsonschema.step_input(step.schema) - del step_input["$id"] - del step_input["$schema"] - del step_input["title"] - del step_input["description"] - result["items"]["oneOf"].append({ - "type": "object", - "properties": { - "id": { - "type": "string", - "const": step_id, - }, - "config": step_input, - }, - "required": [ - "id", - "config", - ] - }) - return json.dumps(result, indent="\t") - - -PLUGINS = Plugins( - [ - PluginStep( - kill_pods, - [ - "error", - ] - ), - PluginStep( - wait_for_pods, - [ - "error" - ] - ), - PluginStep( - run_python_file, - [ - "error" - ] - ), - PluginStep( - vmware_plugin.node_start, - [ - "error" - ] - ), - PluginStep( - vmware_plugin.node_stop, - [ - "error" - ] - ), - PluginStep( - vmware_plugin.node_reboot, - [ - "error" - ] - ), - PluginStep( - vmware_plugin.node_terminate, - [ - "error" - ] - ), - PluginStep( - ibmcloud_plugin.node_start, - [ - "error" - ] - ), - PluginStep( - ibmcloud_plugin.node_stop, - [ - "error" - ] - ), - PluginStep( - ibmcloud_plugin.node_reboot, - [ - "error" - ] - ), - PluginStep( - ibmcloud_plugin.node_terminate, - [ - "error" - ] - ), - PluginStep( - network_chaos, - [ - "error" - ] - ), - PluginStep( - pod_outage, - [ - "error" - ] - ), - PluginStep( - pod_egress_shaping, - [ - "error" - ] - ), - PluginStep( - pod_ingress_shaping, - [ - "error" - ] - ) - ] -) - - -def run(scenarios: List[str], - kraken_config: str, - failed_post_scenarios: List[str], - wait_duration: int, - telemetry: KrknTelemetryOpenshift, - run_uuid: str, - telemetry_request_id: str, - ) -> (List[str], list[ScenarioTelemetry]): - - scenario_telemetries: list[ScenarioTelemetry] = [] - for scenario in scenarios: - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = scenario - scenario_telemetry.start_timestamp = time.time() - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, scenario) - logging.info('scenario ' + str(scenario)) - pool = PodsMonitorPool(telemetry.kubecli) - kill_scenarios = [kill_scenario for kill_scenario in PLUGINS.unserialize_scenario(scenario) if kill_scenario["id"] == "kill-pods"] - - try: - start_monitoring(pool, kill_scenarios) - PLUGINS.run(scenario, telemetry.kubecli.get_kubeconfig_path(), kraken_config, run_uuid) - result = pool.join() - scenario_telemetry.affected_pods = result - if result.error: - raise Exception(f"unrecovered pods: {result.error}") - - except Exception as e: - logging.error(f"scenario exception: {str(e)}") - scenario_telemetry.exit_status = 1 - pool.cancel() - failed_post_scenarios.append(scenario) - log_exception(scenario) - else: - scenario_telemetry.exit_status = 0 - logging.info("Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) - scenario_telemetry.end_timestamp = time.time() - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - - scenario_telemetries.append(scenario_telemetry) - - return failed_post_scenarios, scenario_telemetries - - -def start_monitoring(pool: PodsMonitorPool, scenarios: list[Any]): - for kill_scenario in scenarios: - recovery_time = kill_scenario["config"]["krkn_pod_recovery_time"] - if ("namespace_pattern" in kill_scenario["config"] and - "label_selector" in kill_scenario["config"]): - namespace_pattern = kill_scenario["config"]["namespace_pattern"] - label_selector = kill_scenario["config"]["label_selector"] - pool.select_and_monitor_by_namespace_pattern_and_label( - namespace_pattern=namespace_pattern, - label_selector=label_selector, - max_timeout=recovery_time) - logging.info( - f"waiting {recovery_time} seconds for pod recovery, " - f"pod label selector: {label_selector} namespace pattern: {namespace_pattern}") - - elif ("namespace_pattern" in kill_scenario["config"] and - "name_pattern" in kill_scenario["config"]): - namespace_pattern = kill_scenario["config"]["namespace_pattern"] - name_pattern = kill_scenario["config"]["name_pattern"] - pool.select_and_monitor_by_name_pattern_and_namespace_pattern(pod_name_pattern=name_pattern, - namespace_pattern=namespace_pattern, - max_timeout=recovery_time) - logging.info(f"waiting {recovery_time} seconds for pod recovery, " - f"pod name pattern: {name_pattern} namespace pattern: {namespace_pattern}") - else: - raise Exception(f"impossible to determine monitor parameters, check {kill_scenario} configuration") diff --git a/kraken/plugins/__main__.py b/kraken/plugins/__main__.py deleted file mode 100644 index 6cbd0454..00000000 --- a/kraken/plugins/__main__.py +++ /dev/null @@ -1,4 +0,0 @@ -from kraken.plugins import PLUGINS - -if __name__ == "__main__": - print(PLUGINS.json_schema()) \ No newline at end of file diff --git a/kraken/pod_scenarios/setup.py b/kraken/pod_scenarios/setup.py deleted file mode 100644 index 38218818..00000000 --- a/kraken/pod_scenarios/setup.py +++ /dev/null @@ -1,269 +0,0 @@ -import logging -import time -from typing import Any - -import yaml -import sys -import random -import arcaflow_plugin_kill_pod -from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift - -import kraken.cerberus.setup as cerberus -import kraken.post_actions.actions as post_actions -from krkn_lib.k8s import KrknKubernetes -from krkn_lib.models.telemetry import ScenarioTelemetry -from arcaflow_plugin_sdk import serialization -from krkn_lib.utils.functions import get_yaml_item_value, log_exception - -from kraken import utils - - -# Run pod based scenarios -def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration): - # Loop to run the scenarios starts here - for pod_scenario in scenarios_list: - if len(pod_scenario) > 1: - pre_action_output = post_actions.run(kubeconfig_path, pod_scenario[1]) - else: - pre_action_output = "" - try: - # capture start time - start_time = int(time.time()) - - input = serialization.load_from_file(pod_scenario) - - s = arcaflow_plugin_kill_pod.get_schema() - input_data: arcaflow_plugin_kill_pod.KillPodConfig = s.unserialize_input("pod", input) - - if kubeconfig_path is not None: - input_data.kubeconfig_path = kubeconfig_path - - output_id, output_data = s.call_step("pod", input_data) - - if output_id == "error": - data: arcaflow_plugin_kill_pod.PodErrorOutput = output_data - logging.error("Failed to run pod scenario: {}".format(data.error)) - else: - data: arcaflow_plugin_kill_pod.PodSuccessOutput = output_data - for pod in data.pods: - print("Deleted pod {} in namespace {}\n".format(pod.pod_name, pod.pod_namespace)) - except Exception as e: - logging.error( - "Failed to run scenario: %s. Encountered the following " "exception: %s" % (pod_scenario[0], e) - ) - sys.exit(1) - - logging.info("Scenario: %s has been successfully injected!" % (pod_scenario[0])) - logging.info("Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) - - try: - failed_post_scenarios = post_actions.check_recovery( - kubeconfig_path, pod_scenario, failed_post_scenarios, pre_action_output - ) - except Exception as e: - logging.error("Failed to run post action checks: %s" % e) - sys.exit(1) - - # capture end time - end_time = int(time.time()) - - # publish cerberus status - cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) - return failed_post_scenarios - - -# krkn_lib -def container_run( - scenarios_list, - config, - failed_post_scenarios, - wait_duration, - telemetry: KrknTelemetryOpenshift, - telemetry_request_id: str - ) -> (list[str], list[ScenarioTelemetry]): - - failed_scenarios = [] - scenario_telemetries: list[ScenarioTelemetry] = [] - pool = PodsMonitorPool(telemetry.kubecli) - - for container_scenario_config in scenarios_list: - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = container_scenario_config[0] - scenario_telemetry.start_timestamp = time.time() - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, container_scenario_config[0]) - if len(container_scenario_config) > 1: - pre_action_output = post_actions.run(telemetry.kubecli.get_kubeconfig_path(), container_scenario_config[1]) - else: - pre_action_output = "" - with open(container_scenario_config[0], "r") as f: - cont_scenario_config = yaml.full_load(f) - start_monitoring(kill_scenarios=cont_scenario_config["scenarios"], pool=pool) - for cont_scenario in cont_scenario_config["scenarios"]: - # capture start time - start_time = int(time.time()) - try: - killed_containers = container_killing_in_pod(cont_scenario, telemetry.kubecli) - logging.info(f"killed containers: {str(killed_containers)}") - result = pool.join() - if result.error: - raise Exception(f"pods failed to recovery: {result.error}") - scenario_telemetry.affected_pods = result - logging.info("Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) - - # capture end time - end_time = int(time.time()) - - # publish cerberus status - cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) - except (RuntimeError, Exception): - pool.cancel() - failed_scenarios.append(container_scenario_config[0]) - log_exception(container_scenario_config[0]) - scenario_telemetry.exit_status = 1 - # removed_exit - # sys.exit(1) - else: - scenario_telemetry.exit_status = 0 - scenario_telemetry.end_timestamp = time.time() - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - scenario_telemetries.append(scenario_telemetry) - - return failed_scenarios, scenario_telemetries - -def start_monitoring(kill_scenarios: list[Any], pool: PodsMonitorPool): - for kill_scenario in kill_scenarios: - namespace_pattern = f"^{kill_scenario['namespace']}$" - label_selector = kill_scenario["label_selector"] - recovery_time = kill_scenario["expected_recovery_time"] - pool.select_and_monitor_by_namespace_pattern_and_label( - namespace_pattern=namespace_pattern, - label_selector=label_selector, - max_timeout=recovery_time) - - -def container_killing_in_pod(cont_scenario, kubecli: KrknKubernetes): - scenario_name = get_yaml_item_value(cont_scenario, "name", "") - namespace = get_yaml_item_value(cont_scenario, "namespace", "*") - label_selector = get_yaml_item_value(cont_scenario, "label_selector", None) - pod_names = get_yaml_item_value(cont_scenario, "pod_names", []) - container_name = get_yaml_item_value(cont_scenario, "container_name", "") - kill_action = get_yaml_item_value(cont_scenario, "action", 1) - kill_count = get_yaml_item_value(cont_scenario, "count", 1) - if not isinstance(kill_action, int): - logging.error("Please make sure the action parameter defined in the " - "config is an integer") - raise RuntimeError() - if (kill_action < 1) or (kill_action > 15): - logging.error("Only 1-15 kill signals are supported.") - raise RuntimeError() - kill_action = "kill " + str(kill_action) - if type(pod_names) != list: - logging.error("Please make sure your pod_names are in a list format") - # removed_exit - # sys.exit(1) - raise RuntimeError() - if len(pod_names) == 0: - if namespace == "*": - # returns double array of pod name and namespace - pods = kubecli.get_all_pods(label_selector) - else: - # Only returns pod names - pods = kubecli.list_pods(namespace, label_selector) - else: - if namespace == "*": - logging.error("You must specify the namespace to kill a container in a specific pod") - logging.error("Scenario " + scenario_name + " failed") - # removed_exit - # sys.exit(1) - raise RuntimeError() - pods = pod_names - # get container and pod name - container_pod_list = [] - for pod in pods: - if type(pod) == list: - pod_output = kubecli.get_pod_info(pod[0], pod[1]) - container_names = [container.name for container in pod_output.containers] - - container_pod_list.append([pod[0], pod[1], container_names]) - else: - pod_output = kubecli.get_pod_info(pod, namespace) - container_names = [container.name for container in pod_output.containers] - container_pod_list.append([pod, namespace, container_names]) - - killed_count = 0 - killed_container_list = [] - while killed_count < kill_count: - if len(container_pod_list) == 0: - logging.error("Trying to kill more containers than were found, try lowering kill count") - logging.error("Scenario " + scenario_name + " failed") - # removed_exit - # sys.exit(1) - raise RuntimeError() - selected_container_pod = container_pod_list[random.randint(0, len(container_pod_list) - 1)] - for c_name in selected_container_pod[2]: - if container_name != "": - if c_name == container_name: - killed_container_list.append([selected_container_pod[0], selected_container_pod[1], c_name]) - retry_container_killing(kill_action, selected_container_pod[0], selected_container_pod[1], c_name, kubecli) - break - else: - killed_container_list.append([selected_container_pod[0], selected_container_pod[1], c_name]) - retry_container_killing(kill_action, selected_container_pod[0], selected_container_pod[1], c_name, kubecli) - break - container_pod_list.remove(selected_container_pod) - killed_count += 1 - logging.info("Scenario " + scenario_name + " successfully injected") - return killed_container_list - - -def retry_container_killing(kill_action, podname, namespace, container_name, kubecli: KrknKubernetes): - i = 0 - while i < 5: - logging.info("Killing container %s in pod %s (ns %s)" % (str(container_name), str(podname), str(namespace))) - response = kubecli.exec_cmd_in_pod(kill_action, podname, namespace, container_name) - i += 1 - # Blank response means it is done - if not response: - break - elif "unauthorized" in response.lower() or "authorization" in response.lower(): - time.sleep(2) - continue - else: - logging.warning(response) - continue - - -def check_failed_containers(killed_container_list, wait_time, kubecli: KrknKubernetes): - - container_ready = [] - timer = 0 - while timer <= wait_time: - for killed_container in killed_container_list: - # pod namespace contain name - pod_output = kubecli.get_pod_info(killed_container[0], killed_container[1]) - - for container in pod_output.containers: - if container.name == killed_container[2]: - if container.ready: - container_ready.append(killed_container) - if len(container_ready) != 0: - for item in container_ready: - killed_container_list = killed_container_list.remove(item) - if killed_container_list is None or len(killed_container_list) == 0: - return [] - timer += 5 - logging.info("Waiting 5 seconds for containers to become ready") - time.sleep(5) - return killed_container_list diff --git a/kraken/post_actions/actions.py b/kraken/post_actions/actions.py deleted file mode 100644 index e7cb3a5a..00000000 --- a/kraken/post_actions/actions.py +++ /dev/null @@ -1,48 +0,0 @@ -import logging -import kraken.invoke.command as runcommand - - -def run(kubeconfig_path, scenario, pre_action_output=""): - - if scenario.endswith(".yaml") or scenario.endswith(".yml"): - logging.error("Powerfulseal support has recently been removed. Please switch to using plugins instead.") - elif scenario.endswith(".py"): - action_output = runcommand.invoke("python3 " + scenario).strip() - if pre_action_output: - if pre_action_output == action_output: - logging.info(scenario + " post action checks passed") - else: - logging.info(scenario + " post action response did not match pre check output") - logging.info("Pre action output: " + str(pre_action_output) + "\n") - logging.info("Post action output: " + str(action_output)) - return False - elif scenario != "": - # invoke custom bash script - action_output = runcommand.invoke(scenario).strip() - if pre_action_output: - if pre_action_output == action_output: - logging.info(scenario + " post action checks passed") - else: - logging.info(scenario + " post action response did not match pre check output") - return False - - return action_output - - -# Perform the post scenario actions to see if components recovered -def check_recovery(kubeconfig_path, scenario, failed_post_scenarios, pre_action_output): - if failed_post_scenarios: - for failed_scenario in failed_post_scenarios: - post_action_output = run(kubeconfig_path, failed_scenario[0], failed_scenario[1]) - if post_action_output is not False: - failed_post_scenarios.remove(failed_scenario) - else: - logging.info("Post action scenario " + str(failed_scenario) + "is still failing") - - # check post actions - if len(scenario) > 1: - post_action_output = run(kubeconfig_path, scenario[1], pre_action_output) - if post_action_output is False: - failed_post_scenarios.append([scenario[1], pre_action_output]) - - return failed_post_scenarios diff --git a/kraken/pvc/pvc_scenario.py b/kraken/pvc/pvc_scenario.py deleted file mode 100644 index 2b893655..00000000 --- a/kraken/pvc/pvc_scenario.py +++ /dev/null @@ -1,392 +0,0 @@ -import logging -import random -import re -import time -import yaml -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift - -from .. import utils -from ..cerberus import setup as cerberus -from krkn_lib.k8s import KrknKubernetes -from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import get_yaml_item_value, log_exception - - -# krkn_lib -def run(scenarios_list, - config, - wait_duration, - telemetry: KrknTelemetryOpenshift, - telemetry_request_id: str) -> (list[str], list[ScenarioTelemetry]): - """ - Reads the scenario config and creates a temp file to fill up the PVC - """ - failed_post_scenarios = "" - scenario_telemetries: list[ScenarioTelemetry] = [] - failed_scenarios = [] - for app_config in scenarios_list: - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = app_config - scenario_telemetry.start_timestamp = time.time() - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, app_config) - try: - if len(app_config) > 1: - with open(app_config, "r") as f: - config_yaml = yaml.full_load(f) - scenario_config = config_yaml["pvc_scenario"] - pvc_name = get_yaml_item_value( - scenario_config, "pvc_name", "" - ) - pod_name = get_yaml_item_value( - scenario_config, "pod_name", "" - ) - namespace = get_yaml_item_value( - scenario_config, "namespace", "" - ) - target_fill_percentage = get_yaml_item_value( - scenario_config, "fill_percentage", "50" - ) - duration = get_yaml_item_value( - scenario_config, "duration", 60 - ) - - logging.info( - "Input params:\n" - "pvc_name: '%s'\n" - "pod_name: '%s'\n" - "namespace: '%s'\n" - "target_fill_percentage: '%s%%'\nduration: '%ss'" - % ( - str(pvc_name), - str(pod_name), - str(namespace), - str(target_fill_percentage), - str(duration) - ) - ) - - # Check input params - if namespace is None: - logging.error( - "You must specify the namespace where the PVC is" - ) - #sys.exit(1) - raise RuntimeError() - if pvc_name is None and pod_name is None: - logging.error( - "You must specify the pvc_name or the pod_name" - ) - # sys.exit(1) - raise RuntimeError() - if pvc_name and pod_name: - logging.info( - "pod_name will be ignored, pod_name used will be " - "a retrieved from the pod used in the pvc_name" - ) - - # Get pod name - if pvc_name: - if pod_name: - logging.info( - "pod_name '%s' will be overridden with one of " - "the pods mounted in the PVC" % (str(pod_name)) - ) - pvc = telemetry.kubecli.get_pvc_info(pvc_name, namespace) - try: - # random generator not used for - # security/cryptographic purposes. - pod_name = random.choice(pvc.podNames) # nosec - logging.info("Pod name: %s" % pod_name) - except Exception: - logging.error( - "Pod associated with %s PVC, on namespace %s, " - "not found" % (str(pvc_name), str(namespace)) - ) - # sys.exit(1) - raise RuntimeError() - - # Get volume name - pod = telemetry.kubecli.get_pod_info(name=pod_name, namespace=namespace) - - if pod is None: - logging.error( - "Exiting as pod '%s' doesn't exist " - "in namespace '%s'" % ( - str(pod_name), - str(namespace) - ) - ) - # sys.exit(1) - raise RuntimeError() - - for volume in pod.volumes: - if volume.pvcName is not None: - volume_name = volume.name - pvc_name = volume.pvcName - pvc = telemetry.kubecli.get_pvc_info(pvc_name, namespace) - break - if 'pvc' not in locals(): - logging.error( - "Pod '%s' in namespace '%s' does not use a pvc" % ( - str(pod_name), - str(namespace) - ) - ) - # sys.exit(1) - raise RuntimeError() - logging.info("Volume name: %s" % volume_name) - logging.info("PVC name: %s" % pvc_name) - - # Get container name and mount path - for container in pod.containers: - for vol in container.volumeMounts: - if vol.name == volume_name: - mount_path = vol.mountPath - container_name = container.name - break - logging.info("Container path: %s" % container_name) - logging.info("Mount path: %s" % mount_path) - - # Get PVC capacity and used bytes - command = "df %s -B 1024 | sed 1d" % (str(mount_path)) - command_output = ( - telemetry.kubecli.exec_cmd_in_pod( - command, - pod_name, - namespace, - container_name - ) - ).split() - pvc_used_kb = int(command_output[2]) - pvc_capacity_kb = pvc_used_kb + int(command_output[3]) - logging.info("PVC used: %s KB" % pvc_used_kb) - logging.info("PVC capacity: %s KB" % pvc_capacity_kb) - - # Check valid fill percentage - current_fill_percentage = pvc_used_kb / pvc_capacity_kb - if not ( - current_fill_percentage * 100 - < float(target_fill_percentage) - <= 99 - ): - logging.error( - "Target fill percentage (%.2f%%) is lower than " - "current fill percentage (%.2f%%) " - "or higher than 99%%" % ( - target_fill_percentage, - current_fill_percentage * 100 - ) - ) - # sys.exit(1) - raise RuntimeError() - - # Calculate file size - file_size_kb = int( - ( - float( - target_fill_percentage / 100 - ) * float(pvc_capacity_kb) - ) - float(pvc_used_kb) - ) - logging.debug("File size: %s KB" % file_size_kb) - - file_name = "kraken.tmp" - logging.info( - "Creating %s file, %s KB size, in pod %s at %s (ns %s)" - % ( - str(file_name), - str(file_size_kb), - str(pod_name), - str(mount_path), - str(namespace) - ) - ) - - start_time = int(time.time()) - # Create temp file in the PVC - full_path = "%s/%s" % (str(mount_path), str(file_name)) - command = "fallocate -l $((%s*1024)) %s" % ( - str(file_size_kb), - str(full_path) - ) - logging.debug( - "Create temp file in the PVC command:\n %s" % command - ) - telemetry.kubecli.exec_cmd_in_pod( - command, - pod_name, - namespace, - container_name, - ) - - # Check if file is created - command = "ls -lh %s" % (str(mount_path)) - logging.debug("Check file is created command:\n %s" % command) - response = telemetry.kubecli.exec_cmd_in_pod( - command, pod_name, namespace, container_name - ) - logging.info("\n" + str(response)) - if str(file_name).lower() in str(response).lower(): - logging.info( - "%s file successfully created" % (str(full_path)) - ) - else: - logging.error( - "Failed to create tmp file with %s size" % ( - str(file_size_kb) - ) - ) - remove_temp_file( - file_name, - full_path, - pod_name, - namespace, - container_name, - mount_path, - file_size_kb, - telemetry.kubecli - ) - # sys.exit(1) - raise RuntimeError() - - # Calculate file size - file_size_kb = int( - ( - float( - target_fill_percentage / 100 - ) * float(pvc_capacity_kb) - ) - float(pvc_used_kb) - ) - logging.debug("File size: %s KB" % file_size_kb) - - file_name = "kraken.tmp" - logging.info( - "Creating %s file, %s KB size, in pod %s at %s (ns %s)" - % ( - str(file_name), - str(file_size_kb), - str(pod_name), - str(mount_path), - str(namespace) - ) - ) - - start_time = int(time.time()) - # Create temp file in the PVC - full_path = "%s/%s" % (str(mount_path), str(file_name)) - command = "fallocate -l $((%s*1024)) %s" % ( - str(file_size_kb), - str(full_path) - ) - logging.debug( - "Create temp file in the PVC command:\n %s" % command - ) - telemetry.kubecli.exec_cmd_in_pod( - command, pod_name, namespace, container_name - ) - - # Check if file is created - command = "ls -lh %s" % (str(mount_path)) - logging.debug("Check file is created command:\n %s" % command) - response = telemetry.kubecli.exec_cmd_in_pod( - command, pod_name, namespace, container_name - ) - logging.info("\n" + str(response)) - if str(file_name).lower() in str(response).lower(): - logging.info( - "Waiting for the specified duration in the config: %ss" % ( - duration - ) - ) - time.sleep(duration) - logging.info("Finish waiting") - - remove_temp_file( - file_name, - full_path, - pod_name, - namespace, - container_name, - mount_path, - file_size_kb, - telemetry.kubecli - ) - logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) - - end_time = int(time.time()) - cerberus.publish_kraken_status( - config, - failed_post_scenarios, - start_time, - end_time - ) - except (RuntimeError, Exception): - scenario_telemetry.exit_status = 1 - failed_scenarios.append(app_config) - log_exception(app_config) - else: - scenario_telemetry.exit_status = 0 - - scenario_telemetry.end_timestamp = time.time() - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - scenario_telemetries.append(scenario_telemetry) - - return failed_scenarios, scenario_telemetries - - - - -# krkn_lib -def remove_temp_file( - file_name, - full_path, - pod_name, - namespace, - container_name, - mount_path, - file_size_kb, - kubecli: KrknKubernetes -): - command = "rm -f %s" % (str(full_path)) - logging.debug("Remove temp file from the PVC command:\n %s" % command) - kubecli.exec_cmd_in_pod(command, pod_name, namespace, container_name) - command = "ls -lh %s" % (str(mount_path)) - logging.debug("Check temp file is removed command:\n %s" % command) - response = kubecli.exec_cmd_in_pod( - command, - pod_name, - namespace, - container_name - ) - logging.info("\n" + str(response)) - if not (str(file_name).lower() in str(response).lower()): - logging.info("Temp file successfully removed") - else: - logging.error( - "Failed to delete tmp file with %s size" % (str(file_size_kb)) - ) - raise RuntimeError() - - -def toKbytes(value): - if not re.match("^[0-9]+[K|M|G|T]i$", value): - logging.error( - "PVC capacity %s does not match expression " - "regexp '^[0-9]+[K|M|G|T]i$'" - ) - raise RuntimeError() - unit = {"K": 0, "M": 1, "G": 2, "T": 3} - base = 1024 if ("i" in value) else 1000 - exp = unit[value[-2:-1]] - res = int(value[:-2]) * (base**exp) - return res diff --git a/kraken/service_disruption/common_service_disruption_functions.py b/kraken/service_disruption/common_service_disruption_functions.py deleted file mode 100644 index 80e9474f..00000000 --- a/kraken/service_disruption/common_service_disruption_functions.py +++ /dev/null @@ -1,338 +0,0 @@ -import time -import random -import logging - -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift - -import kraken.cerberus.setup as cerberus -import kraken.post_actions.actions as post_actions -import yaml -from krkn_lib.k8s import KrknKubernetes -from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import get_yaml_item_value, log_exception - -from kraken import utils - - -def delete_objects(kubecli, namespace): - - services = delete_all_services_namespace(kubecli, namespace) - daemonsets = delete_all_daemonset_namespace(kubecli, namespace) - statefulsets = delete_all_statefulsets_namespace(kubecli, namespace) - replicasets = delete_all_replicaset_namespace(kubecli, namespace) - deployments = delete_all_deployment_namespace(kubecli, namespace) - - objects = { "daemonsets": daemonsets, - "deployments": deployments, - "replicasets": replicasets, - "statefulsets": statefulsets, - "services": services - } - - return objects - - -def get_list_running_pods(kubecli: KrknKubernetes, namespace: str): - running_pods = [] - pods = kubecli.list_pods(namespace) - for pod in pods: - pod_status = kubecli.get_pod_info(pod, namespace) - if pod_status and pod_status.status == "Running": - running_pods.append(pod) - logging.info('all running pods ' + str(running_pods)) - return running_pods - - -def delete_all_deployment_namespace(kubecli: KrknKubernetes, namespace: str): - """ - Delete all the deployments in the specified namespace - - :param kubecli: krkn kubernetes python package - :param namespace: namespace - """ - try: - deployments = kubecli.get_deployment_ns(namespace) - for deployment in deployments: - logging.info("Deleting deployment" + deployment) - kubecli.delete_deployment(deployment, namespace) - except Exception as e: - logging.error( - "Exception when calling delete_all_deployment_namespace: %s\n", - str(e), - ) - raise e - - return deployments - - -def delete_all_daemonset_namespace(kubecli: KrknKubernetes, namespace: str): - """ - Delete all the daemonset in the specified namespace - - :param kubecli: krkn kubernetes python package - :param namespace: namespace - """ - try: - daemonsets = kubecli.get_daemonset(namespace) - for daemonset in daemonsets: - logging.info("Deleting daemonset" + daemonset) - kubecli.delete_daemonset(daemonset, namespace) - except Exception as e: - logging.error( - "Exception when calling delete_all_daemonset_namespace: %s\n", - str(e), - ) - raise e - - return daemonsets - - -def delete_all_statefulsets_namespace(kubecli: KrknKubernetes, namespace: str): - """ - Delete all the statefulsets in the specified namespace - - - :param kubecli: krkn kubernetes python package - :param namespace: namespace - """ - try: - statefulsets = kubecli.get_all_statefulset(namespace) - for statefulset in statefulsets: - logging.info("Deleting statefulsets" + statefulsets) - kubecli.delete_statefulset(statefulset, namespace) - except Exception as e: - logging.error( - "Exception when calling delete_all_statefulsets_namespace: %s\n", - str(e), - ) - raise e - - return statefulsets - - -def delete_all_replicaset_namespace(kubecli: KrknKubernetes, namespace: str): - """ - Delete all the replicasets in the specified namespace - - :param kubecli: krkn kubernetes python package - :param namespace: namespace - """ - try: - replicasets = kubecli.get_all_replicasets(namespace) - for replicaset in replicasets: - logging.info("Deleting replicaset" + replicaset) - kubecli.delete_replicaset(replicaset, namespace) - except Exception as e: - logging.error( - "Exception when calling delete_all_replicaset_namespace: %s\n", - str(e), - ) - raise e - - return replicasets - -def delete_all_services_namespace(kubecli: KrknKubernetes, namespace: str): - """ - Delete all the services in the specified namespace - - - :param kubecli: krkn kubernetes python package - :param namespace: namespace - """ - try: - services = kubecli.get_all_services(namespace) - for service in services: - logging.info("Deleting services" + service) - kubecli.delete_services(service, namespace) - except Exception as e: - logging.error( - "Exception when calling delete_all_services_namespace: %s\n", - str(e), - ) - raise e - - return services - - -# krkn_lib -def run( - scenarios_list, - config, - wait_duration, - failed_post_scenarios, - telemetry: KrknTelemetryOpenshift, - telemetry_request_id: str -) -> (list[str], list[ScenarioTelemetry]): - scenario_telemetries: list[ScenarioTelemetry] = [] - failed_scenarios = [] - for scenario_config in scenarios_list: - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = scenario_config[0] - scenario_telemetry.start_timestamp = time.time() - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, scenario_config[0]) - try: - if len(scenario_config) > 1: - pre_action_output = post_actions.run(telemetry.kubecli.get_kubeconfig_path(), scenario_config[1]) - else: - pre_action_output = "" - with open(scenario_config[0], "r") as f: - scenario_config_yaml = yaml.full_load(f) - for scenario in scenario_config_yaml["scenarios"]: - scenario_namespace = get_yaml_item_value( - scenario, "namespace", "" - ) - scenario_label = get_yaml_item_value( - scenario, "label_selector", "" - ) - if scenario_namespace is not None and scenario_namespace.strip() != "": - if scenario_label is not None and scenario_label.strip() != "": - logging.error("You can only have namespace or label set in your namespace scenario") - logging.error( - "Current scenario config has namespace '%s' and label selector '%s'" - % (scenario_namespace, scenario_label) - ) - logging.error( - "Please set either namespace to blank ('') or label_selector to blank ('') to continue" - ) - # removed_exit - # sys.exit(1) - raise RuntimeError() - delete_count = get_yaml_item_value( - scenario, "delete_count", 1 - ) - run_count = get_yaml_item_value(scenario, "runs", 1) - run_sleep = get_yaml_item_value(scenario, "sleep", 10) - wait_time = get_yaml_item_value(scenario, "wait_time", 30) - - logging.info(str(scenario_namespace) + str(scenario_label) + str(delete_count) + str(run_count) + str(run_sleep) + str(wait_time)) - logging.info("done") - start_time = int(time.time()) - for i in range(run_count): - killed_namespaces = {} - namespaces = telemetry.kubecli.check_namespaces([scenario_namespace], scenario_label) - for j in range(delete_count): - if len(namespaces) == 0: - logging.error( - "Couldn't delete %s namespaces, not enough namespaces matching %s with label %s" - % (str(run_count), scenario_namespace, str(scenario_label)) - ) - # removed_exit - # sys.exit(1) - raise RuntimeError() - selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)] - logging.info('Delete objects in selected namespace: ' + selected_namespace ) - try: - # delete all pods in namespace - objects = delete_objects(telemetry.kubecli,selected_namespace) - killed_namespaces[selected_namespace] = objects - logging.info("Deleted all objects in namespace %s was successful" % str(selected_namespace)) - except Exception as e: - logging.info("Delete all objects in namespace %s was unsuccessful" % str(selected_namespace)) - logging.info("Namespace action error: " + str(e)) - raise RuntimeError() - namespaces.remove(selected_namespace) - logging.info("Waiting %s seconds between namespace deletions" % str(run_sleep)) - time.sleep(run_sleep) - - logging.info("Waiting for the specified duration: %s" % wait_duration) - time.sleep(wait_duration) - if len(scenario_config) > 1: - try: - failed_post_scenarios = post_actions.check_recovery( - telemetry.kubecli.get_kubeconfig_path(), scenario_config, failed_post_scenarios, pre_action_output - ) - except Exception as e: - logging.error("Failed to run post action checks: %s" % e) - # removed_exit - # sys.exit(1) - raise RuntimeError() - else: - failed_post_scenarios = check_all_running_deployment(killed_namespaces, wait_time, telemetry.kubecli) - - end_time = int(time.time()) - cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) - except (Exception, RuntimeError): - scenario_telemetry.exit_status = 1 - failed_scenarios.append(scenario_config[0]) - log_exception(scenario_config[0]) - else: - scenario_telemetry.exit_status = 0 - scenario_telemetry.end_timestamp = time.time() - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - scenario_telemetries.append(scenario_telemetry) - return failed_scenarios, scenario_telemetries - - -def check_all_running_pods(kubecli: KrknKubernetes, namespace_name, wait_time): - - timer = 0 - while timer < wait_time: - pod_list = kubecli.list_pods(namespace_name) - pods_running = 0 - for pod in pod_list: - pod_info = kubecli.get_pod_info(pod, namespace_name) - if pod_info.status != "Running" and pod_info.status != "Succeeded": - logging.info("Pods %s still not running or completed" % pod_info.name) - break - pods_running += 1 - if len(pod_list) == pods_running: - break - timer += 5 - time.sleep(5) - logging.info("Waiting 5 seconds for pods to become active") - -# krkn_lib -def check_all_running_deployment(killed_namespaces, wait_time, kubecli: KrknKubernetes): - - timer = 0 - while timer < wait_time and killed_namespaces: - still_missing_ns = killed_namespaces.copy() - for namespace_name, objects in killed_namespaces.items(): - still_missing_obj = objects.copy() - for obj_name, obj_list in objects.items(): - if "deployments" == obj_name: - deployments = kubecli.get_deployment_ns(namespace_name) - if len(obj_list) == len(deployments): - still_missing_obj.pop(obj_name) - elif "replicasets" == obj_name: - replicasets = kubecli.get_all_replicasets(namespace_name) - if len(obj_list) == len(replicasets): - still_missing_obj.pop(obj_name) - elif "statefulsets" == obj_name: - statefulsets = kubecli.get_all_statefulset(namespace_name) - if len(obj_list) == len(statefulsets): - still_missing_obj.pop(obj_name) - elif "services" == obj_name: - services = kubecli.get_all_services(namespace_name) - if len(obj_list) == len(services): - still_missing_obj.pop(obj_name) - elif "daemonsets" == obj_name: - daemonsets = kubecli.get_daemonset(namespace_name) - if len(obj_list) == len(daemonsets): - still_missing_obj.pop(obj_name) - logging.info("Still missing objects " + str(still_missing_obj)) - killed_namespaces[namespace_name] = still_missing_obj.copy() - if len(killed_namespaces[namespace_name].keys()) == 0: - logging.info("Wait for pods to become running for namespace: " + namespace_name) - check_all_running_pods(kubecli, namespace_name, wait_time) - still_missing_ns.pop(namespace_name) - killed_namespaces = still_missing_ns - if len(killed_namespaces.keys()) == 0: - return [] - - timer += 10 - time.sleep(10) - logging.info("Waiting 10 seconds for objects in namespaces to become active") - - logging.error("Objects are still not ready after waiting " + str(wait_time) + "seconds") - logging.error("Non active namespaces " + str(killed_namespaces)) - return killed_namespaces diff --git a/kraken/service_hijacking/service_hijacking.py b/kraken/service_hijacking/service_hijacking.py deleted file mode 100644 index 3f5ca1ba..00000000 --- a/kraken/service_hijacking/service_hijacking.py +++ /dev/null @@ -1,103 +0,0 @@ -import logging -import time -import yaml - -from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift -from krkn_lib.utils import log_exception - -from kraken import utils - - -def run(scenarios_list: list[str], - wait_duration: int, - telemetry: KrknTelemetryOpenshift, - telemetry_request_id: str) -> (list[str], list[ScenarioTelemetry]): - - scenario_telemetries = list[ScenarioTelemetry]() - failed_post_scenarios = [] - for scenario in scenarios_list: - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = scenario - scenario_telemetry.start_timestamp = time.time() - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, scenario) - with open(scenario) as stream: - scenario_config = yaml.safe_load(stream) - - service_name = scenario_config['service_name'] - service_namespace = scenario_config['service_namespace'] - plan = scenario_config["plan"] - image = scenario_config["image"] - target_port = scenario_config["service_target_port"] - chaos_duration = scenario_config["chaos_duration"] - - logging.info(f"checking service {service_name} in namespace: {service_namespace}") - if not telemetry.kubecli.service_exists(service_name, service_namespace): - logging.error(f"service: {service_name} not found in namespace: {service_namespace}, failed to run scenario.") - fail_scenario_telemetry(scenario_telemetry) - failed_post_scenarios.append(scenario) - break - try: - logging.info(f"service: {service_name} found in namespace: {service_namespace}") - logging.info(f"creating webservice and initializing test plan...") - # both named ports and port numbers can be used - if isinstance(target_port, int): - logging.info(f"webservice will listen on port {target_port}") - webservice = telemetry.kubecli.deploy_service_hijacking(service_namespace, plan, image, port_number=target_port) - else: - logging.info(f"traffic will be redirected to named port: {target_port}") - webservice = telemetry.kubecli.deploy_service_hijacking(service_namespace, plan, image, port_name=target_port) - logging.info(f"successfully deployed pod: {webservice.pod_name} " - f"in namespace:{service_namespace} with selector {webservice.selector}!" - ) - logging.info(f"patching service: {service_name} to hijack traffic towards: {webservice.pod_name}") - original_service = telemetry.kubecli.replace_service_selector([webservice.selector], service_name, service_namespace) - if original_service is None: - logging.error(f"failed to patch service: {service_name}, namespace: {service_namespace} with selector {webservice.selector}") - fail_scenario_telemetry(scenario_telemetry) - failed_post_scenarios.append(scenario) - break - - logging.info(f"service: {service_name} successfully patched!") - logging.info(f"original service manifest:\n\n{yaml.dump(original_service)}") - logging.info(f"waiting {chaos_duration} before restoring the service") - time.sleep(chaos_duration) - selectors = ["=".join([key, original_service["spec"]["selector"][key]]) for key in original_service["spec"]["selector"].keys()] - logging.info(f"restoring the service selectors {selectors}") - original_service = telemetry.kubecli.replace_service_selector(selectors, service_name, service_namespace) - if original_service is None: - logging.error(f"failed to restore original service: {service_name}, namespace: {service_namespace} with selectors: {selectors}") - fail_scenario_telemetry(scenario_telemetry) - failed_post_scenarios.append(scenario) - break - logging.info("selectors successfully restored") - logging.info("undeploying service-hijacking resources...") - telemetry.kubecli.undeploy_service_hijacking(webservice) - - logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) - scenario_telemetry.exit_status = 0 - logging.info("success") - except Exception as e: - logging.error(f"scenario {scenario} failed with exception: {e}") - fail_scenario_telemetry(scenario_telemetry) - log_exception(scenario) - - scenario_telemetry.end_timestamp = time.time() - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - scenario_telemetries.append(scenario_telemetry) - - return failed_post_scenarios, scenario_telemetries - -def fail_scenario_telemetry(scenario_telemetry: ScenarioTelemetry): - scenario_telemetry.exit_status = 1 - scenario_telemetry.end_timestamp = time.time() \ No newline at end of file diff --git a/kraken/shut_down/common_shut_down_func.py b/kraken/shut_down/common_shut_down_func.py deleted file mode 100644 index d89f85b7..00000000 --- a/kraken/shut_down/common_shut_down_func.py +++ /dev/null @@ -1,208 +0,0 @@ -#!/usr/bin/env python -import yaml -import logging -import time -from multiprocessing.pool import ThreadPool - -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift - -from .. import utils -from ..cerberus import setup as cerberus -from ..post_actions import actions as post_actions -from ..node_actions.aws_node_scenarios import AWS -from ..node_actions.openstack_node_scenarios import OPENSTACKCLOUD -from ..node_actions.az_node_scenarios import Azure -from ..node_actions.gcp_node_scenarios import GCP -from krkn_lib.k8s import KrknKubernetes -from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import log_exception - -def multiprocess_nodes(cloud_object_function, nodes, processes=0): - try: - # pool object with number of element - - if processes == 0: - pool = ThreadPool(processes=len(nodes)) - else: - pool = ThreadPool(processes=processes) - logging.info("nodes type " + str(type(nodes[0]))) - if type(nodes[0]) is tuple: - node_id = [] - node_info = [] - for node in nodes: - node_id.append(node[0]) - node_info.append(node[1]) - logging.info("node id " + str(node_id)) - logging.info("node info" + str(node_info)) - pool.starmap(cloud_object_function, zip(node_info, node_id)) - - else: - logging.info("pool type" + str(type(nodes))) - pool.map(cloud_object_function, nodes) - pool.close() - except Exception as e: - logging.info("Error on pool multiprocessing: " + str(e)) - - -# Inject the cluster shut down scenario -# krkn_lib -def cluster_shut_down(shut_down_config, kubecli: KrknKubernetes): - runs = shut_down_config["runs"] - shut_down_duration = shut_down_config["shut_down_duration"] - cloud_type = shut_down_config["cloud_type"] - timeout = shut_down_config["timeout"] - processes = 0 - if cloud_type.lower() == "aws": - cloud_object = AWS() - elif cloud_type.lower() == "gcp": - cloud_object = GCP() - processes = 1 - elif cloud_type.lower() == "openstack": - cloud_object = OPENSTACKCLOUD() - elif cloud_type.lower() in ["azure", "az"]: - cloud_object = Azure() - else: - logging.error( - "Cloud type %s is not currently supported for cluster shut down" % - cloud_type - ) - # removed_exit - # sys.exit(1) - raise RuntimeError() - - nodes = kubecli.list_nodes() - node_id = [] - for node in nodes: - instance_id = cloud_object.get_instance_id(node) - node_id.append(instance_id) - logging.info("node id list " + str(node_id)) - for _ in range(runs): - logging.info("Starting cluster_shut_down scenario injection") - stopping_nodes = set(node_id) - multiprocess_nodes(cloud_object.stop_instances, node_id, processes) - stopped_nodes = stopping_nodes.copy() - while len(stopping_nodes) > 0: - for node in stopping_nodes: - if type(node) is tuple: - node_status = cloud_object.wait_until_stopped( - node[1], - node[0], - timeout - ) - else: - node_status = cloud_object.wait_until_stopped( - node, - timeout - ) - - # Only want to remove node from stopping list - # when fully stopped/no error - if node_status: - stopped_nodes.remove(node) - - stopping_nodes = stopped_nodes.copy() - - logging.info( - "Shutting down the cluster for the specified duration: %s" % - (shut_down_duration) - ) - time.sleep(shut_down_duration) - logging.info("Restarting the nodes") - restarted_nodes = set(node_id) - multiprocess_nodes(cloud_object.start_instances, node_id, processes) - logging.info("Wait for each node to be running again") - not_running_nodes = restarted_nodes.copy() - while len(not_running_nodes) > 0: - for node in not_running_nodes: - if type(node) is tuple: - node_status = cloud_object.wait_until_running( - node[1], - node[0], - timeout - ) - else: - node_status = cloud_object.wait_until_running( - node, - timeout - ) - if node_status: - restarted_nodes.remove(node) - not_running_nodes = restarted_nodes.copy() - logging.info( - "Waiting for 150s to allow cluster component initialization" - ) - time.sleep(150) - - logging.info("Successfully injected cluster_shut_down scenario!") - -# krkn_lib - -def run(scenarios_list, - config, - wait_duration, - telemetry: KrknTelemetryOpenshift, - telemetry_request_id: str) -> (list[str], list[ScenarioTelemetry]): - failed_post_scenarios = [] - failed_scenarios = [] - scenario_telemetries: list[ScenarioTelemetry] = [] - - for shut_down_config in scenarios_list: - config_path = shut_down_config - pre_action_output = "" - if isinstance(shut_down_config, list) : - if len(shut_down_config) == 0: - raise Exception("bad config file format for shutdown scenario") - - config_path = shut_down_config[0] - if len(shut_down_config) > 1: - pre_action_output = post_actions.run("", shut_down_config[1]) - - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = config_path - scenario_telemetry.start_timestamp = time.time() - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, config_path) - - with open(config_path, "r") as f: - shut_down_config_yaml = yaml.full_load(f) - shut_down_config_scenario = \ - shut_down_config_yaml["cluster_shut_down_scenario"] - start_time = int(time.time()) - try: - cluster_shut_down(shut_down_config_scenario, telemetry.kubecli) - logging.info( - "Waiting for the specified duration: %s" % (wait_duration) - ) - time.sleep(wait_duration) - failed_post_scenarios = post_actions.check_recovery( - "", shut_down_config, failed_post_scenarios, pre_action_output - ) - end_time = int(time.time()) - cerberus.publish_kraken_status( - config, - failed_post_scenarios, - start_time, - end_time - ) - - except (RuntimeError, Exception): - log_exception(config_path) - failed_scenarios.append(config_path) - scenario_telemetry.exit_status = 1 - else: - scenario_telemetry.exit_status = 0 - - scenario_telemetry.end_timestamp = time.time() - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - scenario_telemetries.append(scenario_telemetry) - - return failed_scenarios, scenario_telemetries - diff --git a/kraken/syn_flood/__init__.py b/kraken/syn_flood/__init__.py deleted file mode 100644 index 57180326..00000000 --- a/kraken/syn_flood/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .syn_flood import * \ No newline at end of file diff --git a/kraken/syn_flood/syn_flood.py b/kraken/syn_flood/syn_flood.py deleted file mode 100644 index 4036388d..00000000 --- a/kraken/syn_flood/syn_flood.py +++ /dev/null @@ -1,148 +0,0 @@ -import logging -import os.path -import time -from typing import List - -import krkn_lib.utils -import yaml -from krkn_lib.k8s import KrknKubernetes -from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift - -from kraken import utils - - -def run(scenarios_list: list[str], - telemetry: KrknTelemetryOpenshift, - telemetry_request_id: str - ) -> (list[str], list[ScenarioTelemetry]): - scenario_telemetries: list[ScenarioTelemetry] = [] - failed_post_scenarios = [] - for scenario in scenarios_list: - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = scenario - scenario_telemetry.start_timestamp = time.time() - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, scenario) - - try: - pod_names = [] - config = parse_config(scenario) - if config["target-service-label"]: - target_services = telemetry.kubecli.select_service_by_label(config["namespace"], config["target-service-label"]) - else: - target_services = [config["target-service"]] - - for target in target_services: - if not telemetry.kubecli.service_exists(target, config["namespace"]): - raise Exception(f"{target} service not found") - for i in range(config["number-of-pods"]): - pod_name = "syn-flood-" + krkn_lib.utils.get_random_string(10) - telemetry.kubecli.deploy_syn_flood(pod_name, - config["namespace"], - config["image"], - target, - config["target-port"], - config["packet-size"], - config["window-size"], - config["duration"], - config["attacker-nodes"] - ) - pod_names.append(pod_name) - - logging.info("waiting all the attackers to finish:") - did_finish = False - finished_pods = [] - while not did_finish: - for pod_name in pod_names: - if not telemetry.kubecli.is_pod_running(pod_name, config["namespace"]): - finished_pods.append(pod_name) - if set(pod_names) == set(finished_pods): - did_finish = True - time.sleep(1) - - except Exception as e: - logging.error(f"Failed to run syn flood scenario {scenario}: {e}") - failed_post_scenarios.append(scenario) - scenario_telemetry.exit_status = 1 - else: - scenario_telemetry.exit_status = 0 - scenario_telemetry.end_timestamp = time.time() - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - scenario_telemetries.append(scenario_telemetry) - return failed_post_scenarios, scenario_telemetries - -def parse_config(scenario_file: str) -> dict[str,any]: - if not os.path.exists(scenario_file): - raise Exception(f"failed to load scenario file {scenario_file}") - - try: - with open(scenario_file) as stream: - config = yaml.safe_load(stream) - except Exception: - raise Exception(f"{scenario_file} is not a valid yaml file") - - missing = [] - if not check_key_value(config ,"packet-size"): - missing.append("packet-size") - if not check_key_value(config,"window-size"): - missing.append("window-size") - if not check_key_value(config, "duration"): - missing.append("duration") - if not check_key_value(config, "namespace"): - missing.append("namespace") - if not check_key_value(config, "number-of-pods"): - missing.append("number-of-pods") - if not check_key_value(config, "target-port"): - missing.append("target-port") - if not check_key_value(config, "image"): - missing.append("image") - if "target-service" not in config.keys(): - missing.append("target-service") - if "target-service-label" not in config.keys(): - missing.append("target-service-label") - - - - - if len(missing) > 0: - raise Exception(f"{(',').join(missing)} parameter(s) are missing") - - if not config["target-service"] and not config["target-service-label"]: - raise Exception("you have either to set a target service or a label") - if config["target-service"] and config["target-service-label"]: - raise Exception("you cannot select both target-service and target-service-label") - - if 'attacker-nodes' and not is_node_affinity_correct(config['attacker-nodes']): - raise Exception("attacker-nodes format is not correct") - return config - -def check_key_value(dictionary, key): - if key in dictionary: - value = dictionary[key] - if value is not None and value != '': - return True - return False - -def is_node_affinity_correct(obj) -> bool: - if not isinstance(obj, dict): - return False - for key in obj.keys(): - if not isinstance(key, str): - return False - if not isinstance(obj[key], list): - return False - return True - - - - diff --git a/kraken/time_actions/common_time_functions.py b/kraken/time_actions/common_time_functions.py deleted file mode 100644 index 8c9f039d..00000000 --- a/kraken/time_actions/common_time_functions.py +++ /dev/null @@ -1,402 +0,0 @@ -import datetime -import time -import logging -import re - -import yaml -import random - -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift -from kubernetes.client import ApiException - -from .. import utils -from ..cerberus import setup as cerberus -from krkn_lib.k8s import KrknKubernetes -from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import get_yaml_item_value, log_exception, get_random_string - - -# krkn_lib -def pod_exec(pod_name, command, namespace, container_name, kubecli:KrknKubernetes): - for i in range(5): - response = kubecli.exec_cmd_in_pod( - command, - pod_name, - namespace, - container_name - ) - if not response: - time.sleep(2) - continue - elif ( - "unauthorized" in response.lower() or - "authorization" in response.lower() - ): - time.sleep(2) - continue - else: - break - return response - - -# krkn_lib -def get_container_name(pod_name, namespace, kubecli:KrknKubernetes, container_name=""): - - container_names = kubecli.get_containers_in_pod(pod_name, namespace) - if container_name != "": - if container_name in container_names: - return container_name - else: - logging.error( - "Container name %s not an existing container in pod %s" % ( - container_name, - pod_name - ) - ) - else: - container_name = container_names[ - # random module here is not used for security/cryptographic - # purposes - random.randint(0, len(container_names) - 1) # nosec - ] - return container_name - - - -def skew_node(node_name: str, action: str, kubecli: KrknKubernetes): - pod_namespace = "default" - status_pod_name = f"time-skew-pod-{get_random_string(5)}" - skew_pod_name = f"time-skew-pod-{get_random_string(5)}" - ntp_enabled = True - logging.info(f'Creating pod to skew {"time" if action == "skew_time" else "date"} on node {node_name}') - status_command = ["timedatectl"] - param = "2001-01-01" - skew_command = ["timedatectl", "set-time"] - if action == "skew_time": - skew_command.append("01:01:01") - else: - skew_command.append("2001-01-01") - - try: - status_response = kubecli.exec_command_on_node(node_name, status_command, status_pod_name, pod_namespace) - if "Network time on: no" in status_response: - ntp_enabled = False - - logging.warning(f'ntp unactive on node {node_name} skewing {"time" if action == "skew_time" else "date"} to {param}') - pod_exec(skew_pod_name, skew_command, pod_namespace, None, kubecli) - else: - logging.info(f'ntp active in cluster node, {"time" if action == "skew_time" else "date"} skewing will have no effect, skipping') - except ApiException: - pass - except Exception as e: - logging.error(f"failed to execute skew command in pod: {e}") - finally: - kubecli.delete_pod(status_pod_name, pod_namespace) - if not ntp_enabled : - kubecli.delete_pod(skew_pod_name, pod_namespace) - - - -# krkn_lib -def skew_time(scenario, kubecli:KrknKubernetes): - if scenario["action"] not in ["skew_date","skew_time"]: - raise RuntimeError(f'{scenario["action"]} is not a valid time skew action') - - if "node" in scenario["object_type"]: - node_names = [] - if "object_name" in scenario.keys() and scenario["object_name"]: - node_names = scenario["object_name"] - elif ( - "label_selector" in scenario.keys() and - scenario["label_selector"] - ): - node_names = kubecli.list_nodes(scenario["label_selector"]) - for node in node_names: - skew_node(node, scenario["action"], kubecli) - logging.info("Reset date/time on node " + str(node)) - return "node", node_names - - elif "pod" in scenario["object_type"]: - skew_command = "date --date " - if scenario["action"] == "skew_date": - skewed_date = "00-01-01" - skew_command += skewed_date - elif scenario["action"] == "skew_time": - skewed_time = "01:01:01" - skew_command += skewed_time - container_name = get_yaml_item_value(scenario, "container_name", "") - pod_names = [] - if "object_name" in scenario.keys() and scenario["object_name"]: - for name in scenario["object_name"]: - if "namespace" not in scenario.keys(): - logging.error("Need to set namespace when using pod name") - # removed_exit - # sys.exit(1) - raise RuntimeError() - pod_names.append([name, scenario["namespace"]]) - elif "namespace" in scenario.keys() and scenario["namespace"]: - if "label_selector" not in scenario.keys(): - logging.info( - "label_selector key not found, querying for all the pods " - "in namespace: %s" % (scenario["namespace"]) - ) - pod_names = kubecli.list_pods(scenario["namespace"]) - else: - logging.info( - "Querying for the pods matching the %s label_selector " - "in namespace %s" - % (scenario["label_selector"], scenario["namespace"]) - ) - pod_names = kubecli.list_pods( - scenario["namespace"], - scenario["label_selector"] - ) - counter = 0 - for pod_name in pod_names: - pod_names[counter] = [pod_name, scenario["namespace"]] - counter += 1 - elif ( - "label_selector" in scenario.keys() and - scenario["label_selector"] - ): - pod_names = kubecli.get_all_pods(scenario["label_selector"]) - - if len(pod_names) == 0: - logging.info( - "Cannot find pods matching the namespace/label_selector, " - "please check" - ) - # removed_exit - # sys.exit(1) - raise RuntimeError() - pod_counter = 0 - for pod in pod_names: - if len(pod) > 1: - selected_container_name = get_container_name( - pod[0], - pod[1], - kubecli, - container_name, - - ) - pod_exec_response = pod_exec( - pod[0], - skew_command, - pod[1], - selected_container_name, - kubecli, - - ) - if pod_exec_response is False: - logging.error( - "Couldn't reset time on container %s " - "in pod %s in namespace %s" - % (selected_container_name, pod[0], pod[1]) - ) - # removed_exit - # sys.exit(1) - raise RuntimeError() - pod_names[pod_counter].append(selected_container_name) - else: - selected_container_name = get_container_name( - pod, - scenario["namespace"], - kubecli, - container_name - ) - pod_exec_response = pod_exec( - pod, - skew_command, - scenario["namespace"], - selected_container_name, - kubecli - ) - if pod_exec_response is False: - logging.error( - "Couldn't reset time on container " - "%s in pod %s in namespace %s" - % ( - selected_container_name, - pod, - scenario["namespace"] - ) - ) - # removed_exit - # sys.exit(1) - raise RuntimeError() - pod_names[pod_counter].append(selected_container_name) - logging.info("Reset date/time on pod " + str(pod[0])) - pod_counter += 1 - return "pod", pod_names - - -# From kubectl/oc command get time output -def parse_string_date(obj_datetime): - try: - logging.info("Obj_date time " + str(obj_datetime)) - obj_datetime = re.sub(r"\s\s+", " ", obj_datetime).strip() - logging.info("Obj_date sub time " + str(obj_datetime)) - date_line = re.match( - r"[\s\S\n]*\w{3} \w{3} \d{1,} \d{2}:\d{2}:\d{2} \w{3} \d{4}[\s\S\n]*", # noqa - obj_datetime - ) - if date_line is not None: - search_response = date_line.group().strip() - logging.info("Search response: " + str(search_response)) - return search_response - else: - return "" - except Exception as e: - logging.info( - "Exception %s when trying to parse string to date" % str(e) - ) - return "" - - -# Get date and time from string returned from OC -def string_to_date(obj_datetime): - obj_datetime = parse_string_date(obj_datetime) - try: - date_time_obj = datetime.datetime.strptime( - obj_datetime, - "%a %b %d %H:%M:%S %Z %Y" - ) - return date_time_obj - except Exception: - logging.info("Couldn't parse string to datetime object") - return datetime.datetime(datetime.MINYEAR, 1, 1) - - -# krkn_lib -def check_date_time(object_type, names, kubecli:KrknKubernetes): - skew_command = "date" - not_reset = [] - max_retries = 30 - if object_type == "node": - for node_name in names: - first_date_time = datetime.datetime.utcnow() - check_pod_name = f"time-skew-pod-{get_random_string(5)}" - node_datetime_string = kubecli.exec_command_on_node(node_name, [skew_command], check_pod_name) - node_datetime = string_to_date(node_datetime_string) - counter = 0 - while not ( - first_date_time < node_datetime < datetime.datetime.utcnow() - ): - time.sleep(10) - logging.info( - "Date/time on node %s still not reset, " - "waiting 10 seconds and retrying" % node_name - ) - - node_datetime_string = kubecli.exec_cmd_in_pod([skew_command], check_pod_name, "default") - node_datetime = string_to_date(node_datetime_string) - counter += 1 - if counter > max_retries: - logging.error( - "Date and time in node %s didn't reset properly" % - node_name - ) - not_reset.append(node_name) - break - if counter < max_retries: - logging.info( - "Date in node " + str(node_name) + " reset properly" - ) - kubecli.delete_pod(check_pod_name) - - elif object_type == "pod": - for pod_name in names: - first_date_time = datetime.datetime.utcnow() - counter = 0 - pod_datetime_string = pod_exec( - pod_name[0], - skew_command, - pod_name[1], - pod_name[2], - kubecli - ) - pod_datetime = string_to_date(pod_datetime_string) - while not ( - first_date_time < pod_datetime < datetime.datetime.utcnow() - ): - time.sleep(10) - logging.info( - "Date/time on pod %s still not reset, " - "waiting 10 seconds and retrying" % pod_name[0] - ) - pod_datetime = pod_exec( - pod_name[0], - skew_command, - pod_name[1], - pod_name[2], - kubecli - ) - pod_datetime = string_to_date(pod_datetime) - counter += 1 - if counter > max_retries: - logging.error( - "Date and time in pod %s didn't reset properly" % - pod_name[0] - ) - not_reset.append(pod_name[0]) - break - if counter < max_retries: - logging.info( - "Date in pod " + str(pod_name[0]) + " reset properly" - ) - return not_reset - - -# krkn_lib -def run(scenarios_list, - config, - wait_duration, - telemetry: KrknTelemetryOpenshift, - telemetry_request_id: str) -> (list[str], list[ScenarioTelemetry]): - failed_scenarios = [] - scenario_telemetries: list[ScenarioTelemetry] = [] - for time_scenario_config in scenarios_list: - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = time_scenario_config - scenario_telemetry.start_timestamp = time.time() - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, time_scenario_config) - try: - with open(time_scenario_config, "r") as f: - scenario_config = yaml.full_load(f) - for time_scenario in scenario_config["time_scenarios"]: - start_time = int(time.time()) - object_type, object_names = skew_time(time_scenario, telemetry.kubecli) - not_reset = check_date_time(object_type, object_names, telemetry.kubecli) - if len(not_reset) > 0: - logging.info("Object times were not reset") - logging.info( - "Waiting for the specified duration: %s" % (wait_duration) - ) - time.sleep(wait_duration) - end_time = int(time.time()) - cerberus.publish_kraken_status( - config, - not_reset, - start_time, - end_time - ) - except (RuntimeError, Exception): - scenario_telemetry.exit_status = 1 - log_exception(time_scenario_config) - failed_scenarios.append(time_scenario_config) - else: - scenario_telemetry.exit_status = 0 - scenario_telemetry.end_timestamp = time.time() - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - scenario_telemetries.append(scenario_telemetry) - - return failed_scenarios, scenario_telemetries diff --git a/kraken/utils/functions.py b/kraken/utils/functions.py deleted file mode 100644 index 222283ff..00000000 --- a/kraken/utils/functions.py +++ /dev/null @@ -1,60 +0,0 @@ -import krkn_lib.utils -from krkn_lib.k8s import KrknKubernetes -from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift -from tzlocal.unix import get_localzone - - -def populate_cluster_events(scenario_telemetry: ScenarioTelemetry, - scenario_config: dict, - kubecli: KrknKubernetes, - start_timestamp: int, - end_timestamp: int - ): - events = [] - namespaces = __retrieve_namespaces(scenario_config, kubecli) - - if len(namespaces) == 0: - events.extend(kubecli.collect_and_parse_cluster_events(start_timestamp, end_timestamp, str(get_localzone()))) - else: - for namespace in namespaces: - events.extend(kubecli.collect_and_parse_cluster_events(start_timestamp, end_timestamp, str(get_localzone()), - namespace=namespace)) - - scenario_telemetry.set_cluster_events(events) - - -def collect_and_put_ocp_logs(telemetry_ocp: KrknTelemetryOpenshift, - scenario_config: dict, - request_id: str, - start_timestamp: int, - end_timestamp: int, - ): - if ( - telemetry_ocp.krkn_telemetry_config and - telemetry_ocp.krkn_telemetry_config["enabled"] and - telemetry_ocp.krkn_telemetry_config["logs_backup"] and - not telemetry_ocp.kubecli.is_kubernetes() - ): - namespaces = __retrieve_namespaces(scenario_config, telemetry_ocp.kubecli) - if len(namespaces) > 0: - for namespace in namespaces: - telemetry_ocp.put_ocp_logs(request_id, - telemetry_ocp.krkn_telemetry_config, - start_timestamp, - end_timestamp, - namespace) - else: - telemetry_ocp.put_ocp_logs(request_id, - telemetry_ocp.krkn_telemetry_config, - start_timestamp, - end_timestamp) - - -def __retrieve_namespaces(scenario_config: dict, kubecli: KrknKubernetes) -> set[str]: - namespaces = list() - namespaces.extend(krkn_lib.utils.deep_get_attribute("namespace", scenario_config)) - namespace_patterns = krkn_lib.utils.deep_get_attribute("namespace_pattern", scenario_config) - for pattern in namespace_patterns: - namespaces.extend(kubecli.list_namespaces_by_regex(pattern)) - return set(namespaces) diff --git a/kraken/zone_outage/actions.py b/kraken/zone_outage/actions.py deleted file mode 100644 index 7e42375b..00000000 --- a/kraken/zone_outage/actions.py +++ /dev/null @@ -1,138 +0,0 @@ -import yaml -import logging -import time - -from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift - -from .. import utils -from ..node_actions.aws_node_scenarios import AWS -from ..cerberus import setup as cerberus -from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import log_exception - -def run(scenarios_list, - config, - wait_duration, - telemetry: KrknTelemetryOpenshift, - telemetry_request_id: str) -> (list[str], list[ScenarioTelemetry]) : - """ - filters the subnet of interest and applies the network acl - to create zone outage - """ - failed_post_scenarios = "" - scenario_telemetries: list[ScenarioTelemetry] = [] - failed_scenarios = [] - - for zone_outage_config in scenarios_list: - scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = zone_outage_config - scenario_telemetry.start_timestamp = time.time() - parsed_scenario_config = telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config) - try: - if len(zone_outage_config) > 1: - with open(zone_outage_config, "r") as f: - zone_outage_config_yaml = yaml.full_load(f) - scenario_config = zone_outage_config_yaml["zone_outage"] - vpc_id = scenario_config["vpc_id"] - subnet_ids = scenario_config["subnet_id"] - duration = scenario_config["duration"] - cloud_type = scenario_config["cloud_type"] - ids = {} - acl_ids_created = [] - - if cloud_type.lower() == "aws": - cloud_object = AWS() - else: - logging.error( - "Cloud type %s is not currently supported for " - "zone outage scenarios" - % cloud_type - ) - # removed_exit - # sys.exit(1) - raise RuntimeError() - - start_time = int(time.time()) - - for subnet_id in subnet_ids: - logging.info("Targeting subnet_id") - network_association_ids = [] - associations, original_acl_id = \ - cloud_object.describe_network_acls(vpc_id, subnet_id) - for entry in associations: - if entry["SubnetId"] == subnet_id: - network_association_ids.append( - entry["NetworkAclAssociationId"] - ) - logging.info( - "Network association ids associated with " - "the subnet %s: %s" - % (subnet_id, network_association_ids) - ) - acl_id = cloud_object.create_default_network_acl(vpc_id) - new_association_id = \ - cloud_object.replace_network_acl_association( - network_association_ids[0], acl_id - ) - - # capture the orginal_acl_id, created_acl_id and - # new association_id to use during the recovery - ids[new_association_id] = original_acl_id - acl_ids_created.append(acl_id) - - # wait for the specified duration - logging.info( - "Waiting for the specified duration " - "in the config: %s" % (duration) - ) - time.sleep(duration) - - # replace the applied acl with the previous acl in use - for new_association_id, original_acl_id in ids.items(): - cloud_object.replace_network_acl_association( - new_association_id, - original_acl_id - ) - logging.info( - "Wating for 60 seconds to make sure " - "the changes are in place" - ) - time.sleep(60) - - # delete the network acl created for the run - for acl_id in acl_ids_created: - cloud_object.delete_network_acl(acl_id) - - logging.info( - "End of scenario. " - "Waiting for the specified duration: %s" % (wait_duration) - ) - time.sleep(wait_duration) - - end_time = int(time.time()) - cerberus.publish_kraken_status( - config, - failed_post_scenarios, - start_time, - end_time - ) - except (RuntimeError, Exception): - scenario_telemetry.exit_status = 1 - failed_scenarios.append(zone_outage_config) - log_exception(zone_outage_config) - else: - scenario_telemetry.exit_status = 0 - scenario_telemetry.end_timestamp = time.time() - utils.collect_and_put_ocp_logs(telemetry, - parsed_scenario_config, - telemetry_request_id, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - utils.populate_cluster_events(scenario_telemetry, - parsed_scenario_config, - telemetry.kubecli, - int(scenario_telemetry.start_timestamp), - int(scenario_telemetry.end_timestamp)) - scenario_telemetries.append(scenario_telemetry) - return failed_scenarios, scenario_telemetries - diff --git a/kraken/__init__.py b/krkn/__init__.py similarity index 100% rename from kraken/__init__.py rename to krkn/__init__.py diff --git a/krkn/cerberus/__init__.py b/krkn/cerberus/__init__.py new file mode 100644 index 00000000..9ca22d34 --- /dev/null +++ b/krkn/cerberus/__init__.py @@ -0,0 +1 @@ +from .setup import * diff --git a/kraken/cerberus/setup.py b/krkn/cerberus/setup.py similarity index 100% rename from kraken/cerberus/setup.py rename to krkn/cerberus/setup.py diff --git a/kraken/chaos_recommender/__init__.py b/krkn/chaos_recommender/__init__.py similarity index 100% rename from kraken/chaos_recommender/__init__.py rename to krkn/chaos_recommender/__init__.py diff --git a/kraken/chaos_recommender/analysis.py b/krkn/chaos_recommender/analysis.py similarity index 56% rename from kraken/chaos_recommender/analysis.py rename to krkn/chaos_recommender/analysis.py index 2c41f40e..90bf9a1c 100644 --- a/kraken/chaos_recommender/analysis.py +++ b/krkn/chaos_recommender/analysis.py @@ -1,7 +1,6 @@ import logging import pandas as pd -import kraken.chaos_recommender.kraken_tests as kraken_tests import time KRAKEN_TESTS_PATH = "./kraken_chaos_tests.txt" @@ -23,7 +22,9 @@ def calculate_zscores(data): zscores["Service"] = data["service"] zscores["CPU"] = (data["CPU"] - data["CPU"].mean()) / data["CPU"].std() zscores["Memory"] = (data["MEM"] - data["MEM"].mean()) / data["MEM"].std() - zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data["NETWORK"].std() + zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data[ + "NETWORK" + ].std() return zscores @@ -37,18 +38,28 @@ def identify_outliers(data, threshold): def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold): # Filter the DataFrame based on CPU_HEATMAP and MEM_HEATMAP thresholds - filtered_df = dataframe[((dataframe['CPU']/dataframe['CPU_LIMITS']) > cpu_threshold)] + filtered_df = dataframe[ + ((dataframe["CPU"] / dataframe["CPU_LIMITS"]) > cpu_threshold) + ] # Get the lists of services - cpu_services = filtered_df['service'].tolist() + cpu_services = filtered_df["service"].tolist() - filtered_df = dataframe[((dataframe['MEM']/dataframe['MEM_LIMITS']) > mem_threshold)] - mem_services = filtered_df['service'].tolist() + filtered_df = dataframe[ + ((dataframe["MEM"] / dataframe["MEM_LIMITS"]) > mem_threshold) + ] + mem_services = filtered_df["service"].tolist() return cpu_services, mem_services -def analysis(file_path, namespaces, chaos_tests_config, threshold, - heatmap_cpu_threshold, heatmap_mem_threshold): +def analysis( + file_path, + namespaces, + chaos_tests_config, + threshold, + heatmap_cpu_threshold, + heatmap_mem_threshold, +): # Load the telemetry data from file logging.info("Fetching the Telemetry data...") data = load_telemetry_data(file_path) @@ -66,29 +77,43 @@ def analysis(file_path, namespaces, chaos_tests_config, threshold, namespace_zscores = zscores.loc[zscores["Namespace"] == namespace] namespace_data = data.loc[data["namespace"] == namespace] outliers_cpu, outliers_memory, outliers_network = identify_outliers( - namespace_zscores, threshold) + namespace_zscores, threshold + ) cpu_services, mem_services = get_services_above_heatmap_threshold( - namespace_data, heatmap_cpu_threshold, heatmap_mem_threshold) + namespace_data, heatmap_cpu_threshold, heatmap_mem_threshold + ) - analysis_data[namespace] = analysis_json(outliers_cpu, outliers_memory, - outliers_network, - cpu_services, mem_services, - chaos_tests_config) + analysis_data[namespace] = analysis_json( + outliers_cpu, + outliers_memory, + outliers_network, + cpu_services, + mem_services, + chaos_tests_config, + ) if cpu_services: - logging.info(f"These services use significant CPU compared to " - f"their assigned limits: {cpu_services}") + logging.info( + f"These services use significant CPU compared to " + f"their assigned limits: {cpu_services}" + ) else: - logging.info("There are no services that are using significant " - "CPU compared to their assigned limits " - "(infinite in case no limits are set).") + logging.info( + "There are no services that are using significant " + "CPU compared to their assigned limits " + "(infinite in case no limits are set)." + ) if mem_services: - logging.info(f"These services use significant MEMORY compared to " - f"their assigned limits: {mem_services}") + logging.info( + f"These services use significant MEMORY compared to " + f"their assigned limits: {mem_services}" + ) else: - logging.info("There are no services that are using significant " - "MEMORY compared to their assigned limits " - "(infinite in case no limits are set).") + logging.info( + "There are no services that are using significant " + "MEMORY compared to their assigned limits " + "(infinite in case no limits are set)." + ) time.sleep(2) logging.info("Please check data in utilisation.txt for further analysis") @@ -96,36 +121,41 @@ def analysis(file_path, namespaces, chaos_tests_config, threshold, return analysis_data -def analysis_json(outliers_cpu, outliers_memory, outliers_network, - cpu_services, mem_services, chaos_tests_config): +def analysis_json( + outliers_cpu, + outliers_memory, + outliers_network, + cpu_services, + mem_services, + chaos_tests_config, +): profiling = { "cpu_outliers": outliers_cpu, "memory_outliers": outliers_memory, - "network_outliers": outliers_network + "network_outliers": outliers_network, } heatmap = { "services_with_cpu_heatmap_above_threshold": cpu_services, - "services_with_mem_heatmap_above_threshold": mem_services + "services_with_mem_heatmap_above_threshold": mem_services, } recommendations = {} if cpu_services: - cpu_recommend = {"services": cpu_services, - "tests": chaos_tests_config['CPU']} + cpu_recommend = {"services": cpu_services, "tests": chaos_tests_config["CPU"]} recommendations["cpu_services_recommendations"] = cpu_recommend if mem_services: - mem_recommend = {"services": mem_services, - "tests": chaos_tests_config['MEM']} + mem_recommend = {"services": mem_services, "tests": chaos_tests_config["MEM"]} recommendations["mem_services_recommendations"] = mem_recommend if outliers_network: - outliers_network_recommend = {"outliers_networks": outliers_network, - "tests": chaos_tests_config['NETWORK']} - recommendations["outliers_network_recommendations"] = ( - outliers_network_recommend) + outliers_network_recommend = { + "outliers_networks": outliers_network, + "tests": chaos_tests_config["NETWORK"], + } + recommendations["outliers_network_recommendations"] = outliers_network_recommend return [profiling, heatmap, recommendations] diff --git a/kraken/chaos_recommender/kraken_tests.py b/krkn/chaos_recommender/kraken_tests.py similarity index 71% rename from kraken/chaos_recommender/kraken_tests.py rename to krkn/chaos_recommender/kraken_tests.py index deb3fa2b..8909e329 100644 --- a/kraken/chaos_recommender/kraken_tests.py +++ b/krkn/chaos_recommender/kraken_tests.py @@ -1,13 +1,13 @@ def get_entries_by_category(filename, category): # Read the file - with open(filename, 'r') as file: + with open(filename, "r") as file: content = file.read() # Split the content into sections based on the square brackets - sections = content.split('\n\n') + sections = content.split("\n\n") # Define the categories - valid_categories = ['CPU', 'NETWORK', 'MEM', 'GENERIC'] + valid_categories = ["CPU", "NETWORK", "MEM", "GENERIC"] # Validate the provided category if category not in valid_categories: @@ -25,6 +25,10 @@ def get_entries_by_category(filename, category): return [] # Extract the entries from the category section - entries = [entry.strip() for entry in target_section.split('\n') if entry and not entry.startswith('[')] + entries = [ + entry.strip() + for entry in target_section.split("\n") + if entry and not entry.startswith("[") + ] return entries diff --git a/krkn/chaos_recommender/prometheus.py b/krkn/chaos_recommender/prometheus.py new file mode 100644 index 00000000..c00f73d3 --- /dev/null +++ b/krkn/chaos_recommender/prometheus.py @@ -0,0 +1,203 @@ +import logging + +from prometheus_api_client import PrometheusConnect +import pandas as pd +import urllib3 + + +saved_metrics_path = "./utilisation.txt" + + +def convert_data_to_dataframe(data, label): + df = pd.DataFrame() + df["service"] = [item["metric"]["pod"] for item in data] + df[label] = [item["value"][1] for item in data] + + return df + + +def convert_data(data, service): + result = {} + for entry in data: + pod_name = entry["metric"]["pod"] + value = entry["value"][1] + result[pod_name] = value + return result.get( + service + ) # for those pods whose limits are not defined they can take as much resources, there assigning a very high value + + +def convert_data_limits(data, node_data, service, prometheus): + result = {} + for entry in data: + pod_name = entry["metric"]["pod"] + value = entry["value"][1] + result[pod_name] = value + return result.get( + service, get_node_capacity(node_data, service, prometheus) + ) # for those pods whose limits are not defined they can take as much resources, there assigning a very high value + + +def get_node_capacity(node_data, pod_name, prometheus): + + # Get the node name on which the pod is running + query = f'kube_pod_info{{pod="{pod_name}"}}' + result = prometheus.custom_query(query) + if not result: + return None + + node_name = result[0]["metric"]["node"] + + for item in node_data: + if item["metric"]["node"] == node_name: + return item["value"][1] + + return "1000000000" + + +def save_utilization_to_file(utilization, filename, prometheus): + + merged_df = pd.DataFrame( + columns=[ + "namespace", + "service", + "CPU", + "CPU_LIMITS", + "MEM", + "MEM_LIMITS", + "NETWORK", + ] + ) + for namespace in utilization: + # Loading utilization_data[] for namespace + # indexes -- 0 CPU, 1 CPU limits, 2 mem, 3 mem limits, 4 network + utilization_data = utilization[namespace] + df_cpu = convert_data_to_dataframe(utilization_data[0], "CPU") + services = df_cpu.service.unique() + logging.info(f"Services for namespace {namespace}: {services}") + + for s in services: + + new_row_df = pd.DataFrame( + { + "namespace": namespace, + "service": s, + "CPU": convert_data(utilization_data[0], s), + "CPU_LIMITS": convert_data_limits( + utilization_data[1], utilization_data[5], s, prometheus + ), + "MEM": convert_data(utilization_data[2], s), + "MEM_LIMITS": convert_data_limits( + utilization_data[3], utilization_data[6], s, prometheus + ), + "NETWORK": convert_data(utilization_data[4], s), + }, + index=[0], + ) + merged_df = pd.concat([merged_df, new_row_df], ignore_index=True) + + # Convert columns to string + merged_df["CPU"] = merged_df["CPU"].astype(str) + merged_df["MEM"] = merged_df["MEM"].astype(str) + merged_df["CPU_LIMITS"] = merged_df["CPU_LIMITS"].astype(str) + merged_df["MEM_LIMITS"] = merged_df["MEM_LIMITS"].astype(str) + merged_df["NETWORK"] = merged_df["NETWORK"].astype(str) + + # Extract integer part before the decimal point + # merged_df['CPU'] = merged_df['CPU'].str.split('.').str[0] + # merged_df['MEM'] = merged_df['MEM'].str.split('.').str[0] + # merged_df['CPU_LIMITS'] = merged_df['CPU_LIMITS'].str.split('.').str[0] + # merged_df['MEM_LIMITS'] = merged_df['MEM_LIMITS'].str.split('.').str[0] + # merged_df['NETWORK'] = merged_df['NETWORK'].str.split('.').str[0] + + merged_df.to_csv(filename, sep="\t", index=False) + + +def fetch_utilization_from_prometheus( + prometheus_endpoint, auth_token, namespaces, scrape_duration +): + urllib3.disable_warnings() + prometheus = PrometheusConnect( + url=prometheus_endpoint, + headers={"Authorization": "Bearer {}".format(auth_token)}, + disable_ssl=True, + ) + + # Dicts for saving utilisation and queries -- key is namespace + utilization = {} + queries = {} + + logging.info("Fetching utilization...") + for namespace in namespaces: + + # Fetch CPU utilization + cpu_query = ( + 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' + % (namespace, scrape_duration) + ) + cpu_result = prometheus.custom_query(cpu_query) + + cpu_limits_query = ( + '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' + % (namespace) + ) + cpu_limits_result = prometheus.custom_query(cpu_limits_query) + + node_cpu_limits_query = ( + 'kube_node_status_capacity{resource="cpu", unit="core"}*1000' + ) + node_cpu_limits_result = prometheus.custom_query(node_cpu_limits_query) + + mem_query = ( + 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' + % (namespace, scrape_duration) + ) + mem_result = prometheus.custom_query(mem_query) + + mem_limits_query = ( + 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' + % (namespace) + ) + mem_limits_result = prometheus.custom_query(mem_limits_query) + + node_mem_limits_query = ( + 'kube_node_status_capacity{resource="memory", unit="byte"}' + ) + node_mem_limits_result = prometheus.custom_query(node_mem_limits_query) + + network_query = ( + 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \ + (avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' + % (namespace, scrape_duration, namespace, scrape_duration) + ) + network_result = prometheus.custom_query(network_query) + + utilization[namespace] = [ + cpu_result, + cpu_limits_result, + mem_result, + mem_limits_result, + network_result, + node_cpu_limits_result, + node_mem_limits_result, + ] + queries[namespace] = json_queries( + cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query + ) + + save_utilization_to_file(utilization, saved_metrics_path, prometheus) + + return saved_metrics_path, queries + + +def json_queries( + cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query +): + queries = { + "cpu_query": cpu_query, + "cpu_limit_query": cpu_limits_query, + "memory_query": mem_query, + "memory_limit_query": mem_limits_query, + "network_query": network_query, + } + return queries diff --git a/kraken/application_outage/__init__.py b/krkn/invoke/__init__.py similarity index 100% rename from kraken/application_outage/__init__.py rename to krkn/invoke/__init__.py diff --git a/kraken/invoke/command.py b/krkn/invoke/command.py similarity index 100% rename from kraken/invoke/command.py rename to krkn/invoke/command.py diff --git a/kraken/cerberus/__init__.py b/krkn/performance_dashboards/__init__.py similarity index 100% rename from kraken/cerberus/__init__.py rename to krkn/performance_dashboards/__init__.py diff --git a/kraken/performance_dashboards/setup.py b/krkn/performance_dashboards/setup.py similarity index 90% rename from kraken/performance_dashboards/setup.py rename to krkn/performance_dashboards/setup.py index 33c92a70..f8bf6fea 100644 --- a/kraken/performance_dashboards/setup.py +++ b/krkn/performance_dashboards/setup.py @@ -14,7 +14,9 @@ def setup(repo, distribution): logging.error("Provided distribution: %s is not supported" % (distribution)) sys.exit(1) delete_repo = "rm -rf performance-dashboards || exit 0" - logging.info("Cloning, installing mutable grafana on the cluster and loading the dashboards") + logging.info( + "Cloning, installing mutable grafana on the cluster and loading the dashboards" + ) try: # delete repo to clone the latest copy if exists subprocess.run(delete_repo, shell=True, universal_newlines=True, timeout=45) diff --git a/kraken/prometheus/__init__.py b/krkn/prometheus/__init__.py similarity index 100% rename from kraken/prometheus/__init__.py rename to krkn/prometheus/__init__.py diff --git a/kraken/prometheus/client.py b/krkn/prometheus/client.py similarity index 51% rename from kraken/prometheus/client.py rename to krkn/prometheus/client.py index e3466fdd..b444f5e8 100644 --- a/kraken/prometheus/client.py +++ b/krkn/prometheus/client.py @@ -16,15 +16,18 @@ from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) -def alerts(prom_cli: KrknPrometheus, - elastic: KrknElastic, - run_uuid, - start_time, - end_time, - alert_profile, - elastic_collect_alerts, - elastic_alerts_index - ): + + +def alerts( + prom_cli: KrknPrometheus, + elastic: KrknElastic, + run_uuid, + start_time, + end_time, + alert_profile, + elastic_collect_alerts, + elastic_alerts_index, +): if alert_profile is None or os.path.exists(alert_profile) is False: logging.error(f"{alert_profile} alert profile does not exist") @@ -33,69 +36,102 @@ def alerts(prom_cli: KrknPrometheus, with open(alert_profile) as profile: profile_yaml = yaml.safe_load(profile) if not isinstance(profile_yaml, list): - logging.error(f"{alert_profile} wrong file format, alert profile must be " - f"a valid yaml file containing a list of items with at least 3 properties: " - f"expr, description, severity" ) + logging.error( + f"{alert_profile} wrong file format, alert profile must be " + f"a valid yaml file containing a list of items with at least 3 properties: " + f"expr, description, severity" + ) sys.exit(1) for alert in profile_yaml: if list(alert.keys()).sort() != ["expr", "description", "severity"].sort(): logging.error(f"wrong alert {alert}, skipping") - processed_alert = prom_cli.process_alert(alert, - datetime.datetime.fromtimestamp(start_time), - datetime.datetime.fromtimestamp(end_time)) - if processed_alert[0] and processed_alert[1] and elastic and elastic_collect_alerts: - elastic_alert = ElasticAlert(run_uuid=run_uuid, - severity=alert["severity"], - alert=processed_alert[1], - created_at=datetime.datetime.fromtimestamp(processed_alert[0]) - ) + processed_alert = prom_cli.process_alert( + alert, + datetime.datetime.fromtimestamp(start_time), + datetime.datetime.fromtimestamp(end_time), + ) + if ( + processed_alert[0] + and processed_alert[1] + and elastic + and elastic_collect_alerts + ): + elastic_alert = ElasticAlert( + run_uuid=run_uuid, + severity=alert["severity"], + alert=processed_alert[1], + created_at=datetime.datetime.fromtimestamp(processed_alert[0]), + ) result = elastic.push_alert(elastic_alert, elastic_alerts_index) if result == -1: logging.error("failed to save alert on ElasticSearch") pass - -def critical_alerts(prom_cli: KrknPrometheus, - summary: ChaosRunAlertSummary, - run_id, - scenario, - start_time, - end_time): +def critical_alerts( + prom_cli: KrknPrometheus, + summary: ChaosRunAlertSummary, + run_id, + scenario, + start_time, + end_time, +): summary.scenario = scenario summary.run_id = run_id query = r"""ALERTS{severity="critical"}""" logging.info("Checking for critical alerts firing post chaos") during_critical_alerts = prom_cli.process_prom_query_in_range( - query, - start_time=datetime.datetime.fromtimestamp(start_time), - end_time=end_time - + query, start_time=datetime.datetime.fromtimestamp(start_time), end_time=end_time ) for alert in during_critical_alerts: if "metric" in alert: - alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none" - alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none" - namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none" - severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none" + alertname = ( + alert["metric"]["alertname"] + if "alertname" in alert["metric"] + else "none" + ) + alertstate = ( + alert["metric"]["alertstate"] + if "alertstate" in alert["metric"] + else "none" + ) + namespace = ( + alert["metric"]["namespace"] + if "namespace" in alert["metric"] + else "none" + ) + severity = ( + alert["metric"]["severity"] if "severity" in alert["metric"] else "none" + ) alert = ChaosRunAlert(alertname, alertstate, namespace, severity) summary.chaos_alerts.append(alert) - - post_critical_alerts = prom_cli.process_query( - query - ) + post_critical_alerts = prom_cli.process_query(query) for alert in post_critical_alerts: if "metric" in alert: - alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none" - alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none" - namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none" - severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none" + alertname = ( + alert["metric"]["alertname"] + if "alertname" in alert["metric"] + else "none" + ) + alertstate = ( + alert["metric"]["alertstate"] + if "alertstate" in alert["metric"] + else "none" + ) + namespace = ( + alert["metric"]["namespace"] + if "namespace" in alert["metric"] + else "none" + ) + severity = ( + alert["metric"]["severity"] if "severity" in alert["metric"] else "none" + ) alert = ChaosRunAlert(alertname, alertstate, namespace, severity) summary.post_chaos_alerts.append(alert) @@ -113,15 +149,16 @@ def critical_alerts(prom_cli: KrknPrometheus, logging.info("No critical alerts are firing!!") -def metrics(prom_cli: KrknPrometheus, - elastic: KrknElastic, - run_uuid, - start_time, - end_time, - metrics_profile, - elastic_collect_metrics, - elastic_metrics_index - ) -> list[dict[str, list[(int, float)] | str]]: +def metrics( + prom_cli: KrknPrometheus, + elastic: KrknElastic, + run_uuid, + start_time, + end_time, + metrics_profile, + elastic_collect_metrics, + elastic_metrics_index, +) -> list[dict[str, list[(int, float)] | str]]: metrics_list: list[dict[str, list[(int, float)] | str]] = [] if metrics_profile is None or os.path.exists(metrics_profile) is False: logging.error(f"{metrics_profile} alert profile does not exist") @@ -129,22 +166,26 @@ def metrics(prom_cli: KrknPrometheus, with open(metrics_profile) as profile: profile_yaml = yaml.safe_load(profile) if not profile_yaml["metrics"] or not isinstance(profile_yaml["metrics"], list): - logging.error(f"{metrics_profile} wrong file format, alert profile must be " - f"a valid yaml file containing a list of items with 3 properties: " - f"expr, description, severity" ) + logging.error( + f"{metrics_profile} wrong file format, alert profile must be " + f"a valid yaml file containing a list of items with 3 properties: " + f"expr, description, severity" + ) sys.exit(1) for metric_query in profile_yaml["metrics"]: - if list(metric_query.keys()).sort() != ["query", "metricName", "instant"].sort(): + if ( + list(metric_query.keys()).sort() + != ["query", "metricName", "instant"].sort() + ): logging.error(f"wrong alert {metric_query}, skipping") metrics_result = prom_cli.process_prom_query_in_range( metric_query["query"], start_time=datetime.datetime.fromtimestamp(start_time), - end_time=datetime.datetime.fromtimestamp(end_time) - + end_time=datetime.datetime.fromtimestamp(end_time), ) - metric = {"name": metric_query["metricName"], "values":[]} + metric = {"name": metric_query["metricName"], "values": []} for returned_metric in metrics_result: if "values" in returned_metric: for value in returned_metric["values"]: @@ -155,13 +196,10 @@ def metrics(prom_cli: KrknPrometheus, metrics_list.append(metric) if elastic_collect_metrics and elastic: - result = elastic.upload_metrics_to_elasticsearch(run_uuid=run_uuid, index=elastic_metrics_index, raw_data=metrics_list) + result = elastic.upload_metrics_to_elasticsearch( + run_uuid=run_uuid, index=elastic_metrics_index, raw_data=metrics_list + ) if result == -1: logging.error("failed to save metrics on ElasticSearch") - return metrics_list - - - - diff --git a/kraken/invoke/__init__.py b/krkn/scenario_plugins/__init__.py similarity index 100% rename from kraken/invoke/__init__.py rename to krkn/scenario_plugins/__init__.py diff --git a/krkn/scenario_plugins/abstract_scenario_plugin.py b/krkn/scenario_plugins/abstract_scenario_plugin.py new file mode 100644 index 00000000..060d9ec3 --- /dev/null +++ b/krkn/scenario_plugins/abstract_scenario_plugin.py @@ -0,0 +1,115 @@ +import logging +import time +from abc import ABC, abstractmethod +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift + +from krkn import utils + + +class AbstractScenarioPlugin(ABC): + @abstractmethod + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + """ + This method serves as the entry point for a ScenarioPlugin. To make the plugin loadable, + the AbstractScenarioPlugin class must be extended, and this method must be implemented. + No exception must be propagated outside of this method. + + :param run_uuid: the uuid of the chaos run generated by krkn for every single run + :param scenario: the config file of the scenario that is currently executed + :param krkn_config: the full dictionary representation of the `config.yaml` + :param lib_telemetry: it is a composite object of all the + krkn-lib objects and methods needed by a krkn plugin to run. + :param scenario_telemetry: the `ScenarioTelemetry` object of the scenario that is currently executed + :return: 0 if the scenario suceeded 1 if failed + """ + pass + + @abstractmethod + def get_scenario_types(self) -> list[str]: + """ + Indicates the scenario types specified in the `config.yaml`. For the plugin to be properly + loaded, recognized and executed, it must be implemented and must return the matching `scenario_type` strings. + One plugin can be mapped one or many different strings unique across the other plugins otherwise an exception + will be thrown. + + + :return: the corresponding scenario_type as a list of strings + """ + pass + + def run_scenarios( + self, + run_uuid: str, + scenarios_list: list[str], + krkn_config: dict[str, any], + telemetry: KrknTelemetryOpenshift, + ) -> tuple[list[str], list[ScenarioTelemetry]]: + + scenario_telemetries: list[ScenarioTelemetry] = [] + failed_scenarios = [] + wait_duration = krkn_config["tunings"]["wait_duration"] + for scenario_config in scenarios_list: + if isinstance(scenario_config, list): + logging.error( + "post scenarios have been deprecated, please " + "remove sub-lists from `scenarios` in config.yaml" + ) + failed_scenarios.append(scenario_config) + break + + scenario_telemetry = ScenarioTelemetry() + scenario_telemetry.scenario = scenario_config + scenario_telemetry.start_timestamp = time.time() + parsed_scenario_config = telemetry.set_parameters_base64( + scenario_telemetry, scenario_config + ) + + try: + logging.info( + f"Running {self.__class__.__name__}: {self.get_scenario_types()} -> {scenario_config}" + ) + return_value = self.run( + run_uuid, + scenario_config, + krkn_config, + telemetry, + scenario_telemetry, + ) + except Exception as e: + logging.error( + f"uncaught exception on scenario `run()` method: {e} " + f"please report an issue on https://github.com/krkn-chaos/krkn" + ) + return_value = 1 + + scenario_telemetry.exit_status = return_value + scenario_telemetry.end_timestamp = time.time() + utils.collect_and_put_ocp_logs( + telemetry, + parsed_scenario_config, + telemetry.get_telemetry_request_id(), + int(scenario_telemetry.start_timestamp), + int(scenario_telemetry.end_timestamp), + ) + utils.populate_cluster_events( + scenario_telemetry, + parsed_scenario_config, + telemetry.get_lib_kubernetes(), + int(scenario_telemetry.start_timestamp), + int(scenario_telemetry.end_timestamp), + ) + + if scenario_telemetry.exit_status != 0: + failed_scenarios.append(scenario_config) + scenario_telemetries.append(scenario_telemetry) + logging.info(f"wating {wait_duration} before running the next scenario") + time.sleep(wait_duration) + return failed_scenarios, scenario_telemetries diff --git a/kraken/managedcluster_scenarios/__init__.py b/krkn/scenario_plugins/application_outage/__init__.py similarity index 100% rename from kraken/managedcluster_scenarios/__init__.py rename to krkn/scenario_plugins/application_outage/__init__.py diff --git a/krkn/scenario_plugins/application_outage/application_outage_scenario_plugin.py b/krkn/scenario_plugins/application_outage/application_outage_scenario_plugin.py new file mode 100644 index 00000000..e016c2dc --- /dev/null +++ b/krkn/scenario_plugins/application_outage/application_outage_scenario_plugin.py @@ -0,0 +1,88 @@ +import logging +import time +import yaml +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn_lib.utils import get_yaml_item_value +from jinja2 import Template +from krkn import cerberus +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class ApplicationOutageScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + wait_duration = krkn_config["tunings"]["wait_duration"] + try: + with open(scenario, "r") as f: + app_outage_config_yaml = yaml.full_load(f) + scenario_config = app_outage_config_yaml["application_outage"] + pod_selector = get_yaml_item_value( + scenario_config, "pod_selector", "{}" + ) + traffic_type = get_yaml_item_value( + scenario_config, "block", "[Ingress, Egress]" + ) + namespace = get_yaml_item_value(scenario_config, "namespace", "") + duration = get_yaml_item_value(scenario_config, "duration", 60) + + start_time = int(time.time()) + + network_policy_template = """--- + apiVersion: networking.k8s.io/v1 + kind: NetworkPolicy + metadata: + name: kraken-deny + spec: + podSelector: + matchLabels: {{ pod_selector }} + policyTypes: {{ traffic_type }} + """ + t = Template(network_policy_template) + rendered_spec = t.render( + pod_selector=pod_selector, traffic_type=traffic_type + ) + yaml_spec = yaml.safe_load(rendered_spec) + # Block the traffic by creating network policy + logging.info("Creating the network policy") + + lib_telemetry.get_lib_kubernetes().create_net_policy( + yaml_spec, namespace + ) + + # wait for the specified duration + logging.info( + "Waiting for the specified duration in the config: %s" % duration + ) + time.sleep(duration) + + # unblock the traffic by deleting the network policy + logging.info("Deleting the network policy") + lib_telemetry.get_lib_kubernetes().delete_net_policy( + "kraken-deny", namespace + ) + + logging.info( + "End of scenario. Waiting for the specified duration: %s" + % wait_duration + ) + time.sleep(wait_duration) + + end_time = int(time.time()) + cerberus.publish_kraken_status(krkn_config, [], start_time, end_time) + except Exception as e: + logging.error( + "ApplicationOutageScenarioPlugin exiting due to Exception %s" % e + ) + return 1 + else: + return 0 + + def get_scenario_types(self) -> list[str]: + return ["application_outages_scenarios"] diff --git a/kraken/network_chaos/__init__.py b/krkn/scenario_plugins/arcaflow/__init__.py similarity index 100% rename from kraken/network_chaos/__init__.py rename to krkn/scenario_plugins/arcaflow/__init__.py diff --git a/krkn/scenario_plugins/arcaflow/arcaflow_scenario_plugin.py b/krkn/scenario_plugins/arcaflow/arcaflow_scenario_plugin.py new file mode 100644 index 00000000..a61cd167 --- /dev/null +++ b/krkn/scenario_plugins/arcaflow/arcaflow_scenario_plugin.py @@ -0,0 +1,197 @@ +import logging +import os +from pathlib import Path +import arcaflow +import yaml +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin +from krkn.scenario_plugins.arcaflow.context_auth import ContextAuth + + +class ArcaflowScenarioPlugin(AbstractScenarioPlugin): + + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + try: + engine_args = self.build_args(scenario) + status_code = self.run_workflow( + engine_args, lib_telemetry.get_lib_kubernetes().get_kubeconfig_path() + ) + return status_code + except Exception as e: + logging.error("ArcaflowScenarioPlugin exiting due to Exception %s" % e) + return 1 + + def get_scenario_types(self) -> [str]: + return ["hog_scenarios", "arcaflow_scenario"] + + def run_workflow( + self, engine_args: arcaflow.EngineArgs, kubeconfig_path: str + ) -> int: + self.set_arca_kubeconfig(engine_args, kubeconfig_path) + exit_status = arcaflow.run(engine_args) + return exit_status + + def build_args(self, input_file: str) -> arcaflow.EngineArgs: + """sets the kubeconfig parsed by setArcaKubeConfig as an input to the arcaflow workflow""" + current_path = Path().resolve() + context = f"{current_path}/{Path(input_file).parent}" + workflow = f"{context}/workflow.yaml" + config = f"{context}/config.yaml" + if not os.path.exists(context): + raise Exception( + "context folder for arcaflow workflow not found: {}".format(context) + ) + if not os.path.exists(input_file): + raise Exception( + "input file for arcaflow workflow not found: {}".format(input_file) + ) + if not os.path.exists(workflow): + raise Exception( + "workflow file for arcaflow workflow not found: {}".format(workflow) + ) + if not os.path.exists(config): + raise Exception( + "configuration file for arcaflow workflow not found: {}".format(config) + ) + + engine_args = arcaflow.EngineArgs() + engine_args.context = context + engine_args.config = config + engine_args.workflow = workflow + engine_args.input = f"{current_path}/{input_file}" + return engine_args + + def set_arca_kubeconfig( + self, engine_args: arcaflow.EngineArgs, kubeconfig_path: str + ): + + context_auth = ContextAuth() + if not os.path.exists(kubeconfig_path): + raise Exception("kubeconfig not found in {}".format(kubeconfig_path)) + + with open(kubeconfig_path, "r") as stream: + try: + kubeconfig = yaml.safe_load(stream) + context_auth.fetch_auth_data(kubeconfig) + except Exception as e: + logging.error( + "impossible to read kubeconfig file in: {}".format(kubeconfig_path) + ) + raise e + + kubeconfig_str = self.set_kubeconfig_auth(kubeconfig, context_auth) + + with open(engine_args.input, "r") as stream: + input_file = yaml.safe_load(stream) + if "input_list" in input_file and isinstance( + input_file["input_list"], list + ): + for index, _ in enumerate(input_file["input_list"]): + if isinstance(input_file["input_list"][index], dict): + input_file["input_list"][index]["kubeconfig"] = kubeconfig_str + else: + input_file["kubeconfig"] = kubeconfig_str + stream.close() + with open(engine_args.input, "w") as stream: + yaml.safe_dump(input_file, stream) + + with open(engine_args.config, "r") as stream: + config_file = yaml.safe_load(stream) + if config_file["deployers"]["image"]["deployer_name"] == "kubernetes": + kube_connection = self.set_kubernetes_deployer_auth( + config_file["deployers"]["image"]["connection"], context_auth + ) + config_file["deployers"]["image"]["connection"] = kube_connection + with open(engine_args.config, "w") as stream: + yaml.safe_dump(config_file, stream, explicit_start=True, width=4096) + + def set_kubernetes_deployer_auth( + self, deployer: any, context_auth: ContextAuth + ) -> any: + if context_auth.clusterHost is not None: + deployer["host"] = context_auth.clusterHost + if context_auth.clientCertificateData is not None: + deployer["cert"] = context_auth.clientCertificateData + if context_auth.clientKeyData is not None: + deployer["key"] = context_auth.clientKeyData + if context_auth.clusterCertificateData is not None: + deployer["cacert"] = context_auth.clusterCertificateData + if context_auth.username is not None: + deployer["username"] = context_auth.username + if context_auth.password is not None: + deployer["password"] = context_auth.password + if context_auth.bearerToken is not None: + deployer["bearerToken"] = context_auth.bearerToken + return deployer + + def set_kubeconfig_auth(self, kubeconfig: any, context_auth: ContextAuth) -> str: + """ + Builds an arcaflow-compatible kubeconfig representation and returns it as a string. + In order to run arcaflow plugins in kubernetes/openshift the kubeconfig must contain client certificate/key + and server certificate base64 encoded within the kubeconfig file itself in *-data fields. That is not always the + case, infact kubeconfig may contain filesystem paths to those files, this function builds an arcaflow-compatible + kubeconfig file and returns it as a string that can be safely included in input.yaml + """ + + if "current-context" not in kubeconfig.keys(): + raise Exception( + "invalid kubeconfig file, impossible to determine current-context" + ) + user_id = None + cluster_id = None + user_name = None + cluster_name = None + current_context = kubeconfig["current-context"] + for context in kubeconfig["contexts"]: + if context["name"] == current_context: + user_name = context["context"]["user"] + cluster_name = context["context"]["cluster"] + if user_name is None: + raise Exception( + "user not set for context {} in kubeconfig file".format(current_context) + ) + if cluster_name is None: + raise Exception( + "cluster not set for context {} in kubeconfig file".format( + current_context + ) + ) + + for index, user in enumerate(kubeconfig["users"]): + if user["name"] == user_name: + user_id = index + for index, cluster in enumerate(kubeconfig["clusters"]): + if cluster["name"] == cluster_name: + cluster_id = index + + if cluster_id is None: + raise Exception( + "no cluster {} found in kubeconfig users".format(cluster_name) + ) + if "client-certificate" in kubeconfig["users"][user_id]["user"]: + kubeconfig["users"][user_id]["user"][ + "client-certificate-data" + ] = context_auth.clientCertificateDataBase64 + del kubeconfig["users"][user_id]["user"]["client-certificate"] + + if "client-key" in kubeconfig["users"][user_id]["user"]: + kubeconfig["users"][user_id]["user"][ + "client-key-data" + ] = context_auth.clientKeyDataBase64 + del kubeconfig["users"][user_id]["user"]["client-key"] + + if "certificate-authority" in kubeconfig["clusters"][cluster_id]["cluster"]: + kubeconfig["clusters"][cluster_id]["cluster"][ + "certificate-authority-data" + ] = context_auth.clusterCertificateDataBase64 + del kubeconfig["clusters"][cluster_id]["cluster"]["certificate-authority"] + kubeconfig_str = yaml.dump(kubeconfig) + return kubeconfig_str diff --git a/kraken/arcaflow_plugin/context_auth.py b/krkn/scenario_plugins/arcaflow/context_auth.py similarity index 80% rename from kraken/arcaflow_plugin/context_auth.py rename to krkn/scenario_plugins/arcaflow/context_auth.py index 47866c14..bb07e926 100644 --- a/kraken/arcaflow_plugin/context_auth.py +++ b/krkn/scenario_plugins/arcaflow/context_auth.py @@ -1,4 +1,3 @@ -import yaml import os import base64 @@ -20,23 +19,25 @@ class ContextAuth: @property def clusterCertificateDataBase64(self): if self.clusterCertificateData is not None: - return base64.b64encode(bytes(self.clusterCertificateData,'utf8')).decode("ascii") + return base64.b64encode(bytes(self.clusterCertificateData, "utf8")).decode( + "ascii" + ) return @property def clientCertificateDataBase64(self): if self.clientCertificateData is not None: - return base64.b64encode(bytes(self.clientCertificateData,'utf8')).decode("ascii") + return base64.b64encode(bytes(self.clientCertificateData, "utf8")).decode( + "ascii" + ) return @property def clientKeyDataBase64(self): if self.clientKeyData is not None: - return base64.b64encode(bytes(self.clientKeyData,"utf-8")).decode("ascii") + return base64.b64encode(bytes(self.clientKeyData, "utf-8")).decode("ascii") return - - def fetch_auth_data(self, kubeconfig: any): context_username = None current_context = kubeconfig["current-context"] @@ -56,8 +57,10 @@ class ContextAuth: for index, user in enumerate(kubeconfig["users"]): if user["name"] == context_username: user_id = index - if user_id is None : - raise Exception("user {0} not found in kubeconfig users".format(context_username)) + if user_id is None: + raise Exception( + "user {0} not found in kubeconfig users".format(context_username) + ) for index, cluster in enumerate(kubeconfig["clusters"]): if cluster["name"] == self.clusterName: @@ -83,7 +86,9 @@ class ContextAuth: if "client-key-data" in user: try: - self.clientKeyData = base64.b64decode(user["client-key-data"]).decode('utf-8') + self.clientKeyData = base64.b64decode(user["client-key-data"]).decode( + "utf-8" + ) except Exception as e: raise Exception("impossible to decode client-key-data") @@ -96,7 +101,9 @@ class ContextAuth: if "client-certificate-data" in user: try: - self.clientCertificateData = base64.b64decode(user["client-certificate-data"]).decode('utf-8') + self.clientCertificateData = base64.b64decode( + user["client-certificate-data"] + ).decode("utf-8") except Exception as e: raise Exception("impossible to decode client-certificate-data") @@ -105,13 +112,17 @@ class ContextAuth: if "certificate-authority" in cluster: try: self.clusterCertificate = cluster["certificate-authority"] - self.clusterCertificateData = self.read_file(cluster["certificate-authority"]) + self.clusterCertificateData = self.read_file( + cluster["certificate-authority"] + ) except Exception as e: raise e if "certificate-authority-data" in cluster: try: - self.clusterCertificateData = base64.b64decode(cluster["certificate-authority-data"]).decode('utf-8') + self.clusterCertificateData = base64.b64decode( + cluster["certificate-authority-data"] + ).decode("utf-8") except Exception as e: raise Exception("impossible to decode certificate-authority-data") @@ -124,19 +135,8 @@ class ContextAuth: if "token" in user: self.bearerToken = user["token"] - def read_file(self, filename:str) -> str: + def read_file(self, filename: str) -> str: if not os.path.exists(filename): raise Exception("file not found {0} ".format(filename)) with open(filename, "rb") as file_stream: - return file_stream.read().decode('utf-8') - - - - - - - - - - - + return file_stream.read().decode("utf-8") diff --git a/kraken/arcaflow_plugin/fixtures/ca.crt b/krkn/scenario_plugins/arcaflow/fixtures/ca.crt similarity index 100% rename from kraken/arcaflow_plugin/fixtures/ca.crt rename to krkn/scenario_plugins/arcaflow/fixtures/ca.crt diff --git a/kraken/arcaflow_plugin/fixtures/client.crt b/krkn/scenario_plugins/arcaflow/fixtures/client.crt similarity index 100% rename from kraken/arcaflow_plugin/fixtures/client.crt rename to krkn/scenario_plugins/arcaflow/fixtures/client.crt diff --git a/kraken/arcaflow_plugin/fixtures/client.key b/krkn/scenario_plugins/arcaflow/fixtures/client.key similarity index 100% rename from kraken/arcaflow_plugin/fixtures/client.key rename to krkn/scenario_plugins/arcaflow/fixtures/client.key diff --git a/kraken/arcaflow_plugin/test_context_auth.py b/krkn/scenario_plugins/arcaflow/test_context_auth.py similarity index 96% rename from kraken/arcaflow_plugin/test_context_auth.py rename to krkn/scenario_plugins/arcaflow/test_context_auth.py index 5571018e..75e48113 100644 --- a/kraken/arcaflow_plugin/test_context_auth.py +++ b/krkn/scenario_plugins/arcaflow/test_context_auth.py @@ -1,7 +1,9 @@ import os import unittest -from context_auth import ContextAuth +import yaml + +from .context_auth import ContextAuth class TestCurrentContext(unittest.TestCase): @@ -9,7 +11,7 @@ class TestCurrentContext(unittest.TestCase): def get_kubeconfig_with_data(self) -> str: """ This function returns a test kubeconfig file as a string. - + :return: a test kubeconfig file in string format (for unit testing purposes) """ # NOQA return """apiVersion: v1 @@ -71,7 +73,8 @@ users: def test_current_context(self): cwd = os.getcwd() current_context_data = ContextAuth() - current_context_data.fetch_auth_data(self.get_kubeconfig_with_data()) + data = yaml.safe_load(self.get_kubeconfig_with_data()) + current_context_data.fetch_auth_data(data) self.assertIsNotNone(current_context_data.clusterCertificateData) self.assertIsNotNone(current_context_data.clientCertificateData) self.assertIsNotNone(current_context_data.clientKeyData) @@ -81,7 +84,8 @@ users: self.assertIsNotNone(current_context_data.clusterHost) current_context_no_data = ContextAuth() - current_context_no_data.fetch_auth_data(self.get_kubeconfig_with_paths()) + data = yaml.safe_load(self.get_kubeconfig_with_paths()) + current_context_no_data.fetch_auth_data(data) self.assertIsNotNone(current_context_no_data.clusterCertificate) self.assertIsNotNone(current_context_no_data.clusterCertificateData) self.assertIsNotNone(current_context_no_data.clientCertificate) @@ -92,9 +96,3 @@ users: self.assertIsNotNone(current_context_no_data.password) self.assertIsNotNone(current_context_no_data.bearerToken) self.assertIsNotNone(current_context_data.clusterHost) - - - - - - diff --git a/kraken/node_actions/__init__.py b/krkn/scenario_plugins/container/__init__.py similarity index 100% rename from kraken/node_actions/__init__.py rename to krkn/scenario_plugins/container/__init__.py diff --git a/krkn/scenario_plugins/container/container_scenario_plugin.py b/krkn/scenario_plugins/container/container_scenario_plugin.py new file mode 100644 index 00000000..9da36d11 --- /dev/null +++ b/krkn/scenario_plugins/container/container_scenario_plugin.py @@ -0,0 +1,232 @@ +import logging +import random +import time + +import yaml +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn_lib.utils import get_yaml_item_value + +from krkn import cerberus +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class ContainerScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + start_time = int(time.time()) + pool = PodsMonitorPool(lib_telemetry.get_lib_kubernetes()) + wait_duration = krkn_config["tunings"]["wait_duration"] + try: + with open(scenario, "r") as f: + cont_scenario_config = yaml.full_load(f) + self.start_monitoring( + kill_scenarios=cont_scenario_config["scenarios"], pool=pool + ) + killed_containers = self.container_killing_in_pod( + cont_scenario_config, lib_telemetry.get_lib_kubernetes() + ) + logging.info(f"killed containers: {str(killed_containers)}") + result = pool.join() + if result.error: + logging.error( + logging.error( + f"ContainerScenarioPlugin pods failed to recovery: {result.error}" + ) + ) + return 1 + scenario_telemetry.affected_pods = result + logging.info("Waiting for the specified duration: %s" % (wait_duration)) + time.sleep(wait_duration) + + # capture end time + end_time = int(time.time()) + + # publish cerberus status + cerberus.publish_kraken_status(krkn_config, [], start_time, end_time) + except (RuntimeError, Exception): + logging.error("ContainerScenarioPlugin exiting due to Exception %s" % e) + return 1 + else: + return 0 + + def get_scenario_types(self) -> list[str]: + return ["container_scenarios"] + + def start_monitoring(self, kill_scenarios: list[any], pool: PodsMonitorPool): + for kill_scenario in kill_scenarios: + namespace_pattern = f"^{kill_scenario['namespace']}$" + label_selector = kill_scenario["label_selector"] + recovery_time = kill_scenario["expected_recovery_time"] + pool.select_and_monitor_by_namespace_pattern_and_label( + namespace_pattern=namespace_pattern, + label_selector=label_selector, + max_timeout=recovery_time, + ) + + def container_killing_in_pod(self, cont_scenario, kubecli: KrknKubernetes): + scenario_name = get_yaml_item_value(cont_scenario, "name", "") + namespace = get_yaml_item_value(cont_scenario, "namespace", "*") + label_selector = get_yaml_item_value(cont_scenario, "label_selector", None) + pod_names = get_yaml_item_value(cont_scenario, "pod_names", []) + container_name = get_yaml_item_value(cont_scenario, "container_name", "") + kill_action = get_yaml_item_value(cont_scenario, "action", 1) + kill_count = get_yaml_item_value(cont_scenario, "count", 1) + if not isinstance(kill_action, int): + logging.error( + "Please make sure the action parameter defined in the " + "config is an integer" + ) + raise RuntimeError() + if (kill_action < 1) or (kill_action > 15): + logging.error("Only 1-15 kill signals are supported.") + raise RuntimeError() + kill_action = "kill " + str(kill_action) + if type(pod_names) != list: + logging.error("Please make sure your pod_names are in a list format") + # removed_exit + # sys.exit(1) + raise RuntimeError() + if len(pod_names) == 0: + if namespace == "*": + # returns double array of pod name and namespace + pods = kubecli.get_all_pods(label_selector) + else: + # Only returns pod names + pods = kubecli.list_pods(namespace, label_selector) + else: + if namespace == "*": + logging.error( + "You must specify the namespace to kill a container in a specific pod" + ) + logging.error("Scenario " + scenario_name + " failed") + # removed_exit + # sys.exit(1) + raise RuntimeError() + pods = pod_names + # get container and pod name + container_pod_list = [] + for pod in pods: + if type(pod) == list: + pod_output = kubecli.get_pod_info(pod[0], pod[1]) + container_names = [ + container.name for container in pod_output.containers + ] + + container_pod_list.append([pod[0], pod[1], container_names]) + else: + pod_output = kubecli.get_pod_info(pod, namespace) + container_names = [ + container.name for container in pod_output.containers + ] + container_pod_list.append([pod, namespace, container_names]) + + killed_count = 0 + killed_container_list = [] + while killed_count < kill_count: + if len(container_pod_list) == 0: + logging.error( + "Trying to kill more containers than were found, try lowering kill count" + ) + logging.error("Scenario " + scenario_name + " failed") + # removed_exit + # sys.exit(1) + raise RuntimeError() + selected_container_pod = container_pod_list[ + random.randint(0, len(container_pod_list) - 1) + ] + for c_name in selected_container_pod[2]: + if container_name != "": + if c_name == container_name: + killed_container_list.append( + [ + selected_container_pod[0], + selected_container_pod[1], + c_name, + ] + ) + self.retry_container_killing( + kill_action, + selected_container_pod[0], + selected_container_pod[1], + c_name, + kubecli, + ) + break + else: + killed_container_list.append( + [selected_container_pod[0], selected_container_pod[1], c_name] + ) + self.retry_container_killing( + kill_action, + selected_container_pod[0], + selected_container_pod[1], + c_name, + kubecli, + ) + break + container_pod_list.remove(selected_container_pod) + killed_count += 1 + logging.info("Scenario " + scenario_name + " successfully injected") + return killed_container_list + + def retry_container_killing( + self, kill_action, podname, namespace, container_name, kubecli: KrknKubernetes + ): + i = 0 + while i < 5: + logging.info( + "Killing container %s in pod %s (ns %s)" + % (str(container_name), str(podname), str(namespace)) + ) + response = kubecli.exec_cmd_in_pod( + kill_action, podname, namespace, container_name + ) + i += 1 + # Blank response means it is done + if not response: + break + elif ( + "unauthorized" in response.lower() + or "authorization" in response.lower() + ): + time.sleep(2) + continue + else: + logging.warning(response) + continue + + def check_failed_containers( + self, killed_container_list, wait_time, kubecli: KrknKubernetes + ): + + container_ready = [] + timer = 0 + while timer <= wait_time: + for killed_container in killed_container_list: + # pod namespace contain name + pod_output = kubecli.get_pod_info( + killed_container[0], killed_container[1] + ) + + for container in pod_output.containers: + if container.name == killed_container[2]: + if container.ready: + container_ready.append(killed_container) + if len(container_ready) != 0: + for item in container_ready: + killed_container_list = killed_container_list.remove(item) + if killed_container_list is None or len(killed_container_list) == 0: + return [] + timer += 5 + logging.info("Waiting 5 seconds for containers to become ready") + time.sleep(5) + return killed_container_list diff --git a/kraken/performance_dashboards/__init__.py b/krkn/scenario_plugins/managed_cluster/__init__.py similarity index 100% rename from kraken/performance_dashboards/__init__.py rename to krkn/scenario_plugins/managed_cluster/__init__.py diff --git a/kraken/managedcluster_scenarios/common_managedcluster_functions.py b/krkn/scenario_plugins/managed_cluster/common_functions.py similarity index 65% rename from kraken/managedcluster_scenarios/common_managedcluster_functions.py rename to krkn/scenario_plugins/managed_cluster/common_functions.py index b4a17c4c..15e73c13 100644 --- a/kraken/managedcluster_scenarios/common_managedcluster_functions.py +++ b/krkn/scenario_plugins/managed_cluster/common_functions.py @@ -2,28 +2,37 @@ import random import logging from krkn_lib.k8s import KrknKubernetes + # krkn_lib # Pick a random managedcluster with specified label selector def get_managedcluster( - managedcluster_name, - label_selector, - instance_kill_count, - kubecli: KrknKubernetes): + managedcluster_name, label_selector, instance_kill_count, kubecli: KrknKubernetes +): if managedcluster_name in kubecli.list_killable_managedclusters(): return [managedcluster_name] elif managedcluster_name: - logging.info("managedcluster with provided managedcluster_name does not exist or the managedcluster might " "be in unavailable state.") + logging.info( + "managedcluster with provided managedcluster_name does not exist or the managedcluster might " + "be in unavailable state." + ) managedclusters = kubecli.list_killable_managedclusters(label_selector) if not managedclusters: - raise Exception("Available managedclusters with the provided label selector do not exist") - logging.info("Available managedclusters with the label selector %s: %s" % (label_selector, managedclusters)) + raise Exception( + "Available managedclusters with the provided label selector do not exist" + ) + logging.info( + "Available managedclusters with the label selector %s: %s" + % (label_selector, managedclusters) + ) number_of_managedclusters = len(managedclusters) if instance_kill_count == number_of_managedclusters: return managedclusters managedclusters_to_return = [] for i in range(instance_kill_count): - managedcluster_to_add = managedclusters[random.randint(0, len(managedclusters) - 1)] + managedcluster_to_add = managedclusters[ + random.randint(0, len(managedclusters) - 1) + ] managedclusters_to_return.append(managedcluster_to_add) managedclusters.remove(managedcluster_to_add) return managedclusters_to_return diff --git a/krkn/scenario_plugins/managed_cluster/managed_cluster_scenario_plugin.py b/krkn/scenario_plugins/managed_cluster/managed_cluster_scenario_plugin.py new file mode 100644 index 00000000..b95238d8 --- /dev/null +++ b/krkn/scenario_plugins/managed_cluster/managed_cluster_scenario_plugin.py @@ -0,0 +1,127 @@ +import logging +import time + +import yaml +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn_lib.utils import get_yaml_item_value + +from krkn import cerberus, utils +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin +from krkn.scenario_plugins.managed_cluster.common_functions import get_managedcluster +from krkn.scenario_plugins.managed_cluster.scenarios import Scenarios + + +class ManagedClusterScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + with open(scenario, "r") as f: + scenario = yaml.full_load(f) + for managedcluster_scenario in scenario["managedcluster_scenarios"]: + managedcluster_scenario_object = Scenarios( + lib_telemetry.get_lib_kubernetes() + ) + if managedcluster_scenario["actions"]: + for action in managedcluster_scenario["actions"]: + start_time = int(time.time()) + try: + self.inject_managedcluster_scenario( + action, + managedcluster_scenario, + managedcluster_scenario_object, + lib_telemetry.get_lib_kubernetes(), + ) + end_time = int(time.time()) + cerberus.get_status(krkn_config, start_time, end_time) + except Exception as e: + logging.error( + "ManagedClusterScenarioPlugin exiting due to Exception %s" + % e + ) + return 1 + else: + return 0 + + def inject_managedcluster_scenario( + self, + action, + managedcluster_scenario, + managedcluster_scenario_object, + kubecli: KrknKubernetes, + ): + # Get the managedcluster scenario configurations + run_kill_count = get_yaml_item_value(managedcluster_scenario, "runs", 1) + instance_kill_count = get_yaml_item_value( + managedcluster_scenario, "instance_count", 1 + ) + managedcluster_name = get_yaml_item_value( + managedcluster_scenario, "managedcluster_name", "" + ) + label_selector = get_yaml_item_value( + managedcluster_scenario, "label_selector", "" + ) + timeout = get_yaml_item_value(managedcluster_scenario, "timeout", 120) + # Get the managedcluster to apply the scenario + if managedcluster_name: + managedcluster_name_list = managedcluster_name.split(",") + else: + managedcluster_name_list = [managedcluster_name] + for single_managedcluster_name in managedcluster_name_list: + managedclusters = get_managedcluster( + single_managedcluster_name, label_selector, instance_kill_count, kubecli + ) + for single_managedcluster in managedclusters: + if action == "managedcluster_start_scenario": + managedcluster_scenario_object.managedcluster_start_scenario( + run_kill_count, single_managedcluster, timeout + ) + elif action == "managedcluster_stop_scenario": + managedcluster_scenario_object.managedcluster_stop_scenario( + run_kill_count, single_managedcluster, timeout + ) + elif action == "managedcluster_stop_start_scenario": + managedcluster_scenario_object.managedcluster_stop_start_scenario( + run_kill_count, single_managedcluster, timeout + ) + elif action == "managedcluster_termination_scenario": + managedcluster_scenario_object.managedcluster_termination_scenario( + run_kill_count, single_managedcluster, timeout + ) + elif action == "managedcluster_reboot_scenario": + managedcluster_scenario_object.managedcluster_reboot_scenario( + run_kill_count, single_managedcluster, timeout + ) + elif action == "stop_start_klusterlet_scenario": + managedcluster_scenario_object.stop_start_klusterlet_scenario( + run_kill_count, single_managedcluster, timeout + ) + elif action == "start_klusterlet_scenario": + managedcluster_scenario_object.stop_klusterlet_scenario( + run_kill_count, single_managedcluster, timeout + ) + elif action == "stop_klusterlet_scenario": + managedcluster_scenario_object.stop_klusterlet_scenario( + run_kill_count, single_managedcluster, timeout + ) + elif action == "managedcluster_crash_scenario": + managedcluster_scenario_object.managedcluster_crash_scenario( + run_kill_count, single_managedcluster, timeout + ) + else: + logging.info( + "There is no managedcluster action that matches %s, skipping scenario" + % action + ) + + def get_managedcluster_scenario_object(self, kubecli: KrknKubernetes): + return Scenarios(kubecli) + + def get_scenario_types(self) -> list[str]: + return ["managedcluster_scenarios"] diff --git a/kraken/managedcluster_scenarios/managedcluster_scenarios.py b/krkn/scenario_plugins/managed_cluster/scenarios.py similarity index 60% rename from kraken/managedcluster_scenarios/managedcluster_scenarios.py rename to krkn/scenario_plugins/managed_cluster/scenarios.py index b2478067..5e22a31e 100644 --- a/kraken/managedcluster_scenarios/managedcluster_scenarios.py +++ b/krkn/scenario_plugins/managed_cluster/scenarios.py @@ -2,104 +2,148 @@ from jinja2 import Environment, FileSystemLoader import os import time import logging -import sys import yaml -import kraken.managedcluster_scenarios.common_managedcluster_functions as common_managedcluster_functions +import krkn.scenario_plugins.managed_cluster.common_functions as common_managedcluster_functions from krkn_lib.k8s import KrknKubernetes + class GENERAL: def __init__(self): pass + # krkn_lib -class managedcluster_scenarios(): +class Scenarios: kubecli: KrknKubernetes + def __init__(self, kubecli: KrknKubernetes): self.kubecli = kubecli self.general = GENERAL() # managedcluster scenario to start the managedcluster - def managedcluster_start_scenario(self, instance_kill_count, managedcluster, timeout): + def managedcluster_start_scenario( + self, instance_kill_count, managedcluster, timeout + ): for _ in range(instance_kill_count): try: logging.info("Starting managedcluster_start_scenario injection") - file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__))) + file_loader = FileSystemLoader( + os.path.abspath(os.path.dirname(__file__)) + ) env = Environment(loader=file_loader, autoescape=False) template = env.get_template("manifestwork.j2") body = yaml.safe_load( - template.render(managedcluster_name=managedcluster, + template.render( + managedcluster_name=managedcluster, args="""kubectl scale deployment.apps/klusterlet --replicas 3 & - kubectl scale deployment.apps/klusterlet-registration-agent --replicas 1 -n open-cluster-management-agent""") + kubectl scale deployment.apps/klusterlet-registration-agent --replicas 1 -n open-cluster-management-agent""", + ) ) self.kubecli.create_manifestwork(body, managedcluster) - logging.info("managedcluster_start_scenario has been successfully injected!") + logging.info( + "managedcluster_start_scenario has been successfully injected!" + ) logging.info("Waiting for the specified timeout: %s" % timeout) - common_managedcluster_functions.wait_for_available_status(managedcluster, timeout, self.kubecli) + common_managedcluster_functions.wait_for_available_status( + managedcluster, timeout, self.kubecli + ) except Exception as e: logging.error("managedcluster scenario exiting due to Exception %s" % e) - sys.exit(1) + raise e finally: logging.info("Deleting manifestworks") self.kubecli.delete_manifestwork(managedcluster) # managedcluster scenario to stop the managedcluster - def managedcluster_stop_scenario(self, instance_kill_count, managedcluster, timeout): + def managedcluster_stop_scenario( + self, instance_kill_count, managedcluster, timeout + ): for _ in range(instance_kill_count): try: logging.info("Starting managedcluster_stop_scenario injection") - file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)),encoding='utf-8') + file_loader = FileSystemLoader( + os.path.abspath(os.path.dirname(__file__)), encoding="utf-8" + ) env = Environment(loader=file_loader, autoescape=False) template = env.get_template("manifestwork.j2") body = yaml.safe_load( - template.render(managedcluster_name=managedcluster, + template.render( + managedcluster_name=managedcluster, args="""kubectl scale deployment.apps/klusterlet --replicas 0 && - kubectl scale deployment.apps/klusterlet-registration-agent --replicas 0 -n open-cluster-management-agent""") + kubectl scale deployment.apps/klusterlet-registration-agent --replicas 0 -n open-cluster-management-agent""", + ) ) self.kubecli.create_manifestwork(body, managedcluster) - logging.info("managedcluster_stop_scenario has been successfully injected!") + logging.info( + "managedcluster_stop_scenario has been successfully injected!" + ) logging.info("Waiting for the specified timeout: %s" % timeout) - common_managedcluster_functions.wait_for_unavailable_status(managedcluster, timeout, self.kubecli) + common_managedcluster_functions.wait_for_unavailable_status( + managedcluster, timeout, self.kubecli + ) except Exception as e: logging.error("managedcluster scenario exiting due to Exception %s" % e) - sys.exit(1) + raise e finally: logging.info("Deleting manifestworks") self.kubecli.delete_manifestwork(managedcluster) # managedcluster scenario to stop and then start the managedcluster - def managedcluster_stop_start_scenario(self, instance_kill_count, managedcluster, timeout): + def managedcluster_stop_start_scenario( + self, instance_kill_count, managedcluster, timeout + ): logging.info("Starting managedcluster_stop_start_scenario injection") self.managedcluster_stop_scenario(instance_kill_count, managedcluster, timeout) time.sleep(10) self.managedcluster_start_scenario(instance_kill_count, managedcluster, timeout) - logging.info("managedcluster_stop_start_scenario has been successfully injected!") + logging.info( + "managedcluster_stop_start_scenario has been successfully injected!" + ) # managedcluster scenario to terminate the managedcluster - def managedcluster_termination_scenario(self, instance_kill_count, managedcluster, timeout): - logging.info("managedcluster termination is not implemented, " "no action is going to be taken") + def managedcluster_termination_scenario( + self, instance_kill_count, managedcluster, timeout + ): + logging.info( + "managedcluster termination is not implemented, " + "no action is going to be taken" + ) # managedcluster scenario to reboot the managedcluster - def managedcluster_reboot_scenario(self, instance_kill_count, managedcluster, timeout): - logging.info("managedcluster reboot is not implemented," " no action is going to be taken") + def managedcluster_reboot_scenario( + self, instance_kill_count, managedcluster, timeout + ): + logging.info( + "managedcluster reboot is not implemented," + " no action is going to be taken" + ) # managedcluster scenario to start the klusterlet def start_klusterlet_scenario(self, instance_kill_count, managedcluster, timeout): for _ in range(instance_kill_count): try: logging.info("Starting start_klusterlet_scenario injection") - file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__))) + file_loader = FileSystemLoader( + os.path.abspath(os.path.dirname(__file__)) + ) env = Environment(loader=file_loader, autoescape=False) template = env.get_template("manifestwork.j2") body = yaml.safe_load( - template.render(managedcluster_name=managedcluster, - args="""kubectl scale deployment.apps/klusterlet --replicas 3""") + template.render( + managedcluster_name=managedcluster, + args="""kubectl scale deployment.apps/klusterlet --replicas 3""", + ) ) self.kubecli.create_manifestwork(body, managedcluster) - logging.info("start_klusterlet_scenario has been successfully injected!") - time.sleep(30) # until https://github.com/open-cluster-management-io/OCM/issues/118 gets solved + logging.info( + "start_klusterlet_scenario has been successfully injected!" + ) + time.sleep( + 30 + ) # until https://github.com/open-cluster-management-io/OCM/issues/118 gets solved except Exception as e: logging.error("managedcluster scenario exiting due to Exception %s" % e) - sys.exit(1) + raise e finally: logging.info("Deleting manifestworks") self.kubecli.delete_manifestwork(managedcluster) @@ -109,25 +153,33 @@ class managedcluster_scenarios(): for _ in range(instance_kill_count): try: logging.info("Starting stop_klusterlet_scenario injection") - file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__))) + file_loader = FileSystemLoader( + os.path.abspath(os.path.dirname(__file__)) + ) env = Environment(loader=file_loader, autoescape=False) template = env.get_template("manifestwork.j2") body = yaml.safe_load( - template.render(managedcluster_name=managedcluster, - args="""kubectl scale deployment.apps/klusterlet --replicas 0""") + template.render( + managedcluster_name=managedcluster, + args="""kubectl scale deployment.apps/klusterlet --replicas 0""", + ) ) self.kubecli.create_manifestwork(body, managedcluster) logging.info("stop_klusterlet_scenario has been successfully injected!") - time.sleep(30) # until https://github.com/open-cluster-management-io/OCM/issues/118 gets solved + time.sleep( + 30 + ) # until https://github.com/open-cluster-management-io/OCM/issues/118 gets solved except Exception as e: logging.error("managedcluster scenario exiting due to Exception %s" % e) - sys.exit(1) + raise e finally: logging.info("Deleting manifestworks") self.kubecli.delete_manifestwork(managedcluster) # managedcluster scenario to stop and start the klusterlet - def stop_start_klusterlet_scenario(self, instance_kill_count, managedcluster, timeout): + def stop_start_klusterlet_scenario( + self, instance_kill_count, managedcluster, timeout + ): logging.info("Starting stop_start_klusterlet_scenario injection") self.stop_klusterlet_scenario(instance_kill_count, managedcluster, timeout) time.sleep(10) @@ -135,6 +187,10 @@ class managedcluster_scenarios(): logging.info("stop_start_klusterlet_scenario has been successfully injected!") # managedcluster scenario to crash the managedcluster - def managedcluster_crash_scenario(self, instance_kill_count, managedcluster, timeout): - logging.info("managedcluster crash scenario is not implemented, " "no action is going to be taken") - + def managedcluster_crash_scenario( + self, instance_kill_count, managedcluster, timeout + ): + logging.info( + "managedcluster crash scenario is not implemented, " + "no action is going to be taken" + ) diff --git a/kraken/pod_scenarios/__init__.py b/krkn/scenario_plugins/native/__init__.py similarity index 100% rename from kraken/pod_scenarios/__init__.py rename to krkn/scenario_plugins/native/__init__.py diff --git a/krkn/scenario_plugins/native/native_scenario_plugin.py b/krkn/scenario_plugins/native/native_scenario_plugin.py new file mode 100644 index 00000000..4c4605b7 --- /dev/null +++ b/krkn/scenario_plugins/native/native_scenario_plugin.py @@ -0,0 +1,93 @@ +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin +from krkn.scenario_plugins.native.plugins import PLUGINS +from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from typing import Any +import logging + + +class NativeScenarioPlugin(AbstractScenarioPlugin): + + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + pool = PodsMonitorPool(lib_telemetry.get_lib_kubernetes()) + kill_scenarios = [ + kill_scenario + for kill_scenario in PLUGINS.unserialize_scenario(scenario) + if kill_scenario["id"] == "kill-pods" + ] + + try: + self.start_monitoring(pool, kill_scenarios) + PLUGINS.run( + scenario, + lib_telemetry.get_lib_kubernetes().get_kubeconfig_path(), + krkn_config, + run_uuid, + ) + result = pool.join() + scenario_telemetry.affected_pods = result + if result.error: + logging.error(f"NativeScenarioPlugin unrecovered pods: {result.error}") + return 1 + + except Exception as e: + logging.error("NativeScenarioPlugin exiting due to Exception %s" % e) + pool.cancel() + return 1 + else: + return 0 + + def get_scenario_types(self) -> list[str]: + return [ + "pod_disruption_scenarios", + "pod_network_scenario", + "vmware_node_scenarios", + "ibmcloud_node_scenarios", + ] + + def start_monitoring(self, pool: PodsMonitorPool, scenarios: list[Any]): + for kill_scenario in scenarios: + recovery_time = kill_scenario["config"]["krkn_pod_recovery_time"] + if ( + "namespace_pattern" in kill_scenario["config"] + and "label_selector" in kill_scenario["config"] + ): + namespace_pattern = kill_scenario["config"]["namespace_pattern"] + label_selector = kill_scenario["config"]["label_selector"] + pool.select_and_monitor_by_namespace_pattern_and_label( + namespace_pattern=namespace_pattern, + label_selector=label_selector, + max_timeout=recovery_time, + ) + logging.info( + f"waiting {recovery_time} seconds for pod recovery, " + f"pod label selector: {label_selector} namespace pattern: {namespace_pattern}" + ) + + elif ( + "namespace_pattern" in kill_scenario["config"] + and "name_pattern" in kill_scenario["config"] + ): + namespace_pattern = kill_scenario["config"]["namespace_pattern"] + name_pattern = kill_scenario["config"]["name_pattern"] + pool.select_and_monitor_by_name_pattern_and_namespace_pattern( + pod_name_pattern=name_pattern, + namespace_pattern=namespace_pattern, + max_timeout=recovery_time, + ) + logging.info( + f"waiting {recovery_time} seconds for pod recovery, " + f"pod name pattern: {name_pattern} namespace pattern: {namespace_pattern}" + ) + else: + raise Exception( + f"impossible to determine monitor parameters, check {kill_scenario} configuration" + ) diff --git a/kraken/plugins/network/cerberus.py b/krkn/scenario_plugins/native/network/cerberus.py similarity index 100% rename from kraken/plugins/network/cerberus.py rename to krkn/scenario_plugins/native/network/cerberus.py diff --git a/kraken/plugins/network/ingress_shaping.py b/krkn/scenario_plugins/native/network/ingress_shaping.py similarity index 100% rename from kraken/plugins/network/ingress_shaping.py rename to krkn/scenario_plugins/native/network/ingress_shaping.py diff --git a/kraken/plugins/network/job.j2 b/krkn/scenario_plugins/native/network/job.j2 similarity index 100% rename from kraken/plugins/network/job.j2 rename to krkn/scenario_plugins/native/network/job.j2 diff --git a/kraken/plugins/network/kubernetes_functions.py b/krkn/scenario_plugins/native/network/kubernetes_functions.py similarity index 100% rename from kraken/plugins/network/kubernetes_functions.py rename to krkn/scenario_plugins/native/network/kubernetes_functions.py diff --git a/kraken/plugins/network/pod_interface.j2 b/krkn/scenario_plugins/native/network/pod_interface.j2 similarity index 100% rename from kraken/plugins/network/pod_interface.j2 rename to krkn/scenario_plugins/native/network/pod_interface.j2 diff --git a/kraken/plugins/network/pod_module.j2 b/krkn/scenario_plugins/native/network/pod_module.j2 similarity index 100% rename from kraken/plugins/network/pod_module.j2 rename to krkn/scenario_plugins/native/network/pod_module.j2 diff --git a/kraken/plugins/node_scenarios/ibmcloud_plugin.py b/krkn/scenario_plugins/native/node_scenarios/ibmcloud_plugin.py similarity index 81% rename from kraken/plugins/node_scenarios/ibmcloud_plugin.py rename to krkn/scenario_plugins/native/node_scenarios/ibmcloud_plugin.py index 078bb10c..f7d52921 100644 --- a/kraken/plugins/node_scenarios/ibmcloud_plugin.py +++ b/krkn/scenario_plugins/native/node_scenarios/ibmcloud_plugin.py @@ -1,19 +1,17 @@ #!/usr/bin/env python -import sys import time import typing from os import environ from dataclasses import dataclass, field -import random from traceback import format_exc import logging -from kraken.plugins.node_scenarios import kubernetes_functions as kube_helper +from krkn.scenario_plugins.native.node_scenarios import ( + kubernetes_functions as kube_helper, +) from arcaflow_plugin_sdk import validation, plugin from kubernetes import client, watch from ibm_vpc import VpcV1 from ibm_cloud_sdk_core.authenticators import IAMAuthenticator -from ibm_cloud_sdk_core import ApiException -import requests import sys @@ -26,19 +24,15 @@ class IbmCloud: apiKey = environ.get("IBMC_APIKEY") service_url = environ.get("IBMC_URL") if not apiKey: - raise Exception( - "Environmental variable 'IBMC_APIKEY' is not set" - ) + raise Exception("Environmental variable 'IBMC_APIKEY' is not set") if not service_url: - raise Exception( - "Environmental variable 'IBMC_URL' is not set" - ) - try: + raise Exception("Environmental variable 'IBMC_URL' is not set") + try: authenticator = IAMAuthenticator(apiKey) self.service = VpcV1(authenticator=authenticator) self.service.set_service_url(service_url) - except Exception as e: + except Exception as e: logging.error("error authenticating" + str(e)) sys.exit(1) @@ -46,15 +40,11 @@ class IbmCloud: """ Deletes the Instance whose name is given by 'instance_id' """ - try: + try: self.service.delete_instance(instance_id) logging.info("Deleted Instance -- '{}'".format(instance_id)) except Exception as e: - logging.info( - "Instance '{}' could not be deleted. ".format( - instance_id - ) - ) + logging.info("Instance '{}' could not be deleted. ".format(instance_id)) return False def reboot_instances(self, instance_id): @@ -65,17 +55,13 @@ class IbmCloud: try: self.service.create_instance_action( - instance_id, - type='reboot', - ) + instance_id, + type="reboot", + ) logging.info("Reset Instance -- '{}'".format(instance_id)) return True except Exception as e: - logging.info( - "Instance '{}' could not be rebooted".format( - instance_id - ) - ) + logging.info("Instance '{}' could not be rebooted".format(instance_id)) return False def stop_instances(self, instance_id): @@ -86,15 +72,13 @@ class IbmCloud: try: self.service.create_instance_action( - instance_id, - type='stop', - ) + instance_id, + type="stop", + ) logging.info("Stopped Instance -- '{}'".format(instance_id)) return True except Exception as e: - logging.info( - "Instance '{}' could not be stopped".format(instance_id) - ) + logging.info("Instance '{}' could not be stopped".format(instance_id)) logging.info("error" + str(e)) return False @@ -106,9 +90,9 @@ class IbmCloud: try: self.service.create_instance_action( - instance_id, - type='start', - ) + instance_id, + type="start", + ) logging.info("Started Instance -- '{}'".format(instance_id)) return True except Exception as e: @@ -120,27 +104,29 @@ class IbmCloud: Returns a list of Instances present in the datacenter """ instance_names = [] - try: + try: instances_result = self.service.list_instances().get_result() - instances_list = instances_result['instances'] + instances_list = instances_result["instances"] for vpc in instances_list: - instance_names.append({"vpc_name": vpc['name'], "vpc_id": vpc['id']}) - starting_count = instances_result['total_count'] - while instances_result['total_count'] == instances_result['limit']: - instances_result = self.service.list_instances(start=starting_count).get_result() - instances_list = instances_result['instances'] - starting_count += instances_result['total_count'] + instance_names.append({"vpc_name": vpc["name"], "vpc_id": vpc["id"]}) + starting_count = instances_result["total_count"] + while instances_result["total_count"] == instances_result["limit"]: + instances_result = self.service.list_instances( + start=starting_count + ).get_result() + instances_list = instances_result["instances"] + starting_count += instances_result["total_count"] for vpc in instances_list: instance_names.append({"vpc_name": vpc.name, "vpc_id": vpc.id}) - except Exception as e: + except Exception as e: logging.error("Error listing out instances: " + str(e)) sys.exit(1) return instance_names - - def find_id_in_list(self, name, vpc_list): + + def find_id_in_list(self, name, vpc_list): for vpc in vpc_list: - if vpc['vpc_name'] == name: - return vpc['vpc_id'] + if vpc["vpc_name"] == name: + return vpc["vpc_id"] def get_instance_status(self, instance_id): """ @@ -149,7 +135,7 @@ class IbmCloud: try: instance = self.service.get_instance(instance_id).get_result() - state = instance['status'] + state = instance["status"] return state except Exception as e: logging.error( @@ -169,7 +155,8 @@ class IbmCloud: while vpc is not None: vpc = self.get_instance_status(instance_id) logging.info( - "Instance %s is still being deleted, sleeping for 5 seconds" % instance_id + "Instance %s is still being deleted, sleeping for 5 seconds" + % instance_id ) time.sleep(5) time_counter += 5 @@ -196,7 +183,9 @@ class IbmCloud: time.sleep(5) time_counter += 5 if time_counter >= timeout: - logging.info("Instance %s is still not ready in allotted time" % instance_id) + logging.info( + "Instance %s is still not ready in allotted time" % instance_id + ) return False return True @@ -216,7 +205,9 @@ class IbmCloud: time.sleep(5) time_counter += 5 if time_counter >= timeout: - logging.info("Instance %s is still not stopped in allotted time" % instance_id) + logging.info( + "Instance %s is still not stopped in allotted time" % instance_id + ) return False return True @@ -236,7 +227,9 @@ class IbmCloud: time.sleep(5) time_counter += 5 if time_counter >= timeout: - logging.info("Instance %s is still restarting after allotted time" % instance_id) + logging.info( + "Instance %s is still restarting after allotted time" % instance_id + ) return False self.wait_until_running(instance_id, timeout) return True @@ -303,9 +296,7 @@ class NodeScenarioConfig: ) label_selector: typing.Annotated[ - typing.Optional[str], - validation.min(1), - validation.required_if_not("name") + typing.Optional[str], validation.min(1), validation.required_if_not("name") ] = field( default=None, metadata={ @@ -374,7 +365,7 @@ def node_start( logging.info("Starting node_start_scenario injection") logging.info("Starting the node %s " % (name)) instance_id = ibmcloud.find_id_in_list(name, node_name_id_list) - if instance_id: + if instance_id: vm_started = ibmcloud.start_instances(instance_id) if vm_started: ibmcloud.wait_until_running(instance_id, cfg.timeout) @@ -383,12 +374,19 @@ def node_start( name, cfg.timeout, watch_resource, core_v1 ) nodes_started[int(time.time_ns())] = Node(name=name) - logging.info("Node with instance ID: %s is in running state" % name) - logging.info("node_start_scenario has been successfully injected!") - else: - logging.error("Failed to find node that matched instances on ibm cloud in region") + logging.info( + "Node with instance ID: %s is in running state" % name + ) + logging.info( + "node_start_scenario has been successfully injected!" + ) + else: + logging.error( + "Failed to find node that matched instances on ibm cloud in region" + ) return "error", NodeScenarioErrorOutput( - "No matching vpc with node name " + name, kube_helper.Actions.START + "No matching vpc with node name " + name, + kube_helper.Actions.START, ) except Exception as e: logging.error("Failed to start node instance. Test Failed") @@ -417,11 +415,11 @@ def node_stop( ibmcloud = IbmCloud() core_v1 = client.CoreV1Api(cli) watch_resource = watch.Watch() - logging.info('set up done') + logging.info("set up done") node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.STOP, core_v1) logging.info("set node list" + str(node_list)) node_name_id_list = ibmcloud.list_instances() - logging.info('node names' + str(node_name_id_list)) + logging.info("node names" + str(node_name_id_list)) nodes_stopped = {} for name in node_list: try: @@ -438,12 +436,19 @@ def node_stop( name, cfg.timeout, watch_resource, core_v1 ) nodes_stopped[int(time.time_ns())] = Node(name=name) - logging.info("Node with instance ID: %s is in stopped state" % name) - logging.info("node_stop_scenario has been successfully injected!") - else: - logging.error("Failed to find node that matched instances on ibm cloud in region") + logging.info( + "Node with instance ID: %s is in stopped state" % name + ) + logging.info( + "node_stop_scenario has been successfully injected!" + ) + else: + logging.error( + "Failed to find node that matched instances on ibm cloud in region" + ) return "error", NodeScenarioErrorOutput( - "No matching vpc with node name " + name, kube_helper.Actions.STOP + "No matching vpc with node name " + name, + kube_helper.Actions.STOP, ) except Exception as e: logging.error("Failed to stop node instance. Test Failed") @@ -495,11 +500,16 @@ def node_reboot( logging.info( "Node with instance ID: %s has rebooted successfully" % name ) - logging.info("node_reboot_scenario has been successfully injected!") - else: - logging.error("Failed to find node that matched instances on ibm cloud in region") + logging.info( + "node_reboot_scenario has been successfully injected!" + ) + else: + logging.error( + "Failed to find node that matched instances on ibm cloud in region" + ) return "error", NodeScenarioErrorOutput( - "No matching vpc with node name " + name, kube_helper.Actions.REBOOT + "No matching vpc with node name " + name, + kube_helper.Actions.REBOOT, ) except Exception as e: logging.error("Failed to reboot node instance. Test Failed") @@ -540,16 +550,23 @@ def node_terminate( ) instance_id = ibmcloud.find_id_in_list(name, node_name_id_list) logging.info("Deleting the node with instance ID: %s " % (name)) - if instance_id: + if instance_id: ibmcloud.delete_instance(instance_id) ibmcloud.wait_until_released(name, cfg.timeout) nodes_terminated[int(time.time_ns())] = Node(name=name) - logging.info("Node with instance ID: %s has been released" % name) - logging.info("node_terminate_scenario has been successfully injected!") - else: - logging.error("Failed to find instances that matched the node specifications on ibm cloud in the set region") + logging.info( + "Node with instance ID: %s has been released" % name + ) + logging.info( + "node_terminate_scenario has been successfully injected!" + ) + else: + logging.error( + "Failed to find instances that matched the node specifications on ibm cloud in the set region" + ) return "error", NodeScenarioErrorOutput( - "No matching vpc with node name " + name, kube_helper.Actions.TERMINATE + "No matching vpc with node name " + name, + kube_helper.Actions.TERMINATE, ) except Exception as e: logging.error("Failed to terminate node instance. Test Failed") diff --git a/kraken/plugins/node_scenarios/kubernetes_functions.py b/krkn/scenario_plugins/native/node_scenarios/kubernetes_functions.py similarity index 100% rename from kraken/plugins/node_scenarios/kubernetes_functions.py rename to krkn/scenario_plugins/native/node_scenarios/kubernetes_functions.py diff --git a/kraken/plugins/node_scenarios/vmware_plugin.py b/krkn/scenario_plugins/native/node_scenarios/vmware_plugin.py similarity index 79% rename from kraken/plugins/node_scenarios/vmware_plugin.py rename to krkn/scenario_plugins/native/node_scenarios/vmware_plugin.py index 270f8378..93c10252 100644 --- a/kraken/plugins/node_scenarios/vmware_plugin.py +++ b/krkn/scenario_plugins/native/node_scenarios/vmware_plugin.py @@ -9,14 +9,18 @@ from os import environ from traceback import format_exc import requests from arcaflow_plugin_sdk import plugin, validation -from com.vmware.vapi.std.errors_client import (AlreadyInDesiredState, - NotAllowedInCurrentState) +from com.vmware.vapi.std.errors_client import ( + AlreadyInDesiredState, + NotAllowedInCurrentState, +) from com.vmware.vcenter.vm_client import Power from com.vmware.vcenter_client import VM, ResourcePool from kubernetes import client, watch from vmware.vapi.vsphere.client import create_vsphere_client -from kraken.plugins.node_scenarios import kubernetes_functions as kube_helper +from krkn.scenario_plugins.native.node_scenarios import ( + kubernetes_functions as kube_helper, +) class vSphere: @@ -104,9 +108,7 @@ class vSphere: return True except NotAllowedInCurrentState: logging.info( - "VM '{}'-'({})' is not Powered On. Cannot reset it", - instance_id, - vm + "VM '{}'-'({})' is not Powered On. Cannot reset it", instance_id, vm ) return False @@ -122,9 +124,7 @@ class vSphere: logging.info(f"Stopped VM -- '{instance_id}-({vm})'") return True except AlreadyInDesiredState: - logging.info( - f"VM '{instance_id}'-'({vm})' is already Powered Off" - ) + logging.info(f"VM '{instance_id}'-'({vm})' is already Powered Off") return False def start_instances(self, instance_id): @@ -139,9 +139,7 @@ class vSphere: logging.info(f"Started VM -- '{instance_id}-({vm})'") return True except AlreadyInDesiredState: - logging.info( - f"VM '{instance_id}'-'({vm})' is already Powered On" - ) + logging.info(f"VM '{instance_id}'-'({vm})' is already Powered On") return False def list_instances(self, datacenter): @@ -152,18 +150,14 @@ class vSphere: datacenter_filter = self.client.vcenter.Datacenter.FilterSpec( names=set([datacenter]) ) - datacenter_summaries = self.client.vcenter.Datacenter.list( - datacenter_filter - ) + datacenter_summaries = self.client.vcenter.Datacenter.list(datacenter_filter) try: datacenter_id = datacenter_summaries[0].datacenter except IndexError: logging.error("Datacenter '{}' doesn't exist", datacenter) sys.exit(1) - vm_filter = self.client.vcenter.VM.FilterSpec( - datacenters={datacenter_id} - ) + vm_filter = self.client.vcenter.VM.FilterSpec(datacenters={datacenter_id}) vm_summaries = self.client.vcenter.VM.list(vm_filter) vm_names = [] for vm in vm_summaries: @@ -177,10 +171,7 @@ class vSphere: datacenter_summaries = self.client.vcenter.Datacenter.list() datacenter_names = [ - { - "datacenter_id": datacenter.datacenter, - "datacenter_name": datacenter.name - } + {"datacenter_id": datacenter.datacenter, "datacenter_name": datacenter.name} for datacenter in datacenter_summaries ] return datacenter_names @@ -194,16 +185,11 @@ class vSphere: datastore_filter = self.client.vcenter.Datastore.FilterSpec( datacenters={datacenter} ) - datastore_summaries = self.client.vcenter.Datastore.list( - datastore_filter - ) + datastore_summaries = self.client.vcenter.Datastore.list(datastore_filter) datastore_names = [] for datastore in datastore_summaries: datastore_names.append( - { - "datastore_name": datastore.name, - "datastore_id": datastore.datastore - } + {"datastore_name": datastore.name, "datastore_id": datastore.datastore} ) return datastore_names @@ -213,9 +199,7 @@ class vSphere: IDs belonging to a specific datacenter """ - folder_filter = self.client.vcenter.Folder.FilterSpec( - datacenters={datacenter} - ) + folder_filter = self.client.vcenter.Folder.FilterSpec(datacenters={datacenter}) folder_summaries = self.client.vcenter.Folder.list(folder_filter) folder_names = [] for folder in folder_summaries: @@ -234,17 +218,12 @@ class vSphere: filter_spec = ResourcePool.FilterSpec( datacenters=set([datacenter]), names=names ) - resource_pool_summaries = self.client.vcenter.ResourcePool.list( - filter_spec - ) + resource_pool_summaries = self.client.vcenter.ResourcePool.list(filter_spec) if len(resource_pool_summaries) > 0: resource_pool = resource_pool_summaries[0].resource_pool return resource_pool else: - logging.error( - "ResourcePool not found in Datacenter '{}'", - datacenter - ) + logging.error("ResourcePool not found in Datacenter '{}'", datacenter) return None def create_default_vm(self, guest_os="RHEL_7_64", max_attempts=10): @@ -277,9 +256,7 @@ class vSphere: # random generator not used for # security/cryptographic purposes in this loop datacenter = random.choice(datacenter_list) # nosec - resource_pool = self.get_resource_pool( - datacenter["datacenter_id"] - ) + resource_pool = self.get_resource_pool(datacenter["datacenter_id"]) folder = random.choice( # nosec self.get_folder_list(datacenter["datacenter_id"]) )["folder_id"] @@ -288,25 +265,18 @@ class vSphere: )["datastore_id"] vm_name = "Test-" + str(time.time_ns()) return ( - create_vm( - vm_name, - resource_pool, - folder, - datastore, - guest_os - ), + create_vm(vm_name, resource_pool, folder, datastore, guest_os), vm_name, ) except Exception as e: logging.error( - "Default VM could not be created, retrying. " - "Error was: %s", - str(e) + "Default VM could not be created, retrying. " "Error was: %s", + str(e), ) logging.error( "Default VM could not be created in %s attempts. " "Check your VMware resources", - max_attempts + max_attempts, ) return None, None @@ -338,15 +308,12 @@ class vSphere: while vm is not None: vm = self.get_vm(instance_id) logging.info( - f"VM {instance_id} is still being deleted, " - f"sleeping for 5 seconds" + f"VM {instance_id} is still being deleted, " f"sleeping for 5 seconds" ) time.sleep(5) time_counter += 5 if time_counter >= timeout: - logging.info( - f"VM {instance_id} is still not deleted in allotted time" - ) + logging.info(f"VM {instance_id} is still not deleted in allotted time") return False return True @@ -361,16 +328,12 @@ class vSphere: while status != Power.State.POWERED_ON: status = self.get_vm_status(instance_id) logging.info( - "VM %s is still not running, " - "sleeping for 5 seconds", - instance_id + "VM %s is still not running, " "sleeping for 5 seconds", instance_id ) time.sleep(5) time_counter += 5 if time_counter >= timeout: - logging.info( - f"VM {instance_id} is still not ready in allotted time" - ) + logging.info(f"VM {instance_id} is still not ready in allotted time") return False return True @@ -385,15 +348,12 @@ class vSphere: while status != Power.State.POWERED_OFF: status = self.get_vm_status(instance_id) logging.info( - f"VM {instance_id} is still not running, " - f"sleeping for 5 seconds" + f"VM {instance_id} is still not running, " f"sleeping for 5 seconds" ) time.sleep(5) time_counter += 5 if time_counter >= timeout: - logging.info( - f"VM {instance_id} is still not ready in allotted time" - ) + logging.info(f"VM {instance_id} is still not ready in allotted time") return False return True @@ -410,16 +370,16 @@ class NodeScenarioSuccessOutput: metadata={ "name": "Nodes started/stopped/terminated/rebooted", "description": "Map between timestamps and the pods " - "started/stopped/terminated/rebooted. " - "The timestamp is provided in nanoseconds", + "started/stopped/terminated/rebooted. " + "The timestamp is provided in nanoseconds", } ) action: kube_helper.Actions = field( metadata={ "name": "The action performed on the node", "description": "The action performed or attempted to be " - "performed on the node. Possible values" - "are : Start, Stop, Terminate, Reboot", + "performed on the node. Possible values" + "are : Start, Stop, Terminate, Reboot", } ) @@ -449,7 +409,7 @@ class NodeScenarioConfig: metadata={ "name": "Name", "description": "Name(s) for target nodes. " - "Required if label_selector is not set.", + "Required if label_selector is not set.", }, ) @@ -458,20 +418,18 @@ class NodeScenarioConfig: metadata={ "name": "Number of runs per node", "description": "Number of times to inject each scenario under " - "actions (will perform on same node each time)", + "actions (will perform on same node each time)", }, ) label_selector: typing.Annotated[ - typing.Optional[str], - validation.min(1), - validation.required_if_not("name") + typing.Optional[str], validation.min(1), validation.required_if_not("name") ] = field( default=None, metadata={ "name": "Label selector", "description": "Kubernetes label selector for the target nodes. " - "Required if name is not set.\n" + "Required if name is not set.\n" "See https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ " # noqa "for details.", }, @@ -482,19 +440,16 @@ class NodeScenarioConfig: metadata={ "name": "Timeout", "description": "Timeout to wait for the target pod(s) " - "to be removed in seconds.", + "to be removed in seconds.", }, ) - instance_count: typing.Annotated[ - typing.Optional[int], - validation.min(1) - ] = field( + instance_count: typing.Annotated[typing.Optional[int], validation.min(1)] = field( default=1, metadata={ "name": "Instance Count", "description": "Number of nodes to perform action/select " - "that match the label selector.", + "that match the label selector.", }, ) @@ -511,7 +466,7 @@ class NodeScenarioConfig: metadata={ "name": "Verify API Session", "description": "Verifies the vSphere client session. " - "It is enabled by default", + "It is enabled by default", }, ) @@ -520,7 +475,7 @@ class NodeScenarioConfig: metadata={ "name": "Kubeconfig path", "description": "Path to your Kubeconfig file. " - "Defaults to ~/.kube/config.\n" + "Defaults to ~/.kube/config.\n" "See https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ " # noqa "for details.", }, @@ -531,11 +486,8 @@ class NodeScenarioConfig: id="vmware-node-start", name="Start the node", description="Start the node(s) by starting the VMware VM " - "on which the node is configured", - outputs={ - "success": NodeScenarioSuccessOutput, - "error": NodeScenarioErrorOutput - }, + "on which the node is configured", + outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput}, ) def node_start( cfg: NodeScenarioConfig, @@ -546,11 +498,7 @@ def node_start( vsphere = vSphere(verify=cfg.verify_session) core_v1 = client.CoreV1Api(cli) watch_resource = watch.Watch() - node_list = kube_helper.get_node_list( - cfg, - kube_helper.Actions.START, - core_v1 - ) + node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.START, core_v1) nodes_started = {} for name in node_list: try: @@ -565,17 +513,12 @@ def node_start( name, cfg.timeout, watch_resource, core_v1 ) nodes_started[int(time.time_ns())] = Node(name=name) - logging.info( - f"Node with instance ID: {name} is in running state" - ) - logging.info( - "node_start_scenario has been successfully injected!" - ) + logging.info(f"Node with instance ID: {name} is in running state") + logging.info("node_start_scenario has been successfully injected!") except Exception as e: logging.error("Failed to start node instance. Test Failed") logging.error( - f"node_start_scenario injection failed! " - f"Error was: {str(e)}" + f"node_start_scenario injection failed! " f"Error was: {str(e)}" ) return "error", NodeScenarioErrorOutput( format_exc(), kube_helper.Actions.START @@ -590,11 +533,8 @@ def node_start( id="vmware-node-stop", name="Stop the node", description="Stop the node(s) by starting the VMware VM " - "on which the node is configured", - outputs={ - "success": NodeScenarioSuccessOutput, - "error": NodeScenarioErrorOutput - }, + "on which the node is configured", + outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput}, ) def node_stop( cfg: NodeScenarioConfig, @@ -605,11 +545,7 @@ def node_stop( vsphere = vSphere(verify=cfg.verify_session) core_v1 = client.CoreV1Api(cli) watch_resource = watch.Watch() - node_list = kube_helper.get_node_list( - cfg, - kube_helper.Actions.STOP, - core_v1 - ) + node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.STOP, core_v1) nodes_stopped = {} for name in node_list: try: @@ -624,17 +560,12 @@ def node_stop( name, cfg.timeout, watch_resource, core_v1 ) nodes_stopped[int(time.time_ns())] = Node(name=name) - logging.info( - f"Node with instance ID: {name} is in stopped state" - ) - logging.info( - "node_stop_scenario has been successfully injected!" - ) + logging.info(f"Node with instance ID: {name} is in stopped state") + logging.info("node_stop_scenario has been successfully injected!") except Exception as e: logging.error("Failed to stop node instance. Test Failed") logging.error( - f"node_stop_scenario injection failed! " - f"Error was: {str(e)}" + f"node_stop_scenario injection failed! " f"Error was: {str(e)}" ) return "error", NodeScenarioErrorOutput( format_exc(), kube_helper.Actions.STOP @@ -649,11 +580,8 @@ def node_stop( id="vmware-node-reboot", name="Reboot VMware VM", description="Reboot the node(s) by starting the VMware VM " - "on which the node is configured", - outputs={ - "success": NodeScenarioSuccessOutput, - "error": NodeScenarioErrorOutput - }, + "on which the node is configured", + outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput}, ) def node_reboot( cfg: NodeScenarioConfig, @@ -664,11 +592,7 @@ def node_reboot( vsphere = vSphere(verify=cfg.verify_session) core_v1 = client.CoreV1Api(cli) watch_resource = watch.Watch() - node_list = kube_helper.get_node_list( - cfg, - kube_helper.Actions.REBOOT, - core_v1 - ) + node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.REBOOT, core_v1) nodes_rebooted = {} for name in node_list: try: @@ -685,17 +609,13 @@ def node_reboot( ) nodes_rebooted[int(time.time_ns())] = Node(name=name) logging.info( - f"Node with instance ID: {name} has rebooted " - "successfully" - ) - logging.info( - "node_reboot_scenario has been successfully injected!" + f"Node with instance ID: {name} has rebooted " "successfully" ) + logging.info("node_reboot_scenario has been successfully injected!") except Exception as e: logging.error("Failed to reboot node instance. Test Failed") logging.error( - f"node_reboot_scenario injection failed! " - f"Error was: {str(e)}" + f"node_reboot_scenario injection failed! " f"Error was: {str(e)}" ) return "error", NodeScenarioErrorOutput( format_exc(), kube_helper.Actions.REBOOT @@ -733,24 +653,18 @@ def node_terminate( ) vsphere.stop_instances(name) vsphere.wait_until_stopped(name, cfg.timeout) - logging.info( - f"Releasing the node with instance ID: {name} " - ) + logging.info(f"Releasing the node with instance ID: {name} ") vsphere.release_instances(name) vsphere.wait_until_released(name, cfg.timeout) nodes_terminated[int(time.time_ns())] = Node(name=name) + logging.info(f"Node with instance ID: {name} has been released") logging.info( - f"Node with instance ID: {name} has been released" - ) - logging.info( - "node_terminate_scenario has been " - "successfully injected!" + "node_terminate_scenario has been " "successfully injected!" ) except Exception as e: logging.error("Failed to terminate node instance. Test Failed") logging.error( - f"node_terminate_scenario injection failed! " - f"Error was: {str(e)}" + f"node_terminate_scenario injection failed! " f"Error was: {str(e)}" ) return "error", NodeScenarioErrorOutput( format_exc(), kube_helper.Actions.TERMINATE diff --git a/krkn/scenario_plugins/native/plugins.py b/krkn/scenario_plugins/native/plugins.py new file mode 100644 index 00000000..34347c0d --- /dev/null +++ b/krkn/scenario_plugins/native/plugins.py @@ -0,0 +1,176 @@ +import dataclasses +import json +import logging +from os.path import abspath +from typing import List, Any, Dict +from krkn.scenario_plugins.native.run_python_plugin import run_python_file +from arcaflow_plugin_kill_pod import kill_pods, wait_for_pods +from krkn.scenario_plugins.native.network.ingress_shaping import network_chaos +from krkn.scenario_plugins.native.pod_network_outage.pod_network_outage_plugin import ( + pod_outage, +) +from krkn.scenario_plugins.native.pod_network_outage.pod_network_outage_plugin import ( + pod_egress_shaping, +) +import krkn.scenario_plugins.native.node_scenarios.ibmcloud_plugin as ibmcloud_plugin +from krkn.scenario_plugins.native.pod_network_outage.pod_network_outage_plugin import ( + pod_ingress_shaping, +) +from arcaflow_plugin_sdk import schema, serialization, jsonschema + +from krkn.scenario_plugins.native.node_scenarios import vmware_plugin + + +@dataclasses.dataclass +class PluginStep: + schema: schema.StepSchema + error_output_ids: List[str] + + def render_output(self, output_id: str, output_data) -> str: + return json.dumps( + { + "output_id": output_id, + "output_data": self.schema.outputs[output_id].serialize(output_data), + }, + indent="\t", + ) + + +class Plugins: + """ + Plugins is a class that can run plugins sequentially. The output is rendered to the standard output and the process + is aborted if a step fails. + """ + + steps_by_id: Dict[str, PluginStep] + + def __init__(self, steps: List[PluginStep]): + self.steps_by_id = dict() + for step in steps: + if step.schema.id in self.steps_by_id: + raise Exception("Duplicate step ID: {}".format(step.schema.id)) + self.steps_by_id[step.schema.id] = step + + def unserialize_scenario(self, file: str) -> Any: + return serialization.load_from_file(abspath(file)) + + def run(self, file: str, kubeconfig_path: str, kraken_config: str, run_uuid: str): + """ + Run executes a series of steps + """ + data = self.unserialize_scenario(abspath(file)) + if not isinstance(data, list): + raise Exception( + "Invalid scenario configuration file: {} expected list, found {}".format( + file, type(data).__name__ + ) + ) + i = 0 + for entry in data: + if not isinstance(entry, dict): + raise Exception( + "Invalid scenario configuration file: {} expected a list of dict's, found {} on step {}".format( + file, type(entry).__name__, i + ) + ) + if "id" not in entry: + raise Exception( + "Invalid scenario configuration file: {} missing 'id' field on step {}".format( + file, + i, + ) + ) + if "config" not in entry: + raise Exception( + "Invalid scenario configuration file: {} missing 'config' field on step {}".format( + file, + i, + ) + ) + + if entry["id"] not in self.steps_by_id: + raise Exception( + "Invalid step {} in {} ID: {} expected one of: {}".format( + i, file, entry["id"], ", ".join(self.steps_by_id.keys()) + ) + ) + step = self.steps_by_id[entry["id"]] + unserialized_input = step.schema.input.unserialize(entry["config"]) + if "kubeconfig_path" in step.schema.input.properties: + unserialized_input.kubeconfig_path = kubeconfig_path + if "kraken_config" in step.schema.input.properties: + unserialized_input.kraken_config = kraken_config + output_id, output_data = step.schema( + params=unserialized_input, run_id=run_uuid + ) + + logging.info(step.render_output(output_id, output_data) + "\n") + if output_id in step.error_output_ids: + raise Exception( + "Step {} in {} ({}) failed".format(i, file, step.schema.id) + ) + i = i + 1 + + def json_schema(self): + """ + This function generates a JSON schema document and renders it from the steps passed. + """ + result = { + "$id": "https://github.com/redhat-chaos/krkn/", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Kraken Arcaflow scenarios", + "description": "Serial execution of Arcaflow Python plugins. See https://github.com/arcaflow for details.", + "type": "array", + "minContains": 1, + "items": {"oneOf": []}, + } + for step_id in self.steps_by_id.keys(): + step = self.steps_by_id[step_id] + step_input = jsonschema.step_input(step.schema) + del step_input["$id"] + del step_input["$schema"] + del step_input["title"] + del step_input["description"] + result["items"]["oneOf"].append( + { + "type": "object", + "properties": { + "id": { + "type": "string", + "const": step_id, + }, + "config": step_input, + }, + "required": [ + "id", + "config", + ], + } + ) + return json.dumps(result, indent="\t") + + +PLUGINS = Plugins( + [ + PluginStep( + kill_pods, + [ + "error", + ], + ), + PluginStep(wait_for_pods, ["error"]), + PluginStep(run_python_file, ["error"]), + PluginStep(vmware_plugin.node_start, ["error"]), + PluginStep(vmware_plugin.node_stop, ["error"]), + PluginStep(vmware_plugin.node_reboot, ["error"]), + PluginStep(vmware_plugin.node_terminate, ["error"]), + PluginStep(ibmcloud_plugin.node_start, ["error"]), + PluginStep(ibmcloud_plugin.node_stop, ["error"]), + PluginStep(ibmcloud_plugin.node_reboot, ["error"]), + PluginStep(ibmcloud_plugin.node_terminate, ["error"]), + PluginStep(network_chaos, ["error"]), + PluginStep(pod_outage, ["error"]), + PluginStep(pod_egress_shaping, ["error"]), + PluginStep(pod_ingress_shaping, ["error"]), + ] +) diff --git a/kraken/plugins/pod_network_outage/cerberus.py b/krkn/scenario_plugins/native/pod_network_outage/cerberus.py similarity index 100% rename from kraken/plugins/pod_network_outage/cerberus.py rename to krkn/scenario_plugins/native/pod_network_outage/cerberus.py diff --git a/kraken/plugins/pod_network_outage/job.j2 b/krkn/scenario_plugins/native/pod_network_outage/job.j2 similarity index 100% rename from kraken/plugins/pod_network_outage/job.j2 rename to krkn/scenario_plugins/native/pod_network_outage/job.j2 diff --git a/kraken/plugins/pod_network_outage/kubernetes_functions.py b/krkn/scenario_plugins/native/pod_network_outage/kubernetes_functions.py similarity index 100% rename from kraken/plugins/pod_network_outage/kubernetes_functions.py rename to krkn/scenario_plugins/native/pod_network_outage/kubernetes_functions.py diff --git a/kraken/plugins/pod_network_outage/pod_module.j2 b/krkn/scenario_plugins/native/pod_network_outage/pod_module.j2 similarity index 100% rename from kraken/plugins/pod_network_outage/pod_module.j2 rename to krkn/scenario_plugins/native/pod_network_outage/pod_module.j2 diff --git a/kraken/plugins/pod_network_outage/pod_network_outage_plugin.py b/krkn/scenario_plugins/native/pod_network_outage/pod_network_outage_plugin.py similarity index 100% rename from kraken/plugins/pod_network_outage/pod_network_outage_plugin.py rename to krkn/scenario_plugins/native/pod_network_outage/pod_network_outage_plugin.py diff --git a/kraken/plugins/run_python_plugin.py b/krkn/scenario_plugins/native/run_python_plugin.py similarity index 100% rename from kraken/plugins/run_python_plugin.py rename to krkn/scenario_plugins/native/run_python_plugin.py diff --git a/kraken/post_actions/__init__.py b/krkn/scenario_plugins/network_chaos/__init__.py similarity index 100% rename from kraken/post_actions/__init__.py rename to krkn/scenario_plugins/network_chaos/__init__.py diff --git a/kraken/network_chaos/job.j2 b/krkn/scenario_plugins/network_chaos/job.j2 similarity index 100% rename from kraken/network_chaos/job.j2 rename to krkn/scenario_plugins/network_chaos/job.j2 diff --git a/krkn/scenario_plugins/network_chaos/network_chaos_scenario_plugin.py b/krkn/scenario_plugins/network_chaos/network_chaos_scenario_plugin.py new file mode 100644 index 00000000..eaa0719f --- /dev/null +++ b/krkn/scenario_plugins/network_chaos/network_chaos_scenario_plugin.py @@ -0,0 +1,255 @@ +import logging +import os +import random +import time + +import yaml +from jinja2 import Environment, FileSystemLoader +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn_lib.utils import get_yaml_item_value, log_exception + +from krkn import cerberus, utils +from krkn.scenario_plugins.node_actions import common_node_functions +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class NetworkChaosScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + try: + with open(scenario, "r") as file: + param_lst = ["latency", "loss", "bandwidth"] + test_config = yaml.safe_load(file) + test_dict = test_config["network_chaos"] + test_duration = int(get_yaml_item_value(test_dict, "duration", 300)) + test_interface = get_yaml_item_value(test_dict, "interfaces", []) + test_node = get_yaml_item_value(test_dict, "node_name", "") + test_node_label = get_yaml_item_value( + test_dict, "label_selector", "node-role.kubernetes.io/master" + ) + test_execution = get_yaml_item_value(test_dict, "execution", "serial") + test_instance_count = get_yaml_item_value( + test_dict, "instance_count", 1 + ) + test_egress = get_yaml_item_value( + test_dict, "egress", {"bandwidth": "100mbit"} + ) + if test_node: + node_name_list = test_node.split(",") + else: + node_name_list = [test_node] + nodelst = [] + for single_node_name in node_name_list: + nodelst.extend( + common_node_functions.get_node( + single_node_name, + test_node_label, + test_instance_count, + lib_telemetry.get_lib_kubernetes(), + ) + ) + file_loader = FileSystemLoader( + os.path.abspath(os.path.dirname(__file__)) + ) + env = Environment(loader=file_loader, autoescape=True) + pod_template = env.get_template("pod.j2") + test_interface = self.verify_interface( + test_interface, + nodelst, + pod_template, + lib_telemetry.get_lib_kubernetes(), + ) + joblst = [] + egress_lst = [i for i in param_lst if i in test_egress] + chaos_config = { + "network_chaos": { + "duration": test_duration, + "interfaces": test_interface, + "node_name": ",".join(nodelst), + "execution": test_execution, + "instance_count": test_instance_count, + "egress": test_egress, + } + } + logging.info( + "Executing network chaos with config \n %s" + % yaml.dump(chaos_config) + ) + job_template = env.get_template("job.j2") + try: + for i in egress_lst: + for node in nodelst: + exec_cmd = self.get_egress_cmd( + test_execution, + test_interface, + i, + test_dict["egress"], + duration=test_duration, + ) + logging.info("Executing %s on node %s" % (exec_cmd, node)) + job_body = yaml.safe_load( + job_template.render( + jobname=i + str(hash(node))[:5], + nodename=node, + cmd=exec_cmd, + ) + ) + joblst.append(job_body["metadata"]["name"]) + api_response = ( + lib_telemetry.get_lib_kubernetes().create_job(job_body) + ) + if api_response is None: + logging.error( + "NetworkChaosScenarioPlugin Error creating job" + ) + return 1 + if test_execution == "serial": + logging.info("Waiting for serial job to finish") + start_time = int(time.time()) + self.wait_for_job( + joblst[:], + lib_telemetry.get_lib_kubernetes(), + test_duration + 300, + ) + + end_time = int(time.time()) + cerberus.publish_kraken_status( + krkn_config, + None, + start_time, + end_time, + ) + if test_execution == "parallel": + break + if test_execution == "parallel": + logging.info("Waiting for parallel job to finish") + start_time = int(time.time()) + self.wait_for_job( + joblst[:], + lib_telemetry.get_lib_kubernetes(), + test_duration + 300, + ) + end_time = int(time.time()) + cerberus.publish_kraken_status( + krkn_config, [], start_time, end_time + ) + except Exception as e: + logging.error( + "NetworkChaosScenarioPlugin exiting due to Exception %s" % e + ) + return 1 + finally: + logging.info("Deleting jobs") + self.delete_job(joblst[:], lib_telemetry.get_lib_kubernetes()) + except (RuntimeError, Exception): + scenario_telemetry.exit_status = 1 + return 1 + else: + return 0 + + def verify_interface( + self, test_interface, nodelst, template, kubecli: KrknKubernetes + ): + pod_index = random.randint(0, len(nodelst) - 1) + pod_body = yaml.safe_load(template.render(nodename=nodelst[pod_index])) + logging.info("Creating pod to query interface on node %s" % nodelst[pod_index]) + kubecli.create_pod(pod_body, "default", 300) + try: + if test_interface == []: + cmd = "ip r | grep default | awk '/default/ {print $5}'" + output = kubecli.exec_cmd_in_pod(cmd, "fedtools", "default") + test_interface = [output.replace("\n", "")] + else: + cmd = "ip -br addr show|awk -v ORS=',' '{print $1}'" + output = kubecli.exec_cmd_in_pod(cmd, "fedtools", "default") + interface_lst = output[:-1].split(",") + for interface in test_interface: + if interface not in interface_lst: + logging.error( + "NetworkChaosScenarioPlugin Interface %s not found in node %s interface list %s" + % (interface, nodelst[pod_index], interface_lst) + ) + raise RuntimeError() + return test_interface + finally: + logging.info("Deleteing pod to query interface on node") + kubecli.delete_pod("fedtools", "default") + + # krkn_lib + def get_job_pods(self, api_response, kubecli: KrknKubernetes): + controllerUid = api_response.metadata.labels["controller-uid"] + pod_label_selector = "controller-uid=" + controllerUid + pods_list = kubecli.list_pods( + label_selector=pod_label_selector, namespace="default" + ) + return pods_list[0] + + # krkn_lib + def wait_for_job(self, joblst, kubecli: KrknKubernetes, timeout=300): + waittime = time.time() + timeout + count = 0 + joblen = len(joblst) + while count != joblen: + for jobname in joblst: + try: + api_response = kubecli.get_job_status(jobname, namespace="default") + if ( + api_response.status.succeeded is not None + or api_response.status.failed is not None + ): + count += 1 + joblst.remove(jobname) + except Exception: + logging.warning("Exception in getting job status") + if time.time() > waittime: + raise Exception("Starting pod failed") + time.sleep(5) + + # krkn_lib + def delete_job(self, joblst, kubecli: KrknKubernetes): + for jobname in joblst: + try: + api_response = kubecli.get_job_status(jobname, namespace="default") + if api_response.status.failed is not None: + pod_name = self.get_job_pods(api_response, kubecli) + pod_stat = kubecli.read_pod(name=pod_name, namespace="default") + logging.error( + f"NetworkChaosScenarioPlugin {pod_stat.status.container_statuses}" + ) + pod_log_response = kubecli.get_pod_log( + name=pod_name, namespace="default" + ) + pod_log = pod_log_response.data.decode("utf-8") + logging.error(pod_log) + except Exception: + logging.warning("Exception in getting job status") + kubecli.delete_job(name=jobname, namespace="default") + + def get_egress_cmd(self, execution, test_interface, mod, vallst, duration=30): + tc_set = tc_unset = tc_ls = "" + param_map = {"latency": "delay", "loss": "loss", "bandwidth": "rate"} + for i in test_interface: + tc_set = "{0} tc qdisc add dev {1} root netem".format(tc_set, i) + tc_unset = "{0} tc qdisc del dev {1} root ;".format(tc_unset, i) + tc_ls = "{0} tc qdisc ls dev {1} ;".format(tc_ls, i) + if execution == "parallel": + for val in vallst.keys(): + tc_set += " {0} {1} ".format(param_map[val], vallst[val]) + tc_set += ";" + else: + tc_set += " {0} {1} ;".format(param_map[mod], vallst[mod]) + exec_cmd = "{0} {1} sleep {2};{3} sleep 20;{4}".format( + tc_set, tc_ls, duration, tc_unset, tc_ls + ) + return exec_cmd + + def get_scenario_types(self) -> list[str]: + return ["network_chaos_scenarios"] diff --git a/kraken/network_chaos/pod.j2 b/krkn/scenario_plugins/network_chaos/pod.j2 similarity index 100% rename from kraken/network_chaos/pod.j2 rename to krkn/scenario_plugins/network_chaos/pod.j2 diff --git a/kraken/pvc/__init__.py b/krkn/scenario_plugins/node_actions/__init__.py similarity index 100% rename from kraken/pvc/__init__.py rename to krkn/scenario_plugins/node_actions/__init__.py diff --git a/kraken/node_actions/abstract_node_scenarios.py b/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py similarity index 83% rename from kraken/node_actions/abstract_node_scenarios.py rename to krkn/scenario_plugins/node_actions/abstract_node_scenarios.py index 73928375..73d3feec 100644 --- a/kraken/node_actions/abstract_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py @@ -1,15 +1,18 @@ import sys import logging import time -import kraken.invoke.command as runcommand -import kraken.node_actions.common_node_functions as nodeaction +import krkn.invoke.command as runcommand +import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction from krkn_lib.k8s import KrknKubernetes + # krkn_lib class abstract_node_scenarios: kubecli: KrknKubernetes + def __init__(self, kubecli: KrknKubernetes): self.kubecli = kubecli + # Node scenario to start the node def node_start_scenario(self, instance_kill_count, node, timeout): pass @@ -47,16 +50,19 @@ class abstract_node_scenarios: try: logging.info("Starting stop_kubelet_scenario injection") logging.info("Stopping the kubelet of the node %s" % (node)) - runcommand.run("oc debug node/" + node + " -- chroot /host systemctl stop kubelet") + runcommand.run( + "oc debug node/" + node + " -- chroot /host systemctl stop kubelet" + ) nodeaction.wait_for_unknown_status(node, timeout, self.kubecli) logging.info("The kubelet of the node %s has been stopped" % (node)) logging.info("stop_kubelet_scenario has been successfuly injected!") except Exception as e: logging.error( - "Failed to stop the kubelet of the node. Encountered following " "exception: %s. Test Failed" % (e) + "Failed to stop the kubelet of the node. Encountered following " + "exception: %s. Test Failed" % (e) ) logging.error("stop_kubelet_scenario injection failed!") - sys.exit(1) + raise e # Node scenario to stop and start the kubelet def stop_start_kubelet_scenario(self, instance_kill_count, node, timeout): @@ -65,25 +71,28 @@ class abstract_node_scenarios: self.node_reboot_scenario(instance_kill_count, node, timeout) logging.info("stop_start_kubelet_scenario has been successfully injected!") - # Node scenario to restart the kubelet def restart_kubelet_scenario(self, instance_kill_count, node, timeout): for _ in range(instance_kill_count): try: logging.info("Starting restart_kubelet_scenario injection") logging.info("Restarting the kubelet of the node %s" % (node)) - runcommand.run("oc debug node/" + node + " -- chroot /host systemctl restart kubelet &") + runcommand.run( + "oc debug node/" + + node + + " -- chroot /host systemctl restart kubelet &" + ) nodeaction.wait_for_not_ready_status(node, timeout, self.kubecli) nodeaction.wait_for_ready_status(node, timeout, self.kubecli) logging.info("The kubelet of the node %s has been restarted" % (node)) logging.info("restart_kubelet_scenario has been successfuly injected!") except Exception as e: logging.error( - "Failed to restart the kubelet of the node. Encountered following " "exception: %s. Test Failed" % (e) + "Failed to restart the kubelet of the node. Encountered following " + "exception: %s. Test Failed" % (e) ) logging.error("restart_kubelet_scenario injection failed!") - sys.exit(1) - + raise e # Node scenario to crash the node def node_crash_scenario(self, instance_kill_count, node, timeout): @@ -92,13 +101,17 @@ class abstract_node_scenarios: logging.info("Starting node_crash_scenario injection") logging.info("Crashing the node %s" % (node)) runcommand.invoke( - "oc debug node/" + node + " -- chroot /host " "dd if=/dev/urandom of=/proc/sysrq-trigger" + "oc debug node/" + node + " -- chroot /host " + "dd if=/dev/urandom of=/proc/sysrq-trigger" ) logging.info("node_crash_scenario has been successfuly injected!") except Exception as e: - logging.error("Failed to crash the node. Encountered following exception: %s. " "Test Failed" % (e)) + logging.error( + "Failed to crash the node. Encountered following exception: %s. " + "Test Failed" % (e) + ) logging.error("node_crash_scenario injection failed!") - sys.exit(1) + raise e # Node scenario to check service status on helper node def node_service_status(self, node, service, ssh_private_key, timeout): diff --git a/kraken/node_actions/alibaba_node_scenarios.py b/krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py similarity index 75% rename from kraken/node_actions/alibaba_node_scenarios.py rename to krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py index 47c2f226..b9ce0f49 100644 --- a/kraken/node_actions/alibaba_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py @@ -1,13 +1,22 @@ import sys import time import logging -import kraken.node_actions.common_node_functions as nodeaction +import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction import os import json from aliyunsdkcore.client import AcsClient -from aliyunsdkecs.request.v20140526 import DescribeInstancesRequest, DeleteInstanceRequest -from aliyunsdkecs.request.v20140526 import StopInstanceRequest, StartInstanceRequest, RebootInstanceRequest -from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios +from aliyunsdkecs.request.v20140526 import ( + DescribeInstancesRequest, + DeleteInstanceRequest, +) +from aliyunsdkecs.request.v20140526 import ( + StopInstanceRequest, + StartInstanceRequest, + RebootInstanceRequest, +) +from krkn.scenario_plugins.node_actions.abstract_node_scenarios import ( + abstract_node_scenarios, +) from krkn_lib.k8s import KrknKubernetes @@ -46,12 +55,12 @@ class Alibaba: "variables/credentials are correct" ) logging.error(response) - sys.exit(1) + raise RuntimeError(response) return instance_list return [] except Exception as e: logging.error("ERROR while trying to get list of instances " + str(e)) - sys.exit(1) + raise e # Get the instance ID of the node def get_instance_id(self, node_name): @@ -59,8 +68,16 @@ class Alibaba: for vm in vm_list: if node_name == vm["InstanceName"]: return vm["InstanceId"] - logging.error("Couldn't find vm with name " + str(node_name) + ", you could try another region") - sys.exit(1) + logging.error( + "Couldn't find vm with name " + + str(node_name) + + ", you could try another region" + ) + raise RuntimeError( + "Couldn't find vm with name " + + str(node_name) + + ", you could try another region" + ) # Start the node instance def start_instances(self, instance_id): @@ -72,9 +89,10 @@ class Alibaba: logging.info("ECS instance with id " + str(instance_id) + " started") except Exception as e: logging.error( - "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e) + "Failed to start node instance %s. Encountered following " + "exception: %s." % (instance_id, e) ) - sys.exit(1) + raise e # https://partners-intl.aliyun.com/help/en/doc-detail/93110.html # Stop the node instance @@ -86,8 +104,11 @@ class Alibaba: self._send_request(request) logging.info("Stop %s command submit successfully.", instance_id) except Exception as e: - logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e)) - sys.exit(1) + logging.error( + "Failed to stop node instance %s. Encountered following " + "exception: %s." % (instance_id, e) + ) + raise e # Terminate the node instance def release_instance(self, instance_id, force_release=True): @@ -99,9 +120,10 @@ class Alibaba: logging.info("ECS Instance " + str(instance_id) + " released") except Exception as e: logging.error( - "Failed to terminate node instance %s. Encountered following " "exception: %s." % (instance_id, e) + "Failed to terminate node instance %s. Encountered following " + "exception: %s." % (instance_id, e) ) - sys.exit(1) + raise e # Reboot the node instance def reboot_instances(self, instance_id, force_reboot=True): @@ -113,9 +135,10 @@ class Alibaba: logging.info("ECS Instance " + str(instance_id) + " rebooted") except Exception as e: logging.error( - "Failed to reboot node instance %s. Encountered following " "exception: %s." % (instance_id, e) + "Failed to reboot node instance %s. Encountered following " + "exception: %s." % (instance_id, e) ) - sys.exit(1) + raise e def get_vm_status(self, instance_id): @@ -132,7 +155,8 @@ class Alibaba: return "Unknown" except Exception as e: logging.error( - "Failed to get node instance status %s. Encountered following " "exception: %s." % (instance_id, e) + "Failed to get node instance status %s. Encountered following " + "exception: %s." % (instance_id, e) ) return None @@ -142,7 +166,9 @@ class Alibaba: status = self.get_vm_status(instance_id) while status != "Running": status = self.get_vm_status(instance_id) - logging.info("ECS %s is still not running, sleeping for 5 seconds" % instance_id) + logging.info( + "ECS %s is still not running, sleeping for 5 seconds" % instance_id + ) time.sleep(5) time_counter += 5 if time_counter >= timeout: @@ -156,11 +182,15 @@ class Alibaba: status = self.get_vm_status(instance_id) while status != "Stopped": status = self.get_vm_status(instance_id) - logging.info("Vm %s is still stopping, sleeping for 5 seconds" % instance_id) + logging.info( + "Vm %s is still stopping, sleeping for 5 seconds" % instance_id + ) time.sleep(5) time_counter += 5 if time_counter >= timeout: - logging.info("Vm %s is still not stopped in allotted time" % instance_id) + logging.info( + "Vm %s is still not stopped in allotted time" % instance_id + ) return False return True @@ -170,7 +200,9 @@ class Alibaba: time_counter = 0 while statuses and statuses != "Released": statuses = self.get_vm_status(instance_id) - logging.info("ECS %s is still being released, waiting 10 seconds" % instance_id) + logging.info( + "ECS %s is still being released, waiting 10 seconds" % instance_id + ) time.sleep(10) time_counter += 10 if time_counter >= timeout: @@ -180,9 +212,10 @@ class Alibaba: logging.info("ECS %s is released" % instance_id) return True + # krkn_lib class alibaba_node_scenarios(abstract_node_scenarios): - def __init__(self,kubecli: KrknKubernetes): + def __init__(self, kubecli: KrknKubernetes): self.alibaba = Alibaba() # Node scenario to start the node @@ -191,7 +224,9 @@ class alibaba_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_start_scenario injection") vm_id = self.alibaba.get_instance_id(node) - logging.info("Starting the node %s with instance ID: %s " % (node, vm_id)) + logging.info( + "Starting the node %s with instance ID: %s " % (node, vm_id) + ) self.alibaba.start_instances(vm_id) self.alibaba.wait_until_running(vm_id, timeout) nodeaction.wait_for_ready_status(node, timeout, self.kubecli) @@ -199,10 +234,11 @@ class alibaba_node_scenarios(abstract_node_scenarios): logging.info("node_start_scenario has been successfully injected!") except Exception as e: logging.error( - "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e) + "Failed to start node instance. Encountered following " + "exception: %s. Test Failed" % (e) ) logging.error("node_start_scenario injection failed!") - sys.exit(1) + raise e # Node scenario to stop the node def node_stop_scenario(self, instance_kill_count, node, timeout): @@ -210,36 +246,48 @@ class alibaba_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_stop_scenario injection") vm_id = self.alibaba.get_instance_id(node) - logging.info("Stopping the node %s with instance ID: %s " % (node, vm_id)) + logging.info( + "Stopping the node %s with instance ID: %s " % (node, vm_id) + ) self.alibaba.stop_instances(vm_id) self.alibaba.wait_until_stopped(vm_id, timeout) logging.info("Node with instance ID: %s is in stopped state" % vm_id) nodeaction.wait_for_unknown_status(node, timeout, self.kubecli) except Exception as e: - logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % e) + logging.error( + "Failed to stop node instance. Encountered following exception: %s. " + "Test Failed" % e + ) logging.error("node_stop_scenario injection failed!") - sys.exit(1) + raise e # Might need to stop and then release the instance # Node scenario to terminate the node def node_termination_scenario(self, instance_kill_count, node, timeout): for _ in range(instance_kill_count): try: - logging.info("Starting node_termination_scenario injection by first stopping instance") + logging.info( + "Starting node_termination_scenario injection by first stopping instance" + ) vm_id = self.alibaba.get_instance_id(node) self.alibaba.stop_instances(vm_id) self.alibaba.wait_until_stopped(vm_id, timeout) - logging.info("Releasing the node %s with instance ID: %s " % (node, vm_id)) + logging.info( + "Releasing the node %s with instance ID: %s " % (node, vm_id) + ) self.alibaba.release_instance(vm_id) self.alibaba.wait_until_released(vm_id, timeout) logging.info("Node with instance ID: %s has been released" % node) - logging.info("node_termination_scenario has been successfully injected!") + logging.info( + "node_termination_scenario has been successfully injected!" + ) except Exception as e: logging.error( - "Failed to release node instance. Encountered following exception:" " %s. Test Failed" % (e) + "Failed to release node instance. Encountered following exception:" + " %s. Test Failed" % (e) ) logging.error("node_termination_scenario injection failed!") - sys.exit(1) + raise e # Node scenario to reboot the node def node_reboot_scenario(self, instance_kill_count, node, timeout): @@ -251,11 +299,14 @@ class alibaba_node_scenarios(abstract_node_scenarios): self.alibaba.reboot_instances(instance_id) nodeaction.wait_for_unknown_status(node, timeout, self.kubecli) nodeaction.wait_for_ready_status(node, timeout, self.kubecli) - logging.info("Node with instance ID: %s has been rebooted" % (instance_id)) + logging.info( + "Node with instance ID: %s has been rebooted" % (instance_id) + ) logging.info("node_reboot_scenario has been successfully injected!") except Exception as e: logging.error( - "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e) + "Failed to reboot node instance. Encountered following exception:" + " %s. Test Failed" % (e) ) logging.error("node_reboot_scenario injection failed!") - sys.exit(1) + raise e diff --git a/kraken/node_actions/aws_node_scenarios.py b/krkn/scenario_plugins/node_actions/aws_node_scenarios.py similarity index 77% rename from kraken/node_actions/aws_node_scenarios.py rename to krkn/scenario_plugins/node_actions/aws_node_scenarios.py index 6894e620..c715a3e8 100644 --- a/kraken/node_actions/aws_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/aws_node_scenarios.py @@ -2,10 +2,13 @@ import sys import time import boto3 import logging -import kraken.node_actions.common_node_functions as nodeaction -from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios +import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction +from krkn.scenario_plugins.node_actions.abstract_node_scenarios import ( + abstract_node_scenarios, +) from krkn_lib.k8s import KrknKubernetes + class AWS: def __init__(self): self.boto_client = boto3.client("ec2") @@ -28,10 +31,9 @@ class AWS: logging.info("EC2 instance: " + str(instance_id) + " started") except Exception as e: logging.error( - "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e) + "Failed to start node instance %s. Encountered following " + "exception: %s." % (instance_id, e) ) - # removed_exit - # sys.exit(1) raise RuntimeError() # Stop the node instance @@ -40,9 +42,10 @@ class AWS: self.boto_client.stop_instances(InstanceIds=[instance_id]) logging.info("EC2 instance: " + str(instance_id) + " stopped") except Exception as e: - logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e)) - # removed_exit - # sys.exit(1) + logging.error( + "Failed to stop node instance %s. Encountered following " + "exception: %s." % (instance_id, e) + ) raise RuntimeError() # Terminate the node instance @@ -52,10 +55,9 @@ class AWS: logging.info("EC2 instance: " + str(instance_id) + " terminated") except Exception as e: logging.error( - "Failed to terminate node instance %s. Encountered following " "exception: %s." % (instance_id, e) + "Failed to terminate node instance %s. Encountered following " + "exception: %s." % (instance_id, e) ) - # removed_exit - # sys.exit(1) raise RuntimeError() # Reboot the node instance @@ -65,10 +67,9 @@ class AWS: logging.info("EC2 instance " + str(instance_id) + " rebooted") except Exception as e: logging.error( - "Failed to reboot node instance %s. Encountered following " "exception: %s." % (instance_id, e) + "Failed to reboot node instance %s. Encountered following " + "exception: %s." % (instance_id, e) ) - # removed_exit - # sys.exit(1) raise RuntimeError() # Below functions poll EC2.Client.describe_instances() every 15 seconds @@ -80,7 +81,10 @@ class AWS: self.boto_instance.wait_until_running(InstanceIds=[instance_id]) return True except Exception as e: - logging.error("Failed to get status waiting for %s to be running %s" % (instance_id, e)) + logging.error( + "Failed to get status waiting for %s to be running %s" + % (instance_id, e) + ) return False # Wait until the node instance is stopped @@ -89,7 +93,10 @@ class AWS: self.boto_instance.wait_until_stopped(InstanceIds=[instance_id]) return True except Exception as e: - logging.error("Failed to get status waiting for %s to be stopped %s" % (instance_id, e)) + logging.error( + "Failed to get status waiting for %s to be stopped %s" + % (instance_id, e) + ) return False # Wait until the node instance is terminated @@ -98,7 +105,10 @@ class AWS: self.boto_instance.wait_until_terminated(InstanceIds=[instance_id]) return True except Exception as e: - logging.error("Failed to get status waiting for %s to be terminated %s" % (instance_id, e)) + logging.error( + "Failed to get status waiting for %s to be terminated %s" + % (instance_id, e) + ) return False # Creates a deny network acl and returns the id @@ -111,10 +121,10 @@ class AWS: except Exception as e: logging.error( "Failed to create the default network_acl: %s" - "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e) + "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" + % (e) ) - # removed_exit - # sys.exit(1) + raise RuntimeError() return acl_id @@ -122,13 +132,14 @@ class AWS: def replace_network_acl_association(self, association_id, acl_id): try: logging.info("Replacing the network acl associated with the subnet") - status = self.boto_client.replace_network_acl_association(AssociationId=association_id, NetworkAclId=acl_id) + status = self.boto_client.replace_network_acl_association( + AssociationId=association_id, NetworkAclId=acl_id + ) logging.info(status) new_association_id = status["NewAssociationId"] except Exception as e: logging.error("Failed to replace network acl association: %s" % (e)) - # removed_exit - # sys.exit(1) + raise RuntimeError() return new_association_id @@ -144,10 +155,10 @@ class AWS: except Exception as e: logging.error( "Failed to describe network acl: %s." - "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e) + "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" + % (e) ) - # removed_exit - # sys.exit(1) + raise RuntimeError() associations = response["NetworkAcls"][0]["Associations"] # grab the current network_acl in use @@ -165,10 +176,10 @@ class AWS: "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (acl_id, e) ) - # removed_exit - # sys.exit(1) + raise RuntimeError() + # krkn_lib class aws_node_scenarios(abstract_node_scenarios): def __init__(self, kubecli: KrknKubernetes): @@ -181,19 +192,23 @@ class aws_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_start_scenario injection") instance_id = self.aws.get_instance_id(node) - logging.info("Starting the node %s with instance ID: %s " % (node, instance_id)) + logging.info( + "Starting the node %s with instance ID: %s " % (node, instance_id) + ) self.aws.start_instances(instance_id) self.aws.wait_until_running(instance_id) nodeaction.wait_for_ready_status(node, timeout, self.kubecli) - logging.info("Node with instance ID: %s is in running state" % (instance_id)) + logging.info( + "Node with instance ID: %s is in running state" % (instance_id) + ) logging.info("node_start_scenario has been successfully injected!") except Exception as e: logging.error( - "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e) + "Failed to start node instance. Encountered following " + "exception: %s. Test Failed" % (e) ) logging.error("node_start_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() # Node scenario to stop the node @@ -202,16 +217,22 @@ class aws_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_stop_scenario injection") instance_id = self.aws.get_instance_id(node) - logging.info("Stopping the node %s with instance ID: %s " % (node, instance_id)) + logging.info( + "Stopping the node %s with instance ID: %s " % (node, instance_id) + ) self.aws.stop_instances(instance_id) self.aws.wait_until_stopped(instance_id) - logging.info("Node with instance ID: %s is in stopped state" % (instance_id)) + logging.info( + "Node with instance ID: %s is in stopped state" % (instance_id) + ) nodeaction.wait_for_unknown_status(node, timeout, self.kubecli) except Exception as e: - logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e)) + logging.error( + "Failed to stop node instance. Encountered following exception: %s. " + "Test Failed" % (e) + ) logging.error("node_stop_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() # Node scenario to terminate the node @@ -220,7 +241,10 @@ class aws_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_termination_scenario injection") instance_id = self.aws.get_instance_id(node) - logging.info("Terminating the node %s with instance ID: %s " % (node, instance_id)) + logging.info( + "Terminating the node %s with instance ID: %s " + % (node, instance_id) + ) self.aws.terminate_instances(instance_id) self.aws.wait_until_terminated(instance_id) for _ in range(timeout): @@ -229,15 +253,17 @@ class aws_node_scenarios(abstract_node_scenarios): time.sleep(1) if node in self.kubecli.list_nodes(): raise Exception("Node could not be terminated") - logging.info("Node with instance ID: %s has been terminated" % (instance_id)) + logging.info( + "Node with instance ID: %s has been terminated" % (instance_id) + ) logging.info("node_termination_scenario has been successfuly injected!") except Exception as e: logging.error( - "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e) + "Failed to terminate node instance. Encountered following exception:" + " %s. Test Failed" % (e) ) logging.error("node_termination_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() # Node scenario to reboot the node @@ -246,17 +272,21 @@ class aws_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_reboot_scenario injection" + str(node)) instance_id = self.aws.get_instance_id(node) - logging.info("Rebooting the node %s with instance ID: %s " % (node, instance_id)) + logging.info( + "Rebooting the node %s with instance ID: %s " % (node, instance_id) + ) self.aws.reboot_instances(instance_id) nodeaction.wait_for_unknown_status(node, timeout, self.kubecli) nodeaction.wait_for_ready_status(node, timeout, self.kubecli) - logging.info("Node with instance ID: %s has been rebooted" % (instance_id)) + logging.info( + "Node with instance ID: %s has been rebooted" % (instance_id) + ) logging.info("node_reboot_scenario has been successfuly injected!") except Exception as e: logging.error( - "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e) + "Failed to reboot node instance. Encountered following exception:" + " %s. Test Failed" % (e) ) logging.error("node_reboot_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() diff --git a/kraken/node_actions/az_node_scenarios.py b/krkn/scenario_plugins/node_actions/az_node_scenarios.py similarity index 80% rename from kraken/node_actions/az_node_scenarios.py rename to krkn/scenario_plugins/node_actions/az_node_scenarios.py index 43e973af..6cad8c12 100644 --- a/kraken/node_actions/az_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/az_node_scenarios.py @@ -1,16 +1,15 @@ - import time import os -import kraken.invoke.command as runcommand import logging -import kraken.node_actions.common_node_functions as nodeaction -from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios +import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction +from krkn.scenario_plugins.node_actions.abstract_node_scenarios import ( + abstract_node_scenarios, +) from azure.mgmt.compute import ComputeManagementClient from azure.identity import DefaultAzureCredential from krkn_lib.k8s import KrknKubernetes - class Azure: def __init__(self): logging.info("azure " + str(self)) @@ -39,9 +38,10 @@ class Azure: self.compute_client.virtual_machines.begin_start(group_name, vm_name) logging.info("vm name " + str(vm_name) + " started") except Exception as e: - logging.error("Failed to start node instance %s. Encountered following " "exception: %s." % (vm_name, e)) - # removed_exit - # sys.exit(1) + logging.error( + "Failed to start node instance %s. Encountered following " + "exception: %s." % (vm_name, e) + ) raise RuntimeError() # Stop the node instance @@ -50,9 +50,10 @@ class Azure: self.compute_client.virtual_machines.begin_power_off(group_name, vm_name) logging.info("vm name " + str(vm_name) + " stopped") except Exception as e: - logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (vm_name, e)) - # removed_exit - # sys.exit(1) + logging.error( + "Failed to stop node instance %s. Encountered following " + "exception: %s." % (vm_name, e) + ) raise RuntimeError() # Terminate the node instance @@ -62,10 +63,10 @@ class Azure: logging.info("vm name " + str(vm_name) + " terminated") except Exception as e: logging.error( - "Failed to terminate node instance %s. Encountered following " "exception: %s." % (vm_name, e) + "Failed to terminate node instance %s. Encountered following " + "exception: %s." % (vm_name, e) ) - # removed_exit - # sys.exit(1) + raise RuntimeError() # Reboot the node instance @@ -74,13 +75,17 @@ class Azure: self.compute_client.virtual_machines.begin_restart(group_name, vm_name) logging.info("vm name " + str(vm_name) + " rebooted") except Exception as e: - logging.error("Failed to reboot node instance %s. Encountered following " "exception: %s." % (vm_name, e)) - # removed_exit - # sys.exit(1) + logging.error( + "Failed to reboot node instance %s. Encountered following " + "exception: %s." % (vm_name, e) + ) + raise RuntimeError() def get_vm_status(self, resource_group, vm_name): - statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name).statuses + statuses = self.compute_client.virtual_machines.instance_view( + resource_group, vm_name + ).statuses status = len(statuses) >= 2 and statuses[1] return status @@ -114,12 +119,16 @@ class Azure: # Wait until the node instance is terminated def wait_until_terminated(self, resource_group, vm_name, timeout): - statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name).statuses[0] + statuses = self.compute_client.virtual_machines.instance_view( + resource_group, vm_name + ).statuses[0] logging.info("vm status " + str(statuses)) time_counter = 0 while statuses.code == "ProvisioningState/deleting": try: - statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name).statuses[0] + statuses = self.compute_client.virtual_machines.instance_view( + resource_group, vm_name + ).statuses[0] logging.info("Vm %s is still deleting, waiting 10 seconds" % vm_name) time.sleep(10) time_counter += 10 @@ -130,6 +139,7 @@ class Azure: logging.info("Vm %s is terminated" % vm_name) return True + # krkn_lib class azure_node_scenarios(abstract_node_scenarios): def __init__(self, kubecli: KrknKubernetes): @@ -143,19 +153,22 @@ class azure_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_start_scenario injection") vm_name, resource_group = self.azure.get_instance_id(node) - logging.info("Starting the node %s with instance ID: %s " % (vm_name, resource_group)) + logging.info( + "Starting the node %s with instance ID: %s " + % (vm_name, resource_group) + ) self.azure.start_instances(resource_group, vm_name) self.azure.wait_until_running(resource_group, vm_name, timeout) - nodeaction.wait_for_ready_status(vm_name, timeout,self.kubecli) + nodeaction.wait_for_ready_status(vm_name, timeout, self.kubecli) logging.info("Node with instance ID: %s is in running state" % node) logging.info("node_start_scenario has been successfully injected!") except Exception as e: logging.error( - "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e) + "Failed to start node instance. Encountered following " + "exception: %s. Test Failed" % (e) ) logging.error("node_start_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() # Node scenario to stop the node @@ -164,16 +177,21 @@ class azure_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_stop_scenario injection") vm_name, resource_group = self.azure.get_instance_id(node) - logging.info("Stopping the node %s with instance ID: %s " % (vm_name, resource_group)) + logging.info( + "Stopping the node %s with instance ID: %s " + % (vm_name, resource_group) + ) self.azure.stop_instances(resource_group, vm_name) self.azure.wait_until_stopped(resource_group, vm_name, timeout) logging.info("Node with instance ID: %s is in stopped state" % vm_name) nodeaction.wait_for_unknown_status(vm_name, timeout, self.kubecli) except Exception as e: - logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % e) + logging.error( + "Failed to stop node instance. Encountered following exception: %s. " + "Test Failed" % e + ) logging.error("node_stop_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() # Node scenario to terminate the node @@ -182,7 +200,10 @@ class azure_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_termination_scenario injection") vm_name, resource_group = self.azure.get_instance_id(node) - logging.info("Terminating the node %s with instance ID: %s " % (vm_name, resource_group)) + logging.info( + "Terminating the node %s with instance ID: %s " + % (vm_name, resource_group) + ) self.azure.terminate_instances(resource_group, vm_name) self.azure.wait_until_terminated(resource_group, vm_name, timeout) for _ in range(timeout): @@ -192,14 +213,16 @@ class azure_node_scenarios(abstract_node_scenarios): if vm_name in self.kubecli.list_nodes(): raise Exception("Node could not be terminated") logging.info("Node with instance ID: %s has been terminated" % node) - logging.info("node_termination_scenario has been successfully injected!") + logging.info( + "node_termination_scenario has been successfully injected!" + ) except Exception as e: logging.error( - "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e) + "Failed to terminate node instance. Encountered following exception:" + " %s. Test Failed" % (e) ) logging.error("node_termination_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() # Node scenario to reboot the node @@ -208,7 +231,10 @@ class azure_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_reboot_scenario injection") vm_name, resource_group = self.azure.get_instance_id(node) - logging.info("Rebooting the node %s with instance ID: %s " % (vm_name, resource_group)) + logging.info( + "Rebooting the node %s with instance ID: %s " + % (vm_name, resource_group) + ) self.azure.reboot_instances(resource_group, vm_name) nodeaction.wait_for_unknown_status(vm_name, timeout, self.kubecli) nodeaction.wait_for_ready_status(vm_name, timeout, self.kubecli) @@ -216,9 +242,9 @@ class azure_node_scenarios(abstract_node_scenarios): logging.info("node_reboot_scenario has been successfully injected!") except Exception as e: logging.error( - "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e) + "Failed to reboot node instance. Encountered following exception:" + " %s. Test Failed" % (e) ) logging.error("node_reboot_scenario injection failed!") - # removed_exit - # sys.exit(1) - raise RuntimeError() \ No newline at end of file + + raise RuntimeError() diff --git a/kraken/node_actions/bm_node_scenarios.py b/krkn/scenario_plugins/node_actions/bm_node_scenarios.py similarity index 79% rename from kraken/node_actions/bm_node_scenarios.py rename to krkn/scenario_plugins/node_actions/bm_node_scenarios.py index 6904c0e6..27f7d35b 100644 --- a/kraken/node_actions/bm_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/bm_node_scenarios.py @@ -1,14 +1,16 @@ -import kraken.node_actions.common_node_functions as nodeaction -from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios +import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction +from krkn.scenario_plugins.node_actions.abstract_node_scenarios import ( + abstract_node_scenarios, +) import logging import openshift as oc import pyipmi import pyipmi.interfaces -import sys import time import traceback from krkn_lib.k8s import KrknKubernetes + class BM: def __init__(self, bm_info, user, passwd): self.user = user @@ -22,7 +24,11 @@ class BM: # Get the ipmi or other BMC address of the baremetal node def get_bmc_addr(self, node_name): # Addresses in the config get higher priority. - if self.bm_info is not None and node_name in self.bm_info and "bmc_addr" in self.bm_info[node_name]: + if ( + self.bm_info is not None + and node_name in self.bm_info + and "bmc_addr" in self.bm_info[node_name] + ): return self.bm_info[node_name]["bmc_addr"] # Get the bmc addr from the BareMetalHost object. @@ -40,7 +46,10 @@ class BM: 'BMC addr empty for node "%s". Either fix the BMH object,' " or specify the address in the scenario config" % node_name ) - sys.exit(1) + raise RuntimeError( + 'BMC addr empty for node "%s". Either fix the BMH object,' + " or specify the address in the scenario config" % node_name + ) return bmh_object.model.spec.bmc.address def get_ipmi_connection(self, bmc_addr, node_name): @@ -69,10 +78,15 @@ class BM: "Missing IPMI BMI user and/or password for baremetal cloud. " "Please specify either a global or per-machine user and pass" ) - sys.exit(1) + raise RuntimeError( + "Missing IPMI BMI user and/or password for baremetal cloud. " + "Please specify either a global or per-machine user and pass" + ) # Establish connection - interface = pyipmi.interfaces.create_interface("ipmitool", interface_type="lanplus") + interface = pyipmi.interfaces.create_interface( + "ipmitool", interface_type="lanplus" + ) connection = pyipmi.create_connection(interface) @@ -96,14 +110,21 @@ class BM: # Wait until the node instance is running def wait_until_running(self, bmc_addr, node_name): - while not self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on: + while ( + not self.get_ipmi_connection(bmc_addr, node_name) + .get_chassis_status() + .power_on + ): time.sleep(1) # Wait until the node instance is stopped def wait_until_stopped(self, bmc_addr, node_name): - while self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on: + while ( + self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on + ): time.sleep(1) + # krkn_lib class bm_node_scenarios(abstract_node_scenarios): def __init__(self, bm_info, user, passwd, kubecli: KrknKubernetes): @@ -116,11 +137,15 @@ class bm_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_start_scenario injection") bmc_addr = self.bm.get_bmc_addr(node) - logging.info("Starting the node %s with bmc address: %s " % (node, bmc_addr)) + logging.info( + "Starting the node %s with bmc address: %s " % (node, bmc_addr) + ) self.bm.start_instances(bmc_addr, node) self.bm.wait_until_running(bmc_addr, node) nodeaction.wait_for_ready_status(node, timeout, self.kubecli) - logging.info("Node with bmc address: %s is in running state" % (bmc_addr)) + logging.info( + "Node with bmc address: %s is in running state" % (bmc_addr) + ) logging.info("node_start_scenario has been successfully injected!") except Exception as e: logging.error( @@ -129,7 +154,7 @@ class bm_node_scenarios(abstract_node_scenarios): "an incorrect ipmi address or login" % (e) ) logging.error("node_start_scenario injection failed!") - sys.exit(1) + raise e # Node scenario to stop the node def node_stop_scenario(self, instance_kill_count, node, timeout): @@ -137,10 +162,14 @@ class bm_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_stop_scenario injection") bmc_addr = self.bm.get_bmc_addr(node) - logging.info("Stopping the node %s with bmc address: %s " % (node, bmc_addr)) + logging.info( + "Stopping the node %s with bmc address: %s " % (node, bmc_addr) + ) self.bm.stop_instances(bmc_addr, node) self.bm.wait_until_stopped(bmc_addr, node) - logging.info("Node with bmc address: %s is in stopped state" % (bmc_addr)) + logging.info( + "Node with bmc address: %s is in stopped state" % (bmc_addr) + ) nodeaction.wait_for_unknown_status(node, timeout, self.kubecli) except Exception as e: logging.error( @@ -149,7 +178,7 @@ class bm_node_scenarios(abstract_node_scenarios): "an incorrect ipmi address or login" % (e) ) logging.error("node_stop_scenario injection failed!") - sys.exit(1) + raise e # Node scenario to terminate the node def node_termination_scenario(self, instance_kill_count, node, timeout): @@ -162,7 +191,9 @@ class bm_node_scenarios(abstract_node_scenarios): logging.info("Starting node_reboot_scenario injection") bmc_addr = self.bm.get_bmc_addr(node) logging.info("BMC Addr: %s" % (bmc_addr)) - logging.info("Rebooting the node %s with bmc address: %s " % (node, bmc_addr)) + logging.info( + "Rebooting the node %s with bmc address: %s " % (node, bmc_addr) + ) self.bm.reboot_instances(bmc_addr, node) nodeaction.wait_for_unknown_status(node, timeout, self.kubecli) nodeaction.wait_for_ready_status(node, timeout, self.kubecli) @@ -176,4 +207,4 @@ class bm_node_scenarios(abstract_node_scenarios): ) traceback.print_exc() logging.error("node_reboot_scenario injection failed!") - sys.exit(1) + raise e diff --git a/kraken/node_actions/common_node_functions.py b/krkn/scenario_plugins/node_actions/common_node_functions.py similarity index 76% rename from kraken/node_actions/common_node_functions.py rename to krkn/scenario_plugins/node_actions/common_node_functions.py index 39827854..f4e47ae1 100644 --- a/kraken/node_actions/common_node_functions.py +++ b/krkn/scenario_plugins/node_actions/common_node_functions.py @@ -2,8 +2,9 @@ import time import random import logging import paramiko -import kraken.invoke.command as runcommand +import krkn.invoke.command as runcommand from krkn_lib.k8s import KrknKubernetes + node_general = False @@ -12,7 +13,10 @@ def get_node(node_name, label_selector, instance_kill_count, kubecli: KrknKubern if node_name in kubecli.list_killable_nodes(): return [node_name] elif node_name: - logging.info("Node with provided node_name does not exist or the node might " "be in NotReady state.") + logging.info( + "Node with provided node_name does not exist or the node might " + "be in NotReady state." + ) nodes = kubecli.list_killable_nodes(label_selector) if not nodes: raise Exception("Ready nodes with the provided label selector do not exist") @@ -34,12 +38,14 @@ def wait_for_ready_status(node, timeout, kubecli: KrknKubernetes): resource_version = kubecli.get_node_resource_version(node) kubecli.watch_node_status(node, "True", timeout, resource_version) + # krkn_lib # Wait until the node status becomes Not Ready def wait_for_not_ready_status(node, timeout, kubecli: KrknKubernetes): resource_version = kubecli.get_node_resource_version(node) kubecli.watch_node_status(node, "False", timeout, resource_version) + # krkn_lib # Wait until the node status becomes Unknown def wait_for_unknown_status(node, timeout, kubecli: KrknKubernetes): @@ -50,7 +56,8 @@ def wait_for_unknown_status(node, timeout, kubecli: KrknKubernetes): # Get the ip of the cluster node def get_node_ip(node): return runcommand.invoke( - "kubectl get node %s -o " "jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'" % (node) + "kubectl get node %s -o " + "jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'" % (node) ) @@ -74,15 +81,23 @@ def check_service_status(node, service, ssh_private_key, timeout): if connection is None: break except Exception as e: - logging.error("Failed to ssh to instance: %s within the timeout duration of %s: %s" % (node, timeout, e)) + logging.error( + "Failed to ssh to instance: %s within the timeout duration of %s: %s" + % (node, timeout, e) + ) for service_name in service: logging.info("Checking status of Service: %s" % (service_name)) stdin, stdout, stderr = ssh.exec_command( - "systemctl status %s | grep '^ Active' " "| awk '{print $2}'" % (service_name) + "systemctl status %s | grep '^ Active' " + "| awk '{print $2}'" % (service_name) ) service_status = stdout.readlines()[0] - logging.info("Status of service %s is %s \n" % (service_name, service_status.strip())) + logging.info( + "Status of service %s is %s \n" % (service_name, service_status.strip()) + ) if service_status.strip() != "active": - logging.error("Service %s is in %s state" % (service_name, service_status.strip())) + logging.error( + "Service %s is in %s state" % (service_name, service_status.strip()) + ) ssh.close() diff --git a/kraken/node_actions/docker_node_scenarios.py b/krkn/scenario_plugins/node_actions/docker_node_scenarios.py similarity index 67% rename from kraken/node_actions/docker_node_scenarios.py rename to krkn/scenario_plugins/node_actions/docker_node_scenarios.py index e77cce80..a2cdf116 100644 --- a/kraken/node_actions/docker_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/docker_node_scenarios.py @@ -1,17 +1,19 @@ -import kraken.node_actions.common_node_functions as nodeaction -from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios +import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction +from krkn.scenario_plugins.node_actions.abstract_node_scenarios import ( + abstract_node_scenarios, +) import logging -import sys import docker from krkn_lib.k8s import KrknKubernetes + class Docker: def __init__(self): self.client = docker.from_env() - def get_container_id(self, node_name): + def get_container_id(self, node_name): container = self.client.containers.get(node_name) - return container.id + return container.id # Start the node instance def start_instances(self, node_name): @@ -27,7 +29,7 @@ class Docker: def reboot_instances(self, node_name): container = self.client.containers.get(node_name) container.restart() - + # Terminate the node instance def terminate_instances(self, node_name): container = self.client.containers.get(node_name) @@ -46,17 +48,22 @@ class docker_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_start_scenario injection") container_id = self.docker.get_container_id(node) - logging.info("Starting the node %s with container ID: %s " % (node, container_id)) + logging.info( + "Starting the node %s with container ID: %s " % (node, container_id) + ) self.docker.start_instances(node) nodeaction.wait_for_ready_status(node, timeout, self.kubecli) - logging.info("Node with container ID: %s is in running state" % (container_id)) + logging.info( + "Node with container ID: %s is in running state" % (container_id) + ) logging.info("node_start_scenario has been successfully injected!") except Exception as e: logging.error( - "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e) + "Failed to start node instance. Encountered following " + "exception: %s. Test Failed" % (e) ) logging.error("node_start_scenario injection failed!") - sys.exit(1) + raise e # Node scenario to stop the node def node_stop_scenario(self, instance_kill_count, node, timeout): @@ -64,14 +71,21 @@ class docker_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_stop_scenario injection") container_id = self.docker.get_container_id(node) - logging.info("Stopping the node %s with container ID: %s " % (node, container_id)) + logging.info( + "Stopping the node %s with container ID: %s " % (node, container_id) + ) self.docker.stop_instances(node) - logging.info("Node with container ID: %s is in stopped state" % (container_id)) + logging.info( + "Node with container ID: %s is in stopped state" % (container_id) + ) nodeaction.wait_for_unknown_status(node, timeout, self.kubecli) except Exception as e: - logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e)) + logging.error( + "Failed to stop node instance. Encountered following exception: %s. " + "Test Failed" % (e) + ) logging.error("node_stop_scenario injection failed!") - sys.exit(1) + raise e # Node scenario to terminate the node def node_termination_scenario(self, instance_kill_count, node, timeout): @@ -79,16 +93,22 @@ class docker_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_termination_scenario injection") container_id = self.docker.get_container_id(node) - logging.info("Terminating the node %s with container ID: %s " % (node, container_id)) + logging.info( + "Terminating the node %s with container ID: %s " + % (node, container_id) + ) self.docker.terminate_instances(node) - logging.info("Node with container ID: %s has been terminated" % (container_id)) + logging.info( + "Node with container ID: %s has been terminated" % (container_id) + ) logging.info("node_termination_scenario has been successfuly injected!") except Exception as e: logging.error( - "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e) + "Failed to terminate node instance. Encountered following exception:" + " %s. Test Failed" % (e) ) logging.error("node_termination_scenario injection failed!") - sys.exit(1) + raise e # Node scenario to reboot the node def node_reboot_scenario(self, instance_kill_count, node, timeout): @@ -96,15 +116,21 @@ class docker_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_reboot_scenario injection") container_id = self.docker.get_container_id(node) - logging.info("Rebooting the node %s with container ID: %s " % (node, container_id)) + logging.info( + "Rebooting the node %s with container ID: %s " + % (node, container_id) + ) self.docker.reboot_instances(node) nodeaction.wait_for_unknown_status(node, timeout, self.kubecli) nodeaction.wait_for_ready_status(node, timeout, self.kubecli) - logging.info("Node with container ID: %s has been rebooted" % (container_id)) + logging.info( + "Node with container ID: %s has been rebooted" % (container_id) + ) logging.info("node_reboot_scenario has been successfuly injected!") except Exception as e: logging.error( - "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e) + "Failed to reboot node instance. Encountered following exception:" + " %s. Test Failed" % (e) ) logging.error("node_reboot_scenario injection failed!") - sys.exit(1) + raise e diff --git a/kraken/node_actions/gcp_node_scenarios.py b/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py similarity index 66% rename from kraken/node_actions/gcp_node_scenarios.py rename to krkn/scenario_plugins/node_actions/gcp_node_scenarios.py index f2c7ece3..437a9181 100644 --- a/kraken/node_actions/gcp_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py @@ -3,28 +3,32 @@ import sys import time import logging import json -import kraken.node_actions.common_node_functions as nodeaction -from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios +import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction +from krkn.scenario_plugins.node_actions.abstract_node_scenarios import ( + abstract_node_scenarios, +) from googleapiclient import discovery from oauth2client.client import GoogleCredentials -import kraken.invoke.command as runcommand from krkn_lib.k8s import KrknKubernetes + class GCP: def __init__(self): - try: + try: gapp_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") with open(gapp_creds, "r") as f: f_str = f.read() - self.project = json.loads(f_str)['project_id'] - #self.project = runcommand.invoke("gcloud config get-value project").split("/n")[0].strip() + self.project = json.loads(f_str)["project_id"] + # self.project = runcommand.invoke("gcloud config get-value project").split("/n")[0].strip() logging.info("project " + str(self.project) + "!") credentials = GoogleCredentials.get_application_default() - self.client = discovery.build("compute", "v1", credentials=credentials, cache_discovery=False) + self.client = discovery.build( + "compute", "v1", credentials=credentials, cache_discovery=False + ) - except Exception as e: + except Exception as e: logging.error("Error on setting up GCP connection: " + str(e)) - sys.exit(1) + raise e # Get the instance ID of the node def get_instance_id(self, node): @@ -32,7 +36,9 @@ class GCP: while zone_request is not None: zone_response = zone_request.execute() for zone in zone_response["items"]: - instances_request = self.client.instances().list(project=self.project, zone=zone["name"]) + instances_request = self.client.instances().list( + project=self.project, zone=zone["name"] + ) while instances_request is not None: instance_response = instances_request.execute() if "items" in instance_response.keys(): @@ -40,72 +46,87 @@ class GCP: if instance["name"] in node: return instance["name"], zone["name"] instances_request = self.client.zones().list_next( - previous_request=instances_request, previous_response=instance_response + previous_request=instances_request, + previous_response=instance_response, ) - zone_request = self.client.zones().list_next(previous_request=zone_request, previous_response=zone_response) + zone_request = self.client.zones().list_next( + previous_request=zone_request, previous_response=zone_response + ) logging.info("no instances ") # Start the node instance def start_instances(self, zone, instance_id): try: - self.client.instances().start(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().start( + project=self.project, zone=zone, instance=instance_id + ).execute() logging.info("vm name " + str(instance_id) + " started") except Exception as e: logging.error( - "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e) + "Failed to start node instance %s. Encountered following " + "exception: %s." % (instance_id, e) ) - # removed_exit - # sys.exit(1) + raise RuntimeError() # Stop the node instance def stop_instances(self, zone, instance_id): try: - self.client.instances().stop(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().stop( + project=self.project, zone=zone, instance=instance_id + ).execute() logging.info("vm name " + str(instance_id) + " stopped") except Exception as e: - logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e)) - # removed_exit - # sys.exit(1) + logging.error( + "Failed to stop node instance %s. Encountered following " + "exception: %s." % (instance_id, e) + ) + raise RuntimeError() # Start the node instance def suspend_instances(self, zone, instance_id): try: - self.client.instances().suspend(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().suspend( + project=self.project, zone=zone, instance=instance_id + ).execute() logging.info("vm name " + str(instance_id) + " suspended") except Exception as e: logging.error( - "Failed to suspend node instance %s. Encountered following " "exception: %s." % (instance_id, e) + "Failed to suspend node instance %s. Encountered following " + "exception: %s." % (instance_id, e) ) - # removed_exit - # sys.exit(1) + raise RuntimeError() # Terminate the node instance def terminate_instances(self, zone, instance_id): try: - self.client.instances().delete(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().delete( + project=self.project, zone=zone, instance=instance_id + ).execute() logging.info("vm name " + str(instance_id) + " terminated") except Exception as e: logging.error( - "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e) + "Failed to start node instance %s. Encountered following " + "exception: %s." % (instance_id, e) ) - # removed_exit - # sys.exit(1) + raise RuntimeError() # Reboot the node instance def reboot_instances(self, zone, instance_id): try: - self.client.instances().reset(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances().reset( + project=self.project, zone=zone, instance=instance_id + ).execute() logging.info("vm name " + str(instance_id) + " rebooted") except Exception as e: logging.error( - "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e) + "Failed to start node instance %s. Encountered following " + "exception: %s." % (instance_id, e) ) - # removed_exit - # sys.exit(1) + raise RuntimeError() # Get instance status @@ -115,13 +136,20 @@ class GCP: i = 0 sleeper = 5 while i <= timeout: - instStatus = self.client.instances().get(project=self.project, zone=zone, instance=instance_id).execute() + instStatus = ( + self.client.instances() + .get(project=self.project, zone=zone, instance=instance_id) + .execute() + ) logging.info("Status of vm " + str(instStatus["status"])) if instStatus["status"] == expected_status: return True time.sleep(sleeper) i += sleeper - logging.error("Status of %s was not %s in %s seconds" % (instance_id, expected_status, timeout)) + logging.error( + "Status of %s was not %s in %s seconds" + % (instance_id, expected_status, timeout) + ) return False # Wait until the node instance is suspended @@ -143,7 +171,9 @@ class GCP: sleeper = 5 while i <= timeout: instStatus = ( - self.client.instances().get(project=self.project, zone=zone, instance=instance_id).execute() + self.client.instances() + .get(project=self.project, zone=zone, instance=instance_id) + .execute() ) logging.info("Status of vm " + str(instStatus["status"])) time.sleep(sleeper) @@ -164,19 +194,23 @@ class gcp_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_start_scenario injection") instance_id, zone = self.gcp.get_instance_id(node) - logging.info("Starting the node %s with instance ID: %s " % (node, instance_id)) + logging.info( + "Starting the node %s with instance ID: %s " % (node, instance_id) + ) self.gcp.start_instances(zone, instance_id) self.gcp.wait_until_running(zone, instance_id, timeout) nodeaction.wait_for_ready_status(node, timeout, self.kubecli) - logging.info("Node with instance ID: %s is in running state" % instance_id) + logging.info( + "Node with instance ID: %s is in running state" % instance_id + ) logging.info("node_start_scenario has been successfully injected!") except Exception as e: logging.error( - "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e) + "Failed to start node instance. Encountered following " + "exception: %s. Test Failed" % (e) ) logging.error("node_start_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() # Node scenario to stop the node @@ -186,16 +220,22 @@ class gcp_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_stop_scenario injection") instance_id, zone = self.gcp.get_instance_id(node) - logging.info("Stopping the node %s with instance ID: %s " % (node, instance_id)) + logging.info( + "Stopping the node %s with instance ID: %s " % (node, instance_id) + ) self.gcp.stop_instances(zone, instance_id) self.gcp.wait_until_stopped(zone, instance_id, timeout) - logging.info("Node with instance ID: %s is in stopped state" % instance_id) + logging.info( + "Node with instance ID: %s is in stopped state" % instance_id + ) nodeaction.wait_for_unknown_status(node, timeout, self.kubecli) except Exception as e: - logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e)) + logging.error( + "Failed to stop node instance. Encountered following exception: %s. " + "Test Failed" % (e) + ) logging.error("node_stop_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() # Node scenario to terminate the node @@ -204,7 +244,10 @@ class gcp_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_termination_scenario injection") instance_id, zone = self.gcp.get_instance_id(node) - logging.info("Terminating the node %s with instance ID: %s " % (node, instance_id)) + logging.info( + "Terminating the node %s with instance ID: %s " + % (node, instance_id) + ) self.gcp.terminate_instances(zone, instance_id) self.gcp.wait_until_terminated(zone, instance_id, timeout) for _ in range(timeout): @@ -212,17 +255,20 @@ class gcp_node_scenarios(abstract_node_scenarios): break time.sleep(1) if node in self.kubecli.list_nodes(): - raise Exception("Node could not be terminated") - logging.info("Node with instance ID: %s has been terminated" % instance_id) + raise RuntimeError("Node could not be terminated") + logging.info( + "Node with instance ID: %s has been terminated" % instance_id + ) logging.info("node_termination_scenario has been successfuly injected!") except Exception as e: logging.error( - "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % e + "Failed to terminate node instance. Encountered following exception:" + " %s. Test Failed" % e ) logging.error("node_termination_scenario injection failed!") - # removed_exit - # sys.exit(1) - raise RuntimeError() + + + raise e # Node scenario to reboot the node def node_reboot_scenario(self, instance_kill_count, node, timeout): @@ -230,16 +276,20 @@ class gcp_node_scenarios(abstract_node_scenarios): try: logging.info("Starting node_reboot_scenario injection") instance_id, zone = self.gcp.get_instance_id(node) - logging.info("Rebooting the node %s with instance ID: %s " % (node, instance_id)) + logging.info( + "Rebooting the node %s with instance ID: %s " % (node, instance_id) + ) self.gcp.reboot_instances(zone, instance_id) nodeaction.wait_for_ready_status(node, timeout, self.kubecli) - logging.info("Node with instance ID: %s has been rebooted" % instance_id) + logging.info( + "Node with instance ID: %s has been rebooted" % instance_id + ) logging.info("node_reboot_scenario has been successfuly injected!") except Exception as e: logging.error( - "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e) + "Failed to reboot node instance. Encountered following exception:" + " %s. Test Failed" % (e) ) logging.error("node_reboot_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() diff --git a/kraken/node_actions/general_cloud_node_scenarios.py b/krkn/scenario_plugins/node_actions/general_cloud_node_scenarios.py similarity index 52% rename from kraken/node_actions/general_cloud_node_scenarios.py rename to krkn/scenario_plugins/node_actions/general_cloud_node_scenarios.py index 62419048..c0a7ac8b 100644 --- a/kraken/node_actions/general_cloud_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/general_cloud_node_scenarios.py @@ -1,11 +1,15 @@ import logging -from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios +from krkn.scenario_plugins.node_actions.abstract_node_scenarios import ( + abstract_node_scenarios, +) from krkn_lib.k8s import KrknKubernetes + class GENERAL: def __init__(self): pass + # krkn_lib class general_node_scenarios(abstract_node_scenarios): def __init__(self, kubecli: KrknKubernetes): @@ -14,16 +18,28 @@ class general_node_scenarios(abstract_node_scenarios): # Node scenario to start the node def node_start_scenario(self, instance_kill_count, node, timeout): - logging.info("Node start is not set up yet for this cloud type, " "no action is going to be taken") + logging.info( + "Node start is not set up yet for this cloud type, " + "no action is going to be taken" + ) # Node scenario to stop the node def node_stop_scenario(self, instance_kill_count, node, timeout): - logging.info("Node stop is not set up yet for this cloud type," " no action is going to be taken") + logging.info( + "Node stop is not set up yet for this cloud type," + " no action is going to be taken" + ) # Node scenario to terminate the node def node_termination_scenario(self, instance_kill_count, node, timeout): - logging.info("Node termination is not set up yet for this cloud type, " "no action is going to be taken") + logging.info( + "Node termination is not set up yet for this cloud type, " + "no action is going to be taken" + ) # Node scenario to reboot the node def node_reboot_scenario(self, instance_kill_count, node, timeout): - logging.info("Node reboot is not set up yet for this cloud type," " no action is going to be taken") + logging.info( + "Node reboot is not set up yet for this cloud type," + " no action is going to be taken" + ) diff --git a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py new file mode 100644 index 00000000..c49afdaf --- /dev/null +++ b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py @@ -0,0 +1,219 @@ +import logging +import time + +import yaml +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn_lib.utils import get_yaml_item_value, log_exception + +from krkn import cerberus, utils +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin +from krkn.scenario_plugins.node_actions import common_node_functions +from krkn.scenario_plugins.node_actions.aws_node_scenarios import aws_node_scenarios +from krkn.scenario_plugins.node_actions.az_node_scenarios import azure_node_scenarios +from krkn.scenario_plugins.node_actions.docker_node_scenarios import ( + docker_node_scenarios, +) +from krkn.scenario_plugins.node_actions.gcp_node_scenarios import gcp_node_scenarios +from krkn.scenario_plugins.node_actions.general_cloud_node_scenarios import ( + general_node_scenarios, +) + +node_general = False + + +class NodeActionsScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + with open(scenario, "r") as f: + node_scenario_config = yaml.full_load(f) + for node_scenario in node_scenario_config["node_scenarios"]: + try: + node_scenario_object = self.get_node_scenario_object( + node_scenario, lib_telemetry.get_lib_kubernetes() + ) + if node_scenario["actions"]: + for action in node_scenario["actions"]: + start_time = int(time.time()) + self.inject_node_scenario( + action, + node_scenario, + node_scenario_object, + lib_telemetry.get_lib_kubernetes(), + ) + end_time = int(time.time()) + cerberus.get_status(krkn_config, start_time, end_time) + except (RuntimeError, Exception) as e: + logging.error("Node Actions exiting due to Exception %s" % e) + return 1 + else: + return 0 + + def get_node_scenario_object(self, node_scenario, kubecli: KrknKubernetes): + if ( + "cloud_type" not in node_scenario.keys() + or node_scenario["cloud_type"] == "generic" + ): + global node_general + node_general = True + return general_node_scenarios(kubecli) + if node_scenario["cloud_type"] == "aws": + return aws_node_scenarios(kubecli) + elif node_scenario["cloud_type"] == "gcp": + return gcp_node_scenarios(kubecli) + elif node_scenario["cloud_type"] == "openstack": + from krkn.scenario_plugins.node_actions.openstack_node_scenarios import ( + openstack_node_scenarios, + ) + + return openstack_node_scenarios(kubecli) + elif ( + node_scenario["cloud_type"] == "azure" + or node_scenario["cloud_type"] == "az" + ): + return azure_node_scenarios(kubecli) + elif ( + node_scenario["cloud_type"] == "alibaba" + or node_scenario["cloud_type"] == "alicloud" + ): + from krkn.scenario_plugins.node_actions.alibaba_node_scenarios import ( + alibaba_node_scenarios, + ) + + return alibaba_node_scenarios(kubecli) + elif node_scenario["cloud_type"] == "bm": + from krkn.scenario_plugins.node_actions.bm_node_scenarios import ( + bm_node_scenarios, + ) + + return bm_node_scenarios( + node_scenario.get("bmc_info"), + node_scenario.get("bmc_user", None), + node_scenario.get("bmc_password", None), + kubecli, + ) + elif node_scenario["cloud_type"] == "docker": + return docker_node_scenarios(kubecli) + else: + logging.error( + "Cloud type " + + node_scenario["cloud_type"] + + " is not currently supported; " + "try using 'generic' if wanting to stop/start kubelet or fork bomb on any " + "cluster" + ) + raise Exception( + "Cloud type " + + node_scenario["cloud_type"] + + " is not currently supported; " + "try using 'generic' if wanting to stop/start kubelet or fork bomb on any " + "cluster" + ) + + def inject_node_scenario( + self, action, node_scenario, node_scenario_object, kubecli: KrknKubernetes + ): + generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario") + # Get the node scenario configurations + run_kill_count = get_yaml_item_value(node_scenario, "runs", 1) + instance_kill_count = get_yaml_item_value(node_scenario, "instance_count", 1) + node_name = get_yaml_item_value(node_scenario, "node_name", "") + label_selector = get_yaml_item_value(node_scenario, "label_selector", "") + if action == "node_stop_start_scenario": + duration = get_yaml_item_value(node_scenario, "duration", 120) + timeout = get_yaml_item_value(node_scenario, "timeout", 120) + service = get_yaml_item_value(node_scenario, "service", "") + ssh_private_key = get_yaml_item_value( + node_scenario, "ssh_private_key", "~/.ssh/id_rsa" + ) + # Get the node to apply the scenario + if node_name: + node_name_list = node_name.split(",") + else: + node_name_list = [node_name] + for single_node_name in node_name_list: + nodes = common_node_functions.get_node( + single_node_name, label_selector, instance_kill_count, kubecli + ) + for single_node in nodes: + if node_general and action not in generic_cloud_scenarios: + logging.info( + "Scenario: " + + action + + " is not set up for generic cloud type, skipping action" + ) + else: + if action == "node_start_scenario": + node_scenario_object.node_start_scenario( + run_kill_count, single_node, timeout + ) + elif action == "node_stop_scenario": + node_scenario_object.node_stop_scenario( + run_kill_count, single_node, timeout + ) + elif action == "node_stop_start_scenario": + node_scenario_object.node_stop_start_scenario( + run_kill_count, single_node, timeout, duration + ) + elif action == "node_termination_scenario": + node_scenario_object.node_termination_scenario( + run_kill_count, single_node, timeout + ) + elif action == "node_reboot_scenario": + node_scenario_object.node_reboot_scenario( + run_kill_count, single_node, timeout + ) + elif action == "stop_start_kubelet_scenario": + node_scenario_object.stop_start_kubelet_scenario( + run_kill_count, single_node, timeout + ) + elif action == "restart_kubelet_scenario": + node_scenario_object.restart_kubelet_scenario( + run_kill_count, single_node, timeout + ) + elif action == "stop_kubelet_scenario": + node_scenario_object.stop_kubelet_scenario( + run_kill_count, single_node, timeout + ) + elif action == "node_crash_scenario": + node_scenario_object.node_crash_scenario( + run_kill_count, single_node, timeout + ) + elif action == "stop_start_helper_node_scenario": + if node_scenario["cloud_type"] != "openstack": + logging.error( + "Scenario: " + action + " is not supported for " + "cloud type " + + node_scenario["cloud_type"] + + ", skipping action" + ) + else: + if not node_scenario["helper_node_ip"]: + logging.error("Helper node IP address is not provided") + raise Exception( + "Helper node IP address is not provided" + ) + node_scenario_object.helper_node_stop_start_scenario( + run_kill_count, node_scenario["helper_node_ip"], timeout + ) + node_scenario_object.helper_node_service_status( + node_scenario["helper_node_ip"], + service, + ssh_private_key, + timeout, + ) + else: + logging.info( + "There is no node action that matches %s, skipping scenario" + % action + ) + + def get_scenario_types(self) -> list[str]: + return ["node_scenarios"] diff --git a/kraken/node_actions/openstack_node_scenarios.py b/krkn/scenario_plugins/node_actions/openstack_node_scenarios.py similarity index 79% rename from kraken/node_actions/openstack_node_scenarios.py rename to krkn/scenario_plugins/node_actions/openstack_node_scenarios.py index b4a33489..f7ce8563 100644 --- a/kraken/node_actions/openstack_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/openstack_node_scenarios.py @@ -1,11 +1,14 @@ import sys import time import logging -import kraken.invoke.command as runcommand -import kraken.node_actions.common_node_functions as nodeaction -from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios +import krkn.invoke.command as runcommand +import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction +from krkn.scenario_plugins.node_actions.abstract_node_scenarios import ( + abstract_node_scenarios, +) from krkn_lib.k8s import KrknKubernetes + class OPENSTACKCLOUD: def __init__(self): self.Wait = 30 @@ -22,9 +25,10 @@ class OPENSTACKCLOUD: runcommand.invoke("openstack server start %s" % (node)) logging.info("Instance: " + str(node) + " started") except Exception as e: - logging.error("Failed to start node instance %s. Encountered following " "exception: %s." % (node, e)) - # removed_exit - # sys.exit(1) + logging.error( + "Failed to start node instance %s. Encountered following " + "exception: %s." % (node, e) + ) raise RuntimeError() # Stop the node instance @@ -33,9 +37,10 @@ class OPENSTACKCLOUD: runcommand.invoke("openstack server stop %s" % (node)) logging.info("Instance: " + str(node) + " stopped") except Exception as e: - logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (node, e)) - # removed_exit - # sys.exit(1) + logging.error( + "Failed to stop node instance %s. Encountered following " + "exception: %s." % (node, e) + ) raise RuntimeError() # Reboot the node instance @@ -44,9 +49,10 @@ class OPENSTACKCLOUD: runcommand.invoke("openstack server reboot --soft %s" % (node)) logging.info("Instance: " + str(node) + " rebooted") except Exception as e: - logging.error("Failed to reboot node instance %s. Encountered following " "exception: %s." % (node, e)) - # removed_exit - # sys.exit(1) + logging.error( + "Failed to reboot node instance %s. Encountered following " + "exception: %s." % (node, e) + ) raise RuntimeError() # Wait until the node instance is running @@ -63,12 +69,16 @@ class OPENSTACKCLOUD: sleeper = 1 while i <= timeout: instStatus = runcommand.invoke( - "openstack server show %s | tr -d ' ' |" "grep '^|status' |" "cut -d '|' -f3 | tr -d '\n'" % (node) + "openstack server show %s | tr -d ' ' |" + "grep '^|status' |" + "cut -d '|' -f3 | tr -d '\n'" % (node) ) logging.info("instance status is %s" % (instStatus)) logging.info("expected status is %s" % (expected_status)) if instStatus.strip() == expected_status: - logging.info("instance status has reached desired status %s" % (instStatus)) + logging.info( + "instance status has reached desired status %s" % (instStatus) + ) return True time.sleep(sleeper) i += sleeper @@ -76,7 +86,9 @@ class OPENSTACKCLOUD: # Get the openstack instance name def get_openstack_nodename(self, os_node_ip): - server_list = runcommand.invoke("openstack server list | grep %s" % (os_node_ip)) + server_list = runcommand.invoke( + "openstack server list | grep %s" % (os_node_ip) + ) list_of_servers = server_list.split("\n") for item in list_of_servers: items = item.split("|") @@ -92,6 +104,7 @@ class OPENSTACKCLOUD: return node_name counter += 1 + # krkn_lib class openstack_node_scenarios(abstract_node_scenarios): def __init__(self, kubecli: KrknKubernetes): @@ -111,11 +124,11 @@ class openstack_node_scenarios(abstract_node_scenarios): logging.info("node_start_scenario has been successfully injected!") except Exception as e: logging.error( - "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e) + "Failed to start node instance. Encountered following " + "exception: %s. Test Failed" % (e) ) logging.error("node_start_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() # Node scenario to stop the node @@ -130,10 +143,12 @@ class openstack_node_scenarios(abstract_node_scenarios): logging.info("Node with instance name: %s is in stopped state" % (node)) nodeaction.wait_for_ready_status(node, timeout, self.kubecli) except Exception as e: - logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e)) + logging.error( + "Failed to stop node instance. Encountered following exception: %s. " + "Test Failed" % (e) + ) logging.error("node_stop_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() # Node scenario to reboot the node @@ -150,11 +165,11 @@ class openstack_node_scenarios(abstract_node_scenarios): logging.info("node_reboot_scenario has been successfuly injected!") except Exception as e: logging.error( - "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e) + "Failed to reboot node instance. Encountered following exception:" + " %s. Test Failed" % (e) ) logging.error("node_reboot_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() # Node scenario to start the node @@ -162,7 +177,9 @@ class openstack_node_scenarios(abstract_node_scenarios): for _ in range(instance_kill_count): try: logging.info("Starting helper_node_start_scenario injection") - openstack_node_name = self.openstackcloud.get_openstack_nodename(node_ip.strip()) + openstack_node_name = self.openstackcloud.get_openstack_nodename( + node_ip.strip() + ) logging.info("Starting the helper node %s" % (openstack_node_name)) self.openstackcloud.start_instances(openstack_node_name) self.openstackcloud.wait_until_running(openstack_node_name, timeout) @@ -170,11 +187,11 @@ class openstack_node_scenarios(abstract_node_scenarios): logging.info("node_start_scenario has been successfully injected!") except Exception as e: logging.error( - "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e) + "Failed to start node instance. Encountered following " + "exception: %s. Test Failed" % (e) ) logging.error("helper_node_start_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() # Node scenario to stop the node @@ -182,27 +199,35 @@ class openstack_node_scenarios(abstract_node_scenarios): for _ in range(instance_kill_count): try: logging.info("Starting helper_node_stop_scenario injection") - openstack_node_name = self.openstackcloud.get_openstack_nodename(node_ip.strip()) + openstack_node_name = self.openstackcloud.get_openstack_nodename( + node_ip.strip() + ) logging.info("Stopping the helper node %s " % (openstack_node_name)) self.openstackcloud.stop_instances(openstack_node_name) self.openstackcloud.wait_until_stopped(openstack_node_name, timeout) logging.info("Helper node with IP: %s is in stopped state" % (node_ip)) except Exception as e: - logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e)) + logging.error( + "Failed to stop node instance. Encountered following exception: %s. " + "Test Failed" % (e) + ) logging.error("helper_node_stop_scenario injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() def helper_node_service_status(self, node_ip, service, ssh_private_key, timeout): try: logging.info("Checking service status on the helper node") - nodeaction.check_service_status(node_ip.strip(), service, ssh_private_key, timeout) + nodeaction.check_service_status( + node_ip.strip(), service, ssh_private_key, timeout + ) logging.info("Service status checked on %s" % (node_ip)) logging.info("Check service status is successfuly injected!") except Exception as e: - logging.error("Failed to check service status. Encountered following exception:" " %s. Test Failed" % (e)) + logging.error( + "Failed to check service status. Encountered following exception:" + " %s. Test Failed" % (e) + ) logging.error("helper_node_service_status injection failed!") - # removed_exit - # sys.exit(1) + raise RuntimeError() diff --git a/kraken/service_disruption/__init__.py b/krkn/scenario_plugins/pvc/__init__.py similarity index 100% rename from kraken/service_disruption/__init__.py rename to krkn/scenario_plugins/pvc/__init__.py diff --git a/krkn/scenario_plugins/pvc/pvc_scenario_plugin.py b/krkn/scenario_plugins/pvc/pvc_scenario_plugin.py new file mode 100644 index 00000000..d842e955 --- /dev/null +++ b/krkn/scenario_plugins/pvc/pvc_scenario_plugin.py @@ -0,0 +1,324 @@ +import logging +import random +import re +import time + +import yaml +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn_lib.utils import get_yaml_item_value, log_exception + +from krkn import cerberus, utils +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class PvcScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + try: + with open(scenario, "r") as f: + config_yaml = yaml.full_load(f) + scenario_config = config_yaml["pvc_scenario"] + pvc_name = get_yaml_item_value(scenario_config, "pvc_name", "") + pod_name = get_yaml_item_value(scenario_config, "pod_name", "") + namespace = get_yaml_item_value(scenario_config, "namespace", "") + target_fill_percentage = get_yaml_item_value( + scenario_config, "fill_percentage", "50" + ) + duration = get_yaml_item_value(scenario_config, "duration", 60) + + logging.info( + "Input params:\n" + "pvc_name: '%s'\n" + "pod_name: '%s'\n" + "namespace: '%s'\n" + "target_fill_percentage: '%s%%'\nduration: '%ss'" + % ( + str(pvc_name), + str(pod_name), + str(namespace), + str(target_fill_percentage), + str(duration), + ) + ) + + # Check input params + if namespace is None: + logging.error( + "PvcScenarioPlugin You must specify the namespace where the PVC is" + ) + return 1 + if pvc_name is None and pod_name is None: + logging.error( + "PvcScenarioPlugin You must specify the pvc_name or the pod_name" + ) + return 1 + if pvc_name and pod_name: + logging.info( + "pod_name will be ignored, pod_name used will be " + "a retrieved from the pod used in the pvc_name" + ) + + # Get pod name + if pvc_name: + if pod_name: + logging.info( + "pod_name '%s' will be overridden with one of " + "the pods mounted in the PVC" % (str(pod_name)) + ) + pvc = lib_telemetry.get_lib_kubernetes().get_pvc_info( + pvc_name, namespace + ) + try: + # random generator not used for + # security/cryptographic purposes. + pod_name = random.choice(pvc.podNames) # nosec + logging.info("Pod name: %s" % pod_name) + except Exception: + logging.error( + "PvcScenarioPlugin Pod associated with %s PVC, on namespace %s, " + "not found" % (str(pvc_name), str(namespace)) + ) + return 1 + + # Get volume name + pod = lib_telemetry.get_lib_kubernetes().get_pod_info( + name=pod_name, namespace=namespace + ) + + if pod is None: + logging.error( + "PvcScenarioPlugin Exiting as pod '%s' doesn't exist " + "in namespace '%s'" % (str(pod_name), str(namespace)) + ) + return 1 + + for volume in pod.volumes: + if volume.pvcName is not None: + volume_name = volume.name + pvc_name = volume.pvcName + pvc = lib_telemetry.get_lib_kubernetes().get_pvc_info( + pvc_name, namespace + ) + break + if "pvc" not in locals(): + logging.error( + "PvcScenarioPlugin Pod '%s' in namespace '%s' does not use a pvc" + % (str(pod_name), str(namespace)) + ) + return 1 + logging.info("Volume name: %s" % volume_name) + logging.info("PVC name: %s" % pvc_name) + + # Get container name and mount path + for container in pod.containers: + for vol in container.volumeMounts: + if vol.name == volume_name: + mount_path = vol.mountPath + container_name = container.name + break + logging.info("Container path: %s" % container_name) + logging.info("Mount path: %s" % mount_path) + + # Get PVC capacity and used bytes + command = "df %s -B 1024 | sed 1d" % (str(mount_path)) + command_output = ( + lib_telemetry.get_lib_kubernetes().exec_cmd_in_pod( + [command], pod_name, namespace, container_name + ) + ).split() + pvc_used_kb = int(command_output[2]) + pvc_capacity_kb = pvc_used_kb + int(command_output[3]) + logging.info("PVC used: %s KB" % pvc_used_kb) + logging.info("PVC capacity: %s KB" % pvc_capacity_kb) + + # Check valid fill percentage + current_fill_percentage = pvc_used_kb / pvc_capacity_kb + if not ( + current_fill_percentage * 100 < float(target_fill_percentage) <= 99 + ): + logging.error( + "PvcScenarioPlugin Target fill percentage (%.2f%%) is lower than " + "current fill percentage (%.2f%%) " + "or higher than 99%%" + % ( + target_fill_percentage, + current_fill_percentage * 100, + ) + ) + return 1 + # Calculate file size + file_size_kb = int( + (float(target_fill_percentage / 100) * float(pvc_capacity_kb)) + - float(pvc_used_kb) + ) + logging.debug("File size: %s KB" % file_size_kb) + + file_name = "kraken.tmp" + logging.info( + "Creating %s file, %s KB size, in pod %s at %s (ns %s)" + % ( + str(file_name), + str(file_size_kb), + str(pod_name), + str(mount_path), + str(namespace), + ) + ) + + start_time = int(time.time()) + # Create temp file in the PVC + full_path = "%s/%s" % (str(mount_path), str(file_name)) + command = "fallocate -l $((%s*1024)) %s" % ( + str(file_size_kb), + str(full_path), + ) + logging.debug("Create temp file in the PVC command:\n %s" % command) + lib_telemetry.get_lib_kubernetes().exec_cmd_in_pod( + [command], + pod_name, + namespace, + container_name, + ) + + # Check if file is created + command = "ls -lh %s" % (str(mount_path)) + logging.debug("Check file is created command:\n %s" % command) + response = lib_telemetry.get_lib_kubernetes().exec_cmd_in_pod( + [command], pod_name, namespace, container_name + ) + logging.info("\n" + str(response)) + if str(file_name).lower() in str(response).lower(): + logging.info("%s file successfully created" % (str(full_path))) + else: + logging.error( + "PvcScenarioPlugin Failed to create tmp file with %s size" + % (str(file_size_kb)) + ) + self.remove_temp_file( + file_name, + full_path, + pod_name, + namespace, + container_name, + mount_path, + file_size_kb, + lib_telemetry.get_lib_kubernetes(), + ) + return 1 + + # Calculate file size + file_size_kb = int( + (float(target_fill_percentage / 100) * float(pvc_capacity_kb)) + - float(pvc_used_kb) + ) + logging.debug("File size: %s KB" % file_size_kb) + + file_name = "kraken.tmp" + logging.info( + "Creating %s file, %s KB size, in pod %s at %s (ns %s)" + % ( + str(file_name), + str(file_size_kb), + str(pod_name), + str(mount_path), + str(namespace), + ) + ) + + start_time = int(time.time()) + # Create temp file in the PVC + full_path = "%s/%s" % (str(mount_path), str(file_name)) + command = "fallocate -l $((%s*1024)) %s" % ( + str(file_size_kb), + str(full_path), + ) + logging.debug("Create temp file in the PVC command:\n %s" % command) + lib_telemetry.get_lib_kubernetes().exec_cmd_in_pod( + [command], pod_name, namespace, container_name + ) + + # Check if file is created + command = "ls -lh %s" % (str(mount_path)) + logging.debug("Check file is created command:\n %s" % command) + response = lib_telemetry.get_lib_kubernetes().exec_cmd_in_pod( + [command], pod_name, namespace, container_name + ) + logging.info("\n" + str(response)) + if str(file_name).lower() in str(response).lower(): + logging.info( + "Waiting for the specified duration in the config: %ss" % duration + ) + time.sleep(duration) + logging.info("Finish waiting") + + self.remove_temp_file( + file_name, + full_path, + pod_name, + namespace, + container_name, + mount_path, + file_size_kb, + lib_telemetry.get_lib_kubernetes(), + ) + end_time = int(time.time()) + cerberus.publish_kraken_status(krkn_config, [], start_time, end_time) + except (RuntimeError, Exception) as e: + logging.error("PvcScenarioPlugin exiting due to Exception %s" % e) + return 1 + else: + return 0 + + # krkn_lib + def remove_temp_file( + self, + file_name, + full_path, + pod_name, + namespace, + container_name, + mount_path, + file_size_kb, + kubecli: KrknKubernetes, + ): + command = "rm -f %s" % (str(full_path)) + logging.debug("Remove temp file from the PVC command:\n %s" % command) + kubecli.exec_cmd_in_pod([command], pod_name, namespace, container_name) + command = "ls -lh %s" % (str(mount_path)) + logging.debug("Check temp file is removed command:\n %s" % command) + response = kubecli.exec_cmd_in_pod( + [command], pod_name, namespace, container_name + ) + logging.info("\n" + str(response)) + if not (str(file_name).lower() in str(response).lower()): + logging.info("Temp file successfully removed") + else: + logging.error( + "PvcScenarioPlugin Failed to delete tmp file with %s size" + % (str(file_size_kb)) + ) + raise RuntimeError() + + def to_kbytes(self, value): + if not re.match("^[0-9]+[K|M|G|T]i$", value): + logging.error( + "PvcScenarioPlugin PVC capacity %s does not match expression " + "regexp '^[0-9]+[K|M|G|T]i$'" + ) + raise RuntimeError() + unit = {"K": 0, "M": 1, "G": 2, "T": 3} + base = 1024 if ("i" in value) else 1000 + exp = unit[value[-2:-1]] + res = int(value[:-2]) * (base**exp) + return res + + def get_scenario_types(self) -> list[str]: + return ["pvc_scenarios"] diff --git a/krkn/scenario_plugins/scenario_plugin_factory.py b/krkn/scenario_plugins/scenario_plugin_factory.py new file mode 100644 index 00000000..bf945435 --- /dev/null +++ b/krkn/scenario_plugins/scenario_plugin_factory.py @@ -0,0 +1,134 @@ +import importlib +import inspect +import pkgutil +from typing import Type, Tuple, Optional +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class ScenarioPluginNotFound(Exception): + pass + + +class ScenarioPluginFactory: + + loaded_plugins: dict[str, any] = {} + failed_plugins: list[Tuple[str, str, str]] = [] + package_name = None + + def __init__(self, package_name: str = "krkn.scenario_plugins"): + self.package_name = package_name + self.__load_plugins(AbstractScenarioPlugin) + + def create_plugin(self, scenario_type: str) -> AbstractScenarioPlugin: + """ + Creates a plugin instance based on the config.yaml scenario name. + The scenario name is provided by the method `get_scenario_type` + defined by the `AbstractScenarioPlugin` abstract class that must + be implemented by all the plugins in order to be loaded correctly + + :param scenario_type: the scenario type defined in the config.yaml + e.g. `arcaflow_scenarios`, `network_scenarios`, `plugin_scenarios` + etc. + :return: an instance of the class that implements this scenario and + inherits from the AbstractScenarioPlugin abstract class + """ + if scenario_type in self.loaded_plugins: + return self.loaded_plugins[scenario_type]() + else: + raise ScenarioPluginNotFound( + f"Failed to load the {scenario_type} scenario plugin. " + f"Please verify the logs to ensure it was loaded correctly." + ) + + def __load_plugins(self, base_class: Type): + base_package = importlib.import_module(self.package_name) + for _, module_name, is_pkg in pkgutil.walk_packages( + base_package.__path__, base_package.__name__ + "." + ): + + if not is_pkg: + module = importlib.import_module(module_name) + + for name, obj in inspect.getmembers(module, inspect.isclass): + if issubclass(obj, base_class) and obj is not base_class: + is_correct, exception_message = ( + self.is_naming_convention_correct(module_name, name) + ) + if not is_correct: + self.failed_plugins.append( + (module_name, name, exception_message) + ) + continue + + cls = getattr(module, name) + instance = cls() + get_scenario_type = getattr(instance, "get_scenario_types") + scenario_types = get_scenario_type() + has_duplicates = False + for scenario_type in scenario_types: + if scenario_type in self.loaded_plugins.keys(): + self.failed_plugins.append( + ( + module_name, + name, + f"scenario type {scenario_type} defined by {self.loaded_plugins[scenario_type].__name__} " + f"and {name} and this is not allowed.", + ) + ) + has_duplicates = True + break + if has_duplicates: + continue + for scenario_type in scenario_types: + self.loaded_plugins[scenario_type] = cls + + def is_naming_convention_correct( + self, module_name: str, class_name: str + ) -> Tuple[bool, Optional[str]]: + """ + Defines the Krkn ScenarioPlugin API naming conventions + + :param module_name: the fully qualified module name that is loaded by + walk_packages + :param class_name: the plugin class name + :return: a tuple of boolean result of the check and optional error message + """ + # plugin file names must end with _scenario_plugin + if not module_name.split(".")[-1].endswith("_scenario_plugin"): + return ( + False, + "scenario plugin module file names must end with `_scenario_plugin` suffix", + ) + + if ( + "scenario" in module_name.split(".")[-2] + or "plugin" in module_name.split(".")[-2] + ): + return ( + False, + "scenario plugin folder cannot contain `scenario` or `plugin` word", + ) + + # plugin class names must be capital camel cased and end with ScenarioPlugin + if ( + class_name == "ScenarioPlugin" + or not class_name.endswith("ScenarioPlugin") + or not class_name[0].isupper() + ): + return ( + False, + "scenario plugin class name must start with a capital letter, " + "end with `ScenarioPlugin`, and cannot be just `ScenarioPlugin`.", + ) + + # plugin file name in snake case must match class name in capital camel case + if self.__snake_to_capital_camel(module_name.split(".")[-1]) != class_name: + return False, ( + "module file name must in snake case must match class name in capital camel case " + "e.g. `example_scenario_plugin` -> `ExampleScenarioPlugin`" + ) + + return True, None + + def __snake_to_capital_camel(self, snake_string: str) -> str: + return snake_string.title().replace("_", "") diff --git a/kraken/service_hijacking/__init__.py b/krkn/scenario_plugins/service_disruption/__init__.py similarity index 100% rename from kraken/service_hijacking/__init__.py rename to krkn/scenario_plugins/service_disruption/__init__.py diff --git a/krkn/scenario_plugins/service_disruption/service_disruption_scenario_plugin.py b/krkn/scenario_plugins/service_disruption/service_disruption_scenario_plugin.py new file mode 100644 index 00000000..710d0a0c --- /dev/null +++ b/krkn/scenario_plugins/service_disruption/service_disruption_scenario_plugin.py @@ -0,0 +1,345 @@ +import logging +import random +import time + +import yaml +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn_lib.utils import get_yaml_item_value, log_exception + +from krkn import cerberus, utils +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class ServiceDisruptionScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + try: + with open(scenario, "r") as f: + scenario_config_yaml = yaml.full_load(f) + for scenario in scenario_config_yaml["scenarios"]: + scenario_namespace = get_yaml_item_value(scenario, "namespace", "") + scenario_label = get_yaml_item_value(scenario, "label_selector", "") + if ( + scenario_namespace is not None + and scenario_namespace.strip() != "" + ): + if scenario_label is not None and scenario_label.strip() != "": + logging.error( + "ServiceDisruptionScenarioPlugin You can only have namespace or " + "label set in your namespace scenario" + ) + logging.error( + "ServiceDisruptionScenarioPlugin Current scenario config has " + "namespace '%s' and label selector '%s'" + % (scenario_namespace, scenario_label) + ) + logging.error( + "ServiceDisruptionScenarioPlugin Please set either namespace " + "to blank ('') or label_selector to blank ('') to continue" + ) + return 1 + delete_count = get_yaml_item_value(scenario, "delete_count", 1) + run_count = get_yaml_item_value(scenario, "runs", 1) + run_sleep = get_yaml_item_value(scenario, "sleep", 10) + wait_time = get_yaml_item_value(scenario, "wait_time", 30) + + logging.info( + str(scenario_namespace) + + str(scenario_label) + + str(delete_count) + + str(run_count) + + str(run_sleep) + + str(wait_time) + ) + logging.info("done") + start_time = int(time.time()) + for i in range(run_count): + killed_namespaces = {} + namespaces = ( + lib_telemetry.get_lib_kubernetes().check_namespaces( + [scenario_namespace], scenario_label + ) + ) + for j in range(delete_count): + if len(namespaces) == 0: + logging.error( + "ServiceDisruptionScenarioPlugin Couldn't delete %s namespaces, ù" + "not enough namespaces matching %s with label %s" + % ( + str(run_count), + scenario_namespace, + str(scenario_label), + ) + ) + return 1 + + selected_namespace = namespaces[ + random.randint(0, len(namespaces) - 1) + ] + logging.info( + "Delete objects in selected namespace: " + + selected_namespace + ) + try: + # delete all pods in namespace + objects = self.delete_objects( + lib_telemetry.get_lib_kubernetes(), + selected_namespace, + ) + killed_namespaces[selected_namespace] = objects + logging.info( + "Deleted all objects in namespace %s was successful" + % str(selected_namespace) + ) + except Exception as e: + logging.info( + "ServiceDisruptionScenarioPlugin Delete all " + "objects in namespace %s was unsuccessful" + % str(selected_namespace) + ) + logging.info("Namespace action error: " + str(e)) + return 1 + namespaces.remove(selected_namespace) + logging.info( + "Waiting %s seconds between namespace deletions" + % str(run_sleep) + ) + time.sleep(run_sleep) + + end_time = int(time.time()) + cerberus.publish_kraken_status( + krkn_config, [], start_time, end_time + ) + except (Exception, RuntimeError) as e: + logging.error( + "ServiceDisruptionScenarioPlugin exiting due to Exception %s" % e + ) + return 1 + else: + return 0 + + def delete_objects(self, kubecli, namespace): + + services = self.delete_all_services_namespace(kubecli, namespace) + daemonsets = self.delete_all_daemonset_namespace(kubecli, namespace) + statefulsets = self.delete_all_statefulsets_namespace(kubecli, namespace) + replicasets = self.delete_all_replicaset_namespace(kubecli, namespace) + deployments = self.delete_all_deployment_namespace(kubecli, namespace) + + objects = { + "daemonsets": daemonsets, + "deployments": deployments, + "replicasets": replicasets, + "statefulsets": statefulsets, + "services": services, + } + + return objects + + def get_list_running_pods(self, kubecli: KrknKubernetes, namespace: str): + running_pods = [] + pods = kubecli.list_pods(namespace) + for pod in pods: + pod_status = kubecli.get_pod_info(pod, namespace) + if pod_status and pod_status.status == "Running": + running_pods.append(pod) + logging.info("all running pods " + str(running_pods)) + return running_pods + + def delete_all_deployment_namespace(self, kubecli: KrknKubernetes, namespace: str): + """ + Delete all the deployments in the specified namespace + + :param kubecli: krkn kubernetes python package + :param namespace: namespace + """ + try: + deployments = kubecli.get_deployment_ns(namespace) + for deployment in deployments: + logging.info("Deleting deployment" + deployment) + kubecli.delete_deployment(deployment, namespace) + except Exception as e: + logging.error( + "Exception when calling delete_all_deployment_namespace: %s\n", + str(e), + ) + raise e + + return deployments + + def delete_all_daemonset_namespace(self, kubecli: KrknKubernetes, namespace: str): + """ + Delete all the daemonset in the specified namespace + + :param kubecli: krkn kubernetes python package + :param namespace: namespace + """ + try: + daemonsets = kubecli.get_daemonset(namespace) + for daemonset in daemonsets: + logging.info("Deleting daemonset" + daemonset) + kubecli.delete_daemonset(daemonset, namespace) + except Exception as e: + logging.error( + "Exception when calling delete_all_daemonset_namespace: %s\n", + str(e), + ) + raise e + + return daemonsets + + def delete_all_statefulsets_namespace( + self, kubecli: KrknKubernetes, namespace: str + ): + """ + Delete all the statefulsets in the specified namespace + + + :param kubecli: krkn kubernetes python package + :param namespace: namespace + """ + try: + statefulsets = kubecli.get_all_statefulset(namespace) + for statefulset in statefulsets: + logging.info("Deleting statefulsets" + statefulsets) + kubecli.delete_statefulset(statefulset, namespace) + except Exception as e: + logging.error( + "Exception when calling delete_all_statefulsets_namespace: %s\n", + str(e), + ) + raise e + + return statefulsets + + def delete_all_replicaset_namespace(self, kubecli: KrknKubernetes, namespace: str): + """ + Delete all the replicasets in the specified namespace + + :param kubecli: krkn kubernetes python package + :param namespace: namespace + """ + try: + replicasets = kubecli.get_all_replicasets(namespace) + for replicaset in replicasets: + logging.info("Deleting replicaset" + replicaset) + kubecli.delete_replicaset(replicaset, namespace) + except Exception as e: + logging.error( + "Exception when calling delete_all_replicaset_namespace: %s\n", + str(e), + ) + raise e + + return replicasets + + def delete_all_services_namespace(self, kubecli: KrknKubernetes, namespace: str): + """ + Delete all the services in the specified namespace + + + :param kubecli: krkn kubernetes python package + :param namespace: namespace + """ + try: + services = kubecli.get_all_services(namespace) + for service in services: + logging.info("Deleting services" + service) + kubecli.delete_services(service, namespace) + except Exception as e: + logging.error( + "Exception when calling delete_all_services_namespace: %s\n", + str(e), + ) + raise e + + return services + + def check_all_running_pods( + self, kubecli: KrknKubernetes, namespace_name, wait_time + ): + + timer = 0 + while timer < wait_time: + pod_list = kubecli.list_pods(namespace_name) + pods_running = 0 + for pod in pod_list: + pod_info = kubecli.get_pod_info(pod, namespace_name) + if pod_info.status != "Running" and pod_info.status != "Succeeded": + logging.info( + "Pods %s still not running or completed" % pod_info.name + ) + break + pods_running += 1 + if len(pod_list) == pods_running: + break + timer += 5 + time.sleep(5) + logging.info("Waiting 5 seconds for pods to become active") + + # krkn_lib + def check_all_running_deployment( + self, killed_namespaces, wait_time, kubecli: KrknKubernetes + ): + + timer = 0 + while timer < wait_time and killed_namespaces: + still_missing_ns = killed_namespaces.copy() + for namespace_name, objects in killed_namespaces.items(): + still_missing_obj = objects.copy() + for obj_name, obj_list in objects.items(): + if "deployments" == obj_name: + deployments = kubecli.get_deployment_ns(namespace_name) + if len(obj_list) == len(deployments): + still_missing_obj.pop(obj_name) + elif "replicasets" == obj_name: + replicasets = kubecli.get_all_replicasets(namespace_name) + if len(obj_list) == len(replicasets): + still_missing_obj.pop(obj_name) + elif "statefulsets" == obj_name: + statefulsets = kubecli.get_all_statefulset(namespace_name) + if len(obj_list) == len(statefulsets): + still_missing_obj.pop(obj_name) + elif "services" == obj_name: + services = kubecli.get_all_services(namespace_name) + if len(obj_list) == len(services): + still_missing_obj.pop(obj_name) + elif "daemonsets" == obj_name: + daemonsets = kubecli.get_daemonset(namespace_name) + if len(obj_list) == len(daemonsets): + still_missing_obj.pop(obj_name) + logging.info("Still missing objects " + str(still_missing_obj)) + killed_namespaces[namespace_name] = still_missing_obj.copy() + if len(killed_namespaces[namespace_name].keys()) == 0: + logging.info( + "Wait for pods to become running for namespace: " + + namespace_name + ) + self.check_all_running_pods(kubecli, namespace_name, wait_time) + still_missing_ns.pop(namespace_name) + killed_namespaces = still_missing_ns + if len(killed_namespaces.keys()) == 0: + return [] + + timer += 10 + time.sleep(10) + logging.info( + "Waiting 10 seconds for objects in namespaces to become active" + ) + + logging.error( + "Objects are still not ready after waiting " + str(wait_time) + "seconds" + ) + logging.error("Non active namespaces " + str(killed_namespaces)) + return killed_namespaces + + def get_scenario_types(self) -> list[str]: + return ["service_disruption_scenarios"] diff --git a/kraken/shut_down/__init__.py b/krkn/scenario_plugins/service_hijacking/__init__.py similarity index 100% rename from kraken/shut_down/__init__.py rename to krkn/scenario_plugins/service_hijacking/__init__.py diff --git a/krkn/scenario_plugins/service_hijacking/service_hijacking_scenario_plugin.py b/krkn/scenario_plugins/service_hijacking/service_hijacking_scenario_plugin.py new file mode 100644 index 00000000..781d3602 --- /dev/null +++ b/krkn/scenario_plugins/service_hijacking/service_hijacking_scenario_plugin.py @@ -0,0 +1,108 @@ +import logging +import time + +import yaml +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class ServiceHijackingScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + with open(scenario) as stream: + scenario_config = yaml.safe_load(stream) + + service_name = scenario_config["service_name"] + service_namespace = scenario_config["service_namespace"] + plan = scenario_config["plan"] + image = scenario_config["image"] + target_port = scenario_config["service_target_port"] + chaos_duration = scenario_config["chaos_duration"] + + logging.info( + f"checking service {service_name} in namespace: {service_namespace}" + ) + if not lib_telemetry.get_lib_kubernetes().service_exists( + service_name, service_namespace + ): + logging.error( + f"ServiceHijackingScenarioPlugin service: {service_name} not found in namespace: {service_namespace}, failed to run scenario." + ) + return 1 + try: + logging.info( + f"service: {service_name} found in namespace: {service_namespace}" + ) + logging.info(f"creating webservice and initializing test plan...") + # both named ports and port numbers can be used + if isinstance(target_port, int): + logging.info(f"webservice will listen on port {target_port}") + webservice = ( + lib_telemetry.get_lib_kubernetes().deploy_service_hijacking( + service_namespace, plan, image, port_number=target_port + ) + ) + else: + logging.info(f"traffic will be redirected to named port: {target_port}") + webservice = ( + lib_telemetry.get_lib_kubernetes().deploy_service_hijacking( + service_namespace, plan, image, port_name=target_port + ) + ) + logging.info( + f"successfully deployed pod: {webservice.pod_name} " + f"in namespace:{service_namespace} with selector {webservice.selector}!" + ) + logging.info( + f"patching service: {service_name} to hijack traffic towards: {webservice.pod_name}" + ) + original_service = ( + lib_telemetry.get_lib_kubernetes().replace_service_selector( + [webservice.selector], service_name, service_namespace + ) + ) + if original_service is None: + logging.error( + f"ServiceHijackingScenarioPlugin failed to patch service: {service_name}, namespace: {service_namespace} with selector {webservice.selector}" + ) + return 1 + + logging.info(f"service: {service_name} successfully patched!") + logging.info(f"original service manifest:\n\n{yaml.dump(original_service)}") + logging.info(f"waiting {chaos_duration} before restoring the service") + time.sleep(chaos_duration) + selectors = [ + "=".join([key, original_service["spec"]["selector"][key]]) + for key in original_service["spec"]["selector"].keys() + ] + logging.info(f"restoring the service selectors {selectors}") + original_service = ( + lib_telemetry.get_lib_kubernetes().replace_service_selector( + selectors, service_name, service_namespace + ) + ) + if original_service is None: + logging.error( + f"ServiceHijackingScenarioPlugin failed to restore original " + f"service: {service_name}, namespace: {service_namespace} with selectors: {selectors}" + ) + return 1 + logging.info("selectors successfully restored") + logging.info("undeploying service-hijacking resources...") + lib_telemetry.get_lib_kubernetes().undeploy_service_hijacking(webservice) + return 0 + except Exception as e: + logging.error( + f"ServiceHijackingScenarioPlugin scenario {scenario} failed with exception: {e}" + ) + return 1 + + def get_scenario_types(self) -> list[str]: + return ["service_hijacking_scenarios"] diff --git a/kraken/time_actions/__init__.py b/krkn/scenario_plugins/shut_down/__init__.py similarity index 100% rename from kraken/time_actions/__init__.py rename to krkn/scenario_plugins/shut_down/__init__.py diff --git a/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py b/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py new file mode 100644 index 00000000..ea915e32 --- /dev/null +++ b/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py @@ -0,0 +1,151 @@ +import logging +import time +from multiprocessing.pool import ThreadPool + +import yaml +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift + +from krkn import cerberus +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin +from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS +from krkn.scenario_plugins.node_actions.az_node_scenarios import Azure +from krkn.scenario_plugins.node_actions.gcp_node_scenarios import GCP +from krkn.scenario_plugins.node_actions.openstack_node_scenarios import OPENSTACKCLOUD + + +class ShutDownScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + try: + with open(scenario, "r") as f: + shut_down_config_yaml = yaml.full_load(f) + shut_down_config_scenario = shut_down_config_yaml[ + "cluster_shut_down_scenario" + ] + start_time = int(time.time()) + self.cluster_shut_down( + shut_down_config_scenario, lib_telemetry.get_lib_kubernetes() + ) + end_time = int(time.time()) + cerberus.publish_kraken_status(krkn_config, [], start_time, end_time) + return 0 + except Exception as e: + logging.error( + f"ShutDownScenarioPlugin scenario {scenario} failed with exception: {e}" + ) + return 1 + + def multiprocess_nodes(self, cloud_object_function, nodes, processes=0): + try: + # pool object with number of element + + if processes == 0: + pool = ThreadPool(processes=len(nodes)) + else: + pool = ThreadPool(processes=processes) + logging.info("nodes type " + str(type(nodes[0]))) + if type(nodes[0]) is tuple: + node_id = [] + node_info = [] + for node in nodes: + node_id.append(node[0]) + node_info.append(node[1]) + logging.info("node id " + str(node_id)) + logging.info("node info" + str(node_info)) + pool.starmap(cloud_object_function, zip(node_info, node_id)) + + else: + logging.info("pool type" + str(type(nodes))) + pool.map(cloud_object_function, nodes) + pool.close() + except Exception as e: + logging.info("Error on pool multiprocessing: " + str(e)) + + # Inject the cluster shut down scenario + # krkn_lib + def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes): + runs = shut_down_config["runs"] + shut_down_duration = shut_down_config["shut_down_duration"] + cloud_type = shut_down_config["cloud_type"] + timeout = shut_down_config["timeout"] + processes = 0 + if cloud_type.lower() == "aws": + cloud_object = AWS() + elif cloud_type.lower() == "gcp": + cloud_object = GCP() + processes = 1 + elif cloud_type.lower() == "openstack": + cloud_object = OPENSTACKCLOUD() + elif cloud_type.lower() in ["azure", "az"]: + cloud_object = Azure() + else: + logging.error( + "Cloud type %s is not currently supported for cluster shut down" + % cloud_type + ) + + raise RuntimeError() + + nodes = kubecli.list_nodes() + node_id = [] + for node in nodes: + instance_id = cloud_object.get_instance_id(node) + node_id.append(instance_id) + logging.info("node id list " + str(node_id)) + for _ in range(runs): + logging.info("Starting cluster_shut_down scenario injection") + stopping_nodes = set(node_id) + self.multiprocess_nodes(cloud_object.stop_instances, node_id, processes) + stopped_nodes = stopping_nodes.copy() + while len(stopping_nodes) > 0: + for node in stopping_nodes: + if type(node) is tuple: + node_status = cloud_object.wait_until_stopped( + node[1], node[0], timeout + ) + else: + node_status = cloud_object.wait_until_stopped(node, timeout) + + # Only want to remove node from stopping list + # when fully stopped/no error + if node_status: + stopped_nodes.remove(node) + + stopping_nodes = stopped_nodes.copy() + + logging.info( + "Shutting down the cluster for the specified duration: %s" + % shut_down_duration + ) + time.sleep(shut_down_duration) + logging.info("Restarting the nodes") + restarted_nodes = set(node_id) + self.multiprocess_nodes(cloud_object.start_instances, node_id, processes) + logging.info("Wait for each node to be running again") + not_running_nodes = restarted_nodes.copy() + while len(not_running_nodes) > 0: + for node in not_running_nodes: + if type(node) is tuple: + node_status = cloud_object.wait_until_running( + node[1], node[0], timeout + ) + else: + node_status = cloud_object.wait_until_running(node, timeout) + if node_status: + restarted_nodes.remove(node) + not_running_nodes = restarted_nodes.copy() + logging.info("Waiting for 150s to allow cluster component initialization") + time.sleep(150) + + logging.info("Successfully injected cluster_shut_down scenario!") + + def get_scenario_types(self) -> list[str]: + return ["cluster_shut_down_scenarios"] diff --git a/kraken/zone_outage/__init__.py b/krkn/scenario_plugins/syn_flood/__init__.py similarity index 100% rename from kraken/zone_outage/__init__.py rename to krkn/scenario_plugins/syn_flood/__init__.py diff --git a/krkn/scenario_plugins/syn_flood/syn_flood_scenario_plugin.py b/krkn/scenario_plugins/syn_flood/syn_flood_scenario_plugin.py new file mode 100644 index 00000000..17e97023 --- /dev/null +++ b/krkn/scenario_plugins/syn_flood/syn_flood_scenario_plugin.py @@ -0,0 +1,139 @@ +import logging +import os +import time + +import yaml +from krkn_lib import utils as krkn_lib_utils +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class SynFloodScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + try: + pod_names = [] + config = self.parse_config(scenario) + if config["target-service-label"]: + target_services = ( + lib_telemetry.get_lib_kubernetes().select_service_by_label( + config["namespace"], config["target-service-label"] + ) + ) + else: + target_services = [config["target-service"]] + + for target in target_services: + if not lib_telemetry.get_lib_kubernetes().service_exists( + target, config["namespace"] + ): + logging.error(f"SynFloodScenarioPlugin {target} service not found") + return 1 + for i in range(config["number-of-pods"]): + pod_name = "syn-flood-" + krkn_lib_utils.get_random_string(10) + lib_telemetry.get_lib_kubernetes().deploy_syn_flood( + pod_name, + config["namespace"], + config["image"], + target, + config["target-port"], + config["packet-size"], + config["window-size"], + config["duration"], + config["attacker-nodes"], + ) + pod_names.append(pod_name) + + logging.info("waiting all the attackers to finish:") + did_finish = False + finished_pods = [] + while not did_finish: + for pod_name in pod_names: + if not lib_telemetry.get_lib_kubernetes().is_pod_running( + pod_name, config["namespace"] + ): + finished_pods.append(pod_name) + if set(pod_names) == set(finished_pods): + did_finish = True + time.sleep(1) + + except Exception as e: + logging.error( + f"SynFloodScenarioPlugin scenario {scenario} failed with exception: {e}" + ) + return 1 + else: + return 0 + + def parse_config(self, scenario_file: str) -> dict[str, any]: + if not os.path.exists(scenario_file): + raise Exception(f"failed to load scenario file {scenario_file}") + + try: + with open(scenario_file) as stream: + config = yaml.safe_load(stream) + except Exception: + raise Exception(f"{scenario_file} is not a valid yaml file") + + missing = [] + if not self.check_key_value(config, "packet-size"): + missing.append("packet-size") + if not self.check_key_value(config, "window-size"): + missing.append("window-size") + if not self.check_key_value(config, "duration"): + missing.append("duration") + if not self.check_key_value(config, "namespace"): + missing.append("namespace") + if not self.check_key_value(config, "number-of-pods"): + missing.append("number-of-pods") + if not self.check_key_value(config, "target-port"): + missing.append("target-port") + if not self.check_key_value(config, "image"): + missing.append("image") + if "target-service" not in config.keys(): + missing.append("target-service") + if "target-service-label" not in config.keys(): + missing.append("target-service-label") + + if len(missing) > 0: + raise Exception(f"{(',').join(missing)} parameter(s) are missing") + + if not config["target-service"] and not config["target-service-label"]: + raise Exception("you have either to set a target service or a label") + if config["target-service"] and config["target-service-label"]: + raise Exception( + "you cannot select both target-service and target-service-label" + ) + + if "attacker-nodes" and not self.is_node_affinity_correct( + config["attacker-nodes"] + ): + raise Exception("attacker-nodes format is not correct") + return config + + def check_key_value(self, dictionary, key): + if key in dictionary: + value = dictionary[key] + if value is not None and value != "": + return True + return False + + def is_node_affinity_correct(self, obj) -> bool: + if not isinstance(obj, dict): + return False + for key in obj.keys(): + if not isinstance(key, str): + return False + if not isinstance(obj[key], list): + return False + return True + + def get_scenario_types(self) -> list[str]: + return ["syn_flood_scenarios"] diff --git a/krkn/scenario_plugins/time_actions/__init__.py b/krkn/scenario_plugins/time_actions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/krkn/scenario_plugins/time_actions/time_actions_scenario_plugin.py b/krkn/scenario_plugins/time_actions/time_actions_scenario_plugin.py new file mode 100644 index 00000000..0ba97d2a --- /dev/null +++ b/krkn/scenario_plugins/time_actions/time_actions_scenario_plugin.py @@ -0,0 +1,352 @@ +import datetime +import logging +import random +import re +import time + +import yaml +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn_lib.utils import get_random_string, get_yaml_item_value, log_exception +from kubernetes.client import ApiException + +from krkn import cerberus, utils +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class TimeActionsScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + try: + with open(scenario, "r") as f: + scenario_config = yaml.full_load(f) + for time_scenario in scenario_config["time_scenarios"]: + start_time = int(time.time()) + object_type, object_names = self.skew_time( + time_scenario, lib_telemetry.get_lib_kubernetes() + ) + not_reset = self.check_date_time( + object_type, + object_names, + lib_telemetry.get_lib_kubernetes(), + ) + if len(not_reset) > 0: + logging.info("Object times were not reset") + end_time = int(time.time()) + cerberus.publish_kraken_status( + krkn_config, not_reset, start_time, end_time + ) + except (RuntimeError, Exception): + logging.error( + f"TimeActionsScenarioPlugin scenario {scenario} failed with exception: {e}" + ) + return 1 + else: + return 0 + + def pod_exec( + self, pod_name, command, namespace, container_name, kubecli: KrknKubernetes + ): + for i in range(5): + response = kubecli.exec_cmd_in_pod( + command, pod_name, namespace, container_name + ) + if not response: + time.sleep(2) + continue + elif ( + "unauthorized" in response.lower() + or "authorization" in response.lower() + ): + time.sleep(2) + continue + else: + break + return response + + # krkn_lib + def get_container_name( + self, pod_name, namespace, kubecli: KrknKubernetes, container_name="" + ): + + container_names = kubecli.get_containers_in_pod(pod_name, namespace) + if container_name != "": + if container_name in container_names: + return container_name + else: + logging.error( + "Container name %s not an existing container in pod %s" + % (container_name, pod_name) + ) + else: + container_name = container_names[ + # random module here is not used for security/cryptographic + # purposes + random.randint(0, len(container_names) - 1) # nosec + ] + return container_name + + def skew_node(self, node_name: str, action: str, kubecli: KrknKubernetes): + pod_namespace = "default" + status_pod_name = f"time-skew-pod-{get_random_string(5)}" + skew_pod_name = f"time-skew-pod-{get_random_string(5)}" + ntp_enabled = True + logging.info( + f'Creating pod to skew {"time" if action == "skew_time" else "date"} on node {node_name}' + ) + status_command = ["timedatectl"] + param = "2001-01-01" + skew_command = ["timedatectl", "set-time"] + if action == "skew_time": + skew_command.append("01:01:01") + else: + skew_command.append("2001-01-01") + + try: + status_response = kubecli.exec_command_on_node( + node_name, status_command, status_pod_name, pod_namespace + ) + if "Network time on: no" in status_response: + ntp_enabled = False + + logging.warning( + f'ntp unactive on node {node_name} skewing {"time" if action == "skew_time" else "date"} to {param}' + ) + self.pod_exec(skew_pod_name, skew_command, pod_namespace, None, kubecli) + else: + logging.info( + f'ntp active in cluster node, {"time" if action == "skew_time" else "date"} skewing will have no effect, skipping' + ) + except ApiException: + pass + except Exception as e: + logging.error(f"failed to execute skew command in pod: {e}") + finally: + kubecli.delete_pod(status_pod_name, pod_namespace) + if not ntp_enabled: + kubecli.delete_pod(skew_pod_name, pod_namespace) + + # krkn_lib + def skew_time(self, scenario, kubecli: KrknKubernetes): + if scenario["action"] not in ["skew_date", "skew_time"]: + raise RuntimeError(f'{scenario["action"]} is not a valid time skew action') + + if "node" in scenario["object_type"]: + node_names = [] + if "object_name" in scenario.keys() and scenario["object_name"]: + node_names = scenario["object_name"] + elif "label_selector" in scenario.keys() and scenario["label_selector"]: + node_names = kubecli.list_nodes(scenario["label_selector"]) + for node in node_names: + self.skew_node(node, scenario["action"], kubecli) + logging.info("Reset date/time on node " + str(node)) + return "node", node_names + + elif "pod" in scenario["object_type"]: + skew_command = "date --date " + if scenario["action"] == "skew_date": + skewed_date = "00-01-01" + skew_command += skewed_date + elif scenario["action"] == "skew_time": + skewed_time = "01:01:01" + skew_command += skewed_time + container_name = get_yaml_item_value(scenario, "container_name", "") + pod_names = [] + if "object_name" in scenario.keys() and scenario["object_name"]: + for name in scenario["object_name"]: + if "namespace" not in scenario.keys(): + logging.error("Need to set namespace when using pod name") + # removed_exit + # sys.exit(1) + raise RuntimeError() + pod_names.append([name, scenario["namespace"]]) + elif "namespace" in scenario.keys() and scenario["namespace"]: + if "label_selector" not in scenario.keys(): + logging.info( + "label_selector key not found, querying for all the pods " + "in namespace: %s" % (scenario["namespace"]) + ) + pod_names = kubecli.list_pods(scenario["namespace"]) + else: + logging.info( + "Querying for the pods matching the %s label_selector " + "in namespace %s" + % (scenario["label_selector"], scenario["namespace"]) + ) + pod_names = kubecli.list_pods( + scenario["namespace"], scenario["label_selector"] + ) + counter = 0 + for pod_name in pod_names: + pod_names[counter] = [pod_name, scenario["namespace"]] + counter += 1 + elif "label_selector" in scenario.keys() and scenario["label_selector"]: + pod_names = kubecli.get_all_pods(scenario["label_selector"]) + + if len(pod_names) == 0: + logging.info( + "Cannot find pods matching the namespace/label_selector, " + "please check" + ) + + raise RuntimeError() + pod_counter = 0 + for pod in pod_names: + if len(pod) > 1: + selected_container_name = self.get_container_name( + pod[0], + pod[1], + kubecli, + container_name, + ) + pod_exec_response = self.pod_exec( + pod[0], + skew_command, + pod[1], + selected_container_name, + kubecli, + ) + if pod_exec_response is False: + logging.error( + "Couldn't reset time on container %s " + "in pod %s in namespace %s" + % (selected_container_name, pod[0], pod[1]) + ) + # removed_exit + # sys.exit(1) + raise RuntimeError() + pod_names[pod_counter].append(selected_container_name) + else: + selected_container_name = self.get_container_name( + pod, scenario["namespace"], kubecli, container_name + ) + pod_exec_response = self.pod_exec( + pod, + skew_command, + scenario["namespace"], + selected_container_name, + kubecli, + ) + if pod_exec_response is False: + logging.error( + "Couldn't reset time on container " + "%s in pod %s in namespace %s" + % (selected_container_name, pod, scenario["namespace"]) + ) + # removed_exit + # sys.exit(1) + raise RuntimeError() + pod_names[pod_counter].append(selected_container_name) + logging.info("Reset date/time on pod " + str(pod[0])) + pod_counter += 1 + return "pod", pod_names + + # From kubectl/oc command get time output + def parse_string_date(self, obj_datetime): + try: + logging.info("Obj_date time " + str(obj_datetime)) + obj_datetime = re.sub(r"\s\s+", " ", obj_datetime).strip() + logging.info("Obj_date sub time " + str(obj_datetime)) + date_line = re.match( + r"[\s\S\n]*\w{3} \w{3} \d{1,} \d{2}:\d{2}:\d{2} \w{3} \d{4}[\s\S\n]*", # noqa + obj_datetime, + ) + if date_line is not None: + search_response = date_line.group().strip() + logging.info("Search response: " + str(search_response)) + return search_response + else: + return "" + except Exception as e: + logging.info("Exception %s when trying to parse string to date" % str(e)) + return "" + + # Get date and time from string returned from OC + def string_to_date(self, obj_datetime): + obj_datetime = self.parse_string_date(obj_datetime) + try: + date_time_obj = datetime.datetime.strptime( + obj_datetime, "%a %b %d %H:%M:%S %Z %Y" + ) + return date_time_obj + except Exception: + logging.info("Couldn't parse string to datetime object") + return datetime.datetime(datetime.MINYEAR, 1, 1) + + # krkn_lib + def check_date_time(self, object_type, names, kubecli: KrknKubernetes): + skew_command = "date" + not_reset = [] + max_retries = 30 + if object_type == "node": + for node_name in names: + first_date_time = datetime.datetime.utcnow() + check_pod_name = f"time-skew-pod-{get_random_string(5)}" + node_datetime_string = kubecli.exec_command_on_node( + node_name, [skew_command], check_pod_name + ) + node_datetime = self.string_to_date(node_datetime_string) + counter = 0 + while not ( + first_date_time < node_datetime < datetime.datetime.utcnow() + ): + time.sleep(10) + logging.info( + "Date/time on node %s still not reset, " + "waiting 10 seconds and retrying" % node_name + ) + + node_datetime_string = kubecli.exec_cmd_in_pod( + [skew_command], check_pod_name, "default" + ) + node_datetime = self.string_to_date(node_datetime_string) + counter += 1 + if counter > max_retries: + logging.error( + "Date and time in node %s didn't reset properly" % node_name + ) + not_reset.append(node_name) + break + if counter < max_retries: + logging.info("Date in node " + str(node_name) + " reset properly") + kubecli.delete_pod(check_pod_name) + + elif object_type == "pod": + for pod_name in names: + first_date_time = datetime.datetime.utcnow() + counter = 0 + pod_datetime_string = self.pod_exec( + pod_name[0], skew_command, pod_name[1], pod_name[2], kubecli + ) + pod_datetime = self.string_to_date(pod_datetime_string) + while not (first_date_time < pod_datetime < datetime.datetime.utcnow()): + time.sleep(10) + logging.info( + "Date/time on pod %s still not reset, " + "waiting 10 seconds and retrying" % pod_name[0] + ) + pod_datetime = self.pod_exec( + pod_name[0], skew_command, pod_name[1], pod_name[2], kubecli + ) + pod_datetime = self.string_to_date(pod_datetime) + counter += 1 + if counter > max_retries: + logging.error( + "Date and time in pod %s didn't reset properly" + % pod_name[0] + ) + not_reset.append(pod_name[0]) + break + if counter < max_retries: + logging.info("Date in pod " + str(pod_name[0]) + " reset properly") + return not_reset + + def get_scenario_types(self) -> list[str]: + return ["time_scenarios"] diff --git a/krkn/scenario_plugins/zone_outage/__init__.py b/krkn/scenario_plugins/zone_outage/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py new file mode 100644 index 00000000..c2a83ee5 --- /dev/null +++ b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py @@ -0,0 +1,102 @@ +import logging +import time + +import yaml +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn_lib.utils import log_exception + +from krkn import utils +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin +from krkn.scenario_plugins.native.network import cerberus +from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS + + +class ZoneOutageScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + try: + with open(scenario, "r") as f: + zone_outage_config_yaml = yaml.full_load(f) + scenario_config = zone_outage_config_yaml["zone_outage"] + vpc_id = scenario_config["vpc_id"] + subnet_ids = scenario_config["subnet_id"] + duration = scenario_config["duration"] + cloud_type = scenario_config["cloud_type"] + ids = {} + acl_ids_created = [] + + if cloud_type.lower() == "aws": + cloud_object = AWS() + else: + logging.error( + "ZoneOutageScenarioPlugin Cloud type %s is not currently supported for " + "zone outage scenarios" % cloud_type + ) + return 1 + + start_time = int(time.time()) + + for subnet_id in subnet_ids: + logging.info("Targeting subnet_id") + network_association_ids = [] + associations, original_acl_id = cloud_object.describe_network_acls( + vpc_id, subnet_id + ) + for entry in associations: + if entry["SubnetId"] == subnet_id: + network_association_ids.append( + entry["NetworkAclAssociationId"] + ) + logging.info( + "Network association ids associated with " + "the subnet %s: %s" % (subnet_id, network_association_ids) + ) + acl_id = cloud_object.create_default_network_acl(vpc_id) + new_association_id = cloud_object.replace_network_acl_association( + network_association_ids[0], acl_id + ) + + # capture the orginal_acl_id, created_acl_id and + # new association_id to use during the recovery + ids[new_association_id] = original_acl_id + acl_ids_created.append(acl_id) + + # wait for the specified duration + logging.info( + "Waiting for the specified duration " "in the config: %s" % duration + ) + time.sleep(duration) + + # replace the applied acl with the previous acl in use + for new_association_id, original_acl_id in ids.items(): + cloud_object.replace_network_acl_association( + new_association_id, original_acl_id + ) + logging.info( + "Wating for 60 seconds to make sure " "the changes are in place" + ) + time.sleep(60) + + # delete the network acl created for the run + for acl_id in acl_ids_created: + cloud_object.delete_network_acl(acl_id) + + end_time = int(time.time()) + cerberus.publish_kraken_status(krkn_config, [], start_time, end_time) + except (RuntimeError, Exception): + logging.error( + f"ZoneOutageScenarioPlugin scenario {scenario} failed with exception: {e}" + ) + return 1 + else: + return 0 + + def get_scenario_types(self) -> list[str]: + return ["zone_outages_scenarios"] diff --git a/krkn/tests/__init__.py b/krkn/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/krkn/tests/test_classes/__init__.py b/krkn/tests/test_classes/__init__.py new file mode 100644 index 00000000..bd575866 --- /dev/null +++ b/krkn/tests/test_classes/__init__.py @@ -0,0 +1,21 @@ +from typing import List, Tuple + +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift + +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class WrongModuleScenarioPlugin(AbstractScenarioPlugin): + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + pass + + def get_scenario_types(self) -> list[str]: + pass diff --git a/krkn/tests/test_classes/correct_scenario_plugin.py b/krkn/tests/test_classes/correct_scenario_plugin.py new file mode 100644 index 00000000..7b3d6b6c --- /dev/null +++ b/krkn/tests/test_classes/correct_scenario_plugin.py @@ -0,0 +1,22 @@ +from typing import List, Tuple + +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift + +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class CorrectScenarioPlugin(AbstractScenarioPlugin): + + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + pass + + def get_scenario_types(self) -> list[str]: + return ["correct_scenarios", "scenarios_correct"] diff --git a/krkn/tests/test_classes/duplicated_scenario_plugin.py b/krkn/tests/test_classes/duplicated_scenario_plugin.py new file mode 100644 index 00000000..ac25849d --- /dev/null +++ b/krkn/tests/test_classes/duplicated_scenario_plugin.py @@ -0,0 +1,20 @@ +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift + +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class DuplicatedScenarioPlugin(AbstractScenarioPlugin): + + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + pass + + def get_scenario_types(self) -> list[str]: + return ["another_irrelevant_scenario", "duplicated_scenario"] diff --git a/krkn/tests/test_classes/duplicated_two_scenario_plugin.py b/krkn/tests/test_classes/duplicated_two_scenario_plugin.py new file mode 100644 index 00000000..da22380e --- /dev/null +++ b/krkn/tests/test_classes/duplicated_two_scenario_plugin.py @@ -0,0 +1,20 @@ +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift + +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class DuplicatedTwoScenarioPlugin(AbstractScenarioPlugin): + + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + pass + + def get_scenario_types(self) -> list[str]: + return ["duplicated_scenario", "irellevant_scenario"] diff --git a/krkn/tests/test_classes/example_scenario_plugin.py b/krkn/tests/test_classes/example_scenario_plugin.py new file mode 100644 index 00000000..86d64224 --- /dev/null +++ b/krkn/tests/test_classes/example_scenario_plugin.py @@ -0,0 +1,56 @@ +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +# Each plugin must extend the AbstractScenarioPlugin abstract class +# and implement its methods. Also the naming conventions must be respected +# you can refer to the documentation for the details: +# https://github.com/krkn-chaos/krkn/blob/main/docs/scenario_plugin_api.md +class ExampleScenarioPlugin(AbstractScenarioPlugin): + + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + """ + :param run_uuid: the uuid of the chaos run generated by krkn for every single run + :param scenario: the config file of the scenario that is currently executed + :param krkn_config: the full dictionary representation of the `config.yaml` + :param lib_telemetry: it is a composite object of all the + [krkn-lib](https://krkn-chaos.github.io/krkn-lib-docs/modules.html) + objects and methods needed by a krkn plugin to run. + :param scenario_telemetry: the `ScenarioTelemetry` object of the scenario that is currently executed + """ + + pass + + try: + # The scenario logic for each scenario must be placed + # here. A try-except it is needed to catch exceptions + # that may occur in this section and they shouldn't + # be propagated outside (only int return value is admitted). + + # krkn-lib KrknKubernetes object containing all the kubernetes primitives + # can be retrieved by the KrknTelemetryOpenshift object + krkn_kubernetes = lib_telemetry.get_lib_kubernetes() + + # krkn-lib KrknOpenshift object containing all the OCP primitives + # can be retrieved by the KrknTelemetryOpenshift object + krkn_openshift = lib_telemetry.get_lib_ocp() + + # if the scenario succeeds the telemetry exit status is 0 + return 0 + except Exception as e: + # if the scenario fails the telemetry exit status is 1 + return 1 + + # Reflects the scenario type defined in the config.yaml + # in the chaos_scenarios section and to which each class + # responds. + def get_scenario_types(self) -> list[str]: + return ["example_scenarios"] diff --git a/krkn/tests/test_classes/snake_case_mismatch_scenario_plugin.py b/krkn/tests/test_classes/snake_case_mismatch_scenario_plugin.py new file mode 100644 index 00000000..1638163e --- /dev/null +++ b/krkn/tests/test_classes/snake_case_mismatch_scenario_plugin.py @@ -0,0 +1,22 @@ +from typing import List, Tuple + +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift + +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class SnakeMismatchScenarioPlugin(AbstractScenarioPlugin): + + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + pass + + def get_scenario_types(self) -> list[str]: + pass diff --git a/krkn/tests/test_classes/wrong_classname_scenario_plugin.py b/krkn/tests/test_classes/wrong_classname_scenario_plugin.py new file mode 100644 index 00000000..0a6bdd12 --- /dev/null +++ b/krkn/tests/test_classes/wrong_classname_scenario_plugin.py @@ -0,0 +1,22 @@ +from typing import List, Tuple + +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift + +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class WrongClassNamePlugin(AbstractScenarioPlugin): + + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + pass + + def get_scenario_types(self) -> list[str]: + pass diff --git a/krkn/tests/test_classes/wrong_module.py b/krkn/tests/test_classes/wrong_module.py new file mode 100644 index 00000000..b63cda9f --- /dev/null +++ b/krkn/tests/test_classes/wrong_module.py @@ -0,0 +1,22 @@ +from typing import List, Tuple + +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift + +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin + + +class WrongModuleScenarioPlugin(AbstractScenarioPlugin): + + def run( + self, + run_uuid: str, + scenario: str, + krkn_config: dict[str, any], + lib_telemetry: KrknTelemetryOpenshift, + scenario_telemetry: ScenarioTelemetry, + ) -> int: + pass + + def get_scenario_types(self) -> list[str]: + pass diff --git a/krkn/tests/test_plugin_factory.py b/krkn/tests/test_plugin_factory.py new file mode 100644 index 00000000..4494ea47 --- /dev/null +++ b/krkn/tests/test_plugin_factory.py @@ -0,0 +1,110 @@ +import unittest + +from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin +from krkn.scenario_plugins.scenario_plugin_factory import ScenarioPluginFactory +from krkn.tests.test_classes.correct_scenario_plugin import ( + CorrectScenarioPlugin, +) + + +class TestPluginFactory(unittest.TestCase): + + def test_plugin_factory(self): + factory = ScenarioPluginFactory("krkn.tests.test_classes") + self.assertEqual(len(factory.loaded_plugins), 5) + self.assertEqual(len(factory.failed_plugins), 4) + self.assertIs( + factory.loaded_plugins["correct_scenarios"].__base__, + AbstractScenarioPlugin, + ) + self.assertTrue( + isinstance( + factory.loaded_plugins["correct_scenarios"](), CorrectScenarioPlugin + ) + ) + # soLid + self.assertTrue( + isinstance( + factory.loaded_plugins["correct_scenarios"](), AbstractScenarioPlugin + ) + ) + + self.assertTrue( + "krkn.tests.test_classes.snake_case_mismatch_scenario_plugin" + in [p[0] for p in factory.failed_plugins] + ) + self.assertTrue( + "krkn.tests.test_classes.wrong_classname_scenario_plugin" + in [p[0] for p in factory.failed_plugins] + ) + self.assertTrue( + "krkn.tests.test_classes.wrong_module" + in [p[0] for p in factory.failed_plugins] + ) + + def test_plugin_factory_naming_convention(self): + factory = ScenarioPluginFactory() + correct_module_name = "krkn.scenario_plugins.example.correct_scenario_plugin" + correct_class_name = "CorrectScenarioPlugin" + correct_class_name_no_match = "NoMatchScenarioPlugin" + wrong_module_name = "krkn.scenario_plugins.example.correct_plugin" + wrong_class_name = "WrongScenario" + wrong_folder_name_plugin = ( + "krkn.scenario_plugins.example_plugin.example_plugin_scenario_plugin" + ) + wrong_folder_name_plugin_class_name = "ExamplePluginScenarioPlugin" + wrong_folder_name_scenario = ( + "krkn.scenario_plugins.example_scenario.example_scenario_scenario_plugin" + ) + wrong_folder_name_scenario_class_name = "ExampleScenarioScenarioPlugin" + + result, message = factory.is_naming_convention_correct( + correct_module_name, correct_class_name + ) + self.assertTrue(result) + self.assertIsNone(message) + + result, message = factory.is_naming_convention_correct( + wrong_module_name, correct_class_name + ) + self.assertFalse(result) + self.assertEqual( + message, + "scenario plugin module file names must end with `_scenario_plugin` suffix", + ) + + result, message = factory.is_naming_convention_correct( + correct_module_name, wrong_class_name + ) + self.assertFalse(result) + self.assertEqual( + message, + "scenario plugin class name must start with a capital letter, " + "end with `ScenarioPlugin`, and cannot be just `ScenarioPlugin`.", + ) + + result, message = factory.is_naming_convention_correct( + correct_module_name, correct_class_name_no_match + ) + self.assertFalse(result) + self.assertEqual( + message, + "module file name must in snake case must match class name in capital camel case " + "e.g. `example_scenario_plugin` -> `ExampleScenarioPlugin`", + ) + + result, message = factory.is_naming_convention_correct( + wrong_folder_name_plugin, wrong_folder_name_plugin_class_name + ) + self.assertFalse(result) + self.assertEqual( + message, "scenario plugin folder cannot contain `scenario` or `plugin` word" + ) + + result, message = factory.is_naming_convention_correct( + wrong_folder_name_scenario, wrong_folder_name_scenario_class_name + ) + self.assertFalse(result) + self.assertEqual( + message, "scenario plugin folder cannot contain `scenario` or `plugin` word" + ) diff --git a/kraken/utils/TeeLogHandler.py b/krkn/utils/TeeLogHandler.py similarity index 100% rename from kraken/utils/TeeLogHandler.py rename to krkn/utils/TeeLogHandler.py diff --git a/kraken/utils/__init__.py b/krkn/utils/__init__.py similarity index 100% rename from kraken/utils/__init__.py rename to krkn/utils/__init__.py diff --git a/krkn/utils/functions.py b/krkn/utils/functions.py new file mode 100644 index 00000000..6f66263a --- /dev/null +++ b/krkn/utils/functions.py @@ -0,0 +1,80 @@ +import krkn_lib.utils +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift +from tzlocal.unix import get_localzone + + +def populate_cluster_events( + scenario_telemetry: ScenarioTelemetry, + scenario_config: dict, + kubecli: KrknKubernetes, + start_timestamp: int, + end_timestamp: int, +): + events = [] + namespaces = __retrieve_namespaces(scenario_config, kubecli) + + if len(namespaces) == 0: + events.extend( + kubecli.collect_and_parse_cluster_events( + start_timestamp, end_timestamp, str(get_localzone()) + ) + ) + else: + for namespace in namespaces: + events.extend( + kubecli.collect_and_parse_cluster_events( + start_timestamp, + end_timestamp, + str(get_localzone()), + namespace=namespace, + ) + ) + + scenario_telemetry.set_cluster_events(events) + + +def collect_and_put_ocp_logs( + telemetry_ocp: KrknTelemetryOpenshift, + scenario_config: dict, + request_id: str, + start_timestamp: int, + end_timestamp: int, +): + if ( + telemetry_ocp.get_telemetry_config() + and telemetry_ocp.get_telemetry_config()["enabled"] + and telemetry_ocp.get_telemetry_config()["logs_backup"] + and not telemetry_ocp.get_lib_kubernetes().is_kubernetes() + ): + namespaces = __retrieve_namespaces( + scenario_config, telemetry_ocp.get_lib_kubernetes() + ) + if len(namespaces) > 0: + for namespace in namespaces: + telemetry_ocp.put_ocp_logs( + request_id, + telemetry_ocp.get_telemetry_config(), + start_timestamp, + end_timestamp, + namespace, + ) + else: + telemetry_ocp.put_ocp_logs( + request_id, + telemetry_ocp.get_telemetry_config(), + start_timestamp, + end_timestamp, + ) + + +def __retrieve_namespaces(scenario_config: dict, kubecli: KrknKubernetes) -> set[str]: + namespaces = list() + namespaces.extend(krkn_lib.utils.deep_get_attribute("namespace", scenario_config)) + namespace_patterns = krkn_lib.utils.deep_get_attribute( + "namespace_pattern", scenario_config + ) + for pattern in namespace_patterns: + namespaces.extend(kubecli.list_namespaces_by_regex(pattern)) + return set(namespaces) diff --git a/requirements.txt b/requirements.txt index d736be24..932f3bea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ google-api-python-client==2.116.0 ibm_cloud_sdk_core==3.18.0 ibm_vpc==0.20.0 jinja2==3.1.4 -krkn-lib==3.1.2 +krkn-lib==4.0.0 lxml==5.1.0 kubernetes==28.1.0 numpy==1.26.4 diff --git a/run_kraken.py b/run_kraken.py index db8c4626..ea6ec698 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -14,24 +14,9 @@ from krkn_lib.elastic.krkn_elastic import KrknElastic from krkn_lib.models.elastic import ElasticChaosRunTelemetry from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus -from tzlocal.unix import get_localzone - -import kraken.time_actions.common_time_functions as time_actions -import kraken.performance_dashboards.setup as performance_dashboards -import kraken.pod_scenarios.setup as pod_scenarios -import kraken.service_disruption.common_service_disruption_functions as service_disruption -import kraken.shut_down.common_shut_down_func as shut_down -import kraken.node_actions.run as nodeaction -import kraken.managedcluster_scenarios.run as managedcluster_scenarios -import kraken.zone_outage.actions as zone_outages -import kraken.application_outage.actions as application_outage -import kraken.pvc.pvc_scenario as pvc_scenario -import kraken.network_chaos.actions as network_chaos -import kraken.arcaflow_plugin as arcaflow_plugin -import kraken.prometheus as prometheus_plugin -import kraken.service_hijacking.service_hijacking as service_hijacking_plugin +import krkn.performance_dashboards.setup as performance_dashboards +import krkn.prometheus as prometheus_plugin import server as server -from kraken import plugins, syn_flood from krkn_lib.k8s import KrknKubernetes from krkn_lib.ocp import KrknOpenshift from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes @@ -40,10 +25,15 @@ from krkn_lib.models.telemetry import ChaosRunTelemetry from krkn_lib.utils import SafeLogger from krkn_lib.utils.functions import get_yaml_item_value, get_junit_test_case -from kraken.utils import TeeLogHandler +from krkn.utils import TeeLogHandler +from krkn.scenario_plugins.scenario_plugin_factory import ( + ScenarioPluginFactory, + ScenarioPluginNotFound, +) report_file = "" + # Main function def main(cfg) -> int: # Start kraken @@ -62,31 +52,25 @@ def main(cfg) -> int: get_yaml_item_value(config["kraken"], "kubeconfig_path", "") ) kraken_config = cfg - chaos_scenarios = get_yaml_item_value( - config["kraken"], "chaos_scenarios", [] - ) + chaos_scenarios = get_yaml_item_value(config["kraken"], "chaos_scenarios", []) publish_running_status = get_yaml_item_value( config["kraken"], "publish_kraken_status", False ) port = get_yaml_item_value(config["kraken"], "port", 8081) signal_address = get_yaml_item_value( - config["kraken"], "signal_address", "0.0.0.0") - run_signal = get_yaml_item_value( - config["kraken"], "signal_state", "RUN" - ) - wait_duration = get_yaml_item_value( - config["tunings"], "wait_duration", 60 + config["kraken"], "signal_address", "0.0.0.0" ) + run_signal = get_yaml_item_value(config["kraken"], "signal_state", "RUN") + wait_duration = get_yaml_item_value(config["tunings"], "wait_duration", 60) iterations = get_yaml_item_value(config["tunings"], "iterations", 1) - daemon_mode = get_yaml_item_value( - config["tunings"], "daemon_mode", False - ) + daemon_mode = get_yaml_item_value(config["tunings"], "daemon_mode", False) deploy_performance_dashboards = get_yaml_item_value( config["performance_monitoring"], "deploy_dashboards", False ) dashboard_repo = get_yaml_item_value( - config["performance_monitoring"], "repo", - "https://github.com/cloud-bulldozer/performance-dashboards.git" + config["performance_monitoring"], + "repo", + "https://github.com/cloud-bulldozer/performance-dashboards.git", ) prometheus_url = config["performance_monitoring"].get("prometheus_url") @@ -101,9 +85,7 @@ def main(cfg) -> int: config["performance_monitoring"], "enable_metrics", False ) # elastic search - enable_elastic = get_yaml_item_value( - config["elastic"], "enable_elastic", False - ) + enable_elastic = get_yaml_item_value(config["elastic"], "enable_elastic", False) elastic_collect_metrics = get_yaml_item_value( config["elastic"], "collect_metrics", False ) @@ -112,24 +94,16 @@ def main(cfg) -> int: config["elastic"], "collect_alerts", False ) - elastic_url = get_yaml_item_value( - config["elastic"], "elastic_url", "" - ) + elastic_url = get_yaml_item_value(config["elastic"], "elastic_url", "") elastic_verify_certs = get_yaml_item_value( config["elastic"], "verify_certs", False ) - elastic_port = get_yaml_item_value( - config["elastic"], "elastic_port", 32766 - ) + elastic_port = get_yaml_item_value(config["elastic"], "elastic_port", 32766) - elastic_username = get_yaml_item_value( - config["elastic"], "username", "" - ) - elastic_password = get_yaml_item_value( - config["elastic"], "password", "" - ) + elastic_username = get_yaml_item_value(config["elastic"], "username", "") + elastic_password = get_yaml_item_value(config["elastic"], "password", "") elastic_metrics_index = get_yaml_item_value( config["elastic"], "metrics_index", "krkn-metrics" @@ -143,8 +117,6 @@ def main(cfg) -> int: config["elastic"], "telemetry_index", "krkn-telemetry" ) - - alert_profile = config["performance_monitoring"].get("alert_profile") metrics_profile = config["performance_monitoring"].get("metrics_profile") check_critical_alerts = get_yaml_item_value( @@ -152,14 +124,13 @@ def main(cfg) -> int: ) telemetry_api_url = config["telemetry"].get("api_url") - # Initialize clients - if (not os.path.isfile(kubeconfig_path) and - not os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/token")): + if not os.path.isfile(kubeconfig_path) and not os.path.isfile( + "/var/run/secrets/kubernetes.io/serviceaccount/token" + ): logging.error( "Cannot read the kubeconfig file at %s, please check" % kubeconfig_path ) - #sys.exit(1) return 1 logging.info("Initializing client to talk to the Kubernetes cluster") @@ -175,8 +146,12 @@ def main(cfg) -> int: # request_id for telemetry is generated once here and used everywhere telemetry_request_id = f"{int(time.time())}-{run_uuid}" if config["telemetry"].get("run_tag"): - telemetry_request_id = f"{telemetry_request_id}-{config['telemetry']['run_tag']}" - telemetry_log_file = f'{config["telemetry"]["archive_path"]}/{telemetry_request_id}.log' + telemetry_request_id = ( + f"{telemetry_request_id}-{config['telemetry']['run_tag']}" + ) + telemetry_log_file = ( + f'{config["telemetry"]["archive_path"]}/{telemetry_request_id}.log' + ) safe_logger = SafeLogger(filename=telemetry_log_file) try: @@ -194,11 +169,9 @@ def main(cfg) -> int: # Set up kraken url to track signal if not 0 <= int(port) <= 65535: logging.error("%s isn't a valid port number, please check" % (port)) - #sys.exit(1) return 1 if not signal_address: logging.error("Please set the signal address in the config") - #sys.exit(1) return 1 address = (signal_address, port) @@ -223,13 +196,15 @@ def main(cfg) -> int: if connection_data: prometheus_url = connection_data.endpoint prometheus_bearer_token = connection_data.token - else: + else: # If can't make a connection, set alerts to false enable_alerts = False critical_alerts = False except Exception: - logging.error("invalid distribution selected, running openshift scenarios against kubernetes cluster." - "Please set 'kubernetes' in config.yaml krkn.platform and try again") + logging.error( + "invalid distribution selected, running openshift scenarios against kubernetes cluster." + "Please set 'kubernetes' in config.yaml krkn.platform and try again" + ) return 1 if cv != "": logging.info(cv) @@ -237,17 +212,22 @@ def main(cfg) -> int: logging.info("Cluster version CRD not detected, skipping") # KrknTelemetry init - telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli, config["telemetry"]) - telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli, config["telemetry"]) + telemetry_k8s = KrknTelemetryKubernetes( + safe_logger, kubecli, config["telemetry"] + ) + telemetry_ocp = KrknTelemetryOpenshift( + safe_logger, ocpcli, telemetry_request_id, config["telemetry"] + ) if enable_elastic: - elastic_search = KrknElastic(safe_logger, - elastic_url, - elastic_port, - elastic_verify_certs, - elastic_username, - elastic_password - ) - else: + elastic_search = KrknElastic( + safe_logger, + elastic_url, + elastic_port, + elastic_verify_certs, + elastic_username, + elastic_password, + ) + else: elastic_search = None summary = ChaosRunAlertSummary() if enable_metrics or enable_alerts or check_critical_alerts: @@ -259,8 +239,6 @@ def main(cfg) -> int: if deploy_performance_dashboards: performance_dashboards.setup(dashboard_repo, distribution) - - # Initialize the start iteration to 0 iteration = 0 @@ -285,11 +263,44 @@ def main(cfg) -> int: chaos_output = ChaosRunOutput() chaos_telemetry = ChaosRunTelemetry() chaos_telemetry.run_uuid = run_uuid + scenario_plugin_factory = ScenarioPluginFactory() + classes_and_types: dict[str, list[str]] = {} + for loaded in scenario_plugin_factory.loaded_plugins.keys(): + if ( + scenario_plugin_factory.loaded_plugins[loaded].__name__ + not in classes_and_types.keys() + ): + classes_and_types[ + scenario_plugin_factory.loaded_plugins[loaded].__name__ + ] = [] + classes_and_types[ + scenario_plugin_factory.loaded_plugins[loaded].__name__ + ].append(loaded) + logging.info( + "📣 `ScenarioPluginFactory`: types from config.yaml mapped to respective classes for execution:" + ) + for class_loaded in classes_and_types.keys(): + if len(classes_and_types[class_loaded]) <= 1: + logging.info( + f" ✅ type: {classes_and_types[class_loaded][0]} ➡️ `{class_loaded}` " + ) + else: + logging.info( + f" ✅ types: [{', '.join(classes_and_types[class_loaded])}] ➡️ `{class_loaded}` " + ) + logging.info("\n") + if len(scenario_plugin_factory.failed_plugins) > 0: + logging.info("Failed to load Scenario Plugins:\n") + for failed in scenario_plugin_factory.failed_plugins: + module_name, class_name, error = failed + logging.error(f"⛔ Class: {class_name} Module: {module_name}") + logging.error(f"⚠️ {error}\n") # Loop to run the chaos starts here while int(iteration) < iterations and run_signal != "STOP": # Inject chaos scenarios specified in the config logging.info("Executing scenarios for iteration " + str(iteration)) if chaos_scenarios: + for scenario in chaos_scenarios: if publish_running_status: run_signal = server.get_status(address) @@ -307,183 +318,43 @@ def main(cfg) -> int: scenario_type = list(scenario.keys())[0] scenarios_list = scenario[scenario_type] if scenarios_list: - # Inject pod chaos scenarios specified in the config - if scenario_type == "pod_scenarios": + try: + scenario_plugin = scenario_plugin_factory.create_plugin( + scenario_type + ) + except ScenarioPluginNotFound: logging.error( - "Pod scenarios have been removed, please use " - "plugin_scenarios with the " - "kill-pods configuration instead." + f"impossible to find scenario {scenario_type}, plugin not found. Exiting" ) - return 1 - elif scenario_type == "arcaflow_scenarios": - failed_post_scenarios, scenario_telemetries = arcaflow_plugin.run( - scenarios_list, - telemetry_ocp, - telemetry_request_id + sys.exit(1) + + failed_post_scenarios, scenario_telemetries = ( + scenario_plugin.run_scenarios( + run_uuid, scenarios_list, config, telemetry_ocp ) - chaos_telemetry.scenarios.extend(scenario_telemetries) + ) + chaos_telemetry.scenarios.extend(scenario_telemetries) - elif scenario_type == "plugin_scenarios": - failed_post_scenarios, scenario_telemetries = plugins.run( - scenarios_list, - kraken_config, - failed_post_scenarios, - wait_duration, - telemetry_ocp, - run_uuid, - telemetry_request_id - ) - chaos_telemetry.scenarios.extend(scenario_telemetries) - # krkn_lib - elif scenario_type == "container_scenarios": - logging.info("Running container scenarios") - failed_post_scenarios, scenario_telemetries = pod_scenarios.container_run( - scenarios_list, - config, - failed_post_scenarios, - wait_duration, - telemetry_ocp, - telemetry_request_id - ) - chaos_telemetry.scenarios.extend(scenario_telemetries) - - # Inject node chaos scenarios specified in the config - # krkn_lib - elif scenario_type == "node_scenarios": - logging.info("Running node scenarios") - failed_post_scenarios, scenario_telemetries = nodeaction.run(scenarios_list, - config, - wait_duration, - telemetry_ocp, - telemetry_request_id) - chaos_telemetry.scenarios.extend(scenario_telemetries) - # Inject managedcluster chaos scenarios specified in the config - # krkn_lib - elif scenario_type == "managedcluster_scenarios": - logging.info("Running managedcluster scenarios") - managedcluster_scenarios.run( - scenarios_list, - config, - wait_duration, - kubecli - ) - - # Inject time skew chaos scenarios specified - # in the config - # krkn_lib - elif scenario_type == "time_scenarios": - logging.info("Running time skew scenarios") - failed_post_scenarios, scenario_telemetries = time_actions.run(scenarios_list, - config, - wait_duration, - telemetry_ocp, - telemetry_request_id - ) - chaos_telemetry.scenarios.extend(scenario_telemetries) - # Inject cluster shutdown scenarios - # krkn_lib - elif scenario_type == "cluster_shut_down_scenarios": - failed_post_scenarios, scenario_telemetries = shut_down.run(scenarios_list, - config, - wait_duration, - telemetry_ocp, - telemetry_request_id - ) - chaos_telemetry.scenarios.extend(scenario_telemetries) - - # Inject namespace chaos scenarios - # krkn_lib - elif scenario_type == "service_disruption_scenarios": - logging.info("Running service disruption scenarios") - failed_post_scenarios, scenario_telemetries = service_disruption.run( - scenarios_list, - config, - wait_duration, - failed_post_scenarios, - telemetry_ocp, - telemetry_request_id - ) - chaos_telemetry.scenarios.extend(scenario_telemetries) - - # Inject zone failures - elif scenario_type == "zone_outages": - logging.info("Inject zone outages") - failed_post_scenarios, scenario_telemetries = zone_outages.run(scenarios_list, - config, - wait_duration, - telemetry_ocp, - telemetry_request_id - ) - chaos_telemetry.scenarios.extend(scenario_telemetries) - # Application outages - elif scenario_type == "application_outages": - logging.info("Injecting application outage") - failed_post_scenarios, scenario_telemetries = application_outage.run( - scenarios_list, - config, - wait_duration, - telemetry_ocp, - telemetry_request_id - ) - chaos_telemetry.scenarios.extend(scenario_telemetries) - - # PVC scenarios - # krkn_lib - elif scenario_type == "pvc_scenarios": - logging.info("Running PVC scenario") - failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, - config, - wait_duration, - telemetry_ocp, - telemetry_request_id - ) - chaos_telemetry.scenarios.extend(scenario_telemetries) - - # Network scenarios - # krkn_lib - elif scenario_type == "network_chaos": - logging.info("Running Network Chaos") - failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, - config, - wait_duration, - telemetry_ocp, - telemetry_request_id - ) - elif scenario_type == "service_hijacking": - logging.info("Running Service Hijacking Chaos") - failed_post_scenarios, scenario_telemetries = service_hijacking_plugin.run(scenarios_list, - wait_duration, - telemetry_ocp, - telemetry_request_id - ) - chaos_telemetry.scenarios.extend(scenario_telemetries) - elif scenario_type == "syn_flood": - logging.info("Running Syn Flood Chaos") - failed_post_scenarios, scenario_telemetries = syn_flood.run(scenarios_list, - telemetry_ocp, - telemetry_request_id - ) - chaos_telemetry.scenarios.extend(scenario_telemetries) - - # Check for critical alerts when enabled post_critical_alerts = 0 if check_critical_alerts: - prometheus_plugin.critical_alerts(prometheus, - summary, - run_uuid, - scenario_type, - start_time, - datetime.datetime.now()) + prometheus_plugin.critical_alerts( + prometheus, + summary, + run_uuid, + scenario_type, + start_time, + datetime.datetime.now(), + ) chaos_output.critical_alerts = summary post_critical_alerts = len(summary.post_chaos_alerts) if post_critical_alerts > 0: - logging.error("Post chaos critical alerts firing please check, exiting") + logging.error( + "Post chaos critical alerts firing please check, exiting" + ) break - iteration += 1 - logging.info("") # telemetry # in order to print decoded telemetry data even if telemetry collection @@ -495,8 +366,12 @@ def main(cfg) -> int: # Cloud platform and network plugins metadata # through OCP specific APIs if distribution == "openshift": + logging.info( + "collecting OCP cluster metadata, this may take few minutes...." + ) telemetry_ocp.collect_cluster_metadata(chaos_telemetry) else: + logging.info("collecting Kubernetes cluster metadata....") telemetry_k8s.collect_cluster_metadata(chaos_telemetry) telemetry_json = chaos_telemetry.to_json() @@ -504,53 +379,82 @@ def main(cfg) -> int: chaos_output.telemetry = decoded_chaos_run_telemetry logging.info(f"Chaos data:\n{chaos_output.to_json()}") if enable_elastic: - elastic_telemetry = ElasticChaosRunTelemetry(chaos_run_telemetry=decoded_chaos_run_telemetry) - result = elastic_search.push_telemetry(elastic_telemetry, elastic_telemetry_index) + elastic_telemetry = ElasticChaosRunTelemetry( + chaos_run_telemetry=decoded_chaos_run_telemetry + ) + result = elastic_search.push_telemetry( + elastic_telemetry, elastic_telemetry_index + ) if result == -1: - safe_logger.error(f"failed to save telemetry on elastic search: {chaos_output.to_json()}") + safe_logger.error( + f"failed to save telemetry on elastic search: {chaos_output.to_json()}" + ) if config["telemetry"]["enabled"]: - logging.info(f'telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/files/' - f'{(config["telemetry"]["telemetry_group"] if config["telemetry"]["telemetry_group"] else "default")}/' - f'{telemetry_request_id}') + logging.info( + f"telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/files/" + f'{(config["telemetry"]["telemetry_group"] if config["telemetry"]["telemetry_group"] else "default")}/' + f"{telemetry_request_id}" + ) logging.info(f"telemetry upload log: {safe_logger.log_file_name}") try: - telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry) - telemetry_k8s.put_critical_alerts(telemetry_request_id, config["telemetry"], summary) + telemetry_k8s.send_telemetry( + config["telemetry"], telemetry_request_id, chaos_telemetry + ) + telemetry_k8s.put_critical_alerts( + telemetry_request_id, config["telemetry"], summary + ) # prometheus data collection is available only on Openshift if config["telemetry"]["prometheus_backup"]: - prometheus_archive_files = '' - if distribution == "openshift" : - prometheus_archive_files = telemetry_ocp.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id) + prometheus_archive_files = "" + if distribution == "openshift": + prometheus_archive_files = ( + telemetry_ocp.get_ocp_prometheus_data( + config["telemetry"], telemetry_request_id + ) + ) else: - if (config["telemetry"]["prometheus_namespace"] and - config["telemetry"]["prometheus_pod_name"] and - config["telemetry"]["prometheus_container_name"]): + if ( + config["telemetry"]["prometheus_namespace"] + and config["telemetry"]["prometheus_pod_name"] + and config["telemetry"]["prometheus_container_name"] + ): try: - prometheus_archive_files = telemetry_k8s.get_prometheus_pod_data( - config["telemetry"], - telemetry_request_id, - config["telemetry"]["prometheus_pod_name"], - config["telemetry"]["prometheus_container_name"], - config["telemetry"]["prometheus_namespace"] + prometheus_archive_files = ( + telemetry_k8s.get_prometheus_pod_data( + config["telemetry"], + telemetry_request_id, + config["telemetry"]["prometheus_pod_name"], + config["telemetry"][ + "prometheus_container_name" + ], + config["telemetry"]["prometheus_namespace"], + ) ) except Exception as e: - logging.error(f"failed to get prometheus backup with exception {str(e)}") + logging.error( + f"failed to get prometheus backup with exception {str(e)}" + ) else: - logging.warning("impossible to backup prometheus," - "check if config contains telemetry.prometheus_namespace, " - "telemetry.prometheus_pod_name and " - "telemetry.prometheus_container_name") + logging.warning( + "impossible to backup prometheus," + "check if config contains telemetry.prometheus_namespace, " + "telemetry.prometheus_pod_name and " + "telemetry.prometheus_container_name" + ) if prometheus_archive_files: safe_logger.info("starting prometheus archive upload:") - telemetry_k8s.put_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id) - + telemetry_k8s.put_prometheus_data( + config["telemetry"], + prometheus_archive_files, + telemetry_request_id, + ) + except Exception as e: logging.error(f"failed to send telemetry data: {str(e)}") else: logging.info("telemetry collection disabled, skipping.") - # Check for the alerts specified if enable_alerts: logging.info("Alerts checking is enabled") @@ -563,33 +467,35 @@ def main(cfg) -> int: end_time, alert_profile, elastic_colllect_alerts, - elastic_alerts_index + elastic_alerts_index, ) else: logging.error("Alert profile is not defined") return 1 - #sys.exit(1) + # sys.exit(1) if enable_metrics: - prometheus_plugin.metrics(prometheus, - elastic_search, - start_time, - run_uuid, - end_time, - metrics_profile, - elastic_collect_metrics, - elastic_metrics_index) + prometheus_plugin.metrics( + prometheus, + elastic_search, + start_time, + run_uuid, + end_time, + metrics_profile, + elastic_collect_metrics, + elastic_metrics_index, + ) if post_critical_alerts > 0: logging.error("Critical alerts are firing, please check; exiting") - #sys.exit(2) + # sys.exit(2) return 2 if failed_post_scenarios: logging.error( "Post scenarios are still failing at the end of all iterations" ) - #sys.exit(2) + # sys.exit(2) return 2 logging.info( @@ -598,13 +504,12 @@ def main(cfg) -> int: ) else: logging.error("Cannot find a config at %s, please check" % (cfg)) - #sys.exit(1) + # sys.exit(1) return 2 return 0 - if __name__ == "__main__": # Initialize the parser to read the config parser = optparse.OptionParser() @@ -623,8 +528,6 @@ if __name__ == "__main__": default="kraken.report", ) - - parser.add_option( "--junit-testcase", dest="junit_testcase", @@ -649,7 +552,11 @@ if __name__ == "__main__": (options, args) = parser.parse_args() report_file = options.output tee_handler = TeeLogHandler() - handlers = [logging.FileHandler(report_file, mode="w"), logging.StreamHandler(), tee_handler] + handlers = [ + logging.FileHandler(report_file, mode="w"), + logging.StreamHandler(), + tee_handler, + ] logging.basicConfig( level=logging.INFO, @@ -666,12 +573,16 @@ if __name__ == "__main__": junit_start_time = time.time() # checks if both mandatory options for junit are set if options.junit_testcase_path and not options.junit_testcase: - logging.error("please set junit test case description with --junit-testcase [description] option") + logging.error( + "please set junit test case description with --junit-testcase [description] option" + ) option_error = True junit_error = True if options.junit_testcase and not options.junit_testcase_path: - logging.error("please set junit test case path with --junit-testcase-path [path] option") + logging.error( + "please set junit test case path with --junit-testcase-path [path] option" + ) option_error = True junit_error = True @@ -680,17 +591,23 @@ if __name__ == "__main__": junit_normalized_path = os.path.normpath(options.junit_testcase_path) if not os.path.exists(junit_normalized_path): - logging.error(f"{junit_normalized_path} do not exists, please select a valid path") + logging.error( + f"{junit_normalized_path} do not exists, please select a valid path" + ) option_error = True junit_error = True if not os.path.isdir(junit_normalized_path): - logging.error(f"{junit_normalized_path} is a file, please select a valid folder path") + logging.error( + f"{junit_normalized_path} is a file, please select a valid folder path" + ) option_error = True junit_error = True if not os.access(junit_normalized_path, os.W_OK): - logging.error(f"{junit_normalized_path} is not writable, please select a valid path") + logging.error( + f"{junit_normalized_path} is not writable, please select a valid path" + ) option_error = True junit_error = True @@ -713,9 +630,11 @@ if __name__ == "__main__": test_suite_name="krkn-test-suite", test_case_description=options.junit_testcase, test_stdout=tee_handler.get_output(), - test_version=options.junit_testcase_version + test_version=options.junit_testcase_version, + ) + junit_testcase_file_path = ( + f"{junit_normalized_path}/junit_krkn_{int(time.time())}.xml" ) - junit_testcase_file_path = f"{junit_normalized_path}/junit_krkn_{int(time.time())}.xml" logging.info(f"writing junit XML testcase in {junit_testcase_file_path}") with open(junit_testcase_file_path, "w") as stream: stream.write(junit_testcase_xml) diff --git a/scenarios/arcaflow/cpu-hog/config.yaml b/scenarios/kube/cpu-hog/config.yaml similarity index 100% rename from scenarios/arcaflow/cpu-hog/config.yaml rename to scenarios/kube/cpu-hog/config.yaml diff --git a/scenarios/arcaflow/cpu-hog/input.yaml b/scenarios/kube/cpu-hog/input.yaml similarity index 100% rename from scenarios/arcaflow/cpu-hog/input.yaml rename to scenarios/kube/cpu-hog/input.yaml diff --git a/scenarios/arcaflow/cpu-hog/sub-workflow.yaml b/scenarios/kube/cpu-hog/sub-workflow.yaml similarity index 100% rename from scenarios/arcaflow/cpu-hog/sub-workflow.yaml rename to scenarios/kube/cpu-hog/sub-workflow.yaml diff --git a/scenarios/arcaflow/cpu-hog/workflow.yaml b/scenarios/kube/cpu-hog/workflow.yaml similarity index 100% rename from scenarios/arcaflow/cpu-hog/workflow.yaml rename to scenarios/kube/cpu-hog/workflow.yaml diff --git a/scenarios/arcaflow/io-hog/config.yaml b/scenarios/kube/io-hog/config.yaml similarity index 100% rename from scenarios/arcaflow/io-hog/config.yaml rename to scenarios/kube/io-hog/config.yaml diff --git a/scenarios/arcaflow/io-hog/input.yaml b/scenarios/kube/io-hog/input.yaml similarity index 100% rename from scenarios/arcaflow/io-hog/input.yaml rename to scenarios/kube/io-hog/input.yaml diff --git a/scenarios/arcaflow/io-hog/sub-workflow.yaml b/scenarios/kube/io-hog/sub-workflow.yaml similarity index 100% rename from scenarios/arcaflow/io-hog/sub-workflow.yaml rename to scenarios/kube/io-hog/sub-workflow.yaml diff --git a/scenarios/arcaflow/io-hog/workflow.yaml b/scenarios/kube/io-hog/workflow.yaml similarity index 100% rename from scenarios/arcaflow/io-hog/workflow.yaml rename to scenarios/kube/io-hog/workflow.yaml diff --git a/scenarios/arcaflow/memory-hog/config.yaml b/scenarios/kube/memory-hog/config.yaml similarity index 100% rename from scenarios/arcaflow/memory-hog/config.yaml rename to scenarios/kube/memory-hog/config.yaml diff --git a/scenarios/arcaflow/memory-hog/input.yaml b/scenarios/kube/memory-hog/input.yaml similarity index 100% rename from scenarios/arcaflow/memory-hog/input.yaml rename to scenarios/kube/memory-hog/input.yaml diff --git a/scenarios/arcaflow/memory-hog/sub-workflow.yaml b/scenarios/kube/memory-hog/sub-workflow.yaml similarity index 100% rename from scenarios/arcaflow/memory-hog/sub-workflow.yaml rename to scenarios/kube/memory-hog/sub-workflow.yaml diff --git a/scenarios/arcaflow/memory-hog/workflow.yaml b/scenarios/kube/memory-hog/workflow.yaml similarity index 100% rename from scenarios/arcaflow/memory-hog/workflow.yaml rename to scenarios/kube/memory-hog/workflow.yaml diff --git a/scenarios/openshift/post_action_etcd_container.py b/scenarios/openshift/post_action_etcd_container.py deleted file mode 100755 index ff39723f..00000000 --- a/scenarios/openshift/post_action_etcd_container.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -import subprocess -import logging -import time - - -def run(cmd): - try: - output = subprocess.Popen( - cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT - ) - (out, err) = output.communicate() - except Exception as e: - logging.error("Failed to run %s, error: %s" % (cmd, e)) - return out - - -i = 0 -while i < 100: - pods_running = run("oc get pods -n openshift-etcd -l app=etcd | grep -c '4/4'").rstrip() - if pods_running == "3": - break - time.sleep(5) - i += 1 - -if pods_running == str(3): - print("There were 3 pods running properly") -else: - print("ERROR there were " + str(pods_running) + " pods running instead of 3") diff --git a/scenarios/openshift/post_action_etcd_example_py.py b/scenarios/openshift/post_action_etcd_example_py.py deleted file mode 100755 index 1c7a2cf4..00000000 --- a/scenarios/openshift/post_action_etcd_example_py.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 -import subprocess -import logging - - -def run(cmd): - try: - output = subprocess.Popen( - cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT - ) - (out, err) = output.communicate() - logging.info("out " + str(out)) - except Exception as e: - logging.error("Failed to run %s, error: %s" % (cmd, e)) - return out - - -pods_running = run("oc get pods -n openshift-etcd | grep -c Running").rstrip() - -if pods_running == str(3): - print("There were 3 pods running properly") -else: - print("ERROR there were " + str(pods_running) + " pods running instead of 3") diff --git a/scenarios/openshift/post_action_namespace.py b/scenarios/openshift/post_action_namespace.py deleted file mode 100755 index 180a0ffd..00000000 --- a/scenarios/openshift/post_action_namespace.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python3 -import subprocess -import time - - -def run(cmd): - try: - output = subprocess.Popen( - cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT - ) - (out, err) = output.communicate() - except Exception as e: - print("Failed to run %s, error: %s" % (cmd, e)) - return out - - -i = 0 -while i < 100: - projects_active = run("oc get project | grep 'ingress' | grep -c Active").rstrip() - if projects_active == "3": - break - i += 1 - time.sleep(5) - -if projects_active == str(3): - print("There were 3 projects running properly") -else: - print("ERROR there were " + str(projects_active) + " projects running instead of 3") diff --git a/scenarios/openshift/post_action_prometheus.yml b/scenarios/openshift/post_action_prometheus.yml deleted file mode 100644 index eed2687c..00000000 --- a/scenarios/openshift/post_action_prometheus.yml +++ /dev/null @@ -1,6 +0,0 @@ -# yaml-language-server: $schema=../plugin.schema.json -- id: kill-pods - config: - namespace_pattern: ^openshift-monitoring$ - label_selector: app=prometheus - krkn_pod_recovery_time: 120 \ No newline at end of file diff --git a/scenarios/openshift/post_action_regex.py b/scenarios/openshift/post_action_regex.py deleted file mode 100755 index e7530688..00000000 --- a/scenarios/openshift/post_action_regex.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python3 -import logging -import re -import subprocess -import sys - -from kubernetes import client, config -from kubernetes.client.rest import ApiException - - -def list_namespaces(): - """ - List all namespaces - """ - spaces_list = [] - try: - config.load_kube_config() - cli = client.CoreV1Api() - ret = cli.list_namespace(pretty=True) - except ApiException as e: - logging.error( - "Exception when calling CoreV1Api->list_namespace: %s\n", - e - ) - for current_namespace in ret.items: - spaces_list.append(current_namespace.metadata.name) - return spaces_list - - -def check_namespaces(namespaces): - """ - Check if all the watch_namespaces are valid - """ - try: - valid_namespaces = list_namespaces() - regex_namespaces = set(namespaces) - set(valid_namespaces) - final_namespaces = set(namespaces) - set(regex_namespaces) - valid_regex = set() - if regex_namespaces: - for current_ns in valid_namespaces: - for regex_namespace in regex_namespaces: - if re.search(regex_namespace, current_ns): - final_namespaces.add(current_ns) - valid_regex.add(regex_namespace) - break - invalid_namespaces = regex_namespaces - valid_regex - if invalid_namespaces: - raise Exception( - "There exists no namespaces matching: %s" % ( - invalid_namespaces - ) - ) - return list(final_namespaces) - except Exception as e: - logging.error(str(e)) - sys.exit(1) - - -def run(cmd): - try: - output = subprocess.Popen( - cmd, - shell=True, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT - ) - (out, err) = output.communicate() - except Exception as e: - logging.error("Failed to run %s, error: %s", cmd, e) - return out - - -def print_running_pods(): - regex_namespace_list = ["openshift-.*"] - checked_namespaces = check_namespaces(regex_namespace_list) - pods_running = 0 - for namespace in checked_namespaces: - new_pods_running = run( - "oc get pods -n " + namespace + " | grep -c Running" - ).rstrip() - try: - pods_running += int(new_pods_running) - except Exception: - continue - print(pods_running) - - -if __name__ == '__main__': - print_running_pods() diff --git a/scenarios/openshift/post_action_regex.sh b/scenarios/openshift/post_action_regex.sh deleted file mode 100755 index 10626cc0..00000000 --- a/scenarios/openshift/post_action_regex.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -pods="$(oc get pods -n openshift-etcd | grep -c Running)" -echo "$pods" - -if [ "$pods" -eq 3 ] -then - echo "Pods Pass" -else - # need capital error for proper error catching in run_kraken - echo "ERROR pod count $pods doesnt match 3 expected pods" -fi diff --git a/scenarios/openshift/post_action_shut_down.py b/scenarios/openshift/post_action_shut_down.py deleted file mode 100644 index a8ec7e78..00000000 --- a/scenarios/openshift/post_action_shut_down.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env python3 -import subprocess -import logging -import time -import yaml - - -def run(cmd): - out = "" - try: - output = subprocess.Popen( - cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT - ) - (out, err) = output.communicate() - except Exception as e: - logging.info("Failed to run %s, error: %s" % (cmd, e)) - return out - - -# Get cluster operators and return yaml -def get_cluster_operators(): - operators_status = run("kubectl get co -o yaml") - status_yaml = yaml.safe_load(operators_status, Loader=yaml.FullLoader) - return status_yaml - - -# Monitor cluster operators -def monitor_cluster_operator(cluster_operators): - failed_operators = [] - for operator in cluster_operators["items"]: - # loop through the conditions in the status section to find the dedgraded condition - if "status" in operator.keys() and "conditions" in operator["status"].keys(): - for status_cond in operator["status"]["conditions"]: - # if the degraded status is not false, add it to the failed operators to return - if status_cond["type"] == "Degraded" and status_cond["status"] != "False": - failed_operators.append(operator["metadata"]["name"]) - break - else: - logging.info("Can't find status of " + operator["metadata"]["name"]) - failed_operators.append(operator["metadata"]["name"]) - # return False if there are failed operators else return True - return failed_operators - - -wait_duration = 10 -timeout = 900 -counter = 0 - -counter = 0 -co_yaml = get_cluster_operators() -failed_operators = monitor_cluster_operator(co_yaml) -while len(failed_operators) > 0: - time.sleep(wait_duration) - co_yaml = get_cluster_operators() - failed_operators = monitor_cluster_operator(co_yaml) - if counter >= timeout: - print("Cluster operators are still degraded after " + str(timeout) + "seconds") - print("Degraded operators " + str(failed_operators)) - exit(1) - counter += wait_duration - -not_ready = run("oc get nodes --no-headers | grep 'NotReady' | wc -l").rstrip() -while int(not_ready) > 0: - time.sleep(wait_duration) - not_ready = run("oc get nodes --no-headers | grep 'NotReady' | wc -l").rstrip() - if counter >= timeout: - print("Nodes are still not ready after " + str(timeout) + "seconds") - exit(1) - counter += wait_duration - -worker_nodes = run("oc get nodes --no-headers | grep worker | egrep -v NotReady | awk '{print $1}'").rstrip() -print("Worker nodes list \n" + str(worker_nodes)) -master_nodes = run("oc get nodes --no-headers | grep master | egrep -v NotReady | awk '{print $1}'").rstrip() -print("Master nodes list \n" + str(master_nodes)) -infra_nodes = run("oc get nodes --no-headers | grep infra | egrep -v NotReady | awk '{print $1}'").rstrip() -print("Infra nodes list \n" + str(infra_nodes)) diff --git a/tests/test_ingress_network_plugin.py b/tests/test_ingress_network_plugin.py index daea3f5b..6ea8f4da 100644 --- a/tests/test_ingress_network_plugin.py +++ b/tests/test_ingress_network_plugin.py @@ -1,7 +1,8 @@ import unittest import logging from arcaflow_plugin_sdk import plugin -from kraken.plugins.network import ingress_shaping + +from krkn.scenario_plugins.native.network import ingress_shaping class NetworkScenariosTest(unittest.TestCase): @@ -9,25 +10,26 @@ class NetworkScenariosTest(unittest.TestCase): def test_serialization(self): plugin.test_object_serialization( ingress_shaping.NetworkScenarioConfig( - node_interface_name={"foo": ['bar']}, + node_interface_name={"foo": ["bar"]}, network_params={ "latency": "50ms", "loss": "0.02", - "bandwidth": "100mbit" - } + "bandwidth": "100mbit", + }, ), self.fail, ) plugin.test_object_serialization( ingress_shaping.NetworkScenarioSuccessOutput( filter_direction="ingress", - test_interfaces={"foo": ['bar']}, + test_interfaces={"foo": ["bar"]}, network_parameters={ "latency": "50ms", "loss": "0.02", - "bandwidth": "100mbit" + "bandwidth": "100mbit", }, - execution_type="parallel"), + execution_type="parallel", + ), self.fail, ) plugin.test_object_serialization( @@ -45,10 +47,10 @@ class NetworkScenariosTest(unittest.TestCase): network_params={ "latency": "50ms", "loss": "0.02", - "bandwidth": "100mbit" - } + "bandwidth": "100mbit", + }, ), - run_id="network-shaping-test" + run_id="network-shaping-test", ) if output_id == "error": logging.error(output_data.error) diff --git a/tests/test_run_python_plugin.py b/tests/test_run_python_plugin.py index ded01312..a29c01a9 100644 --- a/tests/test_run_python_plugin.py +++ b/tests/test_run_python_plugin.py @@ -1,28 +1,37 @@ import tempfile import unittest -from kraken.plugins import run_python_file -from kraken.plugins.run_python_plugin import RunPythonFileInput +from krkn.scenario_plugins.native.run_python_plugin import ( + RunPythonFileInput, + run_python_file, +) class RunPythonPluginTest(unittest.TestCase): def test_success_execution(self): tmp_file = tempfile.NamedTemporaryFile() - tmp_file.write(bytes("print('Hello world!')", 'utf-8')) + tmp_file.write(bytes("print('Hello world!')", "utf-8")) tmp_file.flush() - output_id, output_data = run_python_file(params=RunPythonFileInput(tmp_file.name), run_id="test-python-plugin-success") + output_id, output_data = run_python_file( + params=RunPythonFileInput(tmp_file.name), + run_id="test-python-plugin-success", + ) self.assertEqual("success", output_id) self.assertEqual("Hello world!\n", output_data.stdout) def test_error_execution(self): tmp_file = tempfile.NamedTemporaryFile() - tmp_file.write(bytes("import sys\nprint('Hello world!')\nsys.exit(42)\n", 'utf-8')) + tmp_file.write( + bytes("import sys\nprint('Hello world!')\nsys.exit(42)\n", "utf-8") + ) tmp_file.flush() - output_id, output_data = run_python_file(params=RunPythonFileInput(tmp_file.name), run_id="test-python-plugin-error") + output_id, output_data = run_python_file( + params=RunPythonFileInput(tmp_file.name), run_id="test-python-plugin-error" + ) self.assertEqual("error", output_id) self.assertEqual(42, output_data.exit_code) self.assertEqual("Hello world!\n", output_data.stdout) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_vmware_plugin.py b/tests/test_vmware_plugin.py index 058beabe..2dd7fab2 100644 --- a/tests/test_vmware_plugin.py +++ b/tests/test_vmware_plugin.py @@ -2,33 +2,25 @@ import unittest import os import logging from arcaflow_plugin_sdk import plugin -from kraken.plugins.node_scenarios.kubernetes_functions import Actions -from kraken.plugins.node_scenarios import vmware_plugin + +from krkn.scenario_plugins.native.node_scenarios import vmware_plugin +from krkn.scenario_plugins.native.node_scenarios.kubernetes_functions import Actions class NodeScenariosTest(unittest.TestCase): def setUp(self): - vsphere_env_vars = [ - "VSPHERE_IP", - "VSPHERE_USERNAME", - "VSPHERE_PASSWORD" - ] + vsphere_env_vars = ["VSPHERE_IP", "VSPHERE_USERNAME", "VSPHERE_PASSWORD"] self.credentials_present = all( env_var in os.environ for env_var in vsphere_env_vars ) def test_serialization(self): plugin.test_object_serialization( - vmware_plugin.NodeScenarioConfig( - name="test", - skip_openshift_checks=True - ), + vmware_plugin.NodeScenarioConfig(name="test", skip_openshift_checks=True), self.fail, ) plugin.test_object_serialization( - vmware_plugin.NodeScenarioSuccessOutput( - nodes={}, action=Actions.START - ), + vmware_plugin.NodeScenarioSuccessOutput(nodes={}, action=Actions.START), self.fail, ) plugin.test_object_serialization( diff --git a/utils/chaos_recommender/chaos_recommender.py b/utils/chaos_recommender/chaos_recommender.py index ac9eae80..9bf9da29 100644 --- a/utils/chaos_recommender/chaos_recommender.py +++ b/utils/chaos_recommender/chaos_recommender.py @@ -6,16 +6,17 @@ import re import sys import time import yaml + # kraken module import for running the recommender # both from the root directory and the recommender # folder -sys.path.insert(0, './') -sys.path.insert(0, '../../') +sys.path.insert(0, "./") +sys.path.insert(0, "../../") from krkn_lib.utils import get_yaml_item_value -import kraken.chaos_recommender.analysis as analysis -import kraken.chaos_recommender.prometheus as prometheus +import krkn.chaos_recommender.analysis as analysis +import krkn.chaos_recommender.prometheus as prometheus from kubernetes import config as kube_config @@ -23,28 +24,101 @@ def parse_arguments(parser): # command line options parser.add_argument("-c", "--config-file", action="store", help="Config file path") - parser.add_argument("-o", "--options", action="store_true", help="Evaluate command line options") - parser.add_argument("-n", "--namespaces", action="store", default="", nargs="+", help="Kubernetes application namespaces separated by space") - parser.add_argument("-p", "--prometheus-endpoint", action="store", default="", help="Prometheus endpoint URI") - parser.add_argument("-k", "--kubeconfig", action="store", default=kube_config.KUBE_CONFIG_DEFAULT_LOCATION, help="Kubeconfig path") - parser.add_argument("-t", "--token", action="store", default="", help="Kubernetes authentication token") - parser.add_argument("-s", "--scrape-duration", action="store", default="10m", help="Prometheus scrape duration") - parser.add_argument("-L", "--log-level", action="store", default="INFO", help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL") + parser.add_argument( + "-o", "--options", action="store_true", help="Evaluate command line options" + ) + parser.add_argument( + "-n", + "--namespaces", + action="store", + default="", + nargs="+", + help="Kubernetes application namespaces separated by space", + ) + parser.add_argument( + "-p", + "--prometheus-endpoint", + action="store", + default="", + help="Prometheus endpoint URI", + ) + parser.add_argument( + "-k", + "--kubeconfig", + action="store", + default=kube_config.KUBE_CONFIG_DEFAULT_LOCATION, + help="Kubeconfig path", + ) + parser.add_argument( + "-t", + "--token", + action="store", + default="", + help="Kubernetes authentication token", + ) + parser.add_argument( + "-s", + "--scrape-duration", + action="store", + default="10m", + help="Prometheus scrape duration", + ) + parser.add_argument( + "-L", + "--log-level", + action="store", + default="INFO", + help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL", + ) - parser.add_argument("-J", "--json-output-file", default=False, nargs="?", action="store", - help="Create output file, the path to the folder can be specified, if not specified the default folder is used") + parser.add_argument( + "-J", + "--json-output-file", + default=False, + nargs="?", + action="store", + help="Create output file, the path to the folder can be specified, if not specified the default folder is used", + ) - parser.add_argument("-M", "--MEM", nargs='+', action="store", default=[], - help="Memory related chaos tests (space separated list)") - parser.add_argument("-C", "--CPU", nargs='+', action="store", default=[], - help="CPU related chaos tests (space separated list)") - parser.add_argument("-N", "--NETWORK", nargs='+', action="store", default=[], - help="Network related chaos tests (space separated list)") - parser.add_argument("-G", "--GENERIC", nargs='+', action="store", default=[], - help="Memory related chaos tests (space separated list)") + parser.add_argument( + "-M", + "--MEM", + nargs="+", + action="store", + default=[], + help="Memory related chaos tests (space separated list)", + ) + parser.add_argument( + "-C", + "--CPU", + nargs="+", + action="store", + default=[], + help="CPU related chaos tests (space separated list)", + ) + parser.add_argument( + "-N", + "--NETWORK", + nargs="+", + action="store", + default=[], + help="Network related chaos tests (space separated list)", + ) + parser.add_argument( + "-G", + "--GENERIC", + nargs="+", + action="store", + default=[], + help="Memory related chaos tests (space separated list)", + ) parser.add_argument("--threshold", action="store", default="", help="Threshold") - parser.add_argument("--cpu-threshold", action="store", default="", help="CPU threshold") - parser.add_argument("--mem-threshold", action="store", default="", help="Memory threshold") + parser.add_argument( + "--cpu-threshold", action="store", default="", help="CPU threshold" + ) + parser.add_argument( + "--mem-threshold", action="store", default="", help="Memory threshold" + ) return parser.parse_args() @@ -60,7 +134,9 @@ def read_configuration(config_file_path): log_level = config.get("log level", "INFO") namespaces = config.get("namespaces") namespaces = re.split(r",+\s+|,+|\s+", namespaces) - kubeconfig = get_yaml_item_value(config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION) + kubeconfig = get_yaml_item_value( + config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION + ) prometheus_endpoint = config.get("prometheus_endpoint") auth_token = config.get("auth_token") @@ -74,9 +150,19 @@ def read_configuration(config_file_path): else: output_path = False chaos_tests = config.get("chaos_tests", {}) - return (namespaces, kubeconfig, prometheus_endpoint, auth_token, - scrape_duration, chaos_tests, log_level, threshold, - heatmap_cpu_threshold, heatmap_mem_threshold, output_path) + return ( + namespaces, + kubeconfig, + prometheus_endpoint, + auth_token, + scrape_duration, + chaos_tests, + log_level, + threshold, + heatmap_cpu_threshold, + heatmap_mem_threshold, + output_path, + ) def prompt_input(prompt, default_value): @@ -89,10 +175,7 @@ def prompt_input(prompt, default_value): def make_json_output(inputs, namespace_data, output_path): time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) - data = { - "inputs": inputs, - "analysis_outputs": namespace_data - } + data = {"inputs": inputs, "analysis_outputs": namespace_data} logging.info(f"Summary\n{json.dumps(data, indent=4)}") @@ -106,9 +189,16 @@ def make_json_output(inputs, namespace_data, output_path): logging.info(f"Recommendation output saved in {file}.") -def json_inputs(namespaces, kubeconfig, prometheus_endpoint, scrape_duration, - chaos_tests, threshold, heatmap_cpu_threshold, - heatmap_mem_threshold): +def json_inputs( + namespaces, + kubeconfig, + prometheus_endpoint, + scrape_duration, + chaos_tests, + threshold, + heatmap_cpu_threshold, + heatmap_mem_threshold, +): inputs = { "namespaces": namespaces, "kubeconfig": kubeconfig, @@ -117,7 +207,7 @@ def json_inputs(namespaces, kubeconfig, prometheus_endpoint, scrape_duration, "chaos_tests": chaos_tests, "threshold": threshold, "heatmap_cpu_threshold": heatmap_cpu_threshold, - "heatmap_mem_threshold": heatmap_mem_threshold + "heatmap_mem_threshold": heatmap_mem_threshold, } return inputs @@ -128,34 +218,38 @@ def json_namespace(namespace, queries, analysis_data): "queries": queries, "profiling": analysis_data[0], "heatmap_analysis": analysis_data[1], - "recommendations": analysis_data[2] + "recommendations": analysis_data[2], } return data def main(): - parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool") + parser = argparse.ArgumentParser( + description="Krkn Chaos Recommender Command-Line tool" + ) args = parse_arguments(parser) if args.config_file is None and not args.options: - logging.error("You have to either specify a config file path or pass recommender options as command line arguments") + logging.error( + "You have to either specify a config file path or pass recommender options as command line arguments" + ) parser.print_help() sys.exit(1) if args.config_file is not None: ( - namespaces, - kubeconfig, - prometheus_endpoint, - auth_token, - scrape_duration, - chaos_tests, - log_level, - threshold, - heatmap_cpu_threshold, - heatmap_mem_threshold, - output_path - ) = read_configuration(args.config_file) + namespaces, + kubeconfig, + prometheus_endpoint, + auth_token, + scrape_duration, + chaos_tests, + log_level, + threshold, + heatmap_cpu_threshold, + heatmap_mem_threshold, + output_path, + ) = read_configuration(args.config_file) if args.options: namespaces = args.namespaces @@ -165,7 +259,12 @@ def main(): log_level = args.log_level prometheus_endpoint = args.prometheus_endpoint output_path = args.json_output_file - chaos_tests = {"MEM": args.MEM, "GENERIC": args.GENERIC, "CPU": args.CPU, "NETWORK": args.NETWORK} + chaos_tests = { + "MEM": args.MEM, + "GENERIC": args.GENERIC, + "CPU": args.CPU, + "NETWORK": args.NETWORK, + } threshold = args.threshold heatmap_mem_threshold = args.mem_threshold heatmap_cpu_threshold = args.cpu_threshold @@ -179,29 +278,46 @@ def main(): if output_path is not False: if output_path is None: output_path = "./recommender_output" - logging.info(f"Path for output file not specified. " - f"Using default folder {output_path}") + logging.info( + f"Path for output file not specified. " + f"Using default folder {output_path}" + ) if not os.path.exists(os.path.expanduser(output_path)): logging.error(f"Folder {output_path} for output not found.") sys.exit(1) logging.info("Loading inputs...") - inputs = json_inputs(namespaces, kubeconfig, prometheus_endpoint, - scrape_duration, chaos_tests, threshold, - heatmap_cpu_threshold, heatmap_mem_threshold) + inputs = json_inputs( + namespaces, + kubeconfig, + prometheus_endpoint, + scrape_duration, + chaos_tests, + threshold, + heatmap_cpu_threshold, + heatmap_mem_threshold, + ) namespaces_data = [] logging.info("Starting Analysis...") file_path, queries = prometheus.fetch_utilization_from_prometheus( - prometheus_endpoint, auth_token, namespaces, scrape_duration) + prometheus_endpoint, auth_token, namespaces, scrape_duration + ) - analysis_data = analysis(file_path, namespaces, chaos_tests, threshold, - heatmap_cpu_threshold, heatmap_mem_threshold) + analysis_data = analysis( + file_path, + namespaces, + chaos_tests, + threshold, + heatmap_cpu_threshold, + heatmap_mem_threshold, + ) for namespace in namespaces: - namespace_data = json_namespace(namespace, queries[namespace], - analysis_data[namespace]) + namespace_data = json_namespace( + namespace, queries[namespace], analysis_data[namespace] + ) namespaces_data.append(namespace_data) make_json_output(inputs, namespaces_data, output_path)