From ab98e416a6dec249cd424947cfe42f8095f5208d Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Tue, 23 Apr 2024 10:49:01 +0200 Subject: [PATCH] Integration of the new pod recovery monitoring strategy implemented in krkn-lib (#609) * pod monitoring integration in plugin scenario Signed-off-by: Tullio Sebastiani * pod monitoring integration in container scenario Signed-off-by: Tullio Sebastiani * removed wait-for-pod step from plugin scenario config files Signed-off-by: Tullio Sebastiani * introduced global pod recovery time Signed-off-by: Tullio Sebastiani nit Signed-off-by: Tullio Sebastiani * introduced krkn_pod_recovery_time in plugin scenario and removed all the references to wait-for-pods Signed-off-by: Tullio Sebastiani fix Signed-off-by: Tullio Sebastiani * functional test fix Signed-off-by: Tullio Sebastiani * main branch functional test fix Signed-off-by: Tullio Sebastiani * increased recovery times Signed-off-by: Tullio Sebastiani --------- Signed-off-by: Tullio Sebastiani --- CI/tests/common.sh | 8 ++- CI/tests/test_container.sh | 8 +-- CI/tests/test_telemetry.sh | 8 +-- docs/getting_started.md | 6 +- docs/pod_scenarios.md | 7 +-- kraken/plugins/__init__.py | 59 ++++++++++++++++++- kraken/pod_scenarios/setup.py | 34 +++++++---- requirements.txt | 2 +- run_kraken.py | 3 +- scenarios/kind/scheduler.yml | 6 +- scenarios/kube/pod.yml | 1 + scenarios/kube/scheduler.yml | 6 +- scenarios/openshift/container_etcd.yml | 2 +- scenarios/openshift/customapp_pod.yaml | 6 +- scenarios/openshift/etcd.yml | 6 +- scenarios/openshift/openshift-apiserver.yml | 7 +-- .../openshift/openshift-kube-apiserver.yml | 7 +-- .../openshift/post_action_prometheus.yml | 6 +- scenarios/openshift/prom_kill.yml | 6 +- scenarios/openshift/prometheus.yml | 7 +-- .../openshift/regex_openshift_pod_kill.yml | 1 + scenarios/plugin.schema.json | 13 ++++ 22 files changed, 124 insertions(+), 85 deletions(-) diff --git a/CI/tests/common.sh b/CI/tests/common.sh index cc78bb44..e93f390c 100644 --- a/CI/tests/common.sh +++ b/CI/tests/common.sh @@ -1,7 +1,7 @@ ERRORED=false function finish { - if [ $? -eq 1 ] && [ $ERRORED != "true" ] + if [ $? != 0 ] && [ $ERRORED != "true" ] then error fi @@ -13,8 +13,10 @@ function error { then echo "Error caught." ERRORED=true - else - echo "Exit code greater than zero detected: $exit_code" + elif [ $exit_code == 2 ] + then + echo "Run with exit code 2 detected, it is expected, wrapping the exit code with 0 to avoid pipeline failure" + exit 0 fi } diff --git a/CI/tests/test_container.sh b/CI/tests/test_container.sh index 725233bf..93e3676c 100755 --- a/CI/tests/test_container.sh +++ b/CI/tests/test_container.sh @@ -8,11 +8,11 @@ trap finish EXIT pod_file="CI/scenarios/hello_pod.yaml" function functional_test_container_crash { - yq -i '.scenarios[0].namespace="default"' scenarios/openshift/app_outage.yaml - yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/app_outage.yaml - yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/app_outage.yaml + yq -i '.scenarios[0].namespace="default"' scenarios/openshift/container_etcd.yml + yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/container_etcd.yml + yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/container_etcd.yml export scenario_type="container_scenarios" - export scenario_file="- scenarios/openshift/app_outage.yaml" + export scenario_file="- scenarios/openshift/container_etcd.yml" export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml diff --git a/CI/tests/test_telemetry.sh b/CI/tests/test_telemetry.sh index 46b0301c..15c47220 100644 --- a/CI/tests/test_telemetry.sh +++ b/CI/tests/test_telemetry.sh @@ -22,14 +22,14 @@ function functional_test_telemetry { export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml" export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml - python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml + retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml) RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"` $AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files echo "checking if telemetry files are uploaded on s3" cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 ) - cat s3_remote_files | grep critical-alerts-00.json || ( echo "FAILED: critical-alerts-00.json not uploaded" && exit 1 ) - cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 ) - cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 ) + cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded" && exit 1 ) + cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 ) + cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 ) echo "all files uploaded!" echo "Telemetry Collection: Success" } diff --git a/docs/getting_started.md b/docs/getting_started.md index b0aaca04..5dc5dfd0 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -14,11 +14,7 @@ For example, for adding a pod level scenario for a new application, refer to the namespace_pattern: ^$ label_selector: kill: -- id: wait-for-pods - config: - namespace_pattern: ^$ - label_selector: - count: + krkn_pod_recovery_time: ``` #### Node Scenario Yaml Template diff --git a/docs/pod_scenarios.md b/docs/pod_scenarios.md index ddf6e6d1..0eced3a6 100644 --- a/docs/pod_scenarios.md +++ b/docs/pod_scenarios.md @@ -17,11 +17,8 @@ You can then create the scenario file with the following contents: config: namespace_pattern: ^kube-system$ label_selector: k8s-app=kube-scheduler -- id: wait-for-pods - config: - namespace_pattern: ^kube-system$ - label_selector: k8s-app=kube-scheduler - count: 3 + krkn_pod_recovery_time: 120 + ``` Please adjust the schema reference to point to the [schema file](../scenarios/plugin.schema.json). This file will give you code completion and documentation for the available options in your IDE. diff --git a/kraken/plugins/__init__.py b/kraken/plugins/__init__.py index 40cdf4ad..8718180e 100644 --- a/kraken/plugins/__init__.py +++ b/kraken/plugins/__init__.py @@ -2,11 +2,14 @@ import dataclasses import json import logging from os.path import abspath -from typing import List, Dict +from typing import List, Dict, Any import time from arcaflow_plugin_sdk import schema, serialization, jsonschema from arcaflow_plugin_kill_pod import kill_pods, wait_for_pods +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool + import kraken.plugins.node_scenarios.vmware_plugin as vmware_plugin import kraken.plugins.node_scenarios.ibmcloud_plugin as ibmcloud_plugin from kraken.plugins.run_python_plugin import run_python_file @@ -47,11 +50,14 @@ class Plugins: ) self.steps_by_id[step.schema.id] = step + def unserialize_scenario(self, file: str) -> Any: + return serialization.load_from_file(abspath(file)) + def run(self, file: str, kubeconfig_path: str, kraken_config: str): """ Run executes a series of steps """ - data = serialization.load_from_file(abspath(file)) + data = self.unserialize_scenario(abspath(file)) if not isinstance(data, list): raise Exception( "Invalid scenario configuration file: {} expected list, found {}".format(file, type(data).__name__) @@ -241,7 +247,15 @@ PLUGINS = Plugins( ) -def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetryKubernetes) -> (List[str], list[ScenarioTelemetry]): +def run(scenarios: List[str], + kubeconfig_path: str, + kraken_config: str, + failed_post_scenarios: List[str], + wait_duration: int, + telemetry: KrknTelemetryKubernetes, + kubecli: KrknKubernetes + ) -> (List[str], list[ScenarioTelemetry]): + scenario_telemetries: list[ScenarioTelemetry] = [] for scenario in scenarios: scenario_telemetry = ScenarioTelemetry() @@ -249,10 +263,21 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p scenario_telemetry.startTimeStamp = time.time() telemetry.set_parameters_base64(scenario_telemetry, scenario) logging.info('scenario ' + str(scenario)) + pool = PodsMonitorPool(kubecli) + kill_scenarios = [kill_scenario for kill_scenario in PLUGINS.unserialize_scenario(scenario) if kill_scenario["id"] == "kill-pods"] + try: + start_monitoring(pool, kill_scenarios) PLUGINS.run(scenario, kubeconfig_path, kraken_config) + result = pool.join() + scenario_telemetry.affected_pods = result + if result.error: + raise Exception(f"unrecovered pods: {result.error}") + except Exception as e: + logging.error(f"scenario exception: {str(e)}") scenario_telemetry.exitStatus = 1 + pool.cancel() failed_post_scenarios.append(scenario) log_exception(scenario) else: @@ -263,3 +288,31 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p scenario_telemetry.endTimeStamp = time.time() return failed_post_scenarios, scenario_telemetries + + +def start_monitoring(pool: PodsMonitorPool, scenarios: list[Any]): + for kill_scenario in scenarios: + recovery_time = kill_scenario["config"]["krkn_pod_recovery_time"] + if ("namespace_pattern" in kill_scenario["config"] and + "label_selector" in kill_scenario["config"]): + namespace_pattern = kill_scenario["config"]["namespace_pattern"] + label_selector = kill_scenario["config"]["label_selector"] + pool.select_and_monitor_by_namespace_pattern_and_label( + namespace_pattern=namespace_pattern, + label_selector=label_selector, + max_timeout=recovery_time) + logging.info( + f"waiting {recovery_time} seconds for pod recovery, " + f"pod label selector: {label_selector} namespace pattern: {namespace_pattern}") + + elif ("namespace_pattern" in kill_scenario["config"] and + "name_pattern" in kill_scenario["config"]): + namespace_pattern = kill_scenario["config"]["namespace_pattern"] + name_pattern = kill_scenario["config"]["name_pattern"] + pool.select_and_monitor_by_name_pattern_and_namespace_pattern(pod_name_pattern=name_pattern, + namespace_pattern=namespace_pattern, + max_timeout=recovery_time) + logging.info(f"waiting {recovery_time} seconds for pod recovery, " + f"pod name pattern: {name_pattern} namespace pattern: {namespace_pattern}") + else: + raise Exception(f"impossible to determine monitor parameters, check {kill_scenario} configuration") diff --git a/kraken/pod_scenarios/setup.py b/kraken/pod_scenarios/setup.py index 80fc00b9..8280ca18 100644 --- a/kraken/pod_scenarios/setup.py +++ b/kraken/pod_scenarios/setup.py @@ -1,9 +1,13 @@ import logging import time +from typing import Any + import yaml import sys import random import arcaflow_plugin_kill_pod +from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool + import kraken.cerberus.setup as cerberus import kraken.post_actions.actions as post_actions from krkn_lib.k8s import KrknKubernetes @@ -79,6 +83,7 @@ def container_run(kubeconfig_path, failed_scenarios = [] scenario_telemetries: list[ScenarioTelemetry] = [] + pool = PodsMonitorPool(kubecli) for container_scenario_config in scenarios_list: scenario_telemetry = ScenarioTelemetry() @@ -91,23 +96,17 @@ def container_run(kubeconfig_path, pre_action_output = "" with open(container_scenario_config[0], "r") as f: cont_scenario_config = yaml.full_load(f) + start_monitoring(kill_scenarios=cont_scenario_config["scenarios"], pool=pool) for cont_scenario in cont_scenario_config["scenarios"]: # capture start time start_time = int(time.time()) try: killed_containers = container_killing_in_pod(cont_scenario, kubecli) - if len(container_scenario_config) > 1: - failed_post_scenarios = post_actions.check_recovery( - kubeconfig_path, - container_scenario_config, - failed_post_scenarios, - pre_action_output - ) - else: - failed_post_scenarios = check_failed_containers( - killed_containers, cont_scenario.get("retry_wait", 120), kubecli - ) - + logging.info(f"killed containers: {str(killed_containers)}") + result = pool.join() + if result.error: + raise Exception(f"pods failed to recovery: {result.error}") + scenario_telemetry.affected_pods = result logging.info("Waiting for the specified duration: %s" % (wait_duration)) time.sleep(wait_duration) @@ -117,6 +116,7 @@ def container_run(kubeconfig_path, # publish cerberus status cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) except (RuntimeError, Exception): + pool.cancel() failed_scenarios.append(container_scenario_config[0]) log_exception(container_scenario_config[0]) scenario_telemetry.exitStatus = 1 @@ -129,6 +129,16 @@ def container_run(kubeconfig_path, return failed_scenarios, scenario_telemetries +def start_monitoring(kill_scenarios: list[Any], pool: PodsMonitorPool): + for kill_scenario in kill_scenarios: + namespace_pattern = f"^{kill_scenario['namespace']}$" + label_selector = kill_scenario["label_selector"] + recovery_time = kill_scenario["expected_recovery_time"] + pool.select_and_monitor_by_namespace_pattern_and_label( + namespace_pattern=namespace_pattern, + label_selector=label_selector, + max_timeout=recovery_time) + def container_killing_in_pod(cont_scenario, kubecli: KrknKubernetes): scenario_name = get_yaml_item_value(cont_scenario, "name", "") diff --git a/requirements.txt b/requirements.txt index 00d911c7..772bda34 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ google-api-python-client==2.116.0 ibm_cloud_sdk_core==3.18.0 ibm_vpc==0.20.0 jinja2==3.1.3 -krkn-lib==2.1.1 +krkn-lib==2.1.2 lxml==5.1.0 kubernetes==26.1.0 oauth2client==4.1.3 diff --git a/run_kraken.py b/run_kraken.py index 96fb3bd1..c9eb58f8 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -264,7 +264,8 @@ def main(cfg): kraken_config, failed_post_scenarios, wait_duration, - telemetry_k8s + telemetry_k8s, + kubecli ) chaos_telemetry.scenarios.extend(scenario_telemetries) # krkn_lib diff --git a/scenarios/kind/scheduler.yml b/scenarios/kind/scheduler.yml index 3296b3dc..25d81bbf 100755 --- a/scenarios/kind/scheduler.yml +++ b/scenarios/kind/scheduler.yml @@ -3,8 +3,4 @@ config: namespace_pattern: ^kube-system$ label_selector: component=kube-scheduler -- id: wait-for-pods - config: - namespace_pattern: ^kube-system$ - label_selector: component=kube-scheduler - count: 3 + krkn_pod_recovery_time: 120 diff --git a/scenarios/kube/pod.yml b/scenarios/kube/pod.yml index b9f30a0e..29d931be 100644 --- a/scenarios/kube/pod.yml +++ b/scenarios/kube/pod.yml @@ -4,3 +4,4 @@ name_pattern: ^nginx-.*$ namespace_pattern: ^default$ kill: 1 + krkn_pod_recovery_time: 120 diff --git a/scenarios/kube/scheduler.yml b/scenarios/kube/scheduler.yml index e2168dcb..74ea7602 100755 --- a/scenarios/kube/scheduler.yml +++ b/scenarios/kube/scheduler.yml @@ -3,8 +3,4 @@ config: namespace_pattern: ^kube-system$ label_selector: k8s-app=kube-scheduler -- id: wait-for-pods - config: - namespace_pattern: ^kube-system$ - label_selector: k8s-app=kube-scheduler - count: 3 + krkn_pod_recovery_time: 120 diff --git a/scenarios/openshift/container_etcd.yml b/scenarios/openshift/container_etcd.yml index be7e46c6..4f4d5f4f 100755 --- a/scenarios/openshift/container_etcd.yml +++ b/scenarios/openshift/container_etcd.yml @@ -5,4 +5,4 @@ scenarios: container_name: "etcd" action: 1 count: 1 - expected_recovery_time: 60 + expected_recovery_time: 120 diff --git a/scenarios/openshift/customapp_pod.yaml b/scenarios/openshift/customapp_pod.yaml index cd836521..b060119a 100644 --- a/scenarios/openshift/customapp_pod.yaml +++ b/scenarios/openshift/customapp_pod.yaml @@ -3,8 +3,4 @@ config: namespace_pattern: ^acme-air$ name_pattern: .* -- id: wait-for-pods - config: - namespace_pattern: ^acme-air$ - name_pattern: .* - count: 8 \ No newline at end of file + krkn_pod_recovery_time: 120 \ No newline at end of file diff --git a/scenarios/openshift/etcd.yml b/scenarios/openshift/etcd.yml index 650bfb5d..bb91b399 100755 --- a/scenarios/openshift/etcd.yml +++ b/scenarios/openshift/etcd.yml @@ -3,8 +3,4 @@ config: namespace_pattern: ^openshift-etcd$ label_selector: k8s-app=etcd -- id: wait-for-pods - config: - namespace_pattern: ^openshift-etcd$ - label_selector: k8s-app=etcd - count: 3 + krkn_pod_recovery_time: 120 diff --git a/scenarios/openshift/openshift-apiserver.yml b/scenarios/openshift/openshift-apiserver.yml index eedfeca0..bd8458ad 100755 --- a/scenarios/openshift/openshift-apiserver.yml +++ b/scenarios/openshift/openshift-apiserver.yml @@ -3,8 +3,5 @@ config: namespace_pattern: ^openshift-apiserver$ label_selector: app=openshift-apiserver-a -- id: wait-for-pods - config: - namespace_pattern: ^openshift-apiserver$ - label_selector: app=openshift-apiserver-a - count: 3 + krkn_pod_recovery_time: 120 + diff --git a/scenarios/openshift/openshift-kube-apiserver.yml b/scenarios/openshift/openshift-kube-apiserver.yml index 6043eb5c..324653fa 100755 --- a/scenarios/openshift/openshift-kube-apiserver.yml +++ b/scenarios/openshift/openshift-kube-apiserver.yml @@ -3,8 +3,5 @@ config: namespace_pattern: ^openshift-kube-apiserver$ label_selector: app=openshift-kube-apiserver -- id: wait-for-pods - config: - namespace_pattern: ^openshift-kube-apiserver$ - label_selector: app=openshift-kube-apiserver - count: 3 + krkn_pod_recovery_time: 120 + diff --git a/scenarios/openshift/post_action_prometheus.yml b/scenarios/openshift/post_action_prometheus.yml index 33bd4545..eed2687c 100644 --- a/scenarios/openshift/post_action_prometheus.yml +++ b/scenarios/openshift/post_action_prometheus.yml @@ -3,8 +3,4 @@ config: namespace_pattern: ^openshift-monitoring$ label_selector: app=prometheus -- id: wait-for-pods - config: - namespace_pattern: ^openshift-monitoring$ - label_selector: app=prometheus - count: 2 \ No newline at end of file + krkn_pod_recovery_time: 120 \ No newline at end of file diff --git a/scenarios/openshift/prom_kill.yml b/scenarios/openshift/prom_kill.yml index 8186440d..d1b89570 100644 --- a/scenarios/openshift/prom_kill.yml +++ b/scenarios/openshift/prom_kill.yml @@ -2,8 +2,4 @@ config: namespace_pattern: ^openshift-monitoring$ label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0 -- id: wait-for-pods - config: - namespace_pattern: ^openshift-monitoring$ - label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0 - count: 1 \ No newline at end of file + krkn_pod_recovery_time: 120 \ No newline at end of file diff --git a/scenarios/openshift/prometheus.yml b/scenarios/openshift/prometheus.yml index 0dfbddc7..2b9eeb02 100644 --- a/scenarios/openshift/prometheus.yml +++ b/scenarios/openshift/prometheus.yml @@ -3,9 +3,4 @@ config: namespace_pattern: ^openshift-monitoring$ label_selector: app=prometheus -- id: wait-for-pods - config: - namespace_pattern: ^openshift-monitoring$ - label_selector: app=prometheus - count: 2 - timeout: 180 \ No newline at end of file + krkn_pod_recovery_time: 120 diff --git a/scenarios/openshift/regex_openshift_pod_kill.yml b/scenarios/openshift/regex_openshift_pod_kill.yml index c0f676ca..377cd829 100755 --- a/scenarios/openshift/regex_openshift_pod_kill.yml +++ b/scenarios/openshift/regex_openshift_pod_kill.yml @@ -4,3 +4,4 @@ namespace_pattern: ^openshift-.*$ name_pattern: .* kill: 3 + krkn_pod_recovery_time: 120 diff --git a/scenarios/plugin.schema.json b/scenarios/plugin.schema.json index 2f3dc6c3..ec647888 100644 --- a/scenarios/plugin.schema.json +++ b/scenarios/plugin.schema.json @@ -60,7 +60,14 @@ "default": 1, "title": "Backoff", "description": "How many seconds to wait between checks for the target pod status." + }, + "krkn_pod_recovery_time": { + "type": "integer", + "default": 30, + "title": "Recovery Time", + "description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)." } + }, "required": [ "namespace_pattern" @@ -112,6 +119,12 @@ "default": 1, "title": "Backoff", "description": "How many seconds to wait between checks for the target pod status." + }, + "krkn_pod_recovery_time": { + "type": "integer", + "default": 30, + "title": "Recovery Time", + "description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)." } }, "required": [