From dd4d0d038937f0ecf95598b671a91c457bc79749 Mon Sep 17 00:00:00 2001 From: kattameghana <83449251+kattameghana@users.noreply.github.com> Date: Tue, 18 Mar 2025 17:38:30 +0530 Subject: [PATCH] Health checks implementation for application endpoints (#761) * Hog scenario porting from arcaflow to native (#748) * added new native hog scenario * removed arcaflow dependency + legacy hog scenarios * config update * changed hog configuration structure + added average samples * fix on cpu count * removes tripledes warning * changed selector format * changed selector syntax * number of nodes option * documentation * functional tests * exception handling on hog deployment thread Signed-off-by: kattameghana * Hog scenario porting from arcaflow to native (#748) * added new native hog scenario * removed arcaflow dependency + legacy hog scenarios * config update * changed hog configuration structure + added average samples * fix on cpu count * removes tripledes warning * changed selector format * changed selector syntax * number of nodes option * documentation * functional tests * exception handling on hog deployment thread Signed-off-by: Paige Patton Signed-off-by: kattameghana * adding vsphere updates to non native Signed-off-by: Paige Patton Signed-off-by: kattameghana * adding node id to affected node Signed-off-by: kattameghana * Fixed the spelling mistake Signed-off-by: Meghana Katta Signed-off-by: kattameghana * adding v4.0.8 version (#756) Signed-off-by: Paige Patton Signed-off-by: kattameghana * Add autodetecting distribution (#753) Used is_openshift function from krkn lib Remove distribution from config Remove distribution from documentation Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com> Signed-off-by: kattameghana * initial version of health checks Signed-off-by: kattameghana * Changes for appending success response and health check config format Signed-off-by: kattameghana * Changes include health check doc and exit_on_failure config Signed-off-by: kattameghana * Update config.yaml Signed-off-by: kattameghana * initial version of health checks Signed-off-by: kattameghana * Changes for appending success response and health check config format Signed-off-by: kattameghana * Update config.yaml Signed-off-by: kattameghana * initial version of health checks Signed-off-by: kattameghana * Changes for appending success response and health check config format Signed-off-by: kattameghana * Changes include health check doc and exit_on_failure config Signed-off-by: kattameghana * Update config.yaml Signed-off-by: kattameghana * initial version of health checks Signed-off-by: kattameghana * Changes for appending success response and health check config format Signed-off-by: kattameghana * Update config.yaml Signed-off-by: kattameghana * Added the health check config in functional test config Signed-off-by: kattameghana * Modified the health checks documentation Signed-off-by: kattameghana * Changes for debugging the functional test failing Signed-off-by: kattameghana * changed the code for debugging in run_test.sh Signed-off-by: kattameghana * Debugging Signed-off-by: kattameghana * Removed the functional test running line Signed-off-by: kattameghana * Removing the health check config in common_test_config for debugging Signed-off-by: kattameghana * Fixing functional test fialure Signed-off-by: kattameghana * Removing the changes that are added for debugging Signed-off-by: kattameghana * few modifications Signed-off-by: kattameghana * Renamed timestamp Signed-off-by: kattameghana * Changed the start timestamp and end timestamp data type to the datetime Signed-off-by: kattameghana * initial version of health checks Signed-off-by: kattameghana * Changes for appending success response and health check config format Signed-off-by: kattameghana * Changes include health check doc and exit_on_failure config Signed-off-by: kattameghana * Update config.yaml Signed-off-by: kattameghana * initial version of health checks Signed-off-by: kattameghana * Changes for appending success response and health check config format Signed-off-by: kattameghana * Update config.yaml Signed-off-by: kattameghana * Hog scenario porting from arcaflow to native (#748) * added new native hog scenario * removed arcaflow dependency + legacy hog scenarios * config update * changed hog configuration structure + added average samples * fix on cpu count * removes tripledes warning * changed selector format * changed selector syntax * number of nodes option * documentation * functional tests * exception handling on hog deployment thread Signed-off-by: Paige Patton Signed-off-by: kattameghana * adding node id to affected node Signed-off-by: kattameghana * initial version of health checks Signed-off-by: kattameghana * Changes for appending success response and health check config format Signed-off-by: kattameghana * Changes include health check doc and exit_on_failure config Signed-off-by: kattameghana * Update config.yaml Signed-off-by: kattameghana * initial version of health checks Signed-off-by: kattameghana * Changes for appending success response and health check config format Signed-off-by: kattameghana * Update config.yaml Signed-off-by: kattameghana * Added the health check config in functional test config Signed-off-by: kattameghana * Modified the health checks documentation Signed-off-by: kattameghana * Changes for debugging the functional test failing Signed-off-by: kattameghana * changed the code for debugging in run_test.sh Signed-off-by: kattameghana * Debugging Signed-off-by: kattameghana * Removed the functional test running line Signed-off-by: kattameghana * Removing the health check config in common_test_config for debugging Signed-off-by: kattameghana * Fixing functional test fialure Signed-off-by: kattameghana * Removing the changes that are added for debugging Signed-off-by: kattameghana * few modifications Signed-off-by: kattameghana * Renamed timestamp Signed-off-by: kattameghana * initial version of health checks Signed-off-by: kattameghana * Changes for appending success response and health check config format Signed-off-by: kattameghana * initial version of health checks Signed-off-by: kattameghana * Hog scenario porting from arcaflow to native (#748) * added new native hog scenario * removed arcaflow dependency + legacy hog scenarios * config update * changed hog configuration structure + added average samples * fix on cpu count * removes tripledes warning * changed selector format * changed selector syntax * number of nodes option * documentation * functional tests * exception handling on hog deployment thread Signed-off-by: kattameghana * Hog scenario porting from arcaflow to native (#748) * added new native hog scenario * removed arcaflow dependency + legacy hog scenarios * config update * changed hog configuration structure + added average samples * fix on cpu count * removes tripledes warning * changed selector format * changed selector syntax * number of nodes option * documentation * functional tests * exception handling on hog deployment thread Signed-off-by: Paige Patton Signed-off-by: kattameghana * adding node id to affected node Signed-off-by: kattameghana * initial version of health checks Signed-off-by: kattameghana * Changes include health check doc and exit_on_failure config Signed-off-by: kattameghana * Update config.yaml Signed-off-by: kattameghana * initial version of health checks Signed-off-by: kattameghana * Changes for appending success response and health check config format Signed-off-by: kattameghana * Update config.yaml Signed-off-by: kattameghana * Added the health check config in functional test config Signed-off-by: kattameghana * Changes for debugging the functional test failing Signed-off-by: kattameghana * changed the code for debugging in run_test.sh Signed-off-by: kattameghana * Debugging Signed-off-by: kattameghana * Removed the functional test running line Signed-off-by: kattameghana * Removing the health check config in common_test_config for debugging Signed-off-by: kattameghana * Fixing functional test fialure Signed-off-by: kattameghana * Removing the changes that are added for debugging Signed-off-by: kattameghana * few modifications Signed-off-by: kattameghana * Renamed timestamp Signed-off-by: kattameghana * passing the health check response as HealthCheck object Signed-off-by: kattameghana * Updated the krkn-lib version in requirements.txt Signed-off-by: kattameghana * Changed the coverage Signed-off-by: kattameghana --------- Signed-off-by: kattameghana Signed-off-by: Paige Patton Signed-off-by: Meghana Katta Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com> Co-authored-by: Tullio Sebastiani Co-authored-by: Paige Patton Co-authored-by: Meghana Katta Co-authored-by: Paige Patton <64206430+paigerube14@users.noreply.github.com> Co-authored-by: jtydlack <139967002+jtydlack@users.noreply.github.com> --- CI/config/common_test_config.yaml | 8 ++ README.md | 1 + config/config.yaml | 11 ++- docs/health_checks.md | 59 +++++++++++++ .../shut_down/shut_down_scenario_plugin.py | 4 +- krkn/utils/HealthChecker.py | 83 +++++++++++++++++++ requirements.txt | 4 +- run_kraken.py | 33 ++++++-- 8 files changed, 188 insertions(+), 15 deletions(-) create mode 100644 docs/health_checks.md create mode 100644 krkn/utils/HealthChecker.py diff --git a/CI/config/common_test_config.yaml b/CI/config/common_test_config.yaml index edf1ae86..a0db6d84 100644 --- a/CI/config/common_test_config.yaml +++ b/CI/config/common_test_config.yaml @@ -62,3 +62,11 @@ elastic: metrics_index: "krkn-metrics" alerts_index: "krkn-alerts" telemetry_index: "krkn-telemetry" + +health_checks: # Utilizing health check endpoints to observe application behavior during chaos injection. + interval: # Interval in seconds to perform health checks, default value is 2 seconds + config: # Provide list of health check configurations for applications + - url: # Provide application endpoint + bearer_token: # Bearer token for authentication if any + auth: # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword") + exit_on_failure: # If value is True exits when health check failed for application, values can be True/False diff --git a/README.md b/README.md index 184c4897..8d58a9cf 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,7 @@ It is important to make sure to check if the targeted component recovered from t - Having built in checks for pod and node based scenarios to ensure the expected number of replicas and nodes are up. It also supports running custom scripts with the checks. - Leveraging [Cerberus](https://github.com/krkn-chaos/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail post chaos. It is highly recommended to turn on the Cerberus health check feature available in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation) or can be installed from Kraken using the [instructions](https://github.com/krkn-chaos/krkn#setting-up-infrastructure-dependencies). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file. Cerberus can monitor [application routes](https://github.com/redhat-chaos/cerberus/blob/main/docs/config.md#watch-routes) during the chaos and fails the run if it encounters downtime as it is a potential downtime in a customers, or users environment as well. It is especially important during the control plane chaos scenarios including the API server, Etcd, Ingress etc. It can be enabled by setting `check_applicaton_routes: True` in the [Kraken config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) provided application routes are being monitored in the [cerberus config](https://github.com/redhat-chaos/krkn/blob/main/config/cerberus.yaml). - Leveraging built-in alert collection feature to fail the runs in case of critical alerts. +- Utilizing health check endpoints to observe application behavior during chaos injection [Health checks](https://github.com/krkn-chaos/krkn/docs/health_checks.md) ### Signaling In CI runs or any external job it is useful to stop Kraken once a certain test or state gets reached. We created a way to signal to kraken to pause the chaos or stop it completely using a signal posted to a port of your choice. diff --git a/config/config.yaml b/config/config.yaml index 09ec99a4..56556569 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -111,7 +111,10 @@ telemetry: oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH events_backup: True # enables/disables cluster events collection - - - - +health_checks: # Utilizing health check endpoints to observe application behavior during chaos injection. + interval: # Interval in seconds to perform health checks, default value is 2 seconds + config: # Provide list of health check configurations for applications + - url: # Provide application endpoint + bearer_token: # Bearer token for authentication if any + auth: # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword") + exit_on_failure: # If value is True exits when health check failed for application, values can be True/False diff --git a/docs/health_checks.md b/docs/health_checks.md new file mode 100644 index 00000000..43897516 --- /dev/null +++ b/docs/health_checks.md @@ -0,0 +1,59 @@ +### Health Checks + +Health checks provide real-time visibility into the impact of chaos scenarios on application availability and performance. Health check configuration supports application endpoints accessible via http / https along with authentication mechanism such as bearer token and authentication credentials. +Health checks are configured in the ```config.yaml``` + +The system periodically checks the provided URLs based on the defined interval and records the results in Telemetry. The telemetry data includes: + +- Success response ```200``` when the application is running normally. +- Failure response other than 200 if the application experiences downtime or errors. + +This helps users quickly identify application health issues and take necessary actions. + +#### Sample health check config +``` +health_checks: + interval: # Defines the frequency of health checks, default value is 2 seconds + config: # List of application endpoints to check + - url: "https://example.com/health" + bearer_token: "hfjauljl..." # Bearer token for authentication if any + auth: + exit_on_failure: True # If value is True exits when health check failed for application, values can be True/False + - url: "https://another-service.com/status" + bearer_token: + auth: ("admin","secretpassword") # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword") + exit_on_failure: False + - url: http://general-service.com + bearer_token: + auth: + exit_on_failure: +``` +#### Sample health check telemetry +``` +"health_checks": [ + { + "url": "https://example.com/health", + "status": False, + "status_code": "503", + "start_timestamp": "2025-02-25 11:51:33", + "end_timestamp": "2025-02-25 11:51:40", + "duration": "0:00:07" + }, + { + "url": "https://another-service.com/status", + "status": True, + "status_code": 200, + "start_timestamp": "2025-02-25 22:18:19", + "end_timestamp": "22025-02-25 22:22:46", + "duration": "0:04:27" + }, + { + "url": "http://general-service.com", + "status": True, + "status_code": 200, + "start_timestamp": "2025-02-25 22:18:19", + "end_timestamp": "22025-02-25 22:22:46", + "duration": "0:04:27" + } + ], +``` \ No newline at end of file diff --git a/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py b/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py index 1ddd4242..89d40d0e 100644 --- a/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py +++ b/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py @@ -117,7 +117,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin): while len(stopping_nodes) > 0: for node in stopping_nodes: affected_node = affected_nodes_status.get_affected_node_index(node) - + if type(node) is tuple: node_status = cloud_object.wait_until_stopped( node[1], node[0], timeout, affected_node @@ -149,7 +149,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin): for node in not_running_nodes: affected_node = affected_nodes_status.get_affected_node_index(node) # need to add in time that is passing while waiting for other nodes to be running - + if type(node) is tuple: node_status = cloud_object.wait_until_running( node[1], node[0], timeout, affected_node diff --git a/krkn/utils/HealthChecker.py b/krkn/utils/HealthChecker.py new file mode 100644 index 00000000..7822f1c1 --- /dev/null +++ b/krkn/utils/HealthChecker.py @@ -0,0 +1,83 @@ +import requests +import time +import logging +import queue +from datetime import datetime +from krkn_lib.models.telemetry.models import HealthCheck + +class HealthChecker: + current_iterations: int = 0 + ret_value = 0 + def __init__(self, iterations): + self.iterations = iterations + + def make_request(self, url, auth=None, headers=None): + response_data = {} + response = requests.get(url, auth=auth, headers=headers) + response_data["url"] = url + response_data["status"] = response.status_code == 200 + response_data["status_code"] = response.status_code + return response_data + + + def run_health_check(self, health_check_config, health_check_telemetry_queue: queue.Queue): + if health_check_config and health_check_config["config"] and any(config.get("url") for config in health_check_config["config"]): + health_check_start_time_stamp = datetime.now() + health_check_telemetry = [] + health_check_tracker = {} + interval = health_check_config["interval"] if health_check_config["interval"] else 2 + response_tracker = {config["url"]:True for config in health_check_config["config"]} + while self.current_iterations < self.iterations: + for config in health_check_config.get("config"): + auth, headers = None, None + if config["url"]: url = config["url"] + + if config["bearer_token"]: + bearer_token = "Bearer " + config["bearer_token"] + headers = {"Authorization": bearer_token} + + if config["auth"]: auth = config["auth"] + response = self.make_request(url, auth, headers) + + if response["status_code"] != 200: + if config["url"] not in health_check_tracker: + start_timestamp = datetime.now() + health_check_tracker[config["url"]] = { + "status_code": response["status_code"], + "start_timestamp": start_timestamp + } + if response_tracker[config["url"]] != False: response_tracker[config["url"]] = False + if config["exit_on_failure"] and config["exit_on_failure"] == True and self.ret_value==0: self.ret_value = 2 + else: + if config["url"] in health_check_tracker: + end_timestamp = datetime.now() + start_timestamp = health_check_tracker[config["url"]]["start_timestamp"] + previous_status_code = str(health_check_tracker[config["url"]]["status_code"]) + duration = (end_timestamp - start_timestamp).total_seconds() + downtime_record = { + "url": config["url"], + "status": False, + "status_code": previous_status_code, + "start_timestamp": start_timestamp.isoformat(), + "end_timestamp": end_timestamp.isoformat(), + "duration": duration + } + health_check_telemetry.append(HealthCheck(downtime_record)) + del health_check_tracker[config["url"]] + time.sleep(interval) + health_check_end_time_stamp = datetime.now() + for url, status in response_tracker.items(): + if status == True: + duration = (health_check_end_time_stamp - health_check_start_time_stamp).total_seconds() + success_response = { + "url": url, + "status": True, + "status_code": 200, + "start_timestamp": health_check_start_time_stamp.isoformat(), + "end_timestamp": health_check_end_time_stamp.isoformat(), + "duration": duration + } + health_check_telemetry.append(HealthCheck(success_response)) + health_check_telemetry_queue.put(health_check_telemetry) + else: + logging.info("health checks config is not defined, skipping them") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6664a7e4..8026e144 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ azure-identity==1.16.1 azure-keyvault==4.2.0 azure-mgmt-compute==30.5.0 itsdangerous==2.0.1 -coverage==7.4.1 +coverage==7.6.12 datetime==5.4 docker==7.0.0 gitpython==3.1.41 @@ -15,7 +15,7 @@ google-cloud-compute==1.22.0 ibm_cloud_sdk_core==3.18.0 ibm_vpc==0.20.0 jinja2==3.1.6 -krkn-lib==4.0.8 +krkn-lib==5.0.0 lxml==5.1.0 kubernetes==28.1.0 numpy==1.26.4 diff --git a/run_kraken.py b/run_kraken.py index 1125c349..dcd310bd 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -9,6 +9,8 @@ import optparse import pyfiglet import uuid import time +import queue +import threading from krkn_lib.elastic.krkn_elastic import KrknElastic from krkn_lib.models.elastic import ElasticChaosRunTelemetry @@ -26,6 +28,7 @@ from krkn_lib.utils import SafeLogger from krkn_lib.utils.functions import get_yaml_item_value, get_junit_test_case from krkn.utils import TeeLogHandler +from krkn.utils.HealthChecker import HealthChecker from krkn.scenario_plugins.scenario_plugin_factory import ( ScenarioPluginFactory, ScenarioPluginNotFound, @@ -125,10 +128,11 @@ def main(cfg) -> int: config["performance_monitoring"], "check_critical_alerts", False ) telemetry_api_url = config["telemetry"].get("api_url") + health_check_config = config["health_checks"] # Initialize clients if not os.path.isfile(kubeconfig_path) and not os.path.isfile( - "/var/run/secrets/kubernetes.io/serviceaccount/token" + "/var/run/secrets/kubernetes.io/serviceaccount/token" ): logging.error( "Cannot read the kubeconfig file at %s, please check" % kubeconfig_path @@ -274,8 +278,8 @@ def main(cfg) -> int: classes_and_types: dict[str, list[str]] = {} for loaded in scenario_plugin_factory.loaded_plugins.keys(): if ( - scenario_plugin_factory.loaded_plugins[loaded].__name__ - not in classes_and_types.keys() + scenario_plugin_factory.loaded_plugins[loaded].__name__ + not in classes_and_types.keys() ): classes_and_types[ scenario_plugin_factory.loaded_plugins[loaded].__name__ @@ -302,6 +306,12 @@ def main(cfg) -> int: module_name, class_name, error = failed logging.error(f"⛔ Class: {class_name} Module: {module_name}") logging.error(f"⚠️ {error}\n") + health_check_telemetry_queue = queue.Queue() + health_checker = HealthChecker(iterations) + health_check_worker = threading.Thread(target=health_checker.run_health_check, + args=(health_check_config, health_check_telemetry_queue)) + health_check_worker.start() + # Loop to run the chaos starts here while int(iteration) < iterations and run_signal != "STOP": # Inject chaos scenarios specified in the config @@ -362,12 +372,18 @@ def main(cfg) -> int: break iteration += 1 + health_checker.current_iterations += 1 # telemetry # in order to print decoded telemetry data even if telemetry collection # is disabled, it's necessary to serialize the ChaosRunTelemetry object # to json, and recreate a new object from it. end_time = int(time.time()) + health_check_worker.join() + try: + chaos_telemetry.health_checks = health_check_telemetry_queue.get_nowait() + except queue.Empty: + chaos_telemetry.health_checks = None # if platform is openshift will be collected # Cloud platform and network plugins metadata @@ -422,9 +438,9 @@ def main(cfg) -> int: ) else: if ( - config["telemetry"]["prometheus_namespace"] - and config["telemetry"]["prometheus_pod_name"] - and config["telemetry"]["prometheus_container_name"] + config["telemetry"]["prometheus_namespace"] + and config["telemetry"]["prometheus_pod_name"] + and config["telemetry"]["prometheus_container_name"] ): try: prometheus_archive_files = ( @@ -504,6 +520,9 @@ def main(cfg) -> int: ) # sys.exit(2) return 2 + if health_checker.ret_value != 0: + logging.error("Health check failed for the applications, Please check; exiting") + return health_checker.ret_value logging.info( "Successfully finished running Kraken. UUID for the run: " @@ -646,4 +665,4 @@ if __name__ == "__main__": with open(junit_testcase_file_path, "w") as stream: stream.write(junit_testcase_xml) - sys.exit(retval) + sys.exit(retval) \ No newline at end of file