mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-04-15 06:57:28 +00:00
Current Kraken integration with Cerberus monitors the cluster as well as the application health post chaos and pass/fails if they are not healthy after chaos. This commit adds ability to monitor the user application health during the chaos and fails the run in case of downtime as it's potentially a downtime in case of customers environment as well. It is especially useful in case of control plane failure scenarios including API server, Etcd, Ingress etc.
94 lines
3.9 KiB
Python
94 lines
3.9 KiB
Python
import logging
|
|
import requests
|
|
import sys
|
|
import json
|
|
|
|
|
|
# Get cerberus status
|
|
def get_status(config, start_time, end_time):
|
|
cerberus_status = True
|
|
check_application_routes = False
|
|
application_routes_status = True
|
|
if config["cerberus"]["cerberus_enabled"]:
|
|
cerberus_url = config["cerberus"]["cerberus_url"]
|
|
check_application_routes = config["cerberus"]["check_applicaton_routes"]
|
|
if not cerberus_url:
|
|
logging.error("url where Cerberus publishes True/False signal is not provided.")
|
|
sys.exit(1)
|
|
cerberus_status = requests.get(cerberus_url, timeout=60).content
|
|
cerberus_status = True if cerberus_status == b"True" else False
|
|
|
|
# Fail if the application routes monitored by cerberus experience downtime during the chaos
|
|
if check_application_routes:
|
|
application_routes_status, unavailable_routes = application_status(cerberus_url, start_time, end_time)
|
|
if not application_routes_status:
|
|
logging.error(
|
|
"Application routes: %s monitored by cerberus encountered downtime during the run, failing"
|
|
% unavailable_routes
|
|
)
|
|
else:
|
|
logging.info("Application routes being monitored didn't encounter any downtime during the run!")
|
|
|
|
if not cerberus_status:
|
|
logging.error(
|
|
"Received a no-go signal from Cerberus, looks like "
|
|
"the cluster is unhealthy. Please check the Cerberus "
|
|
"report for more details. Test failed."
|
|
)
|
|
|
|
if not application_routes_status or not cerberus_status:
|
|
sys.exit(1)
|
|
else:
|
|
logging.info("Received a go signal from Ceberus, the cluster is healthy. " "Test passed.")
|
|
return cerberus_status
|
|
|
|
|
|
# Function to publish kraken status to cerberus
|
|
def publish_kraken_status(config, failed_post_scenarios, start_time, end_time):
|
|
cerberus_status = get_status(config, start_time, end_time)
|
|
if not cerberus_status:
|
|
if failed_post_scenarios:
|
|
if config["kraken"]["exit_on_failure"]:
|
|
logging.info(
|
|
"Cerberus status is not healthy and post action scenarios " "are still failing, exiting kraken run"
|
|
)
|
|
sys.exit(1)
|
|
else:
|
|
logging.info("Cerberus status is not healthy and post action scenarios " "are still failing")
|
|
else:
|
|
if failed_post_scenarios:
|
|
if config["kraken"]["exit_on_failure"]:
|
|
logging.info(
|
|
"Cerberus status is healthy but post action scenarios " "are still failing, exiting kraken run"
|
|
)
|
|
sys.exit(1)
|
|
else:
|
|
logging.info("Cerberus status is healthy but post action scenarios " "are still failing")
|
|
|
|
|
|
# Check application availability
|
|
def application_status(cerberus_url, start_time, end_time):
|
|
if not cerberus_url:
|
|
logging.error("url where Cerberus publishes True/False signal is not provided.")
|
|
sys.exit(1)
|
|
else:
|
|
duration = (end_time - start_time) / 60
|
|
url = cerberus_url + "/" + "history" + "?" + "loopback=" + str(duration)
|
|
logging.info("Scraping the metrics for the test duration from cerberus url: %s" % url)
|
|
try:
|
|
failed_routes = []
|
|
status = True
|
|
metrics = requests.get(url, timeout=60).content
|
|
metrics_json = json.loads(metrics)
|
|
for entry in metrics_json["history"]["failures"]:
|
|
if entry["component"] == "route":
|
|
name = entry["name"]
|
|
failed_routes.append(name)
|
|
status = False
|
|
else:
|
|
continue
|
|
except Exception as e:
|
|
logging.error("Failed to scrape metrics from cerberus API at %s: %s" % (url, e))
|
|
sys.exit(1)
|
|
return status, set(failed_routes)
|