Files
krkn/kraken/cerberus/setup.py
Naga Ravi Chaitanya Elluri 716057eab6 Monitor user application availability during chaos
Current Kraken integration with Cerberus monitors the cluster as well as the
application health post chaos and pass/fails if they are not healthy after chaos.
This commit adds ability to monitor the user application health during the chaos
and fails the run in case of downtime as it's potentially a downtime in case of
customers environment as well. It is especially useful in case of control plane
failure scenarios including API server, Etcd, Ingress etc.
2021-07-27 13:15:57 -04:00

94 lines
3.9 KiB
Python

import logging
import requests
import sys
import json
# Get cerberus status
def get_status(config, start_time, end_time):
cerberus_status = True
check_application_routes = False
application_routes_status = True
if config["cerberus"]["cerberus_enabled"]:
cerberus_url = config["cerberus"]["cerberus_url"]
check_application_routes = config["cerberus"]["check_applicaton_routes"]
if not cerberus_url:
logging.error("url where Cerberus publishes True/False signal is not provided.")
sys.exit(1)
cerberus_status = requests.get(cerberus_url, timeout=60).content
cerberus_status = True if cerberus_status == b"True" else False
# Fail if the application routes monitored by cerberus experience downtime during the chaos
if check_application_routes:
application_routes_status, unavailable_routes = application_status(cerberus_url, start_time, end_time)
if not application_routes_status:
logging.error(
"Application routes: %s monitored by cerberus encountered downtime during the run, failing"
% unavailable_routes
)
else:
logging.info("Application routes being monitored didn't encounter any downtime during the run!")
if not cerberus_status:
logging.error(
"Received a no-go signal from Cerberus, looks like "
"the cluster is unhealthy. Please check the Cerberus "
"report for more details. Test failed."
)
if not application_routes_status or not cerberus_status:
sys.exit(1)
else:
logging.info("Received a go signal from Ceberus, the cluster is healthy. " "Test passed.")
return cerberus_status
# Function to publish kraken status to cerberus
def publish_kraken_status(config, failed_post_scenarios, start_time, end_time):
cerberus_status = get_status(config, start_time, end_time)
if not cerberus_status:
if failed_post_scenarios:
if config["kraken"]["exit_on_failure"]:
logging.info(
"Cerberus status is not healthy and post action scenarios " "are still failing, exiting kraken run"
)
sys.exit(1)
else:
logging.info("Cerberus status is not healthy and post action scenarios " "are still failing")
else:
if failed_post_scenarios:
if config["kraken"]["exit_on_failure"]:
logging.info(
"Cerberus status is healthy but post action scenarios " "are still failing, exiting kraken run"
)
sys.exit(1)
else:
logging.info("Cerberus status is healthy but post action scenarios " "are still failing")
# Check application availability
def application_status(cerberus_url, start_time, end_time):
if not cerberus_url:
logging.error("url where Cerberus publishes True/False signal is not provided.")
sys.exit(1)
else:
duration = (end_time - start_time) / 60
url = cerberus_url + "/" + "history" + "?" + "loopback=" + str(duration)
logging.info("Scraping the metrics for the test duration from cerberus url: %s" % url)
try:
failed_routes = []
status = True
metrics = requests.get(url, timeout=60).content
metrics_json = json.loads(metrics)
for entry in metrics_json["history"]["failures"]:
if entry["component"] == "route":
name = entry["name"]
failed_routes.append(name)
status = False
else:
continue
except Exception as e:
logging.error("Failed to scrape metrics from cerberus API at %s: %s" % (url, e))
sys.exit(1)
return status, set(failed_routes)