mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-04-15 06:57:28 +00:00
Merge pull request #21 from paigerube14/component_recovered
Check if killed component(s) recovered
This commit is contained in:
@@ -1,11 +1,15 @@
|
||||
kraken:
|
||||
kubeconfig_path: /root/.kube/config # Path to kubeconfig
|
||||
exit_on_failure: False # Exit when a post action scenario fails
|
||||
scenarios: # List of policies/chaos scenarios to load
|
||||
- scenarios/etcd.yml
|
||||
- scenarios/openshift-kube-apiserver.yml
|
||||
- scenarios/openshift-apiserver.yml
|
||||
- scenarios/regex_openshift_pod_kill.yml
|
||||
|
||||
- - scenarios/etcd.yml
|
||||
- scenarios/post_action_etcd_example.sh
|
||||
- - scenarios/openshift-kube-apiserver.yml
|
||||
- scenarios/post_action_openshift-kube-apiserver.yml
|
||||
- - scenarios/openshift-apiserver.yml
|
||||
- scenarios/post_action_openshift-apiserver.yml
|
||||
- - scenarios/regex_openshift_pod_kill.yml
|
||||
- scenarios/post_action_regex.py
|
||||
cerberus:
|
||||
cerberus_enabled: False # Enable it when cerberus is previously installed
|
||||
cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
|
||||
|
||||
@@ -5,8 +5,10 @@ import logging
|
||||
# Invokes a given command and returns the stdout
|
||||
def invoke(command):
|
||||
try:
|
||||
output = subprocess.check_output(command, shell=True,
|
||||
universal_newlines=True)
|
||||
except Exception:
|
||||
logging.error("Failed to run %s" % (command))
|
||||
return output
|
||||
output = subprocess.Popen(command, shell=True,
|
||||
universal_newlines=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT)
|
||||
(out, err) = output.communicate()
|
||||
except Exception as e:
|
||||
logging.error("Failed to run %s, error: %s" % (command, e))
|
||||
return out
|
||||
|
||||
@@ -4,6 +4,7 @@ import logging
|
||||
import kraken.invoke.command as runcommand
|
||||
import json
|
||||
|
||||
|
||||
kraken_node_name = ""
|
||||
|
||||
|
||||
|
||||
137
run_kraken.py
137
run_kraken.py
@@ -12,6 +12,109 @@ import kraken.invoke.command as runcommand
|
||||
import pyfiglet
|
||||
|
||||
|
||||
# Get cerberus status
|
||||
def cerberus_integration(config):
|
||||
cerberus_status = True
|
||||
if config["cerberus"]["cerberus_enabled"]:
|
||||
cerberus_url = config["cerberus"]["cerberus_url"]
|
||||
if not cerberus_url:
|
||||
logging.error("url where Cerberus publishes True/False signal is not provided.")
|
||||
sys.exit(1)
|
||||
cerberus_status = requests.get(cerberus_url).content
|
||||
cerberus_status = True if cerberus_status == b'True' else False
|
||||
if not cerberus_status:
|
||||
logging.error("Received a no-go signal from Cerberus, looks like "
|
||||
"the cluster is unhealthy. Please check the Cerberus "
|
||||
"report for more details. Test failed.")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("Received a go signal from Ceberus, the cluster is healthy. "
|
||||
"Test passed.")
|
||||
return cerberus_status
|
||||
|
||||
|
||||
# Function to publish kraken status to cerberus
|
||||
def publish_kraken_status(config, failed_post_scenarios):
|
||||
cerberus_status = cerberus_integration(config)
|
||||
if not cerberus_status:
|
||||
if failed_post_scenarios:
|
||||
if config['kraken']['exit_on_failure']:
|
||||
logging.info("Cerberus status is not healthy and post action scenarios "
|
||||
"are still failing, exiting kraken run")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("Cerberus status is not healthy and post action scenarios "
|
||||
"are still failing")
|
||||
else:
|
||||
|
||||
if failed_post_scenarios:
|
||||
if config['kraken']['exit_on_failure']:
|
||||
logging.info("Cerberus status is healthy but post action scenarios "
|
||||
"are still failing, exiting kraken run")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("Cerberus status is healthy but post action scenarios "
|
||||
"are still failing")
|
||||
|
||||
|
||||
def run_post_action(kubeconfig_path, scenario, pre_action_output=""):
|
||||
|
||||
if scenario.endswith(".yaml") or scenario.endswith(".yml"):
|
||||
action_output = runcommand.invoke("powerfulseal autonomous "
|
||||
"--use-pod-delete-instead-of-ssh-kill"
|
||||
" --policy-file %s --kubeconfig %s --no-cloud"
|
||||
" --inventory-kubernetes --headless"
|
||||
% (scenario, kubeconfig_path))
|
||||
# read output to make sure no error
|
||||
if "ERROR" in action_output:
|
||||
action_output.split("ERROR")[1].split('\n')[0]
|
||||
if not pre_action_output:
|
||||
logging.info("Powerful seal pre action check failed for " + str(scenario))
|
||||
return False
|
||||
else:
|
||||
logging.info(scenario + " post action checks passed")
|
||||
|
||||
elif scenario.endswith(".py"):
|
||||
action_output = runcommand.invoke("python3 " + scenario).strip()
|
||||
if pre_action_output:
|
||||
if pre_action_output == action_output:
|
||||
logging.info(scenario + " post action checks passed")
|
||||
else:
|
||||
logging.info(scenario + ' post action response did not match pre check output')
|
||||
return False
|
||||
else:
|
||||
# invoke custom bash script
|
||||
action_output = runcommand.invoke(scenario).strip()
|
||||
if pre_action_output:
|
||||
if pre_action_output == action_output:
|
||||
logging.info(scenario + " post action checks passed")
|
||||
else:
|
||||
logging.info(scenario + ' post action response did not match pre check output')
|
||||
return False
|
||||
|
||||
return action_output
|
||||
|
||||
|
||||
# Perform the post scenario actions to see if components recovered
|
||||
def post_actions(kubeconfig_path, scenario, failed_post_scenarios, pre_action_output):
|
||||
|
||||
for failed_scenario in failed_post_scenarios:
|
||||
post_action_output = run_post_action(kubeconfig_path,
|
||||
failed_scenario[0], failed_scenario[1])
|
||||
if post_action_output is not False:
|
||||
failed_post_scenarios.remove(failed_scenario)
|
||||
else:
|
||||
logging.info('Post action scenario ' + str(failed_scenario) + "is still failing")
|
||||
|
||||
# check post actions
|
||||
if len(scenario) > 1:
|
||||
post_action_output = run_post_action(kubeconfig_path, scenario[1], pre_action_output)
|
||||
if post_action_output is False:
|
||||
failed_post_scenarios.append([scenario[1], pre_action_output])
|
||||
|
||||
return failed_post_scenarios
|
||||
|
||||
|
||||
# Main function
|
||||
def main(cfg):
|
||||
# Start kraken
|
||||
@@ -24,7 +127,6 @@ def main(cfg):
|
||||
config = yaml.full_load(f)
|
||||
kubeconfig_path = config["kraken"]["kubeconfig_path"]
|
||||
scenarios = config["kraken"]["scenarios"]
|
||||
cerberus_enabled = config["cerberus"]["cerberus_enabled"]
|
||||
wait_duration = config["tunings"]["wait_duration"]
|
||||
iterations = config["tunings"]["iterations"]
|
||||
daemon_mode = config["tunings"]['daemon_mode']
|
||||
@@ -59,41 +161,34 @@ def main(cfg):
|
||||
% str(iterations))
|
||||
iterations = int(iterations)
|
||||
|
||||
failed_post_scenarios = []
|
||||
# Loop to run the chaos starts here
|
||||
while (int(iteration) < iterations):
|
||||
# Inject chaos scenarios specified in the config
|
||||
logging.info("Executing scenarios for iteration " + str(iteration))
|
||||
try:
|
||||
# Loop to run the scenarios starts here
|
||||
for scenario in scenarios:
|
||||
logging.info("Injecting scenario: %s" % (scenario))
|
||||
pre_action_output = run_post_action(kubeconfig_path, scenario[1])
|
||||
runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-ssh-kill"
|
||||
" --policy-file %s --kubeconfig %s --no-cloud"
|
||||
" --inventory-kubernetes --headless"
|
||||
% (scenario, kubeconfig_path))
|
||||
logging.info("Scenario: %s has been successfully injected!" % (scenario))
|
||||
% (scenario[0], kubeconfig_path))
|
||||
|
||||
if cerberus_enabled:
|
||||
cerberus_url = config["cerberus"]["cerberus_url"]
|
||||
if not cerberus_url:
|
||||
logging.error("url where Cerberus publishes True/False signal "
|
||||
"is not provided.")
|
||||
sys.exit(1)
|
||||
cerberus_status = requests.get(cerberus_url).content
|
||||
cerberus_status = True if cerberus_status == b'True' else False
|
||||
if not cerberus_status:
|
||||
logging.error("Received a no-go signal from Cerberus, looks like the"
|
||||
" cluster is unhealthy. Please check the Cerberus report"
|
||||
" for more details. Test failed.")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("Received a go signal from Ceberus, the cluster is "
|
||||
"healthy. Test passed.")
|
||||
logging.info("Scenario: %s has been successfully injected!" % (scenario[0]))
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
failed_post_scenarios = post_actions(kubeconfig_path, scenario,
|
||||
failed_post_scenarios, pre_action_output)
|
||||
publish_kraken_status(config, failed_post_scenarios)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run scenario: %s. Encountered the following exception: %s"
|
||||
% (scenario, e))
|
||||
% (scenario[0], e))
|
||||
iteration += 1
|
||||
logging.info("")
|
||||
if failed_post_scenarios:
|
||||
logging.error("Post scenarios are still failing at the end of all iterations")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.error("Cannot find a config at %s, please check" % (cfg))
|
||||
sys.exit(1)
|
||||
|
||||
3
scenarios/etcd.yml
Normal file → Executable file
3
scenarios/etcd.yml
Normal file → Executable file
@@ -11,12 +11,9 @@ scenarios:
|
||||
- labels:
|
||||
namespace: "openshift-etcd"
|
||||
selector: "k8s-app=etcd"
|
||||
|
||||
filters:
|
||||
- randomSample:
|
||||
size: 1
|
||||
|
||||
# The actions will be executed in the order specified
|
||||
actions:
|
||||
- kill:
|
||||
probability: 1
|
||||
|
||||
0
scenarios/openshift-apiserver.yml
Normal file → Executable file
0
scenarios/openshift-apiserver.yml
Normal file → Executable file
1
scenarios/openshift-kube-apiserver.yml
Normal file → Executable file
1
scenarios/openshift-kube-apiserver.yml
Normal file → Executable file
@@ -11,7 +11,6 @@ scenarios:
|
||||
- labels:
|
||||
namespace: "openshift-kube-apiserver"
|
||||
selector: "app=openshift-kube-apiserver"
|
||||
|
||||
filters:
|
||||
- randomSample:
|
||||
size: 1
|
||||
|
||||
21
scenarios/post_action_etcd.yml
Executable file
21
scenarios/post_action_etcd.yml
Executable file
@@ -0,0 +1,21 @@
|
||||
config:
|
||||
runStrategy:
|
||||
runs: 1
|
||||
maxSecondsBetweenRuns: 10
|
||||
minSecondsBetweenRuns: 1
|
||||
scenarios:
|
||||
- name: "check 3 pods are in namespace with selector: etcd"
|
||||
steps:
|
||||
- podAction:
|
||||
matches:
|
||||
- labels:
|
||||
namespace: "openshift-etcd"
|
||||
selector: "k8s-app=etcd"
|
||||
filters:
|
||||
- property:
|
||||
name: "state"
|
||||
value: "Running"
|
||||
# The actions will be executed in the order specified
|
||||
actions:
|
||||
- checkPodCount:
|
||||
count: 3
|
||||
3
scenarios/post_action_etcd_example.sh
Executable file
3
scenarios/post_action_etcd_example.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
pods="$(oc get pods -n openshift-etcd | grep -c Running)"
|
||||
echo "$pods"
|
||||
23
scenarios/post_action_etcd_example_py.py
Executable file
23
scenarios/post_action_etcd_example_py.py
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import logging
|
||||
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
output = subprocess.Popen(cmd, shell=True,
|
||||
universal_newlines=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT)
|
||||
(out, err) = output.communicate()
|
||||
logging.info("out " + str(out))
|
||||
except Exception as e:
|
||||
logging.error("Failed to run %s, error: %s" % (cmd, e))
|
||||
return out
|
||||
|
||||
|
||||
pods_running = run("oc get pods -n openshift-etcd | grep -c Running").rstrip()
|
||||
|
||||
if pods_running == str(3):
|
||||
print("There were 3 pods running properly")
|
||||
else:
|
||||
print("ERROR there were " + str(pods_running) + " pods running instead of 3")
|
||||
23
scenarios/post_action_openshift-apiserver.yml
Executable file
23
scenarios/post_action_openshift-apiserver.yml
Executable file
@@ -0,0 +1,23 @@
|
||||
config:
|
||||
runStrategy:
|
||||
runs: 1
|
||||
maxSecondsBetweenRuns: 30
|
||||
minSecondsBetweenRuns: 1
|
||||
scenarios:
|
||||
- name: "check 3 pods are in namespace with selector: openshift-apiserver"
|
||||
steps:
|
||||
- podAction:
|
||||
matches:
|
||||
- labels:
|
||||
namespace: "openshift-apiserver"
|
||||
selector: "app=openshift-apiserver"
|
||||
|
||||
filters:
|
||||
- property:
|
||||
name: "state"
|
||||
value: "Running"
|
||||
|
||||
# The actions will be executed in the order specified
|
||||
actions:
|
||||
- checkPodCount:
|
||||
count: 3
|
||||
21
scenarios/post_action_openshift-kube-apiserver.yml
Executable file
21
scenarios/post_action_openshift-kube-apiserver.yml
Executable file
@@ -0,0 +1,21 @@
|
||||
config:
|
||||
runStrategy:
|
||||
runs: 1
|
||||
maxSecondsBetweenRuns: 30
|
||||
minSecondsBetweenRuns: 1
|
||||
scenarios:
|
||||
- name: "check 3 pods are in namespace with selector: openshift-kube-apiserver"
|
||||
steps:
|
||||
- podAction:
|
||||
matches:
|
||||
- labels:
|
||||
namespace: "openshift-kube-apiserver"
|
||||
selector: "app=openshift-kube-apiserver"
|
||||
filters:
|
||||
- property:
|
||||
name: "state"
|
||||
value: "Running"
|
||||
# The actions will be executed in the order specified
|
||||
actions:
|
||||
- checkPodCount:
|
||||
count: 3
|
||||
68
scenarios/post_action_regex.py
Executable file
68
scenarios/post_action_regex.py
Executable file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import re
|
||||
import sys
|
||||
from kubernetes import client, config
|
||||
from kubernetes.client.rest import ApiException
|
||||
import logging
|
||||
|
||||
|
||||
# List all namespaces
|
||||
def list_namespaces():
|
||||
namespaces = []
|
||||
try:
|
||||
config.load_kube_config()
|
||||
cli = client.CoreV1Api()
|
||||
ret = cli.list_namespace(pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling \
|
||||
CoreV1Api->list_namespaced_pod: %s\n" % e)
|
||||
for namespace in ret.items:
|
||||
namespaces.append(namespace.metadata.name)
|
||||
return namespaces
|
||||
|
||||
|
||||
# Check if all the watch_namespaces are valid
|
||||
def check_namespaces(namespaces):
|
||||
try:
|
||||
valid_namespaces = list_namespaces()
|
||||
regex_namespaces = set(namespaces) - set(valid_namespaces)
|
||||
final_namespaces = set(namespaces) - set(regex_namespaces)
|
||||
valid_regex = set()
|
||||
if regex_namespaces:
|
||||
for namespace in valid_namespaces:
|
||||
for regex_namespace in regex_namespaces:
|
||||
if re.search(regex_namespace, namespace):
|
||||
final_namespaces.add(namespace)
|
||||
valid_regex.add(regex_namespace)
|
||||
break
|
||||
invalid_namespaces = regex_namespaces - valid_regex
|
||||
if invalid_namespaces:
|
||||
raise Exception("There exists no namespaces matching: %s" % (invalid_namespaces))
|
||||
return list(final_namespaces)
|
||||
except Exception as e:
|
||||
logging.error("%s" % (e))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
output = subprocess.Popen(cmd, shell=True,
|
||||
universal_newlines=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT)
|
||||
(out, err) = output.communicate()
|
||||
except Exception as e:
|
||||
logging.error("Failed to run %s, error: %s" % (cmd, e))
|
||||
return out
|
||||
|
||||
|
||||
regex_namespace = ["openshift-.*"]
|
||||
namespaces = check_namespaces(regex_namespace)
|
||||
pods_running = 0
|
||||
for namespace in namespaces:
|
||||
new_pods_running = run("oc get pods -n " + namespace + " | grep -c Running").rstrip()
|
||||
try:
|
||||
pods_running += int(new_pods_running)
|
||||
except Exception:
|
||||
continue
|
||||
print(pods_running)
|
||||
11
scenarios/post_action_regex.sh
Executable file
11
scenarios/post_action_regex.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
pods="$(oc get pods -n openshift-etcd | grep -c Running)"
|
||||
echo "$pods"
|
||||
|
||||
if [ "$pods" -eq 3 ]
|
||||
then
|
||||
echo "Pods Pass"
|
||||
else
|
||||
# need capital error for proper error catching in run_kraken
|
||||
echo "ERROR pod count $pods doesnt match 3 expected pods"
|
||||
fi
|
||||
18
scenarios/post_action_regex_openshift_pod_kill.yml
Executable file
18
scenarios/post_action_regex_openshift_pod_kill.yml
Executable file
@@ -0,0 +1,18 @@
|
||||
config:
|
||||
runStrategy:
|
||||
runs: 1
|
||||
maxSecondsBetweenRuns: 30
|
||||
minSecondsBetweenRuns: 1
|
||||
scenarios:
|
||||
- name: kill up to 3 pods in any openshift namespace
|
||||
steps:
|
||||
- podAction:
|
||||
matches:
|
||||
- namespace: "openshift-.*"
|
||||
filters:
|
||||
- property:
|
||||
name: "state"
|
||||
value: "Running"
|
||||
actions:
|
||||
- checkPodCount:
|
||||
count: 146
|
||||
0
scenarios/regex_openshift_pod_kill.yml
Normal file → Executable file
0
scenarios/regex_openshift_pod_kill.yml
Normal file → Executable file
Reference in New Issue
Block a user