Merge pull request #21 from paigerube14/component_recovered

Check if killed component(s) recovered
This commit is contained in:
Mike Fiedler
2020-08-20 15:18:12 -04:00
committed by GitHub
16 changed files with 321 additions and 35 deletions

View File

@@ -1,11 +1,15 @@
kraken:
kubeconfig_path: /root/.kube/config # Path to kubeconfig
exit_on_failure: False # Exit when a post action scenario fails
scenarios: # List of policies/chaos scenarios to load
- scenarios/etcd.yml
- scenarios/openshift-kube-apiserver.yml
- scenarios/openshift-apiserver.yml
- scenarios/regex_openshift_pod_kill.yml
- - scenarios/etcd.yml
- scenarios/post_action_etcd_example.sh
- - scenarios/openshift-kube-apiserver.yml
- scenarios/post_action_openshift-kube-apiserver.yml
- - scenarios/openshift-apiserver.yml
- scenarios/post_action_openshift-apiserver.yml
- - scenarios/regex_openshift_pod_kill.yml
- scenarios/post_action_regex.py
cerberus:
cerberus_enabled: False # Enable it when cerberus is previously installed
cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal

View File

@@ -5,8 +5,10 @@ import logging
# Invokes a given command and returns the stdout
def invoke(command):
try:
output = subprocess.check_output(command, shell=True,
universal_newlines=True)
except Exception:
logging.error("Failed to run %s" % (command))
return output
output = subprocess.Popen(command, shell=True,
universal_newlines=True, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
(out, err) = output.communicate()
except Exception as e:
logging.error("Failed to run %s, error: %s" % (command, e))
return out

View File

@@ -4,6 +4,7 @@ import logging
import kraken.invoke.command as runcommand
import json
kraken_node_name = ""

View File

@@ -12,6 +12,109 @@ import kraken.invoke.command as runcommand
import pyfiglet
# Get cerberus status
def cerberus_integration(config):
cerberus_status = True
if config["cerberus"]["cerberus_enabled"]:
cerberus_url = config["cerberus"]["cerberus_url"]
if not cerberus_url:
logging.error("url where Cerberus publishes True/False signal is not provided.")
sys.exit(1)
cerberus_status = requests.get(cerberus_url).content
cerberus_status = True if cerberus_status == b'True' else False
if not cerberus_status:
logging.error("Received a no-go signal from Cerberus, looks like "
"the cluster is unhealthy. Please check the Cerberus "
"report for more details. Test failed.")
sys.exit(1)
else:
logging.info("Received a go signal from Ceberus, the cluster is healthy. "
"Test passed.")
return cerberus_status
# Function to publish kraken status to cerberus
def publish_kraken_status(config, failed_post_scenarios):
cerberus_status = cerberus_integration(config)
if not cerberus_status:
if failed_post_scenarios:
if config['kraken']['exit_on_failure']:
logging.info("Cerberus status is not healthy and post action scenarios "
"are still failing, exiting kraken run")
sys.exit(1)
else:
logging.info("Cerberus status is not healthy and post action scenarios "
"are still failing")
else:
if failed_post_scenarios:
if config['kraken']['exit_on_failure']:
logging.info("Cerberus status is healthy but post action scenarios "
"are still failing, exiting kraken run")
sys.exit(1)
else:
logging.info("Cerberus status is healthy but post action scenarios "
"are still failing")
def run_post_action(kubeconfig_path, scenario, pre_action_output=""):
if scenario.endswith(".yaml") or scenario.endswith(".yml"):
action_output = runcommand.invoke("powerfulseal autonomous "
"--use-pod-delete-instead-of-ssh-kill"
" --policy-file %s --kubeconfig %s --no-cloud"
" --inventory-kubernetes --headless"
% (scenario, kubeconfig_path))
# read output to make sure no error
if "ERROR" in action_output:
action_output.split("ERROR")[1].split('\n')[0]
if not pre_action_output:
logging.info("Powerful seal pre action check failed for " + str(scenario))
return False
else:
logging.info(scenario + " post action checks passed")
elif scenario.endswith(".py"):
action_output = runcommand.invoke("python3 " + scenario).strip()
if pre_action_output:
if pre_action_output == action_output:
logging.info(scenario + " post action checks passed")
else:
logging.info(scenario + ' post action response did not match pre check output')
return False
else:
# invoke custom bash script
action_output = runcommand.invoke(scenario).strip()
if pre_action_output:
if pre_action_output == action_output:
logging.info(scenario + " post action checks passed")
else:
logging.info(scenario + ' post action response did not match pre check output')
return False
return action_output
# Perform the post scenario actions to see if components recovered
def post_actions(kubeconfig_path, scenario, failed_post_scenarios, pre_action_output):
for failed_scenario in failed_post_scenarios:
post_action_output = run_post_action(kubeconfig_path,
failed_scenario[0], failed_scenario[1])
if post_action_output is not False:
failed_post_scenarios.remove(failed_scenario)
else:
logging.info('Post action scenario ' + str(failed_scenario) + "is still failing")
# check post actions
if len(scenario) > 1:
post_action_output = run_post_action(kubeconfig_path, scenario[1], pre_action_output)
if post_action_output is False:
failed_post_scenarios.append([scenario[1], pre_action_output])
return failed_post_scenarios
# Main function
def main(cfg):
# Start kraken
@@ -24,7 +127,6 @@ def main(cfg):
config = yaml.full_load(f)
kubeconfig_path = config["kraken"]["kubeconfig_path"]
scenarios = config["kraken"]["scenarios"]
cerberus_enabled = config["cerberus"]["cerberus_enabled"]
wait_duration = config["tunings"]["wait_duration"]
iterations = config["tunings"]["iterations"]
daemon_mode = config["tunings"]['daemon_mode']
@@ -59,41 +161,34 @@ def main(cfg):
% str(iterations))
iterations = int(iterations)
failed_post_scenarios = []
# Loop to run the chaos starts here
while (int(iteration) < iterations):
# Inject chaos scenarios specified in the config
logging.info("Executing scenarios for iteration " + str(iteration))
try:
# Loop to run the scenarios starts here
for scenario in scenarios:
logging.info("Injecting scenario: %s" % (scenario))
pre_action_output = run_post_action(kubeconfig_path, scenario[1])
runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-ssh-kill"
" --policy-file %s --kubeconfig %s --no-cloud"
" --inventory-kubernetes --headless"
% (scenario, kubeconfig_path))
logging.info("Scenario: %s has been successfully injected!" % (scenario))
% (scenario[0], kubeconfig_path))
if cerberus_enabled:
cerberus_url = config["cerberus"]["cerberus_url"]
if not cerberus_url:
logging.error("url where Cerberus publishes True/False signal "
"is not provided.")
sys.exit(1)
cerberus_status = requests.get(cerberus_url).content
cerberus_status = True if cerberus_status == b'True' else False
if not cerberus_status:
logging.error("Received a no-go signal from Cerberus, looks like the"
" cluster is unhealthy. Please check the Cerberus report"
" for more details. Test failed.")
sys.exit(1)
else:
logging.info("Received a go signal from Ceberus, the cluster is "
"healthy. Test passed.")
logging.info("Scenario: %s has been successfully injected!" % (scenario[0]))
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
failed_post_scenarios = post_actions(kubeconfig_path, scenario,
failed_post_scenarios, pre_action_output)
publish_kraken_status(config, failed_post_scenarios)
except Exception as e:
logging.error("Failed to run scenario: %s. Encountered the following exception: %s"
% (scenario, e))
% (scenario[0], e))
iteration += 1
logging.info("")
if failed_post_scenarios:
logging.error("Post scenarios are still failing at the end of all iterations")
sys.exit(1)
else:
logging.error("Cannot find a config at %s, please check" % (cfg))
sys.exit(1)

3
scenarios/etcd.yml Normal file → Executable file
View File

@@ -11,12 +11,9 @@ scenarios:
- labels:
namespace: "openshift-etcd"
selector: "k8s-app=etcd"
filters:
- randomSample:
size: 1
# The actions will be executed in the order specified
actions:
- kill:
probability: 1

0
scenarios/openshift-apiserver.yml Normal file → Executable file
View File

1
scenarios/openshift-kube-apiserver.yml Normal file → Executable file
View File

@@ -11,7 +11,6 @@ scenarios:
- labels:
namespace: "openshift-kube-apiserver"
selector: "app=openshift-kube-apiserver"
filters:
- randomSample:
size: 1

21
scenarios/post_action_etcd.yml Executable file
View File

@@ -0,0 +1,21 @@
config:
runStrategy:
runs: 1
maxSecondsBetweenRuns: 10
minSecondsBetweenRuns: 1
scenarios:
- name: "check 3 pods are in namespace with selector: etcd"
steps:
- podAction:
matches:
- labels:
namespace: "openshift-etcd"
selector: "k8s-app=etcd"
filters:
- property:
name: "state"
value: "Running"
# The actions will be executed in the order specified
actions:
- checkPodCount:
count: 3

View File

@@ -0,0 +1,3 @@
#!/bin/bash
pods="$(oc get pods -n openshift-etcd | grep -c Running)"
echo "$pods"

View File

@@ -0,0 +1,23 @@
#!/usr/bin/env python3
import subprocess
import logging
def run(cmd):
try:
output = subprocess.Popen(cmd, shell=True,
universal_newlines=True, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
(out, err) = output.communicate()
logging.info("out " + str(out))
except Exception as e:
logging.error("Failed to run %s, error: %s" % (cmd, e))
return out
pods_running = run("oc get pods -n openshift-etcd | grep -c Running").rstrip()
if pods_running == str(3):
print("There were 3 pods running properly")
else:
print("ERROR there were " + str(pods_running) + " pods running instead of 3")

View File

@@ -0,0 +1,23 @@
config:
runStrategy:
runs: 1
maxSecondsBetweenRuns: 30
minSecondsBetweenRuns: 1
scenarios:
- name: "check 3 pods are in namespace with selector: openshift-apiserver"
steps:
- podAction:
matches:
- labels:
namespace: "openshift-apiserver"
selector: "app=openshift-apiserver"
filters:
- property:
name: "state"
value: "Running"
# The actions will be executed in the order specified
actions:
- checkPodCount:
count: 3

View File

@@ -0,0 +1,21 @@
config:
runStrategy:
runs: 1
maxSecondsBetweenRuns: 30
minSecondsBetweenRuns: 1
scenarios:
- name: "check 3 pods are in namespace with selector: openshift-kube-apiserver"
steps:
- podAction:
matches:
- labels:
namespace: "openshift-kube-apiserver"
selector: "app=openshift-kube-apiserver"
filters:
- property:
name: "state"
value: "Running"
# The actions will be executed in the order specified
actions:
- checkPodCount:
count: 3

68
scenarios/post_action_regex.py Executable file
View File

@@ -0,0 +1,68 @@
#!/usr/bin/env python3
import subprocess
import re
import sys
from kubernetes import client, config
from kubernetes.client.rest import ApiException
import logging
# List all namespaces
def list_namespaces():
namespaces = []
try:
config.load_kube_config()
cli = client.CoreV1Api()
ret = cli.list_namespace(pretty=True)
except ApiException as e:
logging.error("Exception when calling \
CoreV1Api->list_namespaced_pod: %s\n" % e)
for namespace in ret.items:
namespaces.append(namespace.metadata.name)
return namespaces
# Check if all the watch_namespaces are valid
def check_namespaces(namespaces):
try:
valid_namespaces = list_namespaces()
regex_namespaces = set(namespaces) - set(valid_namespaces)
final_namespaces = set(namespaces) - set(regex_namespaces)
valid_regex = set()
if regex_namespaces:
for namespace in valid_namespaces:
for regex_namespace in regex_namespaces:
if re.search(regex_namespace, namespace):
final_namespaces.add(namespace)
valid_regex.add(regex_namespace)
break
invalid_namespaces = regex_namespaces - valid_regex
if invalid_namespaces:
raise Exception("There exists no namespaces matching: %s" % (invalid_namespaces))
return list(final_namespaces)
except Exception as e:
logging.error("%s" % (e))
sys.exit(1)
def run(cmd):
try:
output = subprocess.Popen(cmd, shell=True,
universal_newlines=True, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
(out, err) = output.communicate()
except Exception as e:
logging.error("Failed to run %s, error: %s" % (cmd, e))
return out
regex_namespace = ["openshift-.*"]
namespaces = check_namespaces(regex_namespace)
pods_running = 0
for namespace in namespaces:
new_pods_running = run("oc get pods -n " + namespace + " | grep -c Running").rstrip()
try:
pods_running += int(new_pods_running)
except Exception:
continue
print(pods_running)

11
scenarios/post_action_regex.sh Executable file
View File

@@ -0,0 +1,11 @@
#!/bin/bash
pods="$(oc get pods -n openshift-etcd | grep -c Running)"
echo "$pods"
if [ "$pods" -eq 3 ]
then
echo "Pods Pass"
else
# need capital error for proper error catching in run_kraken
echo "ERROR pod count $pods doesnt match 3 expected pods"
fi

View File

@@ -0,0 +1,18 @@
config:
runStrategy:
runs: 1
maxSecondsBetweenRuns: 30
minSecondsBetweenRuns: 1
scenarios:
- name: kill up to 3 pods in any openshift namespace
steps:
- podAction:
matches:
- namespace: "openshift-.*"
filters:
- property:
name: "state"
value: "Running"
actions:
- checkPodCount:
count: 146

0
scenarios/regex_openshift_pod_kill.yml Normal file → Executable file
View File