mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 09:59:59 +00:00
Core Refactoring, Krkn Scenario Plugin API (#694)
* relocated shared libraries from `kraken` to `krkn` folder Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * AbstractScenarioPlugin and ScenarioPluginFactory Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * application_outage porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * arcaflow_scenarios porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * managedcluster_scenarios porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * network_chaos porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * node_actions porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * plugin_scenarios porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * pvc_scenarios porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * service_disruption porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * service_hijacking porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * cluster_shut_down_scenarios porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * syn_flood porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * time_scenarios porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * zone_outages porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * ScenarioPluginFactory tests Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * unit tests update Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * pod_scenarios and post actions deprecated Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> scenarios post_actions Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * funtests and config update Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * run_krkn.py update Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * utils porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * API Documentation Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * container_scenarios porting Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * funtest fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * document gif update Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * Documentation + tests update Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * removed example plugin Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * global renaming Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> test fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> test fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * config.yaml typos Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> typos Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * removed `plugin_scenarios` from NativScenarioPlugin class Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * pod_network_scenarios type added Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * documentation update Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * krkn-lib update Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> typo Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> --------- Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
This commit is contained in:
committed by
GitHub
parent
a13fb43d94
commit
d91172d9b2
@@ -1,29 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import logging
|
||||
import time
|
||||
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
output = subprocess.Popen(
|
||||
cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
except Exception as e:
|
||||
logging.error("Failed to run %s, error: %s" % (cmd, e))
|
||||
return out
|
||||
|
||||
|
||||
i = 0
|
||||
while i < 100:
|
||||
pods_running = run("oc get pods -n openshift-etcd -l app=etcd | grep -c '4/4'").rstrip()
|
||||
if pods_running == "3":
|
||||
break
|
||||
time.sleep(5)
|
||||
i += 1
|
||||
|
||||
if pods_running == str(3):
|
||||
print("There were 3 pods running properly")
|
||||
else:
|
||||
print("ERROR there were " + str(pods_running) + " pods running instead of 3")
|
||||
@@ -1,23 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import logging
|
||||
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
output = subprocess.Popen(
|
||||
cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
logging.info("out " + str(out))
|
||||
except Exception as e:
|
||||
logging.error("Failed to run %s, error: %s" % (cmd, e))
|
||||
return out
|
||||
|
||||
|
||||
pods_running = run("oc get pods -n openshift-etcd | grep -c Running").rstrip()
|
||||
|
||||
if pods_running == str(3):
|
||||
print("There were 3 pods running properly")
|
||||
else:
|
||||
print("ERROR there were " + str(pods_running) + " pods running instead of 3")
|
||||
@@ -1,28 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
output = subprocess.Popen(
|
||||
cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
except Exception as e:
|
||||
print("Failed to run %s, error: %s" % (cmd, e))
|
||||
return out
|
||||
|
||||
|
||||
i = 0
|
||||
while i < 100:
|
||||
projects_active = run("oc get project | grep 'ingress' | grep -c Active").rstrip()
|
||||
if projects_active == "3":
|
||||
break
|
||||
i += 1
|
||||
time.sleep(5)
|
||||
|
||||
if projects_active == str(3):
|
||||
print("There were 3 projects running properly")
|
||||
else:
|
||||
print("ERROR there were " + str(projects_active) + " projects running instead of 3")
|
||||
@@ -1,6 +0,0 @@
|
||||
# yaml-language-server: $schema=../plugin.schema.json
|
||||
- id: kill-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: app=prometheus
|
||||
krkn_pod_recovery_time: 120
|
||||
@@ -1,90 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from kubernetes import client, config
|
||||
from kubernetes.client.rest import ApiException
|
||||
|
||||
|
||||
def list_namespaces():
|
||||
"""
|
||||
List all namespaces
|
||||
"""
|
||||
spaces_list = []
|
||||
try:
|
||||
config.load_kube_config()
|
||||
cli = client.CoreV1Api()
|
||||
ret = cli.list_namespace(pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error(
|
||||
"Exception when calling CoreV1Api->list_namespace: %s\n",
|
||||
e
|
||||
)
|
||||
for current_namespace in ret.items:
|
||||
spaces_list.append(current_namespace.metadata.name)
|
||||
return spaces_list
|
||||
|
||||
|
||||
def check_namespaces(namespaces):
|
||||
"""
|
||||
Check if all the watch_namespaces are valid
|
||||
"""
|
||||
try:
|
||||
valid_namespaces = list_namespaces()
|
||||
regex_namespaces = set(namespaces) - set(valid_namespaces)
|
||||
final_namespaces = set(namespaces) - set(regex_namespaces)
|
||||
valid_regex = set()
|
||||
if regex_namespaces:
|
||||
for current_ns in valid_namespaces:
|
||||
for regex_namespace in regex_namespaces:
|
||||
if re.search(regex_namespace, current_ns):
|
||||
final_namespaces.add(current_ns)
|
||||
valid_regex.add(regex_namespace)
|
||||
break
|
||||
invalid_namespaces = regex_namespaces - valid_regex
|
||||
if invalid_namespaces:
|
||||
raise Exception(
|
||||
"There exists no namespaces matching: %s" % (
|
||||
invalid_namespaces
|
||||
)
|
||||
)
|
||||
return list(final_namespaces)
|
||||
except Exception as e:
|
||||
logging.error(str(e))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
output = subprocess.Popen(
|
||||
cmd,
|
||||
shell=True,
|
||||
universal_newlines=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
except Exception as e:
|
||||
logging.error("Failed to run %s, error: %s", cmd, e)
|
||||
return out
|
||||
|
||||
|
||||
def print_running_pods():
|
||||
regex_namespace_list = ["openshift-.*"]
|
||||
checked_namespaces = check_namespaces(regex_namespace_list)
|
||||
pods_running = 0
|
||||
for namespace in checked_namespaces:
|
||||
new_pods_running = run(
|
||||
"oc get pods -n " + namespace + " | grep -c Running"
|
||||
).rstrip()
|
||||
try:
|
||||
pods_running += int(new_pods_running)
|
||||
except Exception:
|
||||
continue
|
||||
print(pods_running)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print_running_pods()
|
||||
@@ -1,11 +0,0 @@
|
||||
#!/bin/bash
|
||||
pods="$(oc get pods -n openshift-etcd | grep -c Running)"
|
||||
echo "$pods"
|
||||
|
||||
if [ "$pods" -eq 3 ]
|
||||
then
|
||||
echo "Pods Pass"
|
||||
else
|
||||
# need capital error for proper error catching in run_kraken
|
||||
echo "ERROR pod count $pods doesnt match 3 expected pods"
|
||||
fi
|
||||
@@ -1,76 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import logging
|
||||
import time
|
||||
import yaml
|
||||
|
||||
|
||||
def run(cmd):
|
||||
out = ""
|
||||
try:
|
||||
output = subprocess.Popen(
|
||||
cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
except Exception as e:
|
||||
logging.info("Failed to run %s, error: %s" % (cmd, e))
|
||||
return out
|
||||
|
||||
|
||||
# Get cluster operators and return yaml
|
||||
def get_cluster_operators():
|
||||
operators_status = run("kubectl get co -o yaml")
|
||||
status_yaml = yaml.safe_load(operators_status, Loader=yaml.FullLoader)
|
||||
return status_yaml
|
||||
|
||||
|
||||
# Monitor cluster operators
|
||||
def monitor_cluster_operator(cluster_operators):
|
||||
failed_operators = []
|
||||
for operator in cluster_operators["items"]:
|
||||
# loop through the conditions in the status section to find the dedgraded condition
|
||||
if "status" in operator.keys() and "conditions" in operator["status"].keys():
|
||||
for status_cond in operator["status"]["conditions"]:
|
||||
# if the degraded status is not false, add it to the failed operators to return
|
||||
if status_cond["type"] == "Degraded" and status_cond["status"] != "False":
|
||||
failed_operators.append(operator["metadata"]["name"])
|
||||
break
|
||||
else:
|
||||
logging.info("Can't find status of " + operator["metadata"]["name"])
|
||||
failed_operators.append(operator["metadata"]["name"])
|
||||
# return False if there are failed operators else return True
|
||||
return failed_operators
|
||||
|
||||
|
||||
wait_duration = 10
|
||||
timeout = 900
|
||||
counter = 0
|
||||
|
||||
counter = 0
|
||||
co_yaml = get_cluster_operators()
|
||||
failed_operators = monitor_cluster_operator(co_yaml)
|
||||
while len(failed_operators) > 0:
|
||||
time.sleep(wait_duration)
|
||||
co_yaml = get_cluster_operators()
|
||||
failed_operators = monitor_cluster_operator(co_yaml)
|
||||
if counter >= timeout:
|
||||
print("Cluster operators are still degraded after " + str(timeout) + "seconds")
|
||||
print("Degraded operators " + str(failed_operators))
|
||||
exit(1)
|
||||
counter += wait_duration
|
||||
|
||||
not_ready = run("oc get nodes --no-headers | grep 'NotReady' | wc -l").rstrip()
|
||||
while int(not_ready) > 0:
|
||||
time.sleep(wait_duration)
|
||||
not_ready = run("oc get nodes --no-headers | grep 'NotReady' | wc -l").rstrip()
|
||||
if counter >= timeout:
|
||||
print("Nodes are still not ready after " + str(timeout) + "seconds")
|
||||
exit(1)
|
||||
counter += wait_duration
|
||||
|
||||
worker_nodes = run("oc get nodes --no-headers | grep worker | egrep -v NotReady | awk '{print $1}'").rstrip()
|
||||
print("Worker nodes list \n" + str(worker_nodes))
|
||||
master_nodes = run("oc get nodes --no-headers | grep master | egrep -v NotReady | awk '{print $1}'").rstrip()
|
||||
print("Master nodes list \n" + str(master_nodes))
|
||||
infra_nodes = run("oc get nodes --no-headers | grep infra | egrep -v NotReady | awk '{print $1}'").rstrip()
|
||||
print("Infra nodes list \n" + str(infra_nodes))
|
||||
Reference in New Issue
Block a user