Add support to run on Kubernetes

This commit:
- Leverages distribution flag in the config set by the user to skip
  things not supported on OpenShift to be able to run scenarios on
  Kubernetes.
- Adds sample config and scenario files that work on Kubernetes.
This commit is contained in:
Naga Ravi Chaitanya Elluri
2022-05-17 17:04:49 -04:00
parent 23d9a26f52
commit 9208f39e06
36 changed files with 179 additions and 90 deletions

View File

@@ -56,27 +56,19 @@ Instructions on how to setup the config and the options supported can be found a
### Kubernetes/OpenShift chaos scenarios supported
- [Pod Scenarios](docs/pod_scenarios.md)
- [Container Scenarios](docs/container_scenarios.md)
- [Node Scenarios](docs/node_scenarios.md)
- [Time Scenarios](docs/time_scenarios.md)
- [Litmus Scenarios](docs/litmus_scenarios.md)
- [Cluster Shut Down Scenarios](docs/cluster_shut_down_scenarios.md)
- [Namespace Scenarios](docs/namespace_scenarios.md)
- [Zone Outage Scenarios](docs/zone_outage.md)
- [Application_outages](docs/application_outages.md)
- [PVC scenario](docs/pvc_scenario.md)
- [Network_Chaos](docs/network_chaos.md)
Scenario type | Kubernetes | OpenShift
--------------------------- | ------------- | -------------------- |
[Pod Scenarios](docs/pod_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
[Container Scenarios](docs/container_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
[Node Scenarios](docs/node_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
[Time Scenarios](docs/time_scenarios.md) | :x: | :heavy_check_mark: |
[Litmus Scenarios](docs/litmus_scenarios.md) | :x: | :heavy_check_mark: |
[Cluster Shut Down Scenarios](docs/cluster_shut_down_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
[Namespace Scenarios](docs/namespace_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
[Zone Outage Scenarios](docs/zone_outage.md) | :heavy_check_mark: | :heavy_check_mark: |
[Application_outages](docs/application_outages.md) | :heavy_check_mark: | :heavy_check_mark: |
[PVC scenario](docs/pvc_scenario.md) | :heavy_check_mark: | :heavy_check_mark: |
[Network_Chaos](docs/network_chaos.md) | :heavy_check_mark: | :heavy_check_mark: |
### Kraken scenario pass/fail criteria and report

View File

@@ -10,40 +10,40 @@ kraken:
litmus_uninstall_before_run: True # If you want to uninstall litmus before a new run starts
chaos_scenarios: # List of policies/chaos scenarios to load
- container_scenarios: # List of chaos pod scenarios to load
- - scenarios/container_etcd.yml
- - scenarios/openshift/container_etcd.yml
- pod_scenarios:
- - scenarios/etcd.yml
- - scenarios/regex_openshift_pod_kill.yml
- scenarios/post_action_regex.py
- - scenarios/openshift/etcd.yml
- - scenarios/openshift/regex_openshift_pod_kill.yml
- scenarios/openshift/post_action_regex.py
- node_scenarios: # List of chaos node scenarios to load
- scenarios/node_scenarios_example.yml
- scenarios/openshift/node_scenarios_example.yml
- pod_scenarios:
- - scenarios/openshift-apiserver.yml
- - scenarios/openshift-kube-apiserver.yml
- - scenarios/openshift/openshift-apiserver.yml
- - scenarios/openshift/openshift-kube-apiserver.yml
- time_scenarios: # List of chaos time scenarios to load
- scenarios/time_scenarios_example.yml
- scenarios/openshift/time_scenarios_example.yml
- litmus_scenarios: # List of litmus scenarios to load
- - scenarios/templates/litmus-rbac.yaml
- scenarios/node_cpu_hog_engine.yaml
- - scenarios/templates/litmus-rbac.yaml
- scenarios/node_mem_engine.yaml
- - scenarios/templates/litmus-rbac.yaml
- scenarios/node_io_engine.yaml
- - scenarios/openshift/templates/litmus-rbac.yaml
- scenarios/openshift/node_cpu_hog_engine.yaml
- - scenarios/openshift/templates/litmus-rbac.yaml
- scenarios/openshift/node_mem_engine.yaml
- - scenarios/openshift/templates/litmus-rbac.yaml
- scenarios/openshift/node_io_engine.yaml
- cluster_shut_down_scenarios:
- - scenarios/cluster_shut_down_scenario.yml
- scenarios/post_action_shut_down.py
- - scenarios/openshift/cluster_shut_down_scenario.yml
- scenarios/openshift/post_action_shut_down.py
- namespace_scenarios:
- - scenarios/regex_namespace.yaml
- - scenarios/ingress_namespace.yaml
- scenarios/post_action_namespace.py
- - scenarios/openshift/regex_namespace.yaml
- - scenarios/openshift/ingress_namespace.yaml
- scenarios/openshift/post_action_namespace.py
- zone_outages:
- scenarios/zone_outage.yaml
- scenarios/openshift/zone_outage.yaml
- application_outages:
- scenarios/app_outage.yaml
- scenarios/openshift/app_outage.yaml
- pvc_scenarios:
- scenarios/pvc_scenario.yaml
- scenarios/openshift/pvc_scenario.yaml
- network_chaos:
- scenarios/network_chaos.yaml
- scenarios/openshift/network_chaos.yaml
cerberus:
cerberus_enabled: False # Enable it when cerberus is previously installed

View File

@@ -0,0 +1,38 @@
kraken:
distribution: kubernetes # Distribution can be kubernetes or openshift
kubeconfig_path: /root/.kube/config # Path to kubeconfig
exit_on_failure: False # Exit when a post action scenario fails
port: 8081
publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081
signal_state: RUN # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details
litmus_version: v1.13.6 # Litmus version to install
litmus_uninstall: False # If you want to uninstall litmus if failure
litmus_uninstall_before_run: True # If you want to uninstall litmus before a new run starts
chaos_scenarios: # List of policies/chaos scenarios to load
- container_scenarios: # List of chaos pod scenarios to load
- - scenarios/kube/container_dns.yml
- pod_scenarios:
- - scenarios/kube/scheduler.yml
cerberus:
cerberus_enabled: False # Enable it when cerberus is previously installed
cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
check_applicaton_routes: False # When enabled will look for application unavailability using the routes specified in the cerberus config and fails the run
performance_monitoring:
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
capture_metrics: False
config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config
metrics_profile_path: config/metrics-aggregated.yaml
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
uuid: # uuid for the run is generated by default if not set
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
alert_profile: config/alerts # Path to alert profile with the prometheus queries
tunings:
wait_duration: 60 # Duration to wait between each chaos scenario
iterations: 1 # Number of times to execute the scenarios
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever

View File

@@ -10,33 +10,34 @@ kraken:
litmus_uninstall_before_run: True # If you want to uninstall litmus before a new run starts
chaos_scenarios: # List of policies/chaos scenarios to load
- pod_scenarios: # List of chaos pod scenarios to load
- - scenarios/etcd.yml
- - scenarios/regex_openshift_pod_kill.yml
- scenarios/post_action_regex.py
- - scenarios/openshift/etcd.yml
- - scenarios/openshift/regex_openshift_pod_kill.yml
- scenarios/openshift/post_action_regex.py
- node_scenarios: # List of chaos node scenarios to load
- scenarios/node_scenarios_example.yml
- scenarios/openshift/node_scenarios_example.yml
- pod_scenarios:
- - scenarios/openshift-apiserver.yml
- - scenarios/openshift-kube-apiserver.yml
- - scenarios/openshift/openshift-apiserver.yml
- - scenarios/openshift/openshift-kube-apiserver.yml
- time_scenarios: # List of chaos time scenarios to load
- scenarios/time_scenarios_example.yml
- scenarios/openshift/time_scenarios_example.yml
- litmus_scenarios: # List of litmus scenarios to load
- - https://hub.litmuschaos.io/api/chaos/1.10.0?file=charts/generic/node-cpu-hog/rbac.yaml
- scenarios/node_cpu_hog_engine.yaml
- scenarios/openshift/node_cpu_hog_engine.yaml
- cluster_shut_down_scenarios:
- - scenarios/cluster_shut_down_scenario.yml
- scenarios/post_action_shut_down.py
- - scenarios/openshift/cluster_shut_down_scenario.yml
- scenarios/openshift/post_action_shut_down.py
- namespace_scenarios:
- scenarios/regex_namespace.yaml
- scenarios/ingress_namespace.yaml
- scenarios/openshift/regex_namespace.yaml
- scenarios/openshift/ingress_namespace.yaml
- zone_outages:
- scenarios/zone_outage.yaml
- scenarios/openshift/zone_outage.yaml
- application_outages:
- scenarios/app_outage.yaml
- scenarios/openshift/app_outage.yaml
- pvc_scenarios:
- scenarios/pvc_scenario.yaml
- scenarios/openshift/pvc_scenario.yaml
- network_chaos:
- scenarios/network_chaos.yaml
- scenarios/openshift/network_chaos.yaml
cerberus:
cerberus_enabled: True # Enable it when cerberus is previously installed
cerberus_url: http://0.0.0.0:8080 # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal

View File

@@ -34,10 +34,14 @@ def scrape_metrics(
"""
if not prometheus_url:
logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
prometheus_url, prometheus_bearer_token = prometheus.instance(
distribution, prometheus_url, prometheus_bearer_token
)
if distribution == "openshift":
logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
prometheus_url, prometheus_bearer_token = prometheus.instance(
distribution, prometheus_url, prometheus_bearer_token
)
else:
logging.error("Looks like proemtheus url is not defined, exiting")
sys.exit(1)
command = (
"./kube-burner index --uuid "
+ str(uuid)
@@ -69,10 +73,14 @@ def alerts(distribution, prometheus_url, prometheus_bearer_token, start_time, en
"""
if not prometheus_url:
logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
prometheus_url, prometheus_bearer_token = prometheus.instance(
distribution, prometheus_url, prometheus_bearer_token
)
if distribution == "openshift":
logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
prometheus_url, prometheus_bearer_token = prometheus.instance(
distribution, prometheus_url, prometheus_bearer_token
)
else:
logging.error("Looks like proemtheus url is not defined, exiting")
sys.exit(1)
command = (
"./kube-burner check-alerts "
+ " -u "

View File

@@ -120,7 +120,7 @@ def container_killing_in_pod(cont_scenario):
for pod in pods:
if type(pod) == list:
container_names = runcommand.invoke(
'oc get pods %s -n %s -o jsonpath="{.spec.containers[*].name}"' % (pod[0], pod[1])
'kubectl get pods %s -n %s -o jsonpath="{.spec.containers[*].name}"' % (pod[0], pod[1])
).split(" ")
container_pod_list.append([pod[0], pod[1], container_names])
else:
@@ -176,7 +176,9 @@ def check_failed_containers(killed_container_list, wait_time):
while timer <= wait_time:
for killed_container in killed_container_list:
# pod namespace contain name
pod_output = runcommand.invoke("oc get pods %s -n %s -o yaml" % (killed_container[0], killed_container[1]))
pod_output = runcommand.invoke(
"kubectl get pods %s -n %s -o yaml" % (killed_container[0], killed_container[1])
)
pod_output_yaml = yaml.full_load(pod_output)
for statuses in pod_output_yaml["status"]["containerStatuses"]:
if statuses["name"] == killed_container[2]:

View File

@@ -175,29 +175,37 @@ def main(cfg):
# Inject time skew chaos scenarios specified in the config
elif scenario_type == "time_scenarios":
logging.info("Running time skew scenarios")
time_actions.run(scenarios_list, config, wait_duration)
if distribution == "openshift":
logging.info("Running time skew scenarios")
time_actions.run(scenarios_list, config, wait_duration)
else:
logging.error("Litmus scenarios are currently supported only on openshift")
sys.exit(1)
# Inject litmus based chaos scenarios
elif scenario_type == "litmus_scenarios":
logging.info("Running litmus scenarios")
litmus_namespace = "litmus"
if not litmus_installed:
# Remove Litmus resources before running the scenarios
common_litmus.delete_chaos(litmus_namespace)
common_litmus.delete_chaos_experiments(litmus_namespace)
if litmus_uninstall_before_run:
common_litmus.uninstall_litmus(litmus_version, litmus_namespace)
common_litmus.install_litmus(litmus_version, litmus_namespace)
common_litmus.deploy_all_experiments(litmus_version, litmus_namespace)
litmus_installed = True
common_litmus.run(
scenarios_list,
config,
litmus_uninstall,
wait_duration,
litmus_namespace,
)
if distribution == "openshift":
logging.info("Running litmus scenarios")
litmus_namespace = "litmus"
if not litmus_installed:
# Remove Litmus resources before running the scenarios
common_litmus.delete_chaos(litmus_namespace)
common_litmus.delete_chaos_experiments(litmus_namespace)
if litmus_uninstall_before_run:
common_litmus.uninstall_litmus(litmus_version, litmus_namespace)
common_litmus.install_litmus(litmus_version, litmus_namespace)
common_litmus.deploy_all_experiments(litmus_version, litmus_namespace)
litmus_installed = True
common_litmus.run(
scenarios_list,
config,
litmus_uninstall,
wait_duration,
litmus_namespace,
)
else:
logging.error("Litmus scenarios are currently only supported on openshift")
sys.exit(1)
# Inject cluster shutdown scenarios
elif scenario_type == "cluster_shut_down_scenarios":

View File

@@ -0,0 +1,8 @@
scenarios:
- name: "kill dns container"
namespace: "kube-system"
label_selector: "k8s-app=kube-dns"
container_name: ""
action: "kill 1"
count: 1
retry_wait: 60

32
scenarios/kube/scheduler.yml Executable file
View File

@@ -0,0 +1,32 @@
config:
runStrategy:
runs: 1
maxSecondsBetweenRuns: 30
minSecondsBetweenRuns: 1
scenarios:
- name: "delete scheduler pods"
steps:
- podAction:
matches:
- labels:
namespace: "kube-system"
selector: "k8s-app=kube-scheduler"
filters:
- randomSample:
size: 1
actions:
- kill:
probability: 1
force: true
- podAction:
matches:
- labels:
namespace: "kube-system"
selector: "k8s-app=kube-scheduler"
retries:
retriesTimeout:
timeout: 180
actions:
- checkPodCount:
count: 3