mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 18:10:00 +00:00
Add support to run on Kubernetes
This commit: - Leverages distribution flag in the config set by the user to skip things not supported on OpenShift to be able to run scenarios on Kubernetes. - Adds sample config and scenario files that work on Kubernetes.
This commit is contained in:
34
README.md
34
README.md
@@ -56,27 +56,19 @@ Instructions on how to setup the config and the options supported can be found a
|
||||
|
||||
### Kubernetes/OpenShift chaos scenarios supported
|
||||
|
||||
- [Pod Scenarios](docs/pod_scenarios.md)
|
||||
|
||||
- [Container Scenarios](docs/container_scenarios.md)
|
||||
|
||||
- [Node Scenarios](docs/node_scenarios.md)
|
||||
|
||||
- [Time Scenarios](docs/time_scenarios.md)
|
||||
|
||||
- [Litmus Scenarios](docs/litmus_scenarios.md)
|
||||
|
||||
- [Cluster Shut Down Scenarios](docs/cluster_shut_down_scenarios.md)
|
||||
|
||||
- [Namespace Scenarios](docs/namespace_scenarios.md)
|
||||
|
||||
- [Zone Outage Scenarios](docs/zone_outage.md)
|
||||
|
||||
- [Application_outages](docs/application_outages.md)
|
||||
|
||||
- [PVC scenario](docs/pvc_scenario.md)
|
||||
|
||||
- [Network_Chaos](docs/network_chaos.md)
|
||||
Scenario type | Kubernetes | OpenShift
|
||||
--------------------------- | ------------- | -------------------- |
|
||||
[Pod Scenarios](docs/pod_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
|
||||
[Container Scenarios](docs/container_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
|
||||
[Node Scenarios](docs/node_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
|
||||
[Time Scenarios](docs/time_scenarios.md) | :x: | :heavy_check_mark: |
|
||||
[Litmus Scenarios](docs/litmus_scenarios.md) | :x: | :heavy_check_mark: |
|
||||
[Cluster Shut Down Scenarios](docs/cluster_shut_down_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
|
||||
[Namespace Scenarios](docs/namespace_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
|
||||
[Zone Outage Scenarios](docs/zone_outage.md) | :heavy_check_mark: | :heavy_check_mark: |
|
||||
[Application_outages](docs/application_outages.md) | :heavy_check_mark: | :heavy_check_mark: |
|
||||
[PVC scenario](docs/pvc_scenario.md) | :heavy_check_mark: | :heavy_check_mark: |
|
||||
[Network_Chaos](docs/network_chaos.md) | :heavy_check_mark: | :heavy_check_mark: |
|
||||
|
||||
|
||||
### Kraken scenario pass/fail criteria and report
|
||||
|
||||
@@ -10,40 +10,40 @@ kraken:
|
||||
litmus_uninstall_before_run: True # If you want to uninstall litmus before a new run starts
|
||||
chaos_scenarios: # List of policies/chaos scenarios to load
|
||||
- container_scenarios: # List of chaos pod scenarios to load
|
||||
- - scenarios/container_etcd.yml
|
||||
- - scenarios/openshift/container_etcd.yml
|
||||
- pod_scenarios:
|
||||
- - scenarios/etcd.yml
|
||||
- - scenarios/regex_openshift_pod_kill.yml
|
||||
- scenarios/post_action_regex.py
|
||||
- - scenarios/openshift/etcd.yml
|
||||
- - scenarios/openshift/regex_openshift_pod_kill.yml
|
||||
- scenarios/openshift/post_action_regex.py
|
||||
- node_scenarios: # List of chaos node scenarios to load
|
||||
- scenarios/node_scenarios_example.yml
|
||||
- scenarios/openshift/node_scenarios_example.yml
|
||||
- pod_scenarios:
|
||||
- - scenarios/openshift-apiserver.yml
|
||||
- - scenarios/openshift-kube-apiserver.yml
|
||||
- - scenarios/openshift/openshift-apiserver.yml
|
||||
- - scenarios/openshift/openshift-kube-apiserver.yml
|
||||
- time_scenarios: # List of chaos time scenarios to load
|
||||
- scenarios/time_scenarios_example.yml
|
||||
- scenarios/openshift/time_scenarios_example.yml
|
||||
- litmus_scenarios: # List of litmus scenarios to load
|
||||
- - scenarios/templates/litmus-rbac.yaml
|
||||
- scenarios/node_cpu_hog_engine.yaml
|
||||
- - scenarios/templates/litmus-rbac.yaml
|
||||
- scenarios/node_mem_engine.yaml
|
||||
- - scenarios/templates/litmus-rbac.yaml
|
||||
- scenarios/node_io_engine.yaml
|
||||
- - scenarios/openshift/templates/litmus-rbac.yaml
|
||||
- scenarios/openshift/node_cpu_hog_engine.yaml
|
||||
- - scenarios/openshift/templates/litmus-rbac.yaml
|
||||
- scenarios/openshift/node_mem_engine.yaml
|
||||
- - scenarios/openshift/templates/litmus-rbac.yaml
|
||||
- scenarios/openshift/node_io_engine.yaml
|
||||
- cluster_shut_down_scenarios:
|
||||
- - scenarios/cluster_shut_down_scenario.yml
|
||||
- scenarios/post_action_shut_down.py
|
||||
- - scenarios/openshift/cluster_shut_down_scenario.yml
|
||||
- scenarios/openshift/post_action_shut_down.py
|
||||
- namespace_scenarios:
|
||||
- - scenarios/regex_namespace.yaml
|
||||
- - scenarios/ingress_namespace.yaml
|
||||
- scenarios/post_action_namespace.py
|
||||
- - scenarios/openshift/regex_namespace.yaml
|
||||
- - scenarios/openshift/ingress_namespace.yaml
|
||||
- scenarios/openshift/post_action_namespace.py
|
||||
- zone_outages:
|
||||
- scenarios/zone_outage.yaml
|
||||
- scenarios/openshift/zone_outage.yaml
|
||||
- application_outages:
|
||||
- scenarios/app_outage.yaml
|
||||
- scenarios/openshift/app_outage.yaml
|
||||
- pvc_scenarios:
|
||||
- scenarios/pvc_scenario.yaml
|
||||
- scenarios/openshift/pvc_scenario.yaml
|
||||
- network_chaos:
|
||||
- scenarios/network_chaos.yaml
|
||||
- scenarios/openshift/network_chaos.yaml
|
||||
|
||||
cerberus:
|
||||
cerberus_enabled: False # Enable it when cerberus is previously installed
|
||||
|
||||
38
config/config_kubernetes.yaml
Normal file
38
config/config_kubernetes.yaml
Normal file
@@ -0,0 +1,38 @@
|
||||
kraken:
|
||||
distribution: kubernetes # Distribution can be kubernetes or openshift
|
||||
kubeconfig_path: /root/.kube/config # Path to kubeconfig
|
||||
exit_on_failure: False # Exit when a post action scenario fails
|
||||
port: 8081
|
||||
publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081
|
||||
signal_state: RUN # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details
|
||||
litmus_version: v1.13.6 # Litmus version to install
|
||||
litmus_uninstall: False # If you want to uninstall litmus if failure
|
||||
litmus_uninstall_before_run: True # If you want to uninstall litmus before a new run starts
|
||||
chaos_scenarios: # List of policies/chaos scenarios to load
|
||||
- container_scenarios: # List of chaos pod scenarios to load
|
||||
- - scenarios/kube/container_dns.yml
|
||||
- pod_scenarios:
|
||||
- - scenarios/kube/scheduler.yml
|
||||
|
||||
cerberus:
|
||||
cerberus_enabled: False # Enable it when cerberus is previously installed
|
||||
cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
|
||||
check_applicaton_routes: False # When enabled will look for application unavailability using the routes specified in the cerberus config and fails the run
|
||||
|
||||
performance_monitoring:
|
||||
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
||||
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||
kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
|
||||
capture_metrics: False
|
||||
config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config
|
||||
metrics_profile_path: config/metrics-aggregated.yaml
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||
uuid: # uuid for the run is generated by default if not set
|
||||
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||
alert_profile: config/alerts # Path to alert profile with the prometheus queries
|
||||
|
||||
tunings:
|
||||
wait_duration: 60 # Duration to wait between each chaos scenario
|
||||
iterations: 1 # Number of times to execute the scenarios
|
||||
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever
|
||||
@@ -10,33 +10,34 @@ kraken:
|
||||
litmus_uninstall_before_run: True # If you want to uninstall litmus before a new run starts
|
||||
chaos_scenarios: # List of policies/chaos scenarios to load
|
||||
- pod_scenarios: # List of chaos pod scenarios to load
|
||||
- - scenarios/etcd.yml
|
||||
- - scenarios/regex_openshift_pod_kill.yml
|
||||
- scenarios/post_action_regex.py
|
||||
- - scenarios/openshift/etcd.yml
|
||||
- - scenarios/openshift/regex_openshift_pod_kill.yml
|
||||
- scenarios/openshift/post_action_regex.py
|
||||
- node_scenarios: # List of chaos node scenarios to load
|
||||
- scenarios/node_scenarios_example.yml
|
||||
- scenarios/openshift/node_scenarios_example.yml
|
||||
- pod_scenarios:
|
||||
- - scenarios/openshift-apiserver.yml
|
||||
- - scenarios/openshift-kube-apiserver.yml
|
||||
- - scenarios/openshift/openshift-apiserver.yml
|
||||
- - scenarios/openshift/openshift-kube-apiserver.yml
|
||||
- time_scenarios: # List of chaos time scenarios to load
|
||||
- scenarios/time_scenarios_example.yml
|
||||
- scenarios/openshift/time_scenarios_example.yml
|
||||
- litmus_scenarios: # List of litmus scenarios to load
|
||||
- - https://hub.litmuschaos.io/api/chaos/1.10.0?file=charts/generic/node-cpu-hog/rbac.yaml
|
||||
- scenarios/node_cpu_hog_engine.yaml
|
||||
- scenarios/openshift/node_cpu_hog_engine.yaml
|
||||
- cluster_shut_down_scenarios:
|
||||
- - scenarios/cluster_shut_down_scenario.yml
|
||||
- scenarios/post_action_shut_down.py
|
||||
- - scenarios/openshift/cluster_shut_down_scenario.yml
|
||||
- scenarios/openshift/post_action_shut_down.py
|
||||
- namespace_scenarios:
|
||||
- scenarios/regex_namespace.yaml
|
||||
- scenarios/ingress_namespace.yaml
|
||||
- scenarios/openshift/regex_namespace.yaml
|
||||
- scenarios/openshift/ingress_namespace.yaml
|
||||
- zone_outages:
|
||||
- scenarios/zone_outage.yaml
|
||||
- scenarios/openshift/zone_outage.yaml
|
||||
- application_outages:
|
||||
- scenarios/app_outage.yaml
|
||||
- scenarios/openshift/app_outage.yaml
|
||||
- pvc_scenarios:
|
||||
- scenarios/pvc_scenario.yaml
|
||||
- scenarios/openshift/pvc_scenario.yaml
|
||||
- network_chaos:
|
||||
- scenarios/network_chaos.yaml
|
||||
- scenarios/openshift/network_chaos.yaml
|
||||
|
||||
cerberus:
|
||||
cerberus_enabled: True # Enable it when cerberus is previously installed
|
||||
cerberus_url: http://0.0.0.0:8080 # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
|
||||
|
||||
@@ -34,10 +34,14 @@ def scrape_metrics(
|
||||
"""
|
||||
|
||||
if not prometheus_url:
|
||||
logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
|
||||
prometheus_url, prometheus_bearer_token = prometheus.instance(
|
||||
distribution, prometheus_url, prometheus_bearer_token
|
||||
)
|
||||
if distribution == "openshift":
|
||||
logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
|
||||
prometheus_url, prometheus_bearer_token = prometheus.instance(
|
||||
distribution, prometheus_url, prometheus_bearer_token
|
||||
)
|
||||
else:
|
||||
logging.error("Looks like proemtheus url is not defined, exiting")
|
||||
sys.exit(1)
|
||||
command = (
|
||||
"./kube-burner index --uuid "
|
||||
+ str(uuid)
|
||||
@@ -69,10 +73,14 @@ def alerts(distribution, prometheus_url, prometheus_bearer_token, start_time, en
|
||||
"""
|
||||
|
||||
if not prometheus_url:
|
||||
logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
|
||||
prometheus_url, prometheus_bearer_token = prometheus.instance(
|
||||
distribution, prometheus_url, prometheus_bearer_token
|
||||
)
|
||||
if distribution == "openshift":
|
||||
logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
|
||||
prometheus_url, prometheus_bearer_token = prometheus.instance(
|
||||
distribution, prometheus_url, prometheus_bearer_token
|
||||
)
|
||||
else:
|
||||
logging.error("Looks like proemtheus url is not defined, exiting")
|
||||
sys.exit(1)
|
||||
command = (
|
||||
"./kube-burner check-alerts "
|
||||
+ " -u "
|
||||
|
||||
@@ -120,7 +120,7 @@ def container_killing_in_pod(cont_scenario):
|
||||
for pod in pods:
|
||||
if type(pod) == list:
|
||||
container_names = runcommand.invoke(
|
||||
'oc get pods %s -n %s -o jsonpath="{.spec.containers[*].name}"' % (pod[0], pod[1])
|
||||
'kubectl get pods %s -n %s -o jsonpath="{.spec.containers[*].name}"' % (pod[0], pod[1])
|
||||
).split(" ")
|
||||
container_pod_list.append([pod[0], pod[1], container_names])
|
||||
else:
|
||||
@@ -176,7 +176,9 @@ def check_failed_containers(killed_container_list, wait_time):
|
||||
while timer <= wait_time:
|
||||
for killed_container in killed_container_list:
|
||||
# pod namespace contain name
|
||||
pod_output = runcommand.invoke("oc get pods %s -n %s -o yaml" % (killed_container[0], killed_container[1]))
|
||||
pod_output = runcommand.invoke(
|
||||
"kubectl get pods %s -n %s -o yaml" % (killed_container[0], killed_container[1])
|
||||
)
|
||||
pod_output_yaml = yaml.full_load(pod_output)
|
||||
for statuses in pod_output_yaml["status"]["containerStatuses"]:
|
||||
if statuses["name"] == killed_container[2]:
|
||||
|
||||
@@ -175,29 +175,37 @@ def main(cfg):
|
||||
|
||||
# Inject time skew chaos scenarios specified in the config
|
||||
elif scenario_type == "time_scenarios":
|
||||
logging.info("Running time skew scenarios")
|
||||
time_actions.run(scenarios_list, config, wait_duration)
|
||||
if distribution == "openshift":
|
||||
logging.info("Running time skew scenarios")
|
||||
time_actions.run(scenarios_list, config, wait_duration)
|
||||
else:
|
||||
logging.error("Litmus scenarios are currently supported only on openshift")
|
||||
sys.exit(1)
|
||||
|
||||
# Inject litmus based chaos scenarios
|
||||
elif scenario_type == "litmus_scenarios":
|
||||
logging.info("Running litmus scenarios")
|
||||
litmus_namespace = "litmus"
|
||||
if not litmus_installed:
|
||||
# Remove Litmus resources before running the scenarios
|
||||
common_litmus.delete_chaos(litmus_namespace)
|
||||
common_litmus.delete_chaos_experiments(litmus_namespace)
|
||||
if litmus_uninstall_before_run:
|
||||
common_litmus.uninstall_litmus(litmus_version, litmus_namespace)
|
||||
common_litmus.install_litmus(litmus_version, litmus_namespace)
|
||||
common_litmus.deploy_all_experiments(litmus_version, litmus_namespace)
|
||||
litmus_installed = True
|
||||
common_litmus.run(
|
||||
scenarios_list,
|
||||
config,
|
||||
litmus_uninstall,
|
||||
wait_duration,
|
||||
litmus_namespace,
|
||||
)
|
||||
if distribution == "openshift":
|
||||
logging.info("Running litmus scenarios")
|
||||
litmus_namespace = "litmus"
|
||||
if not litmus_installed:
|
||||
# Remove Litmus resources before running the scenarios
|
||||
common_litmus.delete_chaos(litmus_namespace)
|
||||
common_litmus.delete_chaos_experiments(litmus_namespace)
|
||||
if litmus_uninstall_before_run:
|
||||
common_litmus.uninstall_litmus(litmus_version, litmus_namespace)
|
||||
common_litmus.install_litmus(litmus_version, litmus_namespace)
|
||||
common_litmus.deploy_all_experiments(litmus_version, litmus_namespace)
|
||||
litmus_installed = True
|
||||
common_litmus.run(
|
||||
scenarios_list,
|
||||
config,
|
||||
litmus_uninstall,
|
||||
wait_duration,
|
||||
litmus_namespace,
|
||||
)
|
||||
else:
|
||||
logging.error("Litmus scenarios are currently only supported on openshift")
|
||||
sys.exit(1)
|
||||
|
||||
# Inject cluster shutdown scenarios
|
||||
elif scenario_type == "cluster_shut_down_scenarios":
|
||||
|
||||
8
scenarios/kube/container_dns.yml
Executable file
8
scenarios/kube/container_dns.yml
Executable file
@@ -0,0 +1,8 @@
|
||||
scenarios:
|
||||
- name: "kill dns container"
|
||||
namespace: "kube-system"
|
||||
label_selector: "k8s-app=kube-dns"
|
||||
container_name: ""
|
||||
action: "kill 1"
|
||||
count: 1
|
||||
retry_wait: 60
|
||||
32
scenarios/kube/scheduler.yml
Executable file
32
scenarios/kube/scheduler.yml
Executable file
@@ -0,0 +1,32 @@
|
||||
config:
|
||||
runStrategy:
|
||||
runs: 1
|
||||
maxSecondsBetweenRuns: 30
|
||||
minSecondsBetweenRuns: 1
|
||||
scenarios:
|
||||
- name: "delete scheduler pods"
|
||||
steps:
|
||||
- podAction:
|
||||
matches:
|
||||
- labels:
|
||||
namespace: "kube-system"
|
||||
selector: "k8s-app=kube-scheduler"
|
||||
filters:
|
||||
- randomSample:
|
||||
size: 1
|
||||
actions:
|
||||
- kill:
|
||||
probability: 1
|
||||
force: true
|
||||
- podAction:
|
||||
matches:
|
||||
- labels:
|
||||
namespace: "kube-system"
|
||||
selector: "k8s-app=kube-scheduler"
|
||||
retries:
|
||||
retriesTimeout:
|
||||
timeout: 180
|
||||
|
||||
actions:
|
||||
- checkPodCount:
|
||||
count: 3
|
||||
Reference in New Issue
Block a user