mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 18:10:00 +00:00
Adding litmus scenario options
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
kraken:
|
||||
kubeconfig_path: /root/.kube/config # Path to kubeconfig
|
||||
exit_on_failure: False # Exit when a post action scenario fails
|
||||
chaos_scenarios: # List of policies/chaos scenarios to load
|
||||
litmus_version: v1.10.0 # Litmus version to install
|
||||
litmus_uninstall: False # If you want to uninstall litmus if failure
|
||||
chaos_scenarios: # List of policies/chaos scenarios to load
|
||||
- pod_scenarios: # List of chaos pod scenarios to load
|
||||
- - scenarios/etcd.yml
|
||||
- - scenarios/regex_openshift_pod_kill.yml
|
||||
@@ -13,6 +15,9 @@ kraken:
|
||||
- - scenarios/openshift-kube-apiserver.yml
|
||||
- time_scenarios: # List of chaos time scenarios to load
|
||||
- scenarios/time_scenarios_example.yml
|
||||
- litmus_scenarios: # List of litmus scenarios to load
|
||||
- - https://hub.litmuschaos.io/api/chaos/1.10.0?file=charts/generic/node-cpu-hog/rbac.yaml
|
||||
- scenarios/node_hog_engine.yaml
|
||||
|
||||
cerberus:
|
||||
cerberus_enabled: False # Enable it when cerberus is previously installed
|
||||
|
||||
41
docs/litmus_scenarios.md
Normal file
41
docs/litmus_scenarios.md
Normal file
@@ -0,0 +1,41 @@
|
||||
### Litmus Scenarios
|
||||
Kraken consumes [Litmus](https://github.com/litmuschaos/litmus) under the hood for some infrastructure, pod, and node scenarios
|
||||
|
||||
Official Litmus documentation and to read more information on specifics of Litmus resources can be found [here](https://docs.litmuschaos.io/docs/next/getstarted/)
|
||||
|
||||
|
||||
#### Litmus Chaos Custom Resources
|
||||
There are 3 custom resources that are created during each Litmus scenario. Below is a description of the resources:
|
||||
* ChaosEngine: A resource to link a Kubernetes application or Kubernetes node to a ChaosExperiment. ChaosEngine is watched by Litmus' Chaos-Operator which then invokes Chaos-Experiments
|
||||
* ChaosExperiment: A resource to group the configuration parameters of a chaos experiment. ChaosExperiment CRs are created by the operator when experiments are invoked by ChaosEngine.
|
||||
* ChaosResult : A resource to hold the results of a chaos-experiment. The Chaos-exporter reads the results and exports the metrics into a configured Prometheus server.
|
||||
|
||||
### Understanding Litmus Scenarios
|
||||
|
||||
To run Litmus scenarios we need to apply 3 different resources/yaml files to our cluster
|
||||
1. **Chaos experiments** contain the actual chaos details of a scenario
|
||||
|
||||
i. This is installed automatically by Kraken (does not need to be specified in kraken scenario configuration)
|
||||
|
||||
2. **Service Account**: should be created to allow chaosengine to run experiments in your application namespace. Usually sets just enough permissions to a specific namespace to be able to run the experiment properly
|
||||
|
||||
i. This can be defined using either a link to a yaml file or a downloaded file in the scenarios folder
|
||||
|
||||
3. **Chaos Engine** connects the application instance to a Chaos Experiment. This is where you define the specifics of your scenario; ie: the node or pod name you want to cause chaos within
|
||||
|
||||
i. This is a downloaded yaml file in the scenarios folder, full list of scenarios can be found [here](https://hub.litmuschaos.io/)
|
||||
|
||||
**NOTE**: By default all chaos experiments will be installed based on the version you give in the config file.
|
||||
|
||||
Adding a new Litmus based scenario is as simple as adding references to 2 new yaml files (the Service Account and Chaos engine files for your scenario ) in the Kraken config.
|
||||
|
||||
### Current Scenarios
|
||||
|
||||
Following are the start of scenarios for which a chaos scenario config exists today.
|
||||
|
||||
Component | Description | Working
|
||||
------------------------ | ---------------------------------------------------------------------------------------------------| ------------------------- |
|
||||
Node CPU Hog | Chaos scenario that hogs up the CPU on a defined node for a specific amount of time | :heavy_check_mark: |
|
||||
|
||||
|
||||
|
||||
103
kraken/litmus/common_litmus.py
Normal file
103
kraken/litmus/common_litmus.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import kraken.invoke.command as runcommand
|
||||
import logging
|
||||
import time
|
||||
import sys
|
||||
|
||||
|
||||
# Install litmus and wait until pod is running
|
||||
def install_litmus(version):
|
||||
runcommand.invoke("kubectl apply -f "
|
||||
"https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version)
|
||||
|
||||
runcommand.invoke("oc patch -n litmus deployment.apps/chaos-operator-ce --type=json --patch ' "
|
||||
"[ { \"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/env/-\", "
|
||||
"\"value\": { \"name\": \"ANALYTICS\", \"value\": \"FALSE\" } } ]'")
|
||||
|
||||
runcommand.invoke("oc wait deploy -n litmus chaos-operator-ce --for=condition=Available")
|
||||
|
||||
|
||||
def deploy_all_experiments(version_string):
|
||||
|
||||
if not version_string.startswith("v"):
|
||||
logging.error("Incorrect version string for litmus, needs to start with 'v' "
|
||||
"followed by a number")
|
||||
sys.exit(1)
|
||||
version = version_string[1:]
|
||||
|
||||
runcommand.invoke("kubectl apply -f "
|
||||
"https://hub.litmuschaos.io/api/chaos/%s?file=charts/generic/experiments.yaml"
|
||||
% version)
|
||||
|
||||
|
||||
def delete_experiments():
|
||||
runcommand.invoke("kubectl delete chaosengine --all")
|
||||
|
||||
|
||||
# Check status of experiment
|
||||
def check_experiment(engine_name, experiment_name, namespace):
|
||||
chaos_engine = runcommand.invoke("kubectl get chaosengines/%s -n %s -o jsonpath="
|
||||
"'{.status.engineStatus}'" % (engine_name, namespace))
|
||||
engine_status = chaos_engine.strip()
|
||||
max_tries = 30
|
||||
engine_counter = 0
|
||||
while engine_status.lower() != "running" and engine_status.lower() != "completed":
|
||||
time.sleep(10)
|
||||
logging.info("Waiting for engine to start running.")
|
||||
chaos_engine = runcommand.invoke("kubectl get chaosengines/%s -n %s -o jsonpath="
|
||||
"'{.status.engineStatus}'" % (engine_name, namespace))
|
||||
engine_status = chaos_engine.strip()
|
||||
if engine_counter >= max_tries:
|
||||
logging.error("Chaos engine took longer than 5 minutes to be running or complete")
|
||||
return False
|
||||
engine_counter += 1
|
||||
# need to see if error in run
|
||||
if "notfound" in engine_status.lower():
|
||||
logging.info("Chaos engine was not found")
|
||||
return False
|
||||
|
||||
if not chaos_engine:
|
||||
return False
|
||||
chaos_result = runcommand.invoke("kubectl get chaosresult %s"
|
||||
"-%s -n %s -o "
|
||||
"jsonpath='{.status.experimentstatus.verdict}'"
|
||||
% (engine_name, experiment_name, namespace))
|
||||
result_counter = 0
|
||||
status = chaos_result.strip()
|
||||
while status == "Awaited":
|
||||
logging.info("Waiting for chaos result to finish, sleeping 10 seconds")
|
||||
time.sleep(10)
|
||||
chaos_result = runcommand.invoke("kubectl get chaosresult %s"
|
||||
"-%s -n %s -o "
|
||||
"jsonpath='{.status.experimentstatus.verdict}'"
|
||||
% (engine_name, experiment_name, namespace))
|
||||
status = chaos_result.strip()
|
||||
if result_counter >= max_tries:
|
||||
logging.error("Chaos results took longer than 5 minutes to get a final result")
|
||||
return False
|
||||
result_counter += 1
|
||||
if "notfound" in status.lower():
|
||||
logging.info("Chaos result was not found")
|
||||
return False
|
||||
|
||||
if status == "Pass":
|
||||
return True
|
||||
else:
|
||||
chaos_result = runcommand.invoke("kubectl get chaosresult %s"
|
||||
"-%s -n %s -o jsonpath="
|
||||
"'{.status.experimentstatus.failStep}'" %
|
||||
(engine_name, experiment_name, namespace))
|
||||
logging.info("Chaos result failed information: " + str(chaos_result))
|
||||
return False
|
||||
|
||||
|
||||
# Delete all chaos engines in a given namespace
|
||||
def delete_chaos(namespace):
|
||||
runcommand.invoke("kubectl delete chaosengine --all -n " + str(namespace))
|
||||
runcommand.invoke("kubectl delete chaosexperiment --all -n " + str(namespace))
|
||||
runcommand.invoke("kubectl delete chaosresult --all -n " + str(namespace))
|
||||
|
||||
|
||||
# Uninstall litmus operator
|
||||
def uninstall_litmus(version):
|
||||
runcommand.invoke("kubectl delete -f "
|
||||
"https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version)
|
||||
@@ -10,6 +10,7 @@ import requests
|
||||
import pyfiglet
|
||||
import kraken.kubernetes.client as kubecli
|
||||
import kraken.invoke.command as runcommand
|
||||
import kraken.litmus.common_litmus as common_litmus
|
||||
import kraken.node_actions.common_node_functions as nodeaction
|
||||
from kraken.node_actions.aws_node_scenarios import aws_node_scenarios
|
||||
from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios
|
||||
@@ -229,6 +230,53 @@ def time_scenarios(scenarios_list, config):
|
||||
publish_kraken_status(config, not_reset)
|
||||
|
||||
|
||||
def litmus_scenarios(scenarios_list, config, litmus_namespaces, litmus_uninstall):
|
||||
# Loop to run the scenarios starts here
|
||||
for l_scenario in scenarios_list:
|
||||
try:
|
||||
for item in l_scenario:
|
||||
runcommand.invoke("kubectl apply -f %s" % item)
|
||||
if "http" in item:
|
||||
f = requests.get(item)
|
||||
yaml_item = list(yaml.safe_load_all(f.content))[0]
|
||||
else:
|
||||
with open(item, "r") as f:
|
||||
logging.info("opened yaml" + str(item))
|
||||
yaml_item = list(yaml.safe_load_all(f))[0]
|
||||
|
||||
if yaml_item['kind'] == "ChaosEngine":
|
||||
engine_name = yaml_item['metadata']['name']
|
||||
namespace = yaml_item['metadata']['namespace']
|
||||
litmus_namespaces.append(namespace)
|
||||
experiment_names = yaml_item['spec']['experiments']
|
||||
for expr in experiment_names:
|
||||
expr_name = expr['name']
|
||||
experiment_result = common_litmus.check_experiment(engine_name,
|
||||
expr_name,
|
||||
namespace)
|
||||
if experiment_result:
|
||||
logging.info("Scenario: %s has been successfully injected!"
|
||||
% item)
|
||||
else:
|
||||
logging.info("Scenario: %s was not successfully injected!"
|
||||
% item)
|
||||
if litmus_uninstall:
|
||||
for l_item in l_scenario:
|
||||
logging.info('item ' + str(l_item))
|
||||
runcommand.invoke("kubectl delete -f %s" % l_item)
|
||||
if litmus_uninstall:
|
||||
for item in l_scenario:
|
||||
logging.info('item ' + str(item))
|
||||
runcommand.invoke("kubectl delete -f %s" % item)
|
||||
cerberus_integration(config)
|
||||
logging.info("Waiting for the specified duration: %s" % wait_duration)
|
||||
time.sleep(wait_duration)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run litmus scenario: %s. Encountered "
|
||||
"the following exception: %s" % (item, e))
|
||||
return litmus_namespaces
|
||||
|
||||
|
||||
# Main function
|
||||
def main(cfg):
|
||||
# Start kraken
|
||||
@@ -242,6 +290,8 @@ def main(cfg):
|
||||
global kubeconfig_path, wait_duration
|
||||
kubeconfig_path = config["kraken"].get("kubeconfig_path", "")
|
||||
chaos_scenarios = config["kraken"].get("chaos_scenarios", [])
|
||||
litmus_version = config['kraken'].get("litmus_version", 'v1.9.1')
|
||||
litmus_uninstall = config['kraken'].get("litmus_uninstall", False)
|
||||
wait_duration = config["tunings"].get("wait_duration", 60)
|
||||
iterations = config["tunings"].get("iterations", 1)
|
||||
daemon_mode = config["tunings"].get("daemon_mode", False)
|
||||
@@ -277,6 +327,8 @@ def main(cfg):
|
||||
iterations = int(iterations)
|
||||
|
||||
failed_post_scenarios = []
|
||||
litmus_namespaces = []
|
||||
litmus_installed = False
|
||||
# Loop to run the chaos starts here
|
||||
while (int(iteration) < iterations):
|
||||
# Inject chaos scenarios specified in the config
|
||||
@@ -298,9 +350,23 @@ def main(cfg):
|
||||
# Inject time skew chaos scenarios specified in the config
|
||||
elif scenario_type == "time_scenarios":
|
||||
time_scenarios(scenarios_list, config)
|
||||
elif scenario_type == "litmus_scenarios":
|
||||
if not litmus_installed:
|
||||
common_litmus.install_litmus(litmus_version)
|
||||
common_litmus.deploy_all_experiments(litmus_version)
|
||||
litmus_installed = True
|
||||
litmus_namespaces = litmus_scenarios(scenarios_list, config,
|
||||
litmus_namespaces,
|
||||
litmus_uninstall)
|
||||
|
||||
iteration += 1
|
||||
logging.info("")
|
||||
if litmus_uninstall and litmus_installed:
|
||||
for namespace in litmus_namespaces:
|
||||
common_litmus.delete_chaos(namespace)
|
||||
common_litmus.delete_experiments()
|
||||
common_litmus.uninstall_litmus(litmus_version)
|
||||
|
||||
if failed_post_scenarios:
|
||||
logging.error("Post scenarios are still failing at the end of all iterations")
|
||||
sys.exit(1)
|
||||
|
||||
25
scenarios/node_hog_engine.yaml
Normal file
25
scenarios/node_hog_engine.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
apiVersion: litmuschaos.io/v1alpha1
|
||||
kind: ChaosEngine
|
||||
metadata:
|
||||
name: nginx-chaos
|
||||
namespace: default
|
||||
spec:
|
||||
# It can be true/false
|
||||
annotationCheck: 'false'
|
||||
# It can be active/stop
|
||||
engineState: 'active'
|
||||
chaosServiceAccount: node-cpu-hog-sa
|
||||
monitoring: false
|
||||
# It can be delete/retain
|
||||
jobCleanUpPolicy: 'delete'
|
||||
experiments:
|
||||
- name: node-cpu-hog
|
||||
spec:
|
||||
components:
|
||||
env:
|
||||
# set chaos duration (in sec) as desired
|
||||
- name: TOTAL_CHAOS_DURATION
|
||||
value: '60'
|
||||
# ENTER THE COMMA SEPARATED TARGET NODES NAME
|
||||
- name: TARGET_NODES
|
||||
value: ''
|
||||
Reference in New Issue
Block a user