From 6280a392500b9c17e58f6b58f27eb194dc2c9405 Mon Sep 17 00:00:00 2001 From: gsteeds Date: Wed, 4 May 2022 00:26:02 -0500 Subject: [PATCH] Fixed links within docs, as well as read through docs files and corrected some spelling and grammer issues. --- CI/config/common_test_config.yaml | 38 +++---- README.md | 48 ++++----- containers/README.md | 8 +- docs/alerts.md | 8 +- docs/application_outages.md | 2 +- docs/cloud_setup.md | 18 ++-- docs/cluster_shut_down_scenarios.md | 12 +-- docs/config.md | 4 +- docs/container_scenarios.md | 14 +-- docs/contribute.md | 28 ++--- docs/getting_started.md | 24 ++--- docs/index.md | 154 ++++++++++++++-------------- docs/installation.md | 14 +-- docs/litmus_scenarios.md | 32 +++--- docs/metrics.md | 10 +- docs/namespace_scenarios.md | 26 ++--- docs/network_chaos.md | 16 +-- docs/node_scenarios.md | 66 ++++++------ docs/pod_scenarios.md | 18 ++-- docs/pvc_scenario.md | 28 ++--- docs/signal.md | 26 ++--- docs/test_your_changes.md | 26 ++--- docs/time_scenarios.md | 16 +-- docs/zone_outage.md | 20 ++-- 24 files changed, 328 insertions(+), 328 deletions(-) diff --git a/CI/config/common_test_config.yaml b/CI/config/common_test_config.yaml index f9637766..1967c4f1 100644 --- a/CI/config/common_test_config.yaml +++ b/CI/config/common_test_config.yaml @@ -1,31 +1,31 @@ kraken: - distribution: openshift # Distribution can be kubernetes or openshift - kubeconfig_path: /root/.kube/config # Path to kubeconfig - exit_on_failure: False # Exit when a post action scenario fails - litmus_version: v1.13.6 # Litmus version to install - litmus_uninstall: False # If you want to uninstall litmus if failure - chaos_scenarios: # List of policies/chaos scenarios to load - - $scenario_type: # List of chaos pod scenarios to load + distribution: openshift # Distribution can be kubernetes or openshift. + kubeconfig_path: /root/.kube/config # Path to kubeconfig. + exit_on_failure: False # Exit when a post action scenario fails. + litmus_version: v1.13.6 # Litmus version to install. + litmus_uninstall: False # If you want to uninstall litmus if failure. + chaos_scenarios: # List of policies/chaos scenarios to load. + - $scenario_type: # List of chaos pod scenarios to load. - $scenario_file $post_config cerberus: - cerberus_enabled: False # Enable it when cerberus is previously installed - cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal + cerberus_enabled: False # Enable it when cerberus is previously installed. + cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal. performance_monitoring: - deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift + deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift. repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz" capture_metrics: False - config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config + config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config. metrics_profile_path: config/metrics-aggregated.yaml - prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. - prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. - uuid: # uuid for the run is generated by default if not set - enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error - alert_profile: config/alerts # Path to alert profile with the prometheus queries + prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. + prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. + uuid: # uuid for the run is generated by default if not set. + enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error. + alert_profile: config/alerts # Path to alert profile with the prometheus queries. tunings: - wait_duration: 6 # Duration to wait between each chaos scenario - iterations: 1 # Number of times to execute the scenarios - daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever + wait_duration: 6 # Duration to wait between each chaos scenario. + iterations: 1 # Number of times to execute the scenarios. + daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever. diff --git a/README.md b/README.md index 10407c51..ce615693 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Krkn aka Kraken -[![Docker Repository on Quay](https://quay.io/repository/openshift-scale/kraken/status "Docker Repository on Quay")](https://quay.io/chaos-kubox/krkn) +[![Docker Repository on Quay](https://quay.io/repository/chaos-kubox/krkn?tab=tags&tag=latest "Docker Repository on Quay")](https://quay.io/chaos-kubox/krkn) ![Krkn logo](media/logo.png) @@ -16,12 +16,12 @@ Kraken injects deliberate failures into Kubernetes/OpenShift clusters to check i ### Chaos Testing Guide [Guide](docs/index.md) encapsulates: -- Test methodology that needs to be embraced -- Best practices that an OpenShift cluster, platform and applications running on top of it should take into account for best user experience, performance, resilience and reliability -- Tooling -- Scenarios supported -- Test environment recommendations as to how and where to run chaos tests -- Chaos testing in practice +- Test methodology that needs to be embraced. +- Best practices that an OpenShift cluster, platform and applications running on top of it should take into account for best user experience, performance, resilience and reliability. +- Tooling. +- Scenarios supported. +- Test environment recommendations as to how and where to run chaos tests. +- Chaos testing in practice. The guide is hosted at [https://chaos-kubox.github.io/krkn/](https://chaos-kubox.github.io/krkn/). @@ -35,15 +35,15 @@ After installation, refer back to the below sections for supported scenarios and #### Running Kraken with minimal configuration tweaks -For cases where you want to run Kraken with minimal configuration changes, refer to [Kraken-hub](https://github.com/cloud-bulldozer/kraken-hub). One use case is CI integration where you don't want to carry around different configuration files for the scenarios. +For cases where you want to run Kraken with minimal configuration changes, refer to [Kraken-hub](https://github.com/chaos-kubox/krkn-hub). One use case is CI integration where you do not want to carry around different configuration files for the scenarios. ### Setting up infrastructure dependencies -Kraken indexes the metrics specified in the profile into Elasticsearch in addition to leveraging Cerberus for understanding the health of the Kubernetes/OpenShift cluster under test. More information on the features is documented below. The infrastruture pieces can be easily installed and uninstalled by running: +Kraken indexes the metrics specified in the profile into Elasticsearch in addition to leveraging Cerberus for understanding the health of the Kubernetes/OpenShift cluster under test. More information on the features is documented below. The infrastructure pieces can be easily installed and uninstalled by running: ``` $ cd kraken -$ podman-compose up or $ docker-compose up # Spins up the containers specified in the docker-compose.yml file present in the run directory -$ podman-compose down or $ docker-compose down # Delete the containers installed +$ podman-compose up or $ docker-compose up # Spins up the containers specified in the docker-compose.yml file present in the run directory. +$ podman-compose down or $ docker-compose down # Delete the containers installed. ``` This will manage the Cerberus and Elasticsearch containers on the host on which you are running Kraken. @@ -80,21 +80,21 @@ Instructions on how to setup the config and the options supported can be found a ### Kraken scenario pass/fail criteria and report -It's important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes/OpenShift cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by: +It is important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes/OpenShift cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by: - Having built in checks for pod and node based scenarios to ensure the expected number of replicas and nodes are up. It also supports running custom scripts with the checks. -- Leveraging [Cerberus](https://github.com/openshift-scale/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail post chaos. It is highly recommended to turn on the Cerberus health check feature avaliable in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation) or can be installed from Kraken using the [instructions](https://github.com/cloud-bulldozer/kraken#setting-up-infrastructure-dependencies). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file. Cerberus can monitor [application routes](https://github.com/cloud-bulldozer/cerberus/blob/master/docs/config.md#watch-routes) during the chaos and fails the run if it encounters downtime as it's a potential downtime in customer, users environment as well. It is especially important during the control plane chaos scenarios including the API server, Etcd, Ingress etc. It can be enabled by setting `check_applicaton_routes: True` in the [Kraken config](https://github.com/cloud-bulldozer/kraken/blob/master/config/config.yaml) provided application routes are being monitored in the [cerberus config](https://github.com/cloud-bulldozer/kraken/blob/master/config/cerberus.yaml) +- Leveraging [Cerberus](https://github.com/openshift-scale/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail post chaos. It is highly recommended to turn on the Cerberus health check feature available in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation) or can be installed from Kraken using the [instructions](https://github.com/chaos-kubox/krkn#setting-up-infrastructure-dependencies). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file. Cerberus can monitor [application routes](https://github.com/chaos-kubox/cerberus/blob/main/docs/config.md#watch-routes) during the chaos and fails the run if it encounters downtime as it is a potential downtime in a customers, or users environment as well. It is especially important during the control plane chaos scenarios including the API server, Etcd, Ingress etc. It can be enabled by setting `check_applicaton_routes: True` in the [Kraken config](https://github.com/chaos-kubox/krkn/blob/main/config/config.yaml) provided application routes are being monitored in the [cerberus config](https://github.com/chaos-kubox/krkn/blob/main/config/cerberus.yaml). - Leveraging [kube-burner](docs/alerts.md) alerting feature to fail the runs in case of critical alerts. ### Signaling -In CI runs or any external job it is useful to stop Kraken once a certain test or state gets reached. We created a way to signal to kraken to pause the chaos or stop it completely using a signal posted to a port of your choice +In CI runs or any external job it is useful to stop Kraken once a certain test or state gets reached. We created a way to signal to kraken to pause the chaos or stop it completely using a signal posted to a port of your choice. -For example if we have a test run loading the cluster running and kraken separately running; we want to be able to know when to start/stop the kraken run based on when the test run completes or gets to a certain loaded state +For example if we have a test run loading the cluster running and kraken separately running; we want to be able to know when to start/stop the kraken run based on when the test run completes or gets to a certain loaded state. More detailed information on enabling and leveraging this feature can be found [here](docs/signal.md). ### Performance monitoring -Monitoring the Kubernetes/OpenShift cluster to observe the impact of Kraken chaos scenarios on various components is key to find out the bottlenecks as it's important to make sure the cluster is healthy in terms if both recovery as well as performance during/after the failure has been injected. Instructions on enabling it can be found [here](docs/performance_dashboards.md). +Monitoring the Kubernetes/OpenShift cluster to observe the impact of Kraken chaos scenarios on various components is key to find out the bottlenecks as it is important to make sure the cluster is healthy in terms if both recovery as well as performance during/after the failure has been injected. Instructions on enabling it can be found [here](docs/performance_dashboards.md). ### Scraping and storing metrics long term @@ -113,11 +113,11 @@ In addition to checking the recovery and health of the cluster and components un ### Roadmap Following is a list of enhancements that we are planning to work on adding support in Kraken. Of course any help/contributions are greatly appreciated. -- [Ability to visualize the metrics that are being captured by Kraken and stored in Elasticsearch](https://github.com/cloud-bulldozer/kraken/issues/124) -- Ability to shape the ingress network similar to how Kraken supports [egress traffic shaping](https://github.com/cloud-bulldozer/kraken/blob/master/docs/network_chaos.md) today -- Continue to improve [Chaos Testing Guide](https://cloud-bulldozer.github.io/kraken/) in terms of adding more best practices, test environment recommendations and scenarios to make sure OpenShift platform, as well the applications running on top it, are resilient and performant under chaotic conditions -- Support for running Kraken on Kubernetes distribution - see https://github.com/cloud-bulldozer/kraken/issues/185, https://github.com/cloud-bulldozer/kraken/issues/186 -- Sweet logo for Kraken - see https://github.com/cloud-bulldozer/kraken/issues/195 +- [Ability to visualize the metrics that are being captured by Kraken and stored in Elasticsearch](https://github.com/chaos-kubox/krkn/issues/124) +- Ability to shape the ingress network similar to how Kraken supports [egress traffic shaping](https://github.com/chaos-kubox/krkn/blob/main/docs/network_chaos.md) today. +- Continue to improve [Chaos Testing Guide](https://cloud-bulldozer.github.io/kraken/) in terms of adding best practices, test environment recommendations and scenarios to make sure the OpenShift platform, as well the applications running on top it, are resilient and performant under chaotic conditions. +- Support for running Kraken on Kubernetes distribution - see https://github.com/chaos-kubox/krkn/issues/185, https://github.com/chaos-kubox/krkn/issues/186 +- Sweet logo for Kraken - see https://github.com/chaos-kubox/krkn/issues/195 ### Contributions @@ -125,11 +125,11 @@ We are always looking for more enhancements, fixes to make it better, any contri [More information on how to Contribute](docs/contribute.md) -If adding a new scenario or tweaking the main config, be sure to add in updates into the CI to be sure the CI is up to date -Please read [this file]((CI/README.md#adding-a-test-case)) for more information on updates +If adding a new scenario or tweaking the main config, be sure to add in updates into the CI to be sure the CI is up to date. +Please read [this file]((CI/README.md#adding-a-test-case)) for more information on updates. ### Community -Key Members(slack_usernames/full name): paigerube14/Paige Rubendall, mffiedler/Mike Fiedler, ravielluri/Naga Ravi Chaitanya Elluri +Key Members(slack_usernames/full name): paigerube14/Paige Rubendall, mffiedler/Mike Fiedler, ravielluri/Naga Ravi Chaitanya Elluri. * [**#sig-scalability on Kubernetes Slack**](https://kubernetes.slack.com) * [**#forum-chaos on CoreOS Slack internal to Red Hat**](https://coreos.slack.com) diff --git a/containers/README.md b/containers/README.md index fd50ff4c..8d9e4abc 100644 --- a/containers/README.md +++ b/containers/README.md @@ -3,17 +3,17 @@ Container image gets automatically built by quay.io at [Kraken image](https://quay.io/chaos-kubox/krkn). ### Run containerized version -Refer [instructions](https://github.com/cloud-bulldozer/kraken/blob/master/docs/installation.md#run-containerized-version) for information on how to run the containerized version of kraken. +Refer [instructions](https://github.com/chaos-kubox/krkn/blob/main/docs/installation.md#run-containerized-version) for information on how to run the containerized version of kraken. ### Run Custom Kraken Image -Refer to [instructions](https://github.com/cloud-bulldozer/kraken/blob/master/containers/build_own_image-README.md) for information on how to run a custom containerized version of kraken using podman +Refer to [instructions](https://github.com/chaos-kubox/krkn/blob/main/containers/build_own_image-README.md) for information on how to run a custom containerized version of kraken using podman. ### Kraken as a KubeApp To run containerized Kraken as a Kubernetes/OpenShift Deployment, follow these steps: -1. Configure the [config.yaml](https://github.com/openshift-scale/kraken/tree/master/config/config.yaml) file according to your requirements. +1. Configure the [config.yaml](https://github.com/chaos-kubox/krkn/blob/main/config/config.yaml) file according to your requirements. 2. Create a namespace under which you want to run the kraken pod using `kubectl create ns `. 3. Switch to `` namespace: - In Kubernetes, use `kubectl config set-context --current --namespace=` @@ -21,7 +21,7 @@ To run containerized Kraken as a Kubernetes/OpenShift Deployment, follow these s 4. Create a ConfigMap named kube-config using `kubectl create configmap kube-config --from-file=` 5. Create a ConfigMap named kraken-config using `kubectl create configmap kraken-config --from-file=` 6. Create a ConfigMap named scenarios-config using `kubectl create configmap scenarios-config --from-file=` -7. Create a serviceaccount to run the kraken pod `kubectl create serviceaccount useroot`. +7. Create a service account to run the kraken pod `kubectl create serviceaccount useroot`. 8. In Openshift, add privileges to service account and execute `oc adm policy add-scc-to-user privileged -z useroot`. 9. Create a Job using `kubectl apply -f kraken.yml` and monitor the status using `oc get jobs` and `oc get pods`. diff --git a/docs/alerts.md b/docs/alerts.md index d62ef0f9..8429730c 100644 --- a/docs/alerts.md +++ b/docs/alerts.md @@ -1,18 +1,18 @@ ## Alerts -Pass/fail based on metrics captured from the cluster is important in addition to checking the health status and recovery. Kraken supports alerting based on the queries defined by the user and modifies the return code of the run to determine pass/fail. It's especially useful in case of automated runs in CI where user won't be able to monitor the system. It uses [Kube-burner](https://kube-burner.readthedocs.io/en/latest/) under the hood. This feature can be enabled in the [config](https://github.com/cloud-bulldozer/kraken/blob/master/config/config.yaml) by setting the following: +Pass/fail based on metrics captured from the cluster is important in addition to checking the health status and recovery. Kraken supports alerting based on the queries defined by the user and modifies the return code of the run to determine pass/fail. It's especially useful in case of automated runs in CI where user won't be able to monitor the system. It uses [Kube-burner](https://kube-burner.readthedocs.io/en/latest/) under the hood. This feature can be enabled in the [config](https://github.com/chaos-kubox/krkn/blob/main/config/config.yaml) by setting the following: ``` performance_monitoring: kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz" prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. - enable_alerts: True # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error - alert_profile: config/alerts # Path to alert profile with the prometheus queries + enable_alerts: True # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error. + alert_profile: config/alerts # Path to alert profile with the prometheus queries. ``` ### Alert profile -A couple of [alert profiles](https://github.com/cloud-bulldozer/kraken/tree/master/config) ( [alerts](https://github.com/cloud-bulldozer/kraken/tree/master/config/alerts) are shipped by default and they can be tweaked to add more queries to alert on. Following are a couple of alerts for example: +A couple of [alert profiles](https://github.com/chaos-kubox/krkn/tree/main/config) [alerts](https://github.com/chaos-kubox/krkn/blob/main/config/alerts) are shipped by default and can be tweaked to add more queries to alert on. The following are a few alerts examples: ``` - expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[5m:]) > 0.01 diff --git a/docs/application_outages.md b/docs/application_outages.md index f2ad9348..7a56c6da 100644 --- a/docs/application_outages.md +++ b/docs/application_outages.md @@ -1,5 +1,5 @@ ### Application outages -Scenario to block the traffic ( Ingress/Egress ) of an application matching the labels for the specified duration of time to understand the behavior of the service/other services which depend it during the downtime. This helps with the planning the requirements accordingly be it improving the timeouts or tweaking the alerts etc. +Scenario to block the traffic ( Ingress/Egress ) of an application matching the labels for the specified duration of time to understand the behavior of the service/other services which depend on it during downtime. This helps with planning the requirements accordingly, be it improving the timeouts or tweaking the alerts etc. ##### Sample scenario config ``` diff --git a/docs/cloud_setup.md b/docs/cloud_setup.md index 91afd914..c6ed1491 100644 --- a/docs/cloud_setup.md +++ b/docs/cloud_setup.md @@ -8,7 +8,7 @@ Supported Cloud Providers: ## AWS -**NOTE**: For clusters with AWS make sure [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) is installed and properly [configured](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html) using an AWS account +**NOTE**: For clusters with AWS make sure [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) is installed and properly [configured](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html) using an AWS account ## GCP **NOTE**: For clusters with GCP make sure [GCP CLI](https://cloud.google.com/sdk/docs/install#linux) is installed. @@ -17,7 +17,7 @@ A google service account is required to give proper authentication to GCP for no **NOTE**: A user with 'resourcemanager.projects.setIamPolicy' permission is required to grant project-level permissions to the service account. -After creating the service account you'll need to enable the account using the following: ```export GOOGLE_APPLICATION_CREDENTIALS=""``` +After creating the service account you will need to enable the account using the following: ```export GOOGLE_APPLICATION_CREDENTIALS=""``` ## Openstack @@ -25,13 +25,13 @@ After creating the service account you'll need to enable the account using the f ## Azure -**NOTE**: For Azure node killing scenarios, make sure [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) is installed +**NOTE**: For Azure node killing scenarios, make sure [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) is installed. -You will also need to create a service principal and give it the correct access, see [here](https://docs.openshift.com/container-platform/4.5/installing/installing_azure/installing-azure-account.html) for creating the service principal and setting the proper permissions +You will also need to create a service principal and give it the correct access, see [here](https://docs.openshift.com/container-platform/4.5/installing/installing_azure/installing-azure-account.html) for creating the service principal and setting the proper permissions. -To properly run the service principal requires “Azure Active Directory Graph/Application.ReadWrite.OwnedBy” api permission granted and “User Access Administrator” +To properly run the service principal requires “Azure Active Directory Graph/Application.ReadWrite.OwnedBy” api permission granted and “User Access Administrator”. -Before running you'll need to set the following: +Before running you will need to set the following: 1. Login using ```az login``` 2. ```export AZURE_TENANT_ID=``` @@ -42,7 +42,7 @@ Before running you'll need to set the following: ## Alibaba -See the [Installation guide](https://www.alibabacloud.com/help/en/doc-detail/121988.html?spm=a2c63.p38356.0.0.13f868799CwZPL) to install alicloud cli +See the [Installation guide](https://www.alibabacloud.com/help/en/alibaba-cloud-cli/latest/installation-guide) to install alicloud cli. 1. ```export ALIBABA_ID=``` @@ -50,6 +50,6 @@ See the [Installation guide](https://www.alibabacloud.com/help/en/doc-detail/121 3. ```export ALIBABA_REGION_ID=``` -Refer to [region and zone page](https://www.alibabacloud.com/help/en/doc-detail/188196.htm?spm=a2c63.p38356.0.0.440c5aa4G3MxVt#concept-2459516) to get the region id for the region your running on +Refer to [region and zone page](https://www.alibabacloud.com/help/en/elastic-compute-service/latest/regions-and-zones#concept-2459516) to get the region id for the region you are running on. -Set cloud_type to either alibaba or alicloud in your node scenario yaml file +Set cloud_type to either alibaba or alicloud in your node scenario yaml file. diff --git a/docs/cluster_shut_down_scenarios.md b/docs/cluster_shut_down_scenarios.md index 0fde759f..2ab8bb4c 100644 --- a/docs/cluster_shut_down_scenarios.md +++ b/docs/cluster_shut_down_scenarios.md @@ -1,7 +1,7 @@ #### Kubernetes/OpenShift cluster shut down scenario -Scenario to shut down all the nodes including the masters and restart them after specified duration. Cluster shut down scenario can be injected by placing the shut_down config file under cluster_shut_down_scenario option in the kraken config. Refer to [cluster_shut_down_scenario](https://github.com/openshift-scale/kraken/blob/master/scenarios/cluster_shut_down_scenario.yml) config file. +Scenario to shut down all the nodes including the masters and restart them after specified duration. Cluster shut down scenario can be injected by placing the shut_down config file under cluster_shut_down_scenario option in the kraken config. Refer to [cluster_shut_down_scenario](https://github.com/chaos-kubox/krkn/blob/main/scenarios/cluster_shut_down_scenario.yml) config file. -Refer to [cloud setup](cloud_setup.md) to configure your cli properly for the cloud provider of the cluster you want to shut down +Refer to [cloud setup](cloud_setup.md) to configure your cli properly for the cloud provider of the cluster you want to shut down. Current accepted cloud types: * [Azure](cloud_setup.md#azure) @@ -11,8 +11,8 @@ Current accepted cloud types: ``` -cluster_shut_down_scenario: # Scenario to stop all the nodes for specified duration and restart the nodes - runs: 1 # Number of times to execute the cluster_shut_down scenario - shut_down_duration: 120 # duration in seconds to shut down the cluster - cloud_type: aws # cloud type on which Kubernetes/OpenShift runs +cluster_shut_down_scenario: # Scenario to stop all the nodes for specified duration and restart the nodes. + runs: 1 # Number of times to execute the cluster_shut_down scenario. + shut_down_duration: 120 # Duration in seconds to shut down the cluster. + cloud_type: aws # Cloud type on which Kubernetes/OpenShift runs. ``` diff --git a/docs/config.md b/docs/config.md index ff5d7577..958e933d 100644 --- a/docs/config.md +++ b/docs/config.md @@ -1,4 +1,4 @@ ### Config -Set the scenarios to inject and the tunings like duration to wait between each scenario in the config file located at [config/config.yaml](https://github.com/cloud-bulldozer/kraken/blob/master/config/config.yaml). +Set the scenarios to inject and the tunings like duration to wait between each scenario in the config file located at [config/config.yaml](https://github.com/chaos-kubox/krkn/blob/main/config/config.yaml). -**NOTE**: [config](https://github.com/cloud-bulldozer/kraken/tree/master/config/config_performance.yaml) can be used if leveraging the [automated way](https://github.com/cloud-bulldozer/kraken#setting-up-infrastructure-dependencies) to install the infrastruture pieces. +**NOTE**: [config](https://github.com/chaos-kubox/krkn/blob/main/config/config_performance.yaml) can be used if leveraging the [automated way](https://github.com/chaos-kubox/krkn#setting-up-infrastructure-dependencies) to install the infrastructure pieces. diff --git a/docs/container_scenarios.md b/docs/container_scenarios.md index 0922cf81..6fe471a6 100644 --- a/docs/container_scenarios.md +++ b/docs/container_scenarios.md @@ -1,7 +1,7 @@ ### Container Scenarios Kraken uses the `oc exec` command to `kill` specific containers in a pod. This can be based on the pods namespace or labels. If you know the exact object you want to kill, you can also specify the specific container name or pod name in the scenario yaml file. -These scenarios are in a simple yaml format that you can manipulate to run your specific tests or use the pre-existing scenarios to see how it works +These scenarios are in a simple yaml format that you can manipulate to run your specific tests or use the pre-existing scenarios to see how it works. #### Example Config The following are the components of Kubernetes/OpenShift for which a basic chaos scenario config exists today. @@ -18,20 +18,20 @@ scenarios: ``` #### Post Action -In all scenarios we do a post chaos check to wait and verify the specific component +In all scenarios we do a post chaos check to wait and verify the specific component. Here there are two options: -1. Pass a custom script in the main config scenario list, that will run before the chaos and verify the output matches post chaos scenario +1. Pass a custom script in the main config scenario list that will run before the chaos and verify the output matches post chaos scenario. -See [scenarios/post_action_etcd_container.py](https://github.com/cloud-bulldozer/kraken/tree/master/scenarios/post_action_etcd_container.py) for an example +See [scenarios/post_action_etcd_container.py](https://github.com/chaos-kubox/krkn/blob/main/scenarios/post_action_etcd_container.py) for an example. ``` -- container_scenarios: # List of chaos pod scenarios to load +- container_scenarios: # List of chaos pod scenarios to load. - - scenarios/container_etcd.yml - scenarios/post_action_etcd_container.py ``` -2. Allow kraken to wait and check the killed containers become ready again. Kraken keeps a list of the specific -containers that were killed as well as the namespaces and pods to verify all containers that were affected recover properly +2. Allow kraken to wait and check the killed containers until they become ready again. Kraken keeps a list of the specific +containers that were killed as well as the namespaces and pods to verify all containers that were affected recover properly. ``` retry_wait: diff --git a/docs/contribute.md b/docs/contribute.md index c3126fa2..3e882441 100644 --- a/docs/contribute.md +++ b/docs/contribute.md @@ -10,7 +10,7 @@ How to: ## Pull request -In order to submit a change or a PR, please fork the project and follow instructions: +In order to submit a change or a PR, please fork the project and follow these instructions: ```bash $ git clone http://github.com//krkn $ cd krkn @@ -25,18 +25,18 @@ $ git push ## Fix Formatting Kraken uses [pre-commit](https://pre-commit.com) framework to maintain the code linting and python code styling. The CI would run the pre-commit check on each pull request. -We encourage our contributors to follow the same pattern, while contributing to the code. +We encourage our contributors to follow the same pattern while contributing to the code. -The pre-commit configuration file is present in the repository `.pre-commit-config.yaml` -It contains the different code styling and linting guide which we use for the application. +The pre-commit configuration file is present in the repository `.pre-commit-config.yaml`. +It contains the different code styling and linting guides which we use for the application. -Following command can be used to run the pre-commit: +The following command can be used to run the pre-commit: `pre-commit run --all-files` -If pre-commit is not installed in your system, it can be install with : `pip install pre-commit` +If pre-commit is not installed in your system, it can be installed with `pip install pre-commit`. ## Squash Commits -If there are mutliple commits, please rebase/squash multiple commits +If there are multiple commits, please rebase/squash multiple commits before creating the PR by following: ```bash @@ -46,7 +46,7 @@ $ git rebase -i HEAD~ $ git rebase -i ``` -In the interactive rebase screen, set the first commit to `pick` and all others to `squash` (or whatever else you may need to do). +In the interactive rebase screen, set the first commit to `pick`, and all others to `squash`, or whatever else you may need to do. Push your rebased commits (you may need to force), then issue your PR. @@ -58,22 +58,22 @@ $ git push origin --force ## Rebase with Upstream If changes go into the main repository while you're working on your code it is best to rebase your code with the - upstream so you stay up to date with all changes and fix any conflicting code changes + upstream, so you stay up to date with all changes and fix any conflicting code changes. -If not already configured, set the upstream url for kraken +If not already configured, set the upstream url for kraken. ``` git remote add upstream https://github.com/cloud-bulldozer/kraken.git ``` -Rebase to upstream master branch +Rebase to upstream master branch. ``` git fetch upstream git rebase upstream/master git push origin --force ``` -If any errors occur, it’ll list off any files that have merge issues -Edit the files with the code you want to keep, see below for detailed help from Git +If any errors occur, it will list off any files that have merge issues. +Edit the files with the code you want to keep. See below for detailed help from Git. 1. Vi 2. Resolving-a-merge-conflict-using-the-command-line 3. git add @@ -86,7 +86,7 @@ Edit the files with the code you want to keep, see below for detailed help from Merge Conflicts Example ``` 1. git rebase upstream/kraken -2. vi run_kraken.py [edit at the required places, get rid of arrowed lines and dashes apply correct changes] +2. vi run_kraken.py [edit at the indicated places, get rid of arrowed lines and dashes, and apply correct changes] 3. git add run_kraken.py 4. git rebase --continue 5. repeat 2-4 until done diff --git a/docs/getting_started.md b/docs/getting_started.md index c3bfbed5..82c49482 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -1,8 +1,8 @@ ## Getting Started Running Chaos Scenarios #### Adding New Scenarios -Adding a new scenario is as simple as adding a new config file under [scenarios directory](https://github.com/cloud-bulldozer/kraken/tree/master/scenarios) and defining it in the main kraken [config](https://github.com/cloud-bulldozer/kraken/blob/master/config/config.yaml#L8). -You can either copy an existing yaml file and make it your own or fill in one of the templates below to suit your needs +Adding a new scenario is as simple as adding a new config file under [scenarios directory](https://github.com/chaos-kubox/krkn/tree/main/scenarios) and defining it in the main kraken [config](https://github.com/chaos-kubox/krkn/blob/main/config/config.yaml#L8). +You can either copy an existing yaml file and make it your own, or fill in one of the templates below to suit your needs. ### Templates #### Pod Scenario Yaml Template @@ -21,7 +21,7 @@ scenarios: matches: - labels: namespace: "" - selector: "" #this can be left blank + selector: "" # This can be left blank. filters: - randomSample: size: @@ -33,11 +33,11 @@ scenarios: matches: - labels: namespace: "" - selector: "" #this can be left blank + selector: "" # This can be left blank. retries: retriesTimeout: - # Amount of time to wait with retrying, before failing if pod count doesn't match expected - timeout: 180 + # Amount of time to wait with retrying, before failing if pod count does not match expected + # timeout: 180. actions: - checkPodCount: @@ -51,12 +51,12 @@ More information on specific items that you can add to the pod killing scenarios ``` node_scenarios: - - actions: # node chaos scenarios to be injected + - actions: # Node chaos scenarios to be injected. - - - node_name: # can be left blank + node_name: # Can be left blank. label_selector: - instance_kill_count: + instance_kill_count: timeout: cloud_type: ``` @@ -72,10 +72,10 @@ time_scenarios: ### Common Scenario Edits -If you just want to make small changes to pre-existing scenarios, feel free to edit the scenario file itself +If you just want to make small changes to pre-existing scenarios, feel free to edit the scenario file itself. #### Example of Quick Pod Scenario Edit: -If you want to kill 2 pods instead of 1 in any of the pre-existing scenarios, you can either edit the number located at filters -> randomSample -> size or the runs under the config -> runStrategy section +If you want to kill 2 pods instead of 1 in any of the pre-existing scenarios, you can either edit the number located at filters -> randomSample -> size or the runs under the config -> runStrategy section. #### Example of Quick Nodes Scenario Edit: -If your cluster is build on GCP instead of AWS, just change the cloud type in the node_scenarios_example.yml file +If your cluster is build on GCP instead of AWS, just change the cloud type in the node_scenarios_example.yml file. diff --git a/docs/index.md b/docs/index.md index 07e681cf..62263d90 100644 --- a/docs/index.md +++ b/docs/index.md @@ -25,32 +25,32 @@ The network is homogeneous. Consistent resource usage with no spikes. All shared resources are available from all places. -The assumptions led to a number of outages in production environments in the past. The services suffered from poor performance or were inaccessible to the customers, leading to missing Service Level Agreement uptime promises, revenue loss, and a degradation in the perceived reliability of said services.. +Various assumptions led to a number of outages in production environments in the past. The services suffered from poor performance or were inaccessible to the customers, leading to missing Service Level Agreement uptime promises, revenue loss, and a degradation in the perceived reliability of said services. -How can we best avoid this from happening? This is exactly where Chaos testing can add value +How can we best avoid this from happening? This is where Chaos testing can add value. ### Test Strategies and Methodology Failures in production are costly. To help mitigate risk to service health, consider the following strategies and approaches to service testing: -- Be proactive vs reactive. We have different types of test suites in place - unit, integration and end-to-end - that help expose bugs in code in a controlled environment. Through implementation of a chaos engineering strategy,we can help discover potential for service degradation that were unknown. We need to understand the systems behavior under unpredictable conditions in order to find the areas to harden, and use the performance data points to size the clusters to handle the failures in order to keep the downtime to as minimal as possible. +- Be proactive vs reactive. We have different types of test suites in place - unit, integration and end-to-end - that help expose bugs in code in a controlled environment. Through implementation of a chaos engineering strategy, we can discover potential causes of service degradation. We need to understand the systems' behavior under unpredictable conditions in order to find the areas to harden, and use performance data points to size the clusters to handle failures in order to keep downtime to a minimum. - Test the resiliency of a system under turbulent conditions by running tests that are designed to disrupt while monitoring the systems adaptability and performance: - - Establish and define your steady state and metrics - understand the behavior and performance under stable conditions and define the metrics that will be used to evaluate the system’s behavior. Then decide on acceptable outcomes before injecting chaos. + - Establish and define your steady state and metrics - understand the behavior and performance under stable conditions and define the metrics that will be used to evaluate the system’s behavior. Then decide on acceptable outcomes before injecting chaos. - Analyze the statuses and metrics of all components during the chaos test runs. - Improve the areas that are not resilient and performant by comparing the key metrics and Service Level Objectives (SLOs) to the stable conditions before the chaos. - For example: evaluating the API server latency or application uptime to see if the key performance indicators and service level indicators are still within the acceptable limits. + For example: evaluating the API server latency or application uptime to see if the key performance indicators and service level indicators are still within acceptable limits. ### Best Practices -Now that we understand the test methodology, let’s take a look at the best practices for an OpenShift cluster. On that platform there are user applications and cluster workloads that need to be designed for stability and to provide the best user experience possible: +Now that we understand the test methodology, let us take a look at the best practices for an OpenShift cluster. On that platform there are user applications and cluster workloads that need to be designed for stability and to provide the best user experience possible: -- Alerts with appropriate severity should get fired - - Alerts are key to identify when a component starts degrading, and can help focus the investigation effort on affected system components. - - Alerts should have proper severity,description, notification policy, escalation policy, and SOPin order to reduce MTTR for responding SRE or Ops resources. +- Alerts with appropriate severity should get fired. + - Alerts are key to identify when a component starts degrading, and can help focus the investigation effort on affected system components. + - Alerts should have proper severity, description, notification policy, escalation policy, and SOP in order to reduce MTTR for responding SRE or Ops resources. - Detailed information on the alerts consistency can be found [here](https://github.com/openshift/enhancements/blob/master/enhancements/monitoring/alerting-consistency.md). - Minimal performance impact - Network, CPU, Memory, Disk, Throughput etc. @@ -58,50 +58,50 @@ Now that we understand the test methodology, let’s take a look at the best pra We want to look at this in terms of CPU, Memory, Disk, Throughput, Network etc. - We want to look at this in terms of CPU, Memory, Disk, Throughput, Network etc. -- Appropriate CPU/Memory limits set to avoid performance throttling and OOM kills - - There might be rogue applications hogging resources ( CPU/Memory ) on the nodes which might lead to applications underperforming or worse getting OOM killed. It’s important to ensure that the applications and system components have reserved resources for the kube-scheduler to take into consideration in order to keep them performing at the expected levels. +- Appropriate CPU/Memory limits set to avoid performance throttling and OOM kills. + - There might be rogue applications hogging resources ( CPU/Memory ) on the nodes which might lead to applications underperforming or worse getting OOM killed. It is important to ensure that applications and system components have reserved resources for the kube-scheduler to take into consideration in order to keep them performing at the expected levels. -- Services dependent on the system under test need to handle the failure gracefully to avoid performance degradation and downtime - appropriate timeouts - - In a distributed system, services deployed coordinate with each other and might have external dependencies. Each of the services deployed as a deployment, pod or container need to handle the downtime of other dependent services gracefully instead of crashing due to not having appropriate timeouts, fallback logic etc. +- Services dependent on the system under test need to handle the failure gracefully to avoid performance degradation and downtime - appropriate timeouts. + - In a distributed system, services deployed coordinate with each other and might have external dependencies. Each of the services deployed as a deployment, pod, or container, need to handle the downtime of other dependent services gracefully instead of crashing due to not having appropriate timeouts, fallback logic etc. - Proper node sizing to avoid cascading failures and ensure cluster stability especially when the cluster is large and dense - The platform needs to be sized taking into account the resource usage spikes that might occur during chaotic events. For example, if one of the main nodes goes down, the other two main nodes need to have enough resources to handle the load. The resource usage depends on the load or number of objects that are running being managed by the Control Plane ( Api Server, Etcd, Controller and Scheduler ). As such, it’s critical to test such conditions, understand the behavior, and leverage the data to size the platform appropriately. This can help keep the applications stable during unplanned events without the control plane undergoing cascading failures which can potentially bring down the entire cluster. -- Proper node sizing to avoid application failures and maintain stability - - An application pod might use more resources during reinitialization after a crash, so it’s important to take that into account for sizing the nodes in the cluster to accommodate it. For example, monitoring solutions like Prometheus need high amounts of memory to replay the write ahead log ( WAL ) when it restarts. As such, it’s critical to test such conditions, understand the behavior, and leverage the data to size the platform appropriately. This can help keep the application stable during unplanned events without undergoing degradation in performance or even worse hog the resources on the node which can impact other applications and system pods. +- Proper node sizing to avoid application failures and maintain stability. + - An application pod might use more resources during reinitialization after a crash, so it is important to take that into account for sizing the nodes in the cluster to accommodate it. For example, monitoring solutions like Prometheus need high amounts of memory to replay the write ahead log ( WAL ) when it restarts. As such, it’s critical to test such conditions, understand the behavior, and leverage the data to size the platform appropriately. This can help keep the application stable during unplanned events without undergoing degradation in performance or even worse hog the resources on the node which can impact other applications and system pods. -- Minimal initialization time and fast recovery logic - - The controller watching the component should recognize a failure as soon as possible. The component needs to have minimal initialization time to avoid extended downtime or overloading the replicas if it is a highly available configuration. The cause of failure can be because of issues with the infrastructure on top of which it’s running, application failures or because of service failures that it depends on. +- Minimal initialization time and fast recovery logic. + - The controller watching the component should recognize a failure as soon as possible. The component needs to have minimal initialization time to avoid extended downtime or overloading the replicas if it is a highly available configuration. The cause of failure can be because of issues with the infrastructure on top of which it is running, application failures, or because of service failures that it depends on. -- High Availability deployment strategy +- High Availability deployment strategy. - There should be multiple replicas ( both OpenShift and application control planes ) running preferably in different availability zones to survive outages while still serving the user/system requests. Avoid single points of failure. - Backed by persistent storage - - It’s important to have the system/application backed by persistent storage. This is especially important in cases where the application is a database or a stateful application given that a node, pod or container failure will wipe off the data. + - It is important to have the system/application backed by persistent storage. This is especially important in cases where the application is a database or a stateful application given that a node, pod, or container failure will wipe off the data. - There should be fallback routes to the backend in case of using CDN, for example, Akamai in case of console.redhat.com - a managed service deployed on top of OpenShift dedicated: - Content delivery networks (CDNs) are commonly used to host resources such as images, JavaScript files, and CSS. The average web page is nearly 2 MB in size, and offloading heavy resources to third-parties is extremely effective for reducing backend server traffic and latency. However, this makes each CDN an additional point of failure for every site that relies on it. If the CDN fails, its customers could also fail. - To test how the application reacts to failures, drop all network traffic between the system and CDN. The application should still serve the content to the user irrespective of the failure. -- Appropriate caching and Content Delivery Network should be enabled to be performant and usable when there’s a latency on the client side: - - Not every user or machine has access to unlimited bandwidth, there might be a delay on the user side ( client ) to access the API’s due to limited bandwidth, throttling or latency depending on the geographic location. It’s important to inject latency between the client and API calls to understand the behavior and optimize things including caching wherever possible, using CDN’s or opting for different protocols like HTTP/2 or HTTP/3 vs HTTP. +- Appropriate caching and Content Delivery Network should be enabled to be performant and usable when there is a latency on the client side. + - Not every user or machine has access to unlimited bandwidth, there might be a delay on the user side ( client ) to access the API’s due to limited bandwidth, throttling or latency depending on the geographic location. It is important to inject latency between the client and API calls to understand the behavior and optimize things including caching wherever possible, using CDN’s or opting for different protocols like HTTP/2 or HTTP/3 vs HTTP. ### Tooling -Now that we looked at the best practices, In this section, we will go through how [Kraken](https://github.com/cloud-bulldozer/kraken) - a chaos testing framework can help test the resilience of OpenShift and make sure the applications and services are following the best practices. +Now that we looked at the best practices, In this section, we will go through how [Kraken](https://github.com/chaos-kubox/krkn) - a chaos testing framework can help test the resilience of OpenShift and make sure the applications and services are following the best practices. #### Workflow -Let’s start by understanding the workflow of kraken: the user will start by running kraken by pointing to a specific OpenShift cluster using kubeconfig to be able to talk to the platform on top of which OpenShift cluster is hosted. This can be done by either the oc/kubectl API or the cloud API. Based on the configuration of kraken, it will inject specific chaos scenarios as shown below, talks to [Cerberus](https://github.com/cloud-bulldozer/cerberus) to get the go/no-go signal representing the overall health of the cluster ( optional - can be turned off ), scrapes metrics from in-cluster prometheus given a metrics profile with the promql queries and stores them long term in Elasticsearch configured ( optional - can be turned off ), evaluates the promql expressions specified in the alerts profile ( optional - can be turned off ) and aggregated everything to set the pass/fail i.e exits 0 or 1. More about the metrics collection, cerberus and metrics evaluation can be found in the next section. +Let us start by understanding the workflow of kraken: the user will start by running kraken by pointing to a specific OpenShift cluster using kubeconfig to be able to talk to the platform on top of which the OpenShift cluster is hosted. This can be done by either the oc/kubectl API or the cloud API. Based on the configuration of kraken, it will inject specific chaos scenarios as shown below, talk to [Cerberus](https://github.com/chaos-kubox/cerberus) to get the go/no-go signal representing the overall health of the cluster ( optional - can be turned off ), scrapes metrics from in-cluster prometheus given a metrics profile with the promql queries and stores them long term in Elasticsearch configured ( optional - can be turned off ), evaluates the promql expressions specified in the alerts profile ( optional - can be turned off ) and aggregated everything to set the pass/fail i.e. exits 0 or 1. More about the metrics collection, cerberus and metrics evaluation can be found in the next section. ![Kraken workflow](../media/kraken-workflow.png) #### Cluster recovery checks, metrics evaluation and pass/fail criteria -- Most of the scenarios have built in checks to verify if the targeted component recovered from the failure after the specified duration of time but there might be cases where other components might have an impact because of a certain failure and it’s extremely important to make sure that the system/application is healthy as a whole post chaos. This is exactly where [Cerberus](https://github.com/cloud-bulldozer/cerberus) comes to the rescue. +- Most of the scenarios have built in checks to verify if the targeted component recovered from the failure after the specified duration of time but there might be cases where other components might have an impact because of a certain failure and it’s extremely important to make sure that the system/application is healthy as a whole post chaos. This is exactly where [Cerberus](https://github.com/chaos-kubox/cerberus) comes to the rescue. If the monitoring tool, cerberus is enabled it will consume the signal and continue running chaos or not based on that signal. -- Apart from checking the recovery and cluster health status, it’s equally important to evaluate the performance metrics like latency, resource usage spikes, throughput, etcd health like disk fsync, leader elections etc. To help with this, Kraken has a way to evaluate promql expressions from the incluster prometheus and set the exit status to 0 or 1 based on the severity set for each of the query. Details on how to use this feature can be found [here](https://github.com/cloud-bulldozer/kraken#alerts). +- Apart from checking the recovery and cluster health status, it’s equally important to evaluate the performance metrics like latency, resource usage spikes, throughput, etcd health like disk fsync, leader elections etc. To help with this, Kraken has a way to evaluate promql expressions from the incluster prometheus and set the exit status to 0 or 1 based on the severity set for each of the query. Details on how to use this feature can be found [here](https://github.com/chaos-kubox/krkn#alerts). - The overall pass or fail of kraken is based on the recovery of the specific component (within a certain amount of time), the cerberus health signal which tracks the health of the entire cluster and metrics evaluation from incluster prometheus. @@ -110,20 +110,20 @@ If the monitoring tool, cerberus is enabled it will consume the signal and conti ### Scenarios -Let’s take a look at how to run the chaos scenarios on your OpenShift clusters using Kraken-hub - a lightweight wrapper around Kraken to ease the runs by providing the ability to run them by just running container images using podman with parameters set as environment variables. This eliminates the need to carry around and edit configuration files and makes it easy for any CI framework integration. Here are the scenarios supported: +Let us take a look at how to run the chaos scenarios on your OpenShift clusters using Kraken-hub - a lightweight wrapper around Kraken to ease the runs by providing the ability to run them by just running container images using podman with parameters set as environment variables. This eliminates the need to carry around and edit configuration files and makes it easy for any CI framework integration. Here are the scenarios supported: -- Pod Scenarios ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/pod-scenarios.md)) +- Pod Scenarios ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/pod-scenarios.md)) - Disrupts OpenShift/Kubernetes and applications deployed as pods: - Helps understand the availability of the application, the initialization timing and recovery status. - [Demo](https://asciinema.org/a/452351?speed=3&theme=solarized-dark) -- Container Scenarios ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/container-scenarios.md)) +- Container Scenarios ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/container-scenarios.md)) - Disrupts OpenShift/Kubernetes and applications deployed as containers running as part of a pod(s) using a specified kill signal to mimic failures: - - Helps understand the impact and recovery timing when the program/process running in the containers is disrupted - hangs, paused, killed etc. using various kill signals i.e SIGHUP, SIGTERM, SIGKILL etc. + - Helps understand the impact and recovery timing when the program/process running in the containers are disrupted - hangs, paused, killed etc., using various kill signals, i.e. SIGHUP, SIGTERM, SIGKILL etc. - [Demo](https://asciinema.org/a/BXqs9JSGDSEKcydTIJ5LpPZBM?speed=3&theme=solarized-dark) -- Node Scenarios ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/node-scenarios.md)) - - Disrupts nodes as part of the cluster infrastructure by talking to the cloud API. AWS, Azure, GCP, OpenStack and Baremetal are the supported platforms as of now. Some of the possible disruptions include: +- Node Scenarios ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/node-scenarios.md)) + - Disrupts nodes as part of the cluster infrastructure by talking to the cloud API. AWS, Azure, GCP, OpenStack and Baremetal are the supported platforms as of now. Possible disruptions include: - Terminate nodes - Fork bomb inside the node - Stop the node @@ -131,44 +131,44 @@ Let’s take a look at how to run the chaos scenarios on your OpenShift clusters - etc. - [Demo](https://asciinema.org/a/ANZY7HhPdWTNaWt4xMFanF6Q5) -- Zone Outages ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/zone-outages.md)) - - Creates outage of availability zone(s) in a targeted region in the public cloud where the OpenShift cluster is running by tweaking the network acl of the zone to simulate the failure and that in turn will stop both ingress and egress traffic from all the nodes in a particular zone for the specified duration and reverts it back to the previous state - - Helps understand the impact on both Kubernetes/OpenShift control plane as well as applications, services running on the worker nodes in that zone. - - Currently only set up for AWS cloud platform: 1 VPC and multiples subnets within the VPC can be specified +- Zone Outages ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/zone-outages.md)) + - Creates outage of availability zone(s) in a targeted region in the public cloud where the OpenShift cluster is running by tweaking the network acl of the zone to simulate the failure, and that in turn will stop both ingress and egress traffic from all nodes in a particular zone for the specified duration and reverts it back to the previous state. + - Helps understand the impact on both Kubernetes/OpenShift control plane as well as applications and services running on the worker nodes in that zone. + - Currently, only set up for AWS cloud platform: 1 VPC and multiples subnets within the VPC can be specified. - [Demo](https://asciinema.org/a/452672?speed=3&theme=solarized-dark) -- Application outages ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/application-outages.md)) - - Scenario to block the traffic ( Ingress/Egress ) of an application matching the labels for the specified duration of time to understand the behavior of the service/other services which depend on it during the downtime - - Helps understand how the dependent services react to the unavailability +- Application Outages ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/application-outages.md)) + - Scenario to block the traffic ( Ingress/Egress ) of an application matching the labels for the specified duration of time to understand the behavior of the service/other services which depend on it during the downtime. + - Helps understand how the dependent services react to the unavailability. - [Demo](https://asciinema.org/a/452403?speed=3&theme=solarized-dark) -- Power Outages ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/power-outages.md)) - - This scenario imitates a power outage by shutting down of the entire cluster for a specified duration of time, restarts all the nodes after the specified time and checks the health of the cluster. - - There are various use cases in the customer environments, for example, some of the clusters are shutdown in cases where the applications are not needed to run in a particular time/season in order to save costs - - The nodes are stopped in parallel to mimic a power outage i.e pulling off the plug +- Power Outages ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/power-outages.md)) + - This scenario imitates a power outage by shutting down of the entire cluster for a specified duration of time, then restarts all the nodes after the specified time and checks the health of the cluster. + - There are various use cases in the customer environments. For example, when some of the clusters are shutdown in cases where the applications are not needed to run in a particular time/season in order to save costs. + - The nodes are stopped in parallel to mimic a power outage i.e., pulling off the plug - [Demo](https://asciinema.org/a/r0zLbh70XK7gnc4s5v0ZzSXGo) -- Resource hog +- Resource Hog - Hogs CPU, Memory and IO on the targeted nodes - - Helps understand if the application/system components have reserved resources to not get disrupted because of rogue applications or get performance throttled. - - CPU Hog ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/node-cpu-hog.md), [Demo](https://asciinema.org/a/452762)) - - Memory Hog ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/node-memory-hog.md), [Demo](https://asciinema.org/a/452742?speed=3&theme=solarized-dark)) - - IO Hog ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/node-io-hog.md)) + - Helps understand if the application/system components have reserved resources to not get disrupted because of rogue applications, or get performance throttled. + - CPU Hog ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/node-cpu-hog.md), [Demo](https://asciinema.org/a/452762)) + - Memory Hog ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/node-memory-hog.md), [Demo](https://asciinema.org/a/452742?speed=3&theme=solarized-dark)) + - IO Hog ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/node-io-hog.md)) -- Time Skewing ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/time-scenarios.md)) - - Manipulate the system time and/or date of specific pods/nodes - - verify scheduling of objects continue to work - - Verify time gets reset properly +- Time Skewing ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/time-scenarios.md)) + - Manipulate the system time and/or date of specific pods/nodes. + - Verify scheduling of objects so they continue to work. + - Verify time gets reset properly. -- Namespace failures ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/namespace-scenarios.md)) - - Delete namespaces for the specified duration - - Helps understand the impact on other components and test/improve recovery time of the components in the targeted namespace +- Namespace Failures ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/namespace-scenarios.md)) + - Delete namespaces for the specified duration. + - Helps understand the impact on other components and tests/improves recovery time of the components in the targeted namespace. -- Persistent volume fill ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/pvc-scenarios.md)) - - Fills up the persistent volumes, up to a given percentage, used by the pod for the specified duration - - Helps understand how an application deals when it’s no longer able to write data to the disk. For example kafka’s behavior when it’s not able to commit data to the disk. +- Persistent Volume Fill ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/pvc-scenarios.md)) + - Fills up the persistent volumes, up to a given percentage, used by the pod for the specified duration. + - Helps understand how an application deals when it is no longer able to write data to the disk. For example, kafka’s behavior when it is not able to commit data to the disk. -- Network Chaos ([Documentation](https://github.com/cloud-bulldozer/kraken-hub/blob/main/docs/network-chaos.md)) +- Network Chaos ([Documentation](https://github.com/chaos-kubox/krkn-hub/blob/main/docs/network-chaos.md)) - Scenarios supported includes: - Network latency - Packet loss @@ -182,47 +182,47 @@ Let’s take a look at how to run the chaos scenarios on your OpenShift clusters ### Test Environment Recommendations - how and where to run chaos tests -Let’s take a look at few recommendations on how and where to run the chaos tests: +Let us take a look at few recommendations on how and where to run the chaos tests: -- Run the chaos tests continuously in your test pipelines - - Software, systems, and infrastructure does change – and the condition/health of each can change pretty rapidly. A good place to run the tests is in your CI/CD pipeline running on a regular cadence. +- Run the chaos tests continuously in your test pipelines: + - Software, systems, and infrastructure does change – and the condition/health of each can change pretty rapidly. A good place to run tests is in your CI/CD pipeline running on a regular cadence. -- Run the chaos tests manually to learn from the system - - When running a Chaos scenario or a Fault tests, it’s more important to understand how the system respond and reacts rather than mark the execution as pass or failed. - - It’s important to define the scope of the test before the execution to avoid some issues from masking others. +- Run the chaos tests manually to learn from the system: + - When running a Chaos scenario or Fault tests, it is more important to understand how the system responds and reacts, rather than mark the execution as pass or fail. + - It is important to define the scope of the test before the execution to avoid some issues from masking others. - Run the chaos tests in production environments or mimic the load in staging environments: - As scary as a thought about testing in production is, production is the environment that users are in and traffic spikes/load are real. To fully test the robustness/resilience of a production system, running Chaos Engineering experiments in a production environment will provide needed insights. A couple of things to keep in mind: - Minimize blast radius and have a backup plan in place to make sure the users and customers do not undergo downtime. - - Mimic the load in a staging environment in case Service Level Agreements are tight to cover any downtime. + - Mimic the load in a staging environment in case Service Level Agreements are too tight to cover any downtime. - Enable Observability: - - Chaos Engineering Without Observability ... Is Just Chaos - - Make sure to have logging and monitoring installed on the cluster to help with understanding the behaviour as to why it’s happening. In case of running the tests in the CI where it’s not humanly possible to monitor the cluster all the time, it’s recommended to leverage Cerberus to capture the state during the runs and metrics collection in Kraken to store metrics long term even after the cluster is gone. + - Chaos Engineering Without Observability ... Is Just Chaos. + - Make sure to have logging and monitoring installed on the cluster to help with understanding the behaviour as to why it is happening. In case of running the tests in the CI where it is not humanly possible to monitor the cluster all the time, it is recommended to leverage Cerberus to capture the state during the runs and metrics collection in Kraken to store metrics long term even after the cluster is gone. - Kraken ships with dashboards that will help understand API, Etcd and OpenShift cluster level stats and performance metrics. - Pay attention to Prometheus alerts. Check if they are firing as expected. -- Run multiple chaos tests at once to mimic the production outages +- Run multiple chaos tests at once to mimic the production outages: - For example, hogging both IO and Network at the same time instead of running them separately to observe the impact. - - You might have existing test cases, be it related to Performance, Scalability or QE, run the chaos in the background during the test runs to observe the impact. Signaling feature in Kraken can help with coordinating the chaos runs i.e start, stop, pause the scenarios based on the state of the other test jobs. + - You might have existing test cases, be it related to Performance, Scalability or QE. Run the chaos in the background during the test runs to observe the impact. Signaling feature in Kraken can help with coordinating the chaos runs i.e., start, stop, pause the scenarios based on the state of the other test jobs. #### Chaos testing in Practice within the OpenShift Organization -Within the OpenShift organization we use kraken to perform chaos testing throughout a release before the code is available to customers +Within the OpenShift organization we use kraken to perform chaos testing throughout a release before the code is available to customers. - 1. We execute kraken during our regression test suite + 1. We execute kraken during our regression test suite. - i. We cover each of the chaos scenarios across different clouds + i. We cover each of the chaos scenarios across different clouds. - a. Our testing is predominantly done on AWS, Azure and GCP + a. Our testing is predominantly done on AWS, Azure and GCP. - 2. We run the chaos scenarios during a long running reliability test + 2. We run the chaos scenarios during a long running reliability test. - i. During this test we perform different types of tasks by different users on the cluster + i. During this test we perform different types of tasks by different users on the cluster. - ii. We have added the execution of kraken to perform a certain times throughout the long running test and monitor the health of the cluster + ii. We have added the execution of kraken to perform at certain times throughout the long running test and monitor the health of the cluster. iii. This test can be seen here: https://github.com/openshift/svt/tree/master/reliability-v2 - 3. We are starting to add in test cases that perform chaos testing during an upgrade (not many iterations of this have been completed) + 3. We are starting to add in test cases that perform chaos testing during an upgrade (not many iterations of this have been completed). diff --git a/docs/installation.md b/docs/installation.md index 94f2be64..7550660f 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,15 +1,15 @@ ## Installation -Following ways are supported to run Kraken: +The following ways are supported to run Kraken: -- Standalone python program through Git -- Containerized version using either Podman or Docker as the runtime -- Kubernetes or OpenShift deployment +- Standalone python program through Git. +- Containerized version using either Podman or Docker as the runtime. +- Kubernetes or OpenShift deployment. **NOTE**: It is recommended to run Kraken external to the cluster ( Standalone or Containerized ) hitting the Kubernetes/OpenShift API as running it internal to the cluster might be disruptive to itself and also might not report back the results if the chaos leads to cluster's API server instability. **NOTE**: To run Kraken on Power (ppc64le) architecture, build and run a containerized version by following the - instructions given [here](https://github.com/cloud-bulldozer/krkn/blob/master/containers/build_own_image-README.md). + instructions given [here](https://github.com/chaos-kubox/krkn/blob/main/containers/build_own_image-README.md). ### Git @@ -50,8 +50,8 @@ $ podman run --name=kraken --net=host -v :/root/.kube/config $ podman logs -f kraken ``` -If you want to build your own kraken image see [here](https://github.com/cloud-bulldozer/krkn/blob/master/containers/build_own_image-README.md) +If you want to build your own kraken image see [here](https://github.com/chaos-kubox/krkn/blob/main/containers/build_own_image-README.md) ### Run Kraken as a Kubernetes deployment -Refer [Instructions](https://github.com/openshift-scale/kraken/blob/master/containers/README.md) on how to deploy and run Kraken as a Kubernetes/OpenShift deployment. +Refer [Instructions](https://github.com/chaos-kubox/krkn/blob/main/containers/README.md) on how to deploy and run Kraken as a Kubernetes/OpenShift deployment. diff --git a/docs/litmus_scenarios.md b/docs/litmus_scenarios.md index be0b719d..e94db8d0 100644 --- a/docs/litmus_scenarios.md +++ b/docs/litmus_scenarios.md @@ -1,41 +1,41 @@ ### Litmus Scenarios Kraken consumes [Litmus](https://github.com/litmuschaos/litmus) under the hood for some scenarios -Official Litmus documentation and to read more information on specifics of Litmus resources can be found [here](https://docs.litmuschaos.io/docs/next/getstarted/) +Official Litmus documentation and specifics of Litmus resources can be found [here](https://docs.litmuschaos.io/docs/next/getstarted/) #### Litmus Chaos Custom Resources There are 3 custom resources that are created during each Litmus scenario. Below is a description of the resources: -* ChaosEngine: A resource to link a Kubernetes application or Kubernetes node to a ChaosExperiment. ChaosEngine is watched by Litmus' Chaos-Operator which then invokes Chaos-Experiments +* ChaosEngine: A resource to link a Kubernetes application or Kubernetes node to a ChaosExperiment. ChaosEngine is watched by Litmus' Chaos-Operator which then invokes Chaos-Experiments. * ChaosExperiment: A resource to group the configuration parameters of a chaos experiment. ChaosExperiment CRs are created by the operator when experiments are invoked by ChaosEngine. * ChaosResult : A resource to hold the results of a chaos-experiment. The Chaos-exporter reads the results and exports the metrics into a configured Prometheus server. ### Understanding Litmus Scenarios -To run Litmus scenarios we need to apply 3 different resources/yaml files to our cluster -1. **Chaos experiments** contain the actual chaos details of a scenario +To run Litmus scenarios we need to apply 3 different resources/yaml files to our cluster. +1. **Chaos experiments** contain the actual chaos details of a scenario. - i. This is installed automatically by Kraken (does not need to be specified in kraken scenario configuration) + i. This is installed automatically by Kraken (does not need to be specified in kraken scenario configuration). -2. **Service Account**: should be created to allow chaosengine to run experiments in your application namespace. Usually sets just enough permissions to a specific namespace to be able to run the experiment properly +2. **Service Account**: should be created to allow chaosengine to run experiments in your application namespace. Usually it sets just enough permissions to a specific namespace to be able to run the experiment properly. - i. This can be defined using either a link to a yaml file or a downloaded file in the scenarios folder + i. This can be defined using either a link to a yaml file or a downloaded file in the scenarios' folder. -3. **Chaos Engine** connects the application instance to a Chaos Experiment. This is where you define the specifics of your scenario; ie: the node or pod name you want to cause chaos within +3. **Chaos Engine** connects the application instance to a Chaos Experiment. This is where you define the specifics of your scenario; i.e.: the node or pod name you want to cause chaos within. - i. This is a downloaded yaml file in the scenarios folder, full list of scenarios can be found [here](https://hub.litmuschaos.io/) + i. This is a downloaded yaml file in the scenarios' folder. A full list of scenarios can be found [here](https://hub.litmuschaos.io/) -**NOTE**: By default all chaos experiments will be installed based on the version you give in the config file. +**NOTE**: By default, all chaos experiments will be installed based on the version you give in the config file. Adding a new Litmus based scenario is as simple as adding references to 2 new yaml files (the Service Account and Chaos engine files for your scenario ) in the Kraken config. ### Supported scenarios -Following are the start of scenarios for which a chaos scenario config exists today. +The following are the start of scenarios for which a chaos scenario config exists today. -Scenario | Description | Working ------------------------- | ---------------------------------------------------------------------------------------------------| ------------------------- | -[Node CPU Hog](https://github.com/cloud-bulldozer/kraken/blob/master/scenarios/node_cpu_hog_engine.yaml) | Chaos scenario that hogs up the CPU on a defined node for a specific amount of time | :heavy_check_mark: | -[Node Memory Hog](https://github.com/cloud-bulldozer/kraken/blob/master/scenarios/node_mem_engine.yaml) | Chaos scenario that hogs up the memory on a defined node for a specific amount of time | :heavy_check_mark: | -[Node IO Hog](https://github.com/cloud-bulldozer/kraken/blob/master/scenarios/node_io_engine.yaml) | Chaos scenario that hogs up the IO on a defined node for a specific amount of time | :heavy_check_mark: | +Scenario | Description | Working +------------------------ |-----------------------------------------------------------------------------------------| ------------------------- | +[Node CPU Hog](https://github.com/chaos-kubox/krkn/blob/main/scenarios/node_cpu_hog_engine.yaml) | Chaos scenario that hogs up the CPU on a defined node for a specific amount of time. | :heavy_check_mark: | +[Node Memory Hog](https://github.com/chaos-kubox/krkn/blob/main/scenarios/node_mem_engine.yaml) | Chaos scenario that hogs up the memory on a defined node for a specific amount of time. | :heavy_check_mark: | +[Node IO Hog](https://github.com/chaos-kubox/krkn/blob/main/scenarios/node_io_engine.yaml) | Chaos scenario that hogs up the IO on a defined node for a specific amount of time. | :heavy_check_mark: | diff --git a/docs/metrics.md b/docs/metrics.md index 1dc6f1f7..89090c5e 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -1,22 +1,22 @@ ## Scraping and storing metrics for the run -There are cases where the state of the cluster and metrics on the cluster during the chaos test run need to be stored long term to review after the cluster is terminated, for example CI and automation test runs. To help with this, Kraken supports capturing metrics for the duration of the scenarios defined in the config and indexes them into Elasticsearch. The indexed metrics can be visualized with the help of Grafana. +There are cases where the state of the cluster and metrics on the cluster during the chaos test run need to be stored long term to review after the cluster is terminated, for example CI and automation test runs. To help with this, Kraken supports capturing metrics for the duration of the scenarios defined in the config and indexes them into Elasticsearch. The indexed metrics can be visualized with the help of Grafana. -It uses [Kube-burner](https://github.com/cloud-bulldozer/kube-burner) under the hood. The metrics to capture need to be defined in a metrics profile which Kraken consumes to query prometheus ( installed by default in OpenShift ) with the start and end timestamp of the run. Each run has a unique identifier ( uuid ) and all the metrics/documents in Elasticsearch will be associated with it. The uuid is generated automatially if not set in the config. This feature can be enabled in the [config](https://github.com/cloud-bulldozer/kraken/blob/master/config/config.yaml) by setting the following: +It uses [Kube-burner](https://github.com/cloud-bulldozer/kube-burner) under the hood. The metrics to capture need to be defined in a metrics profile which Kraken consumes to query prometheus ( installed by default in OpenShift ) with the start and end timestamp of the run. Each run has a unique identifier ( uuid ) and all the metrics/documents in Elasticsearch will be associated with it. The uuid is generated automatically if not set in the config. This feature can be enabled in the [config](https://github.com/chaos-kubox/krkn/blob/main/config/config.yaml) by setting the following: ``` performance_monitoring: kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz" capture_metrics: True - config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config + config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config. metrics_profile_path: config/metrics-aggregated.yaml prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. - uuid: # uuid for the run is generated by default if not set + uuid: # uuid for the run is generated by default if not set. ``` ### Metrics profile -A couple of [metric profiles](https://github.com/cloud-bulldozer/kraken/tree/master/config) ( [metrics.yaml](https://github.com/cloud-bulldozer/kraken/tree/master/config/metrics.yaml) and [metrics-aggregated.yaml](https://github.com/cloud-bulldozer/kraken/tree/master/config/metrics-aggregated.yaml) are shipped by default and they can be tweaked to add more metrics to capture during the run. Following are the API server metrics for example: +A couple of [metric profiles](https://github.com/chaos-kubox/krkn/tree/main/config), [metrics.yaml](https://github.com/chaos-kubox/krkn/blob/main/config/metrics.yaml), and [metrics-aggregated.yaml](https://github.com/chaos-kubox/krkn/blob/main/config/metrics-aggregated.yaml) are shipped by default and can be tweaked to add more metrics to capture during the run. The following are the API server metrics for example: ``` metrics: diff --git a/docs/namespace_scenarios.md b/docs/namespace_scenarios.md index 3d6bbc80..f78fd3eb 100644 --- a/docs/namespace_scenarios.md +++ b/docs/namespace_scenarios.md @@ -1,22 +1,22 @@ ### Delete Namespace Scenarios -Using this type of scenario configuration, one is able to delete specific namespace or namespace matching a certain regex string +Using this type of scenario configuration one is able to delete a specific namespace, or a namespace matching a certain regex string. Configuration Options: -**action:** default is `delete` +**action:** Default is `delete` -**namespace:** specific namespace or regex style namespace of what you want to delete, gets all namespaces if not specified; set to "" if you want to use the label_selector field +**namespace:** Specific namespace or regex style namespace of what you want to delete. Gets all namespaces if not specified; set to "" if you want to use the label_selector field. -Set to '^.*$' and label_selector to "" to randomly select any namespace in your cluster +Set to '^.*$' and label_selector to "" to randomly select any namespace in your cluster. -**label_selector:** label on the namespace you want to delete, set to "" if you are using the namespace variable +**label_selector:** Label on the namespace you want to delete. Set to "" if you are using the namespace variable. -**delete_count:** number of namespaces to kill in each run, based on matching namespace and label specified, default is 1 +**delete_count:** Number of namespaces to kill in each run. Based on matching namespace and label specified, default is 1. -**runs:** number of runs/iterations to kill namespaces, default is 1 +**runs:** Number of runs/iterations to kill namespaces, default is 1. -**sleep:** number of seconds to wait between each iteration/count of killing namespaces. Defaults to 10 seconds if not set +**sleep:** Number of seconds to wait between each iteration/count of killing namespaces. Defaults to 10 seconds if not set Refer to [namespace_scenarios_example](https://github.com/chaos-kubox/krkn/blob/main/scenarios/regex_namespace.yaml) config file. @@ -31,16 +31,16 @@ scenarios: sleep: 15 ``` -**NOTE:** Many openshift namespaces have finalizers built that protect the namespace from being fully deleted: see documentation [here](https://kubernetes.io/blog/2021/05/14/using-finalizers-to-control-deletion/) -The namespaces that do have finalizers enabled will be in left in a terminating state but all the pods running on that namespace will get deleted +**NOTE:** Many openshift namespaces have finalizers built that protect the namespace from being fully deleted: see documentation [here](https://kubernetes.io/blog/2021/05/14/using-finalizers-to-control-deletion/). +The namespaces that do have finalizers enabled will be in left in a terminating state but all the pods running on that namespace will get deleted. #### Post Action -In all scenarios we do a post chaos check to wait and verify the specific component +In all scenarios we do a post chaos check to wait and verify the specific component. Here there are two options: -1. Pass a custom script in the main config scenario list, that will run before the chaos and verify the output matches post chaos scenario +1. Pass a custom script in the main config scenario list that will run before the chaos and verify the output matches post chaos scenario. See [scenarios/post_action_namespace.py](https://github.com/cloud-bulldozer/kraken/tree/master/scenarios/post_action_namespace.py) for an example @@ -52,7 +52,7 @@ See [scenarios/post_action_namespace.py](https://github.com/cloud-bulldozer/krak 2. Allow kraken to wait and check the killed namespaces become 'Active' again. Kraken keeps a list of the specific -namespaces that were killed to verify all that were affected recover properly +namespaces that were killed to verify all that were affected recover properly. ``` wait_time: diff --git a/docs/network_chaos.md b/docs/network_chaos.md index edebf9ed..0de1f6e4 100644 --- a/docs/network_chaos.md +++ b/docs/network_chaos.md @@ -1,16 +1,16 @@ ### Network chaos -Scenario to introduce network latency, packet loss, bandwidth restriction in the Node's hostnework interface. The purpose of this scenario is to observe faults caused by random variations in the network. +Scenario to introduce network latency, packet loss, and bandwidth restriction in the Node's host network interface. The purpose of this scenario is to observe faults caused by random variations in the network. ##### Sample scenario config ``` network_chaos: # Scenario to create an outage by simulating random variations in the network. - duration: 300 # in seconds - during with network chaos will be applied. - node_name: # comma separated node names on which scenario has to be injected. - label_selector: node-role.kubernetes.io/master # when node_name is not specified, a node with matching label_selector is selected for running the scenario. - instance_count: 1 # Number of nodes to execute network chaos in. + duration: 300 # In seconds - duration network chaos will be applied. + node_name: # Comma separated node names on which scenario has to be injected. + label_selector: node-role.kubernetes.io/master # When node_name is not specified, a node with matching label_selector is selected for running the scenario. + instance_count: 1 # Number of nodes in which to execute network chaos. interfaces: # List of interface on which to apply the network restriction. - "ens5" # Interface name would be the Kernel host network interface name. - execution: serial|parallel # Execute each of the egress option as a single scenario(parallel) or as separate scenario(serial). + execution: serial|parallel # Execute each of the egress options as a single scenario(parallel) or as separate scenario(serial). egress: latency: 50ms loss: 0.02 # percentage @@ -18,8 +18,8 @@ network_chaos: # Scenario to create an outage ``` ##### Steps - - Pick the nodes to introduce the network anomly either from node_name or label_selector. - - Verify interface list in one of the node or use the interface with default route, as test interface, if no interface is specified by the user. + - Pick the nodes to introduce the network anomaly either from node_name or label_selector. + - Verify interface list in one of the nodes or use the interface with a default route, as test interface, if no interface is specified by the user. - Set traffic shaping config on node's interface using tc and netem. - Wait for the duration time. - Remove traffic shaping config on node's interface. diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md index d4d890f0..9ef1b066 100644 --- a/docs/node_scenarios.md +++ b/docs/node_scenarios.md @@ -1,29 +1,29 @@ ### Node Scenarios -Following node chaos scenarios are supported: +The following node chaos scenarios are supported: -1. **node_start_scenario**: scenario to stop the node instance. -2. **node_stop_scenario**: scenario to stop the node instance. -3. **node_stop_start_scenario**: scenario to stop and then start the node instance. -4. **node_termination_scenario**: scenario to terminate the node instance. -5. **node_reboot_scenario**: scenario to reboot the node instance. -6. **stop_kubelet_scenario**: scenario to stop the kubelet of the node instance. -7. **stop_start_kubelet_scenario**: scenario to stop and start the kubelet of the node instance. -8. **node_crash_scenario**: scenario to crash the node instance. -9. **stop_start_helper_node_scenario**: scenario to stop and start the helper node and check service status. +1. **node_start_scenario**: Scenario to stop the node instance. +2. **node_stop_scenario**: Scenario to stop the node instance. +3. **node_stop_start_scenario**: Scenario to stop and then start the node instance. +4. **node_termination_scenario**: Scenario to terminate the node instance. +5. **node_reboot_scenario**: Scenario to reboot the node instance. +6. **stop_kubelet_scenario**: Scenario to stop the kubelet of the node instance. +7. **stop_start_kubelet_scenario**: Scenario to stop and start the kubelet of the node instance. +8. **node_crash_scenario**: Scenario to crash the node instance. +9. **stop_start_helper_node_scenario**: Scenario to stop and start the helper node and check service status. -**NOTE**: If the node doesn't recover from the node_crash_scenario injection, reboot the node to get it back to Ready state. +**NOTE**: If the node does not recover from the node_crash_scenario injection, reboot the node to get it back to Ready state. **NOTE**: node_start_scenario, node_stop_scenario, node_stop_start_scenario, node_termination_scenario , node_reboot_scenario and stop_start_kubelet_scenario are supported only on AWS, Azure, OpenStack, BareMetal, GCP , and Alibaba as of now. -**NOTE**: Node scenarios are supported only when running the standalone version of Kraken until https://github.com/cloud-bulldozer/kraken/issues/106 gets fixed. +**NOTE**: Node scenarios are supported only when running the standalone version of Kraken until https://github.com/chaos-kubox/krkn/issues/106 gets fixed. #### AWS -How to set up AWS cli to run node scenarios is defined [here](cloud_setup.md#aws) +How to set up AWS cli to run node scenarios is defined [here](cloud_setup.md#aws). #### Baremetal **NOTE**: Baremetal requires setting the IPMI user and password to power on, off, and reboot nodes, using the config options `bm_user` and `bm_password`. It can either be set in the root of the entry in the scenarios config, or it can be set per machine. @@ -31,34 +31,34 @@ How to set up AWS cli to run node scenarios is defined [here](cloud_setup.md#aws If no per-machine addresses are specified, kraken attempts to use the BMC value in the BareMetalHost object. To list them, you can do 'oc get bmh -o wide --all-namespaces'. If the BMC values are blank, you must specify them per-machine using the config option 'bmc_addr' as specified below. For per-machine settings, add a "bmc_info" section to the entry in the scenarios config. Inside there, add a configuration section using the node name. In that, add per-machine settings. Valid settings are 'bmc_user', 'bmc_password', and 'bmc_addr'. -For examples, see the example node scenario or the example below. +See the example node scenario or the example below. **NOTE**: Baremetal requires oc (openshift client) be installed on the machine running Kraken. **NOTE**: Baremetal machines are fragile. Some node actions can occasionally corrupt the filesystem if it does not shut down properly, and sometimes the kubelet does not start properly. #### GCP -How to set up GCP cli to run node scenarios is defined [here](cloud_setup.md#gcp) +How to set up GCP cli to run node scenarios is defined [here](cloud_setup.md#gcp). #### Openstack -How to set up Openstack cli to run node scenarios is defined [here](cloud_setup.md#openstack) +How to set up Openstack cli to run node scenarios is defined [here](cloud_setup.md#openstack). The supported node level chaos scenarios on an OPENSTACK cloud are `node_stop_start_scenario`, `stop_start_kubelet_scenario` and `node_reboot_scenario`. -**NOTE**: For `stop_start_helper_node_scenario`, visit [here](https://github.com/RedHatOfficial/ocp4-helpernode) to learn more about the helper node and its usage. +**NOTE**: For `stop_start_helper_node_scenario`, visit [here](https://github.com/redhat-cop/ocp4-helpernode) to learn more about the helper node and its usage. To execute the scenario, ensure the value for `ssh_private_key` in the node scenarios config file is set with the correct private key file path for ssh connection to the helper node. Ensure passwordless ssh is configured on the host running Kraken and the helper node to avoid connection errors. #### Azure -How to set up Azure cli to run node scenarios is defined [here](cloud_setup.md#azure) +How to set up Azure cli to run node scenarios is defined [here](cloud_setup.md#azure). #### Alibaba -How to set up Alibaba cli to run node scenarios is defined [here](cloud_setup.md#alibaba) +How to set up Alibaba cli to run node scenarios is defined [here](cloud_setup.md#alibaba). **NOTE**: There is no "terminating" idea in Alibaba, so any scenario with terminating will "release" the node . Releasing a node is 2 steps, stopping the node and then releasing it. @@ -68,23 +68,23 @@ How to set up Alibaba cli to run node scenarios is defined [here](cloud_setup.md **NOTE**: The `node_crash_scenario` and `stop_kubelet_scenario` scenario is supported independent of the cloud platform. -Use 'generic' or do not add the 'cloud_type' key to your scenario if your cluster is not set up using one of the current supported cloud types +Use 'generic' or do not add the 'cloud_type' key to your scenario if your cluster is not set up using one of the current supported cloud types. -Node scenarios can be injected by placing the node scenarios config files under node_scenarios option in the kraken config. Refer to [node_scenarios_example](https://github.com/openshift-scale/kraken/blob/master/scenarios/node_scenarios_example.yml) config file. +Node scenarios can be injected by placing the node scenarios config files under node_scenarios option in the kraken config. Refer to [node_scenarios_example](https://github.com/chaos-kubox/krkn/blob/main/scenarios/node_scenarios_example.yml) config file. ``` node_scenarios: - - actions: # node chaos scenarios to be injected + - actions: # Node chaos scenarios to be injected. - node_stop_start_scenario - stop_start_kubelet_scenario - node_crash_scenario - node_name: # node on which scenario has to be injected - label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection - instance_count: 1 # Number of nodes to perform action/select that match the label selector - runs: 1 # number of times to inject each scenario under actions (will perform on same node each time) - timeout: 120 # duration to wait for completion of node scenario injection - cloud_type: aws # cloud type on which Kubernetes/OpenShift runs + node_name: # Node on which scenario has to be injected. + label_selector: node-role.kubernetes.io/worker # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection. + instance_count: 1 # Number of nodes to perform action/select that match the label selector. + runs: 1 # Number of times to inject each scenario under actions (will perform on same node each time). + timeout: 120 # Duration to wait for completion of node scenario injection. + cloud_type: aws # Cloud type on which Kubernetes/OpenShift runs. - actions: - node_reboot_scenario node_name: @@ -99,15 +99,15 @@ node_scenarios: instance_count: 1 timeout: 120 - actions: - - stop_start_helper_node_scenario # node chaos scenario for helper node + - stop_start_helper_node_scenario # Node chaos scenario for helper node. instance_count: 1 timeout: 120 - helper_node_ip: # ip address of the helper node - service: # check status of the services on the helper node + helper_node_ip: # ip address of the helper node. + service: # Check status of the services on the helper node. - haproxy - dhcpd - named - ssh_private_key: /root/.ssh/id_rsa # ssh key to access the helper node + ssh_private_key: /root/.ssh/id_rsa # ssh key to access the helper node. cloud_type: openstack - actions: - node_stop_start_scenario @@ -120,7 +120,7 @@ node_scenarios: bmc_password: defaultpass # For baremetal (bm) cloud type. The default IPMI password. Optional if specified for all machines. bmc_info: # This section is here to specify baremetal per-machine info, so it is optional if there is no per-machine info. node-1: # The node name for the baremetal machine - bmc_addr: mgmt-machine1.example.com # Optional. For baremetal nodes with the IPMI BMC address missing from 'oc get bmh' + bmc_addr: mgmt-machine1.example.com # Optional. For baremetal nodes with the IPMI BMC address missing from 'oc get bmh'. node-2: bmc_addr: mgmt-machine2.example.com bmc_user: user # The baremetal IPMI user. Overrides the default IPMI user specified above. Optional if the default is set. diff --git a/docs/pod_scenarios.md b/docs/pod_scenarios.md index 6b963d61..0433be3a 100644 --- a/docs/pod_scenarios.md +++ b/docs/pod_scenarios.md @@ -1,14 +1,14 @@ ### Pod Scenarios Kraken consumes [Powerfulseal](https://github.com/powerfulseal/powerfulseal) under the hood to run the pod scenarios. -These scenarios are in a simple yaml format that you can manipulate to run your specific tests or use the pre-existing scenarios to see how it works +These scenarios are in a simple yaml format that you can manipulate to run your specific tests or use the pre-existing scenarios to see how it works. -#### Pod chaos scenarios +#### Pod Chaos Scenarios The following are the components of Kubernetes/OpenShift for which a basic chaos scenario config exists today. -Component | Description | Working ------------------------- | ---------------------------------------------------------------------------------------------------| ------------------------- | -[Etcd](https://github.com/cloud-bulldozer/kraken/blob/master/scenarios/etcd.yml) | Kills a single/multiple etcd replicas for the specified number of times in a loop | :heavy_check_mark: | -[Kube ApiServer](https://github.com/cloud-bulldozer/kraken/blob/master/scenarios/openshift-kube-apiserver.yml) | Kills a single/multiple kube-apiserver replicas for the specified number of times in a loop | :heavy_check_mark: | -[ApiServer](https://github.com/cloud-bulldozer/kraken/blob/master/scenarios/openshift-apiserver.yml) | Kills a single/multiple apiserver replicas for the specified number of times in a loop | :heavy_check_mark: | -[Prometheus](https://github.com/cloud-bulldozer/kraken/blob/master/scenarios/prometheus.yml) | Kills a single/multiple prometheus replicas for the specified number of times in a loop | :heavy_check_mark: | -[OpenShift System Pods](https://github.com/cloud-bulldozer/kraken/blob/master/scenarios/regex_openshift_pod_kill.yml) | Kills random pods running in the OpenShift system namespaces | :heavy_check_mark: | +Component | Description | Working +------------------------ |----------------------------------------------------------------------------------------------| ------------------------- | +[Etcd](https://github.com/chaos-kubox/krkn/blob/main/scenarios/etcd.yml) | Kills a single/multiple etcd replicas for the specified number of times in a loop. | :heavy_check_mark: | +[Kube ApiServer](https://github.com/chaos-kubox/krkn/blob/main/scenarios/openshift-kube-apiserver.yml) | Kills a single/multiple kube-apiserver replicas for the specified number of times in a loop. | :heavy_check_mark: | +[ApiServer](https://github.com/chaos-kubox/krkn/blob/main/scenarios/openshift-apiserver.yml) | Kills a single/multiple apiserver replicas for the specified number of times in a loop. | :heavy_check_mark: | +[Prometheus](https://github.com/chaos-kubox/krkn/blob/main/scenarios/prometheus.yml) | Kills a single/multiple prometheus replicas for the specified number of times in a loop. | :heavy_check_mark: | +[OpenShift System Pods](https://github.com/chaos-kubox/krkn/blob/main/scenarios/regex_openshift_pod_kill.yml) | Kills random pods running in the OpenShift system namespaces. | :heavy_check_mark: | diff --git a/docs/pvc_scenario.md b/docs/pvc_scenario.md index 351bbec2..6ff9e9d3 100644 --- a/docs/pvc_scenario.md +++ b/docs/pvc_scenario.md @@ -1,26 +1,26 @@ ### PVC scenario -Scenario to fill up a given PersistenVolumeClaim by creating a temp file on the PVC from a pod associated with it. The purpose of this scenario is to fill up a volume to understand faults cause by the application using this volume. +Scenario to fill up a given PersistenVolumeClaim by creating a temp file on the PVC from a pod associated with it. The purpose of this scenario is to fill up a volume to understand faults caused by the application using this volume. ##### Sample scenario config ``` pvc_scenario: - pvc_name: # Name of the target PVC - pod_name: # Name of the pod where the PVC is mounted, it will be ignored if the pvc_name is defined - namespace: # Namespace where the PVC is - fill_percentage: 50 # Target percentage to fill up the cluster, value must be higher than current percentage, valid values are between 0 and 99 - duration: 60 # Duration in seconds for the fault + pvc_name: # Name of the target PVC. + pod_name: # Name of the pod where the PVC is mounted. It will be ignored if the pvc_name is defined. + namespace: # Namespace where the PVC is. + fill_percentage: 50 # Target percentage to fill up the cluster. Value must be higher than current percentage. Valid values are between 0 and 99. + duration: 60 # Duration in seconds for the fault. ``` ##### Steps - - Get the pod name where the PVC is mounted - - Get the volume name mounted in the container pod - - Get the container name where the PVC is mounted - - Get the mount path where the PVC is mounted in the pod - - Get the PVC capacity and current used capacity - - Calculate file size to fill the PVC to the target fill_percentage - - Connect to the pod + - Get the pod name where the PVC is mounted. + - Get the volume name mounted in the container pod. + - Get the container name where the PVC is mounted. + - Get the mount path where the PVC is mounted in the pod. + - Get the PVC capacity and current used capacity. + - Calculate file size to fill the PVC to the target fill_percentage. + - Connect to the pod. - Create a temp file `kraken.tmp` with random data on the mount path: - `dd bs=1024 count=$file_size /mount_path/kraken.tmp` - - Wait for the duration time + - Wait for the duration time. - Remove the temp file created: - `rm kraken.tmp` diff --git a/docs/signal.md b/docs/signal.md index 7ae7afa1..f9411e55 100644 --- a/docs/signal.md +++ b/docs/signal.md @@ -1,30 +1,30 @@ ### Signaling to Kraken -This functionality allows a user to be able to pause or stop the kraken run at any time no matter the number of iterations or daemon_mode set in the config +This functionality allows a user to be able to pause or stop the kraken run at any time no matter the number of iterations or daemon_mode set in the config. -If publish_kraken_status is set to True in the config, kraken will start up a connection to a url at a certain port to decide if it should continue running +If publish_kraken_status is set to True in the config, kraken will start up a connection to a url at a certain port to decide if it should continue running. -By default it will get posted to http://0.0.0.0:8081/ +By default, it will get posted to http://0.0.0.0:8081/ -An example use case for this feature would be coordinating kraken runs based on the status of the service installation or load on the cluster +An example use case for this feature would be coordinating kraken runs based on the status of the service installation or load on the cluster. #### States -There are 3 states in the kraken status +There are 3 states in the kraken status: -```PAUSE```: When the Kraken signal is 'PAUSE', this will pause the kraken test and wait for the wait_duration until the signal returns to RUN +```PAUSE```: When the Kraken signal is 'PAUSE', this will pause the kraken test and wait for the wait_duration until the signal returns to RUN. -```STOP```: When the Kraken signal is 'STOP', end the kraken run and print out report +```STOP```: When the Kraken signal is 'STOP', end the kraken run and print out report. -```RUN```: When the Kraken signal is 'RUN', continue kraken run based on iterations +```RUN```: When the Kraken signal is 'RUN', continue kraken run based on iterations. #### Configuration -In the config you need to set these parameters to tell kraken which port to post the kraken run status to -As well if you want to publish and stop running based on the kraken status or not -The signal is set to `RUN` by default meaning it will continue to run the scenarios, it can set to `PAUSE` for Kraken to act as listener and wait until set to `RUN` before injecting chaos +In the config you need to set these parameters to tell kraken which port to post the kraken run status to. +As well if you want to publish and stop running based on the kraken status or not. +The signal is set to `RUN` by default, meaning it will continue to run the scenarios. It can set to `PAUSE` for Kraken to act as listener and wait until set to `RUN` before injecting chaos. ``` port: 8081 publish_kraken_status: True @@ -34,9 +34,9 @@ The signal is set to `RUN` by default meaning it will continue to run the scenar #### Setting Signal -See [set_stop_signal.py](https://github.com/cloud-bulldozer/kraken/tree/master/set_stop_signal.py) for an example of how to reset the kraken status during kraken execution +See [set_stop_signal.py](https://github.com/chaos-kubox/krkn/blob/main/set_stop_signal.py) for an example of how to reset the kraken status during kraken execution. -Make sure to set the correct port number in your set_stop_signal script +Make sure to set the correct port number in your set_stop_signal script. diff --git a/docs/test_your_changes.md b/docs/test_your_changes.md index dcfbc5db..d72d89de 100644 --- a/docs/test_your_changes.md +++ b/docs/test_your_changes.md @@ -15,30 +15,30 @@ Scenario Types: * application-outages ## Adding a New Scenario -1. Create folder under [kraken/kraken](../kraken) with name pertinent to your scenario name +1. Create folder under [kraken/kraken](../kraken) with name pertinent to your scenario name. -2. Create a python file that will have a generic run function to be the base of your scenario +2. Create a python file that will have a generic run function to be the base of your scenario. - a. See [shut_down.py](../kraken/shut_down/common_shut_down_func.py) for example + a. See [shut_down.py](../kraken/shut_down/common_shut_down_func.py) for example. -3. Add in a scenario yaml file to run your specific scenario under [scenarios](../scenarios) +3. Add in a scenario yaml file to run your specific scenario under [scenarios](../scenarios). - a. Try to add as many parameters as possible and be sure to give them default values in your run function + a. Try to add as many parameters as possible and be sure to give them default values in your run function. -4. Add all functionality and helper functions in file you made above (Step 2) +4. Add all functionality and helper functions in file you made above (Step 2). -5. Add in caller to new scenario type in [run_kraken.py](../run_kraken.py) (around line 154) +5. Add in caller to new scenario type in [run_kraken.py](../run_kraken.py) (around line 154). - a. This will also require you to add the new scenario python script to your imports + a. This will also require you to add the new scenario python script to your imports. -6. Add scenario type and scenario yaml to the scenario list in [config](../config/config.yaml) and [config_performance](../config/config_performance.yaml) +6. Add scenario type and scenario yaml to the scenario list in [config](../config/config.yaml) and [config_performance](../config/config_performance.yaml). -7. Update this doc and main README with new scenario type +7. Update this doc and main README with new scenario type. -8. Add CI test for new scenario +8. Add CI test for new scenario. - a. Refer to test [Readme](../CI/README.md#adding-a-test-case) for more details + a. Refer to test [Readme](../CI/README.md#adding-a-test-case) for more details. ## Follow Contribute guide -Once all you're happy with your changes, follow the [contribution](#docs/contribute.md) guide on how to create your own branch and squash your commits +Once all you are happy with your changes, follow the [contribution](#docs/contribute.md) guide on how to create your own branch and squash your commits. diff --git a/docs/time_scenarios.md b/docs/time_scenarios.md index 15ea0408..6c3a8ae7 100644 --- a/docs/time_scenarios.md +++ b/docs/time_scenarios.md @@ -1,22 +1,22 @@ ### Time/Date Skew Scenarios -Using this type of scenario configuration, one is able to change the time and/or date of the system for pods or nodes +Using this type of scenario configuration, one is able to change the time and/or date of the system for pods or nodes. Configuration Options: -**action:** skew_time or skew_date +**action:** skew_time or skew_date. -**object_type:** pod or node +**object_type:** pod or node. -**namespace:** namespace of the pods you want to skew, need to be set if setting a specific pod name +**namespace:** namespace of the pods you want to skew. Needs to be set if setting a specific pod name. -**label_selector:** label on the nodes or pods you want to skew +**label_selector:** Label on the nodes or pods you want to skew. -**container_name:** container name in pod you want to reset time on, if left blank it will randomly select one +**container_name:** Container name in pod you want to reset time on. If left blank it will randomly select one. -**object_name:** list of the names of pods or nodes you want to skew +**object_name:** List of the names of pods or nodes you want to skew. -Refer to [time_scenarios_example](https://github.com/openshift-scale/kraken/blob/master/scenarios/time_scenarios_example.yml) config file. +Refer to [time_scenarios_example](https://github.com/chaos-kubox/krkn/blob/main/scenarios/time_scenarios_example.yml) config file. ``` time_scenarios: diff --git a/docs/zone_outage.md b/docs/zone_outage.md index 7ea35428..8a2317cf 100644 --- a/docs/zone_outage.md +++ b/docs/zone_outage.md @@ -1,18 +1,18 @@ ### Zone outage scenario -Scenario to create outage in a targeted zone in the public cloud to understand the impact on both Kubernetes/OpenShift control plane as well as applications running on the worker nodes in that zone. It tweaks the network acl of the zone to simulate the failure and that in turn will stop both ingress and egress traffic from all the nodes in a particualar zone for the specified duration and reverts it back to the previous state. Zone outage can be injected by placing the zone_outage config file under zone_outages option in the [kraken config](https://github.com/cloud-bulldozer/kraken/blob/master/config/config.yaml). Refer to [zone_outage_scenario](https://github.com/openshift-scale/kraken/blob/master/scenarios/zone_outage.yaml) config file for the parameters that need to be defined. +Scenario to create outage in a targeted zone in the public cloud to understand the impact on both Kubernetes/OpenShift control plane as well as applications running on the worker nodes in that zone. It tweaks the network acl of the zone to simulate the failure and that in turn will stop both ingress and egress traffic from all the nodes in a particular zone for the specified duration and reverts it back to the previous state. Zone outage can be injected by placing the zone_outage config file under zone_outages option in the [kraken config](https://github.com/chaos-kubox/krkn/blob/main/config/config.yaml). Refer to [zone_outage_scenario](https://github.com/chaos-kubox/krkn/blob/main/scenarios/zone_outage.yaml) config file for the parameters that need to be defined. -Refer to [cloud setup](cloud_setup.md) to configure your cli properly for the cloud provider of the cluster you want to shut down +Refer to [cloud setup](cloud_setup.md) to configure your cli properly for the cloud provider of the cluster you want to shut down. ##### Current accepted cloud types: * [AWS](cloud_setup.md#aws) ##### Sample scenario config ``` -zone_outage: # Scenario to create an outage of a zone by tweaking network ACL - cloud_type: aws # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario. - duration: 600 # duration in seconds after which the zone will be back online - vpc_id: # cluster virtual private network to target - subnet_id: [subnet1, subnet2] # List of subnet-id's to deny both ingress and egress traffic +zone_outage: # Scenario to create an outage of a zone by tweaking network ACL. + cloud_type: aws # Cloud type on which Kubernetes/OpenShift runs. aws is the only platform supported currently for this scenario. + duration: 600 # Duration in seconds after which the zone will be back online. + vpc_id: # Cluster virtual private network to target. + subnet_id: [subnet1, subnet2] # List of subnet-id's to deny both ingress and egress traffic. ``` **NOTE**: vpc_id and subnet_id can be obtained from the cloud web console by selecting one of the instances in the targeted zone ( us-west-2a for example ). @@ -20,7 +20,7 @@ zone_outage: # Scenario to create an out ##### Debugging steps in case of failures In case of failures during the steps which revert back the network acl to allow traffic and bring back the cluster nodes in the zone, the nodes in the particular zone will be in `NotReady` condition. Here is how to fix it: -- OpenShift by default deploys the nodes in different zones for fault tolerance, for example us-west-2a, us-west-2b, us-west-2c. The cluster is associated with a virtual private network and each zone has it's own subnet with a network acl which defines the ingress and egress traffic rules at the zone level unlike security groups which are at an instance level. +- OpenShift by default deploys the nodes in different zones for fault tolerance, for example us-west-2a, us-west-2b, us-west-2c. The cluster is associated with a virtual private network and each zone has its own subnet with a network acl which defines the ingress and egress traffic rules at the zone level unlike security groups which are at an instance level. - From the cloud web console, select one of the instances in the zone which is down and go to the subnet_id specified in the config. -- Look at the network acl associtated with the subnet and you will see both ingress and egress traffic being denied which is expected as Kraken deliberately injects it. -- Kraken just switches the network acl while still keeping the original or default network acl around, switching to the default network acl from the drop down menu will get back the nodes in the targeted zone into Ready state. +- Look at the network acl associated with the subnet and you will see both ingress and egress traffic being denied which is expected as Kraken deliberately injects it. +- Kraken just switches the network acl while still keeping the original or default network acl around, switching to the default network acl from the drop-down menu will get back the nodes in the targeted zone into Ready state.