mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-19 20:40:33 +00:00
Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
64cfd2ca4d | ||
|
|
9cb701a616 | ||
|
|
0372013b67 | ||
|
|
4fea1a354d | ||
|
|
667798d588 | ||
|
|
0c30d89a1b | ||
|
|
2ba20fa483 | ||
|
|
97035a765c | ||
|
|
10ba53574e | ||
|
|
0ecba41082 | ||
|
|
491f59d152 | ||
|
|
2549c9a146 | ||
|
|
949f1f09e0 | ||
|
|
959766254d | ||
|
|
0e68dedb12 |
2
.github/workflows/docker-image.yml
vendored
2
.github/workflows/docker-image.yml
vendored
@@ -13,6 +13,7 @@ jobs:
|
||||
- name: Build the Docker images
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
run: |
|
||||
./containers/compile_dockerfile.sh
|
||||
docker build --no-cache -t quay.io/krkn-chaos/krkn containers/ --build-arg TAG=${GITHUB_REF#refs/tags/}
|
||||
docker tag quay.io/krkn-chaos/krkn quay.io/redhat-chaos/krkn
|
||||
docker tag quay.io/krkn-chaos/krkn quay.io/krkn-chaos/krkn:${GITHUB_REF#refs/tags/}
|
||||
@@ -21,6 +22,7 @@ jobs:
|
||||
- name: Test Build the Docker images
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
run: |
|
||||
./containers/compile_dockerfile.sh
|
||||
docker build --no-cache -t quay.io/krkn-chaos/krkn containers/ --build-arg PR_NUMBER=${{ github.event.pull_request.number }}
|
||||
- name: Login in quay
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
|
||||
11
ROADMAP.md
11
ROADMAP.md
@@ -6,10 +6,11 @@ Following are a list of enhancements that we are planning to work on adding supp
|
||||
- [x] [Centralized storage for chaos experiments artifacts](https://github.com/krkn-chaos/krkn/issues/423)
|
||||
- [ ] [Support for causing DNS outages](https://github.com/krkn-chaos/krkn/issues/394)
|
||||
- [x] [Chaos recommender](https://github.com/krkn-chaos/krkn/tree/main/utils/chaos-recommender) to suggest scenarios having probability of impacting the service under test using profiling results
|
||||
- [ ] Chaos AI integration to improve and automate test coverage
|
||||
- [] Chaos AI integration to improve test coverage while reducing fault space to save costs and execution time
|
||||
- [x] [Support for pod level network traffic shaping](https://github.com/krkn-chaos/krkn/issues/393)
|
||||
- [ ] [Ability to visualize the metrics that are being captured by Kraken and stored in Elasticsearch](https://github.com/krkn-chaos/krkn/issues/124)
|
||||
- [ ] Support for running all the scenarios of Kraken on Kubernetes distribution - see https://github.com/krkn-chaos/krkn/issues/185, https://github.com/redhat-chaos/krkn/issues/186
|
||||
- [ ] Continue to improve [Chaos Testing Guide](https://krkn-chaos.github.io/krkn) in terms of adding best practices, test environment recommendations and scenarios to make sure the OpenShift platform, as well the applications running on top it, are resilient and performant under chaotic conditions.
|
||||
- [ ] [Switch documentation references to Kubernetes](https://github.com/krkn-chaos/krkn/issues/495)
|
||||
- [ ] [OCP and Kubernetes functionalities segregation](https://github.com/krkn-chaos/krkn/issues/497)
|
||||
- [x] Support for running all the scenarios of Kraken on Kubernetes distribution - see https://github.com/krkn-chaos/krkn/issues/185, https://github.com/redhat-chaos/krkn/issues/186
|
||||
- [x] Continue to improve [Chaos Testing Guide](https://krkn-chaos.github.io/krkn) in terms of adding best practices, test environment recommendations and scenarios to make sure the OpenShift platform, as well the applications running on top it, are resilient and performant under chaotic conditions.
|
||||
- [x] [Switch documentation references to Kubernetes](https://github.com/krkn-chaos/krkn/issues/495)
|
||||
- [x] [OCP and Kubernetes functionalities segregation](https://github.com/krkn-chaos/krkn/issues/497)
|
||||
- [x] [Krknctl - client for running Krkn scenarios with ease](https://github.com/krkn-chaos/krknctl)
|
||||
|
||||
@@ -49,6 +49,11 @@ RUN python3.9 -m ensurepip
|
||||
RUN pip3.9 install -r requirements.txt
|
||||
RUN pip3.9 install jsonschema
|
||||
|
||||
LABEL krknctl.title.global="Krkn Base Image"
|
||||
LABEL krknctl.description.global="This is the krkn base image."
|
||||
LABEL krknctl.input_fields.global='$KRKNCTL_INPUT'
|
||||
|
||||
|
||||
RUN chown -R krkn:krkn /home/krkn && chmod 755 /home/krkn
|
||||
USER krkn
|
||||
ENTRYPOINT ["python3.9", "run_kraken.py"]
|
||||
5
containers/compile_dockerfile.sh
Executable file
5
containers/compile_dockerfile.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$SCRIPT_DIR"
|
||||
export KRKNCTL_INPUT=$(cat krknctl-input.json|tr -d "\n")
|
||||
|
||||
envsubst '${KRKNCTL_INPUT}' < Dockerfile.template > Dockerfile
|
||||
396
containers/krknctl-input.json
Normal file
396
containers/krknctl-input.json
Normal file
@@ -0,0 +1,396 @@
|
||||
[
|
||||
{
|
||||
"name": "cerberus-enabled",
|
||||
"short_description": "Enable Cerberus",
|
||||
"description": "Enables Cerberus Support",
|
||||
"variable": "CERBERUS_ENABLED",
|
||||
"type": "enum",
|
||||
"default": "False",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "cerberus-url",
|
||||
"short_description": "Cerberus URL",
|
||||
"description": "Cerberus http url",
|
||||
"variable": "CERBERUS_URL",
|
||||
"type": "string",
|
||||
"default": "http://0.0.0.0:8080",
|
||||
"validator": "^(http|https):\/\/.*",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "distribution",
|
||||
"short_description": "Orchestrator distribution",
|
||||
"description": "Selects the orchestrator distribution",
|
||||
"variable": "DISTRIBUTION",
|
||||
"type": "enum",
|
||||
"default": "openshift",
|
||||
"allowed_values": "openshift,kubernetes",
|
||||
"separator": ",",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "krkn-kubeconfig",
|
||||
"short_description": "Krkn kubeconfig path",
|
||||
"description": "Sets the path where krkn will search for kubeconfig (in container)",
|
||||
"variable": "KRKN_KUBE_CONFIG",
|
||||
"type": "string",
|
||||
"default": "/home/krkn/.kube/config",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "wait-duration",
|
||||
"short_description": "Post chaos wait duration",
|
||||
"description": "waits for a certain amount of time after the scenario",
|
||||
"variable": "WAIT_DURATION",
|
||||
"type": "number",
|
||||
"default": "1"
|
||||
},
|
||||
{
|
||||
"name": "iterations",
|
||||
"short_description": "Chaos scenario iterations",
|
||||
"description": "number of times the same chaos scenario will be executed",
|
||||
"variable": "ITERATIONS",
|
||||
"type": "number",
|
||||
"default": "1"
|
||||
},
|
||||
{
|
||||
"name": "daemon-mode",
|
||||
"short_description": "Sets krkn daemon mode",
|
||||
"description": "if set the scenario will execute forever",
|
||||
"variable": "DAEMON_MODE",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "uuid",
|
||||
"short_description": "Sets krkn run uuid",
|
||||
"description": "sets krkn run uuid instead of generating it",
|
||||
"variable": "UUID",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "capture-metrics",
|
||||
"short_description": "Enables metrics capture",
|
||||
"description": "Enables metrics capture",
|
||||
"variable": "CAPTURE_METRICS",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "enable-alerts",
|
||||
"short_description": "Enables cluster alerts check",
|
||||
"description": "Enables cluster alerts check",
|
||||
"variable": "ENABLE_ALERTS",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "alerts-path",
|
||||
"short_description": "Cluster alerts path file (in container)",
|
||||
"description": "Enables cluster alerts check",
|
||||
"variable": "ALERTS_PATH",
|
||||
"type": "string",
|
||||
"default": "config/alerts.yaml",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "enable-es",
|
||||
"short_description": "Enables elastic search data collection",
|
||||
"description": "Enables elastic search data collection",
|
||||
"variable": "ENABLE_ES",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-server",
|
||||
"short_description": "Elasticsearch instance URL",
|
||||
"description": "Elasticsearch instance URL",
|
||||
"variable": "ES_SERVER",
|
||||
"type": "string",
|
||||
"default": "http://0.0.0.0",
|
||||
"validator": "^(http|https):\/\/.*",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-port",
|
||||
"short_description": "Elasticsearch instance port",
|
||||
"description": "Elasticsearch instance port",
|
||||
"variable": "ES_PORT",
|
||||
"type": "number",
|
||||
"default": "443",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-username",
|
||||
"short_description": "Elasticsearch instance username",
|
||||
"description": "Elasticsearch instance username",
|
||||
"variable": "ES_USERNAME",
|
||||
"type": "string",
|
||||
"default": "elastic",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-password",
|
||||
"short_description": "Elasticsearch instance password",
|
||||
"description": "Elasticsearch instance password",
|
||||
"variable": "ES_PASSWORD",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-verify-certs",
|
||||
"short_description": "Enables elasticsearch TLS certificate verification",
|
||||
"description": "Enables elasticsearch TLS certificate verification",
|
||||
"variable": "ES_VERIFY_CERTS",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-collect-metrics",
|
||||
"short_description": "Enables metrics collection on elastic search",
|
||||
"description": "Enables metrics collection on elastic search",
|
||||
"variable": "ES_COLLECT_METRICS",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-collect-alerts",
|
||||
"short_description": "Enables alerts collection on elastic search",
|
||||
"description": "Enables alerts collection on elastic search",
|
||||
"variable": "ES_COLLECT_ALERTS",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-metrics-index",
|
||||
"short_description": "Elasticsearch metrics index",
|
||||
"description": "Index name for metrics in Elasticsearch",
|
||||
"variable": "ES_METRICS_INDEX",
|
||||
"type": "string",
|
||||
"default": "krkn-metrics",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-alerts-index",
|
||||
"short_description": "Elasticsearch alerts index",
|
||||
"description": "Index name for alerts in Elasticsearch",
|
||||
"variable": "ES_ALERTS_INDEX",
|
||||
"type": "string",
|
||||
"default": "krkn-alerts",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-telemetry-index",
|
||||
"short_description": "Elasticsearch telemetry index",
|
||||
"description": "Index name for telemetry in Elasticsearch",
|
||||
"variable": "ES_TELEMETRY_INDEX",
|
||||
"type": "string",
|
||||
"default": "krkn-telemetry",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "check-critical-alerts",
|
||||
"short_description": "Check critical alerts",
|
||||
"description": "Enables checking for critical alerts",
|
||||
"variable": "CHECK_CRITICAL_ALERTS",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-enabled",
|
||||
"short_description": "Enable telemetry",
|
||||
"description": "Enables telemetry support",
|
||||
"variable": "TELEMETRY_ENABLED",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-api-url",
|
||||
"short_description": "Telemetry API URL",
|
||||
"description": "API endpoint for telemetry data",
|
||||
"variable": "TELEMETRY_API_URL",
|
||||
"type": "string",
|
||||
"default": "https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production",
|
||||
"validator": "^(http|https):\/\/.*",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-username",
|
||||
"short_description": "Telemetry username",
|
||||
"description": "Username for telemetry authentication",
|
||||
"variable": "TELEMETRY_USERNAME",
|
||||
"type": "string",
|
||||
"default": "redhat-chaos",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-password",
|
||||
"short_description": "Telemetry password",
|
||||
"description": "Password for telemetry authentication",
|
||||
"variable": "TELEMETRY_PASSWORD",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-prometheus-backup",
|
||||
"short_description": "Prometheus backup for telemetry",
|
||||
"description": "Enables Prometheus backup for telemetry",
|
||||
"variable": "TELEMETRY_PROMETHEUS_BACKUP",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "True",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-full-prometheus-backup",
|
||||
"short_description": "Full Prometheus backup",
|
||||
"description": "Enables full Prometheus backup for telemetry",
|
||||
"variable": "TELEMETRY_FULL_PROMETHEUS_BACKUP",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-backup-threads",
|
||||
"short_description": "Telemetry backup threads",
|
||||
"description": "Number of threads for telemetry backup",
|
||||
"variable": "TELEMETRY_BACKUP_THREADS",
|
||||
"type": "number",
|
||||
"default": "5",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-archive-path",
|
||||
"short_description": "Telemetry archive path",
|
||||
"description": "Path to save telemetry archive",
|
||||
"variable": "TELEMETRY_ARCHIVE_PATH",
|
||||
"type": "string",
|
||||
"default": "/tmp",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-max-retries",
|
||||
"short_description": "Telemetry max retries",
|
||||
"description": "Maximum retries for telemetry operations",
|
||||
"variable": "TELEMETRY_MAX_RETRIES",
|
||||
"type": "number",
|
||||
"default": "0",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-run-tag",
|
||||
"short_description": "Telemetry run tag",
|
||||
"description": "Tag for telemetry run",
|
||||
"variable": "TELEMETRY_RUN_TAG",
|
||||
"type": "string",
|
||||
"default": "chaos",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-group",
|
||||
"short_description": "Telemetry group",
|
||||
"description": "Group name for telemetry data",
|
||||
"variable": "TELEMETRY_GROUP",
|
||||
"type": "string",
|
||||
"default": "default",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-archive-size",
|
||||
"short_description": "Telemetry archive size",
|
||||
"description": "Maximum size for telemetry archives",
|
||||
"variable": "TELEMETRY_ARCHIVE_SIZE",
|
||||
"type": "number",
|
||||
"default": "1000",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-logs-backup",
|
||||
"short_description": "Telemetry logs backup",
|
||||
"description": "Enables logs backup for telemetry",
|
||||
"variable": "TELEMETRY_LOGS_BACKUP",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-filter-pattern",
|
||||
"short_description": "Telemetry filter pattern",
|
||||
"description": "Filter pattern for telemetry logs",
|
||||
"variable": "TELEMETRY_FILTER_PATTERN",
|
||||
"type": "string",
|
||||
"default": "[\"(\\\\w{3}\\\\s\\\\d{1,2}\\\\s\\\\d{2}:\\\\d{2}:\\\\d{2}\\\\.\\\\d+).+\",\"kinit (\\\\d+/\\\\d+/\\\\d+\\\\s\\\\d{2}:\\\\d{2}:\\\\d{2})\\\\s+\",\"(\\\\d{4}-\\\\d{2}-\\\\d{2}T\\\\d{2}:\\\\d{2}:\\\\d{2}\\\\.\\\\d+Z).+\"]",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-cli-path",
|
||||
"short_description": "Telemetry CLI path (oc)",
|
||||
"description": "Path to telemetry CLI tool (oc)",
|
||||
"variable": "TELEMETRY_CLI_PATH",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-events-backup",
|
||||
"short_description": "Telemetry events backup",
|
||||
"description": "Enables events backup for telemetry",
|
||||
"variable": "TELEMETRY_EVENTS_BACKUP",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "True",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "krkn-debug",
|
||||
"short_description": "Krkn debug mode",
|
||||
"description": "Enables debug mode for Krkn",
|
||||
"variable": "KRKN_DEBUG",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
}
|
||||
]
|
||||
|
||||
@@ -13,13 +13,26 @@ Supported Cloud Providers:
|
||||
**NOTE**: For clusters with AWS make sure [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) is installed and properly [configured](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html) using an AWS account
|
||||
|
||||
## GCP
|
||||
**NOTE**: For clusters with GCP make sure [GCP CLI](https://cloud.google.com/sdk/docs/install#linux) is installed.
|
||||
|
||||
A google service account is required to give proper authentication to GCP for node actions. See [here](https://cloud.google.com/docs/authentication/getting-started) for how to create a service account.
|
||||
In order to set up Application Default Credentials (ADC) for use by Cloud Client Libraries, you can provide either service account credentials or the credentials associated with your user acccount:
|
||||
|
||||
**NOTE**: A user with 'resourcemanager.projects.setIamPolicy' permission is required to grant project-level permissions to the service account.
|
||||
- Using service account credentials:
|
||||
|
||||
After creating the service account you will need to enable the account using the following: ```export GOOGLE_APPLICATION_CREDENTIALS="<serviceaccount.json>"```
|
||||
A google service account is required to give proper authentication to GCP for node actions. See [here](https://cloud.google.com/docs/authentication/getting-started) for how to create a service account.
|
||||
|
||||
**NOTE**: A user with 'resourcemanager.projects.setIamPolicy' permission is required to grant project-level permissions to the service account.
|
||||
|
||||
After creating the service account you will need to enable the account using the following: ```export GOOGLE_APPLICATION_CREDENTIALS="<serviceaccount.json>"```
|
||||
|
||||
- Using the credentials associated with your user acccount:
|
||||
|
||||
1. Make sure that the [GCP CLI](https://cloud.google.com/sdk/docs/install#linux) is installed and [initialized](https://cloud.google.com/sdk/docs/initializing) by running:
|
||||
|
||||
```gcloud init```
|
||||
|
||||
2. Create local authentication credentials for your user account:
|
||||
|
||||
```gcloud auth application-default login```
|
||||
|
||||
## Openstack
|
||||
|
||||
@@ -32,6 +45,7 @@ After creating the service account you will need to enable the account using the
|
||||
To properly run the service principal requires “Azure Active Directory Graph/Application.ReadWrite.OwnedBy” api permission granted and “User Access Administrator”.
|
||||
|
||||
Before running you will need to set the following:
|
||||
|
||||
1. ```export AZURE_SUBSCRIPTION_ID=<subscription_id>```
|
||||
|
||||
2. ```export AZURE_TENANT_ID=<tenant_id>```
|
||||
@@ -66,9 +80,10 @@ Set the following environment variables
|
||||
|
||||
These are the credentials that you would normally use to access the vSphere client.
|
||||
|
||||
|
||||
## IBMCloud
|
||||
If no api key is set up with proper VPC resource permissions, use the following to create:
|
||||
|
||||
If no API key is set up with proper VPC resource permissions, use the following to create it:
|
||||
|
||||
* Access group
|
||||
* Service id with the following access
|
||||
* With policy **VPC Infrastructure Services**
|
||||
|
||||
@@ -8,6 +8,7 @@ Current accepted cloud types:
|
||||
* [GCP](cloud_setup.md#gcp)
|
||||
* [AWS](cloud_setup.md#aws)
|
||||
* [Openstack](cloud_setup.md#openstack)
|
||||
* [IBMCloud](cloud_setup.md#ibmcloud)
|
||||
|
||||
|
||||
```
|
||||
|
||||
@@ -18,7 +18,7 @@ network_chaos: # Scenario to create an outage
|
||||
```
|
||||
|
||||
##### Sample scenario config for ingress traffic shaping (using a plugin)
|
||||
'''
|
||||
```
|
||||
- id: network_chaos
|
||||
config:
|
||||
node_interface_name: # Dictionary with key as node name(s) and value as a list of its interfaces to test
|
||||
@@ -35,7 +35,7 @@ network_chaos: # Scenario to create an outage
|
||||
bandwidth: 10mbit
|
||||
wait_duration: 120
|
||||
test_duration: 60
|
||||
'''
|
||||
```
|
||||
|
||||
Note: For ingress traffic shaping, ensure that your node doesn't have any [IFB](https://wiki.linuxfoundation.org/networking/ifb) interfaces already present. The scenario relies on creating IFBs to do the shaping, and they are deleted at the end of the scenario.
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ The following node chaos scenarios are supported:
|
||||
|
||||
1. **node_start_scenario**: Scenario to stop the node instance.
|
||||
2. **node_stop_scenario**: Scenario to stop the node instance.
|
||||
3. **node_stop_start_scenario**: Scenario to stop and then start the node instance. Not supported on VMware.
|
||||
3. **node_stop_start_scenario**: Scenario to stop the node instance for specified duration and then start the node instance. Not supported on VMware.
|
||||
4. **node_termination_scenario**: Scenario to terminate the node instance.
|
||||
5. **node_reboot_scenario**: Scenario to reboot the node instance.
|
||||
6. **stop_kubelet_scenario**: Scenario to stop the kubelet of the node instance.
|
||||
@@ -12,6 +12,7 @@ The following node chaos scenarios are supported:
|
||||
8. **restart_kubelet_scenario**: Scenario to restart the kubelet of the node instance.
|
||||
9. **node_crash_scenario**: Scenario to crash the node instance.
|
||||
10. **stop_start_helper_node_scenario**: Scenario to stop and start the helper node and check service status.
|
||||
11. **node_disk_detach_attach_scenario**: Scenario to detach node disk for specified duration.
|
||||
|
||||
|
||||
**NOTE**: If the node does not recover from the node_crash_scenario injection, reboot the node to get it back to Ready state.
|
||||
@@ -20,6 +21,8 @@ The following node chaos scenarios are supported:
|
||||
, node_reboot_scenario and stop_start_kubelet_scenario are supported on AWS, Azure, OpenStack, BareMetal, GCP
|
||||
, VMware and Alibaba.
|
||||
|
||||
**NOTE**: node_disk_detach_attach_scenario is supported only on AWS and cannot detach root disk.
|
||||
|
||||
|
||||
#### AWS
|
||||
|
||||
@@ -57,6 +60,8 @@ kind was primarily designed for testing Kubernetes itself, but may be used for l
|
||||
#### GCP
|
||||
Cloud setup instructions can be found [here](cloud_setup.md#gcp). Sample scenario config can be found [here](https://github.com/krkn-chaos/krkn/blob/main/scenarios/openshift/gcp_node_scenarios.yml).
|
||||
|
||||
NOTE: The parallel option is not available for GCP, the api doesn't perform processes at the same time
|
||||
|
||||
|
||||
#### Openstack
|
||||
|
||||
|
||||
@@ -13,10 +13,12 @@ zone_outage: # Scenario to create an out
|
||||
duration: 600 # Duration in seconds after which the zone will be back online.
|
||||
vpc_id: # Cluster virtual private network to target.
|
||||
subnet_id: [subnet1, subnet2] # List of subnet-id's to deny both ingress and egress traffic.
|
||||
default_acl_id: acl-xxxxxxxx # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario.
|
||||
```
|
||||
|
||||
**NOTE**: vpc_id and subnet_id can be obtained from the cloud web console by selecting one of the instances in the targeted zone ( us-west-2a for example ).
|
||||
**NOTE**: Multiple zones will experience downtime in case of targeting multiple subnets which might have an impact on the cluster health especially if the zones have control plane components deployed.
|
||||
**NOTE**: default_acl_id can be obtained from the AWS VPC Console by selecting "Network ACLs" from the left sidebar ( the ID will be in the format 'acl-xxxxxxxx' ). Make sure the selected ACL has the desired ingress/egress rules for your outage scenario ( i.e., deny all ).
|
||||
|
||||
##### Debugging steps in case of failures
|
||||
In case of failures during the steps which revert back the network acl to allow traffic and bring back the cluster nodes in the zone, the nodes in the particular zone will be in `NotReady` condition. Here is how to fix it:
|
||||
|
||||
@@ -29,9 +29,9 @@ def calculate_zscores(data):
|
||||
|
||||
|
||||
def identify_outliers(data, threshold):
|
||||
outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist()
|
||||
outliers_memory = data[data["Memory"] > threshold]["Service"].tolist()
|
||||
outliers_network = data[data["Network"] > threshold]["Service"].tolist()
|
||||
outliers_cpu = data[data["CPU"] > float(threshold)]["Service"].tolist()
|
||||
outliers_memory = data[data["Memory"] > float(threshold)]["Service"].tolist()
|
||||
outliers_network = data[data["Network"] > float(threshold)]["Service"].tolist()
|
||||
|
||||
return outliers_cpu, outliers_memory, outliers_network
|
||||
|
||||
@@ -39,13 +39,13 @@ def identify_outliers(data, threshold):
|
||||
def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold):
|
||||
# Filter the DataFrame based on CPU_HEATMAP and MEM_HEATMAP thresholds
|
||||
filtered_df = dataframe[
|
||||
((dataframe["CPU"] / dataframe["CPU_LIMITS"]) > cpu_threshold)
|
||||
((dataframe["CPU"] / dataframe["CPU_LIMITS"]) > float(cpu_threshold))
|
||||
]
|
||||
# Get the lists of services
|
||||
cpu_services = filtered_df["service"].tolist()
|
||||
|
||||
filtered_df = dataframe[
|
||||
((dataframe["MEM"] / dataframe["MEM_LIMITS"]) > mem_threshold)
|
||||
((dataframe["MEM"] / dataframe["MEM_LIMITS"]) > float(mem_threshold))
|
||||
]
|
||||
mem_services = filtered_df["service"].tolist()
|
||||
|
||||
|
||||
@@ -34,7 +34,16 @@ class IbmCloud:
|
||||
self.service.set_service_url(service_url)
|
||||
except Exception as e:
|
||||
logging.error("error authenticating" + str(e))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node_name):
|
||||
node_list = self.list_instances()
|
||||
for node in node_list:
|
||||
if node_name == node["vpc_name"]:
|
||||
return node["vpc_id"]
|
||||
logging.error("Couldn't find node with name " + str(node_name) + ", you could try another region")
|
||||
sys.exit(1)
|
||||
|
||||
def delete_instance(self, instance_id):
|
||||
"""
|
||||
|
||||
@@ -42,19 +42,13 @@ class NetworkChaosScenarioPlugin(AbstractScenarioPlugin):
|
||||
test_egress = get_yaml_item_value(
|
||||
test_dict, "egress", {"bandwidth": "100mbit"}
|
||||
)
|
||||
|
||||
if test_node:
|
||||
node_name_list = test_node.split(",")
|
||||
nodelst = common_node_functions.get_node_by_name(node_name_list, lib_telemetry.get_lib_kubernetes())
|
||||
else:
|
||||
node_name_list = [test_node]
|
||||
nodelst = []
|
||||
for single_node_name in node_name_list:
|
||||
nodelst.extend(
|
||||
common_node_functions.get_node(
|
||||
single_node_name,
|
||||
test_node_label,
|
||||
test_instance_count,
|
||||
lib_telemetry.get_lib_kubernetes(),
|
||||
)
|
||||
nodelst = common_node_functions.get_node(
|
||||
test_node_label, test_instance_count, lib_telemetry.get_lib_kubernetes()
|
||||
)
|
||||
file_loader = FileSystemLoader(
|
||||
os.path.abspath(os.path.dirname(__file__))
|
||||
@@ -149,7 +143,10 @@ class NetworkChaosScenarioPlugin(AbstractScenarioPlugin):
|
||||
finally:
|
||||
logging.info("Deleting jobs")
|
||||
self.delete_job(joblst[:], lib_telemetry.get_lib_kubernetes())
|
||||
except (RuntimeError, Exception):
|
||||
except (RuntimeError, Exception) as e:
|
||||
logging.error(
|
||||
"NetworkChaosScenarioPlugin exiting due to Exception %s" % e
|
||||
)
|
||||
scenario_telemetry.exit_status = 1
|
||||
return 1
|
||||
else:
|
||||
|
||||
@@ -36,6 +36,20 @@ class abstract_node_scenarios:
|
||||
self.helper_node_start_scenario(instance_kill_count, node, timeout)
|
||||
logging.info("helper_node_stop_start_scenario has been successfully injected!")
|
||||
|
||||
# Node scenario to detach and attach the disk
|
||||
def node_disk_detach_attach_scenario(self, instance_kill_count, node, timeout, duration):
|
||||
logging.info("Starting disk_detach_attach_scenario injection")
|
||||
disk_attachment_details = self.get_disk_attachment_info(instance_kill_count, node)
|
||||
if disk_attachment_details:
|
||||
self.disk_detach_scenario(instance_kill_count, node, timeout)
|
||||
logging.info("Waiting for %s seconds before attaching the disk" % (duration))
|
||||
time.sleep(duration)
|
||||
self.disk_attach_scenario(instance_kill_count, disk_attachment_details, timeout)
|
||||
logging.info("node_disk_detach_attach_scenario has been successfully injected!")
|
||||
else:
|
||||
logging.error("Node %s has only root disk attached" % (node))
|
||||
logging.error("node_disk_detach_attach_scenario failed!")
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
pass
|
||||
|
||||
@@ -12,7 +12,8 @@ from krkn_lib.k8s import KrknKubernetes
|
||||
class AWS:
|
||||
def __init__(self):
|
||||
self.boto_client = boto3.client("ec2")
|
||||
self.boto_instance = boto3.resource("ec2").Instance("id")
|
||||
self.boto_resource = boto3.resource("ec2")
|
||||
self.boto_instance = self.boto_resource.Instance("id")
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node):
|
||||
@@ -179,6 +180,72 @@ class AWS:
|
||||
|
||||
raise RuntimeError()
|
||||
|
||||
# Detach volume
|
||||
def detach_volumes(self, volumes_ids: list):
|
||||
for volume in volumes_ids:
|
||||
try:
|
||||
self.boto_client.detach_volume(VolumeId=volume, Force=True)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Detaching volume %s failed with exception: %s"
|
||||
% (volume, e)
|
||||
)
|
||||
|
||||
# Attach volume
|
||||
def attach_volume(self, attachment: dict):
|
||||
try:
|
||||
if self.get_volume_state(attachment["VolumeId"]) == "in-use":
|
||||
logging.info(
|
||||
"Volume %s is already in use." % attachment["VolumeId"]
|
||||
)
|
||||
return
|
||||
logging.info(
|
||||
"Attaching the %s volumes to instance %s."
|
||||
% (attachment["VolumeId"], attachment["InstanceId"])
|
||||
)
|
||||
self.boto_client.attach_volume(
|
||||
InstanceId=attachment["InstanceId"],
|
||||
Device=attachment["Device"],
|
||||
VolumeId=attachment["VolumeId"]
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed attaching disk %s to the %s instance. "
|
||||
"Encountered following exception: %s"
|
||||
% (attachment['VolumeId'], attachment['InstanceId'], e)
|
||||
)
|
||||
raise RuntimeError()
|
||||
|
||||
# Get IDs of node volumes
|
||||
def get_volumes_ids(self, instance_id: list):
|
||||
response = self.boto_client.describe_instances(InstanceIds=instance_id)
|
||||
instance_attachment_details = response["Reservations"][0]["Instances"][0]["BlockDeviceMappings"]
|
||||
root_volume_device_name = self.get_root_volume_id(instance_id)
|
||||
volume_ids = []
|
||||
for device in instance_attachment_details:
|
||||
if device["DeviceName"] != root_volume_device_name:
|
||||
volume_id = device["Ebs"]["VolumeId"]
|
||||
volume_ids.append(volume_id)
|
||||
return volume_ids
|
||||
|
||||
# Get volumes attachment details
|
||||
def get_volume_attachment_details(self, volume_ids: list):
|
||||
response = self.boto_client.describe_volumes(VolumeIds=volume_ids)
|
||||
volumes_details = response["Volumes"]
|
||||
return volumes_details
|
||||
|
||||
# Get root volume
|
||||
def get_root_volume_id(self, instance_id):
|
||||
instance_id = instance_id[0]
|
||||
instance = self.boto_resource.Instance(instance_id)
|
||||
root_volume_id = instance.root_device_name
|
||||
return root_volume_id
|
||||
|
||||
# Get volume state
|
||||
def get_volume_state(self, volume_id: str):
|
||||
volume = self.boto_resource.Volume(volume_id)
|
||||
state = volume.state
|
||||
return state
|
||||
|
||||
# krkn_lib
|
||||
class aws_node_scenarios(abstract_node_scenarios):
|
||||
@@ -290,3 +357,49 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
|
||||
# Get volume attachment info
|
||||
def get_disk_attachment_info(self, instance_kill_count, node):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Obtaining disk attachment information")
|
||||
instance_id = (self.aws.get_instance_id(node)).split()
|
||||
volumes_ids = self.aws.get_volumes_ids(instance_id)
|
||||
if volumes_ids:
|
||||
vol_attachment_details = self.aws.get_volume_attachment_details(
|
||||
volumes_ids
|
||||
)
|
||||
return vol_attachment_details
|
||||
return
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to obtain disk attachment information of %s node. "
|
||||
"Encounteres following exception: %s." % (node, e)
|
||||
)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to detach the volume
|
||||
def disk_detach_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting disk_detach_scenario injection")
|
||||
instance_id = (self.aws.get_instance_id(node)).split()
|
||||
volumes_ids = self.aws.get_volumes_ids(instance_id)
|
||||
logging.info(
|
||||
"Detaching the %s volumes from instance %s "
|
||||
% (volumes_ids, node)
|
||||
)
|
||||
self.aws.detach_volumes(volumes_ids)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to detach disk from %s node. Encountered following"
|
||||
"exception: %s." % (node, e)
|
||||
)
|
||||
logging.debug("")
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to attach the volume
|
||||
def disk_attach_scenario(self, instance_kill_count, attachment_details, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
for attachment in attachment_details:
|
||||
self.aws.attach_volume(attachment["Attachments"][0])
|
||||
|
||||
@@ -8,19 +8,28 @@ from krkn_lib.k8s import KrknKubernetes
|
||||
node_general = False
|
||||
|
||||
|
||||
def get_node_by_name(node_name_list, kubecli: KrknKubernetes):
|
||||
killable_nodes = kubecli.list_killable_nodes()
|
||||
for node_name in node_name_list:
|
||||
if node_name not in killable_nodes:
|
||||
logging.info(
|
||||
f"Node with provided ${node_name} does not exist or the node might "
|
||||
"be in NotReady state."
|
||||
)
|
||||
return
|
||||
return node_name_list
|
||||
|
||||
|
||||
# Pick a random node with specified label selector
|
||||
def get_node(node_name, label_selector, instance_kill_count, kubecli: KrknKubernetes):
|
||||
if node_name in kubecli.list_killable_nodes():
|
||||
return [node_name]
|
||||
elif node_name:
|
||||
logging.info(
|
||||
"Node with provided node_name does not exist or the node might "
|
||||
"be in NotReady state."
|
||||
)
|
||||
nodes = kubecli.list_killable_nodes(label_selector)
|
||||
def get_node(label_selector, instance_kill_count, kubecli: KrknKubernetes):
|
||||
|
||||
label_selector_list = label_selector.split(",")
|
||||
nodes = []
|
||||
for label_selector in label_selector_list:
|
||||
nodes.extend(kubecli.list_killable_nodes(label_selector))
|
||||
if not nodes:
|
||||
raise Exception("Ready nodes with the provided label selector do not exist")
|
||||
logging.info("Ready nodes with the label selector %s: %s" % (label_selector, nodes))
|
||||
logging.info("Ready nodes with the label selector %s: %s" % (label_selector_list, nodes))
|
||||
number_of_nodes = len(nodes)
|
||||
if instance_kill_count == number_of_nodes:
|
||||
return nodes
|
||||
@@ -35,22 +44,19 @@ def get_node(node_name, label_selector, instance_kill_count, kubecli: KrknKubern
|
||||
# krkn_lib
|
||||
# Wait until the node status becomes Ready
|
||||
def wait_for_ready_status(node, timeout, kubecli: KrknKubernetes):
|
||||
resource_version = kubecli.get_node_resource_version(node)
|
||||
kubecli.watch_node_status(node, "True", timeout, resource_version)
|
||||
kubecli.watch_node_status(node, "True", timeout)
|
||||
|
||||
|
||||
# krkn_lib
|
||||
# Wait until the node status becomes Not Ready
|
||||
def wait_for_not_ready_status(node, timeout, kubecli: KrknKubernetes):
|
||||
resource_version = kubecli.get_node_resource_version(node)
|
||||
kubecli.watch_node_status(node, "False", timeout, resource_version)
|
||||
kubecli.watch_node_status(node, "False", timeout)
|
||||
|
||||
|
||||
# krkn_lib
|
||||
# Wait until the node status becomes Unknown
|
||||
def wait_for_unknown_status(node, timeout, kubecli: KrknKubernetes):
|
||||
resource_version = kubecli.get_node_resource_version(node)
|
||||
kubecli.watch_node_status(node, "Unknown", timeout, resource_version)
|
||||
kubecli.watch_node_status(node, "Unknown", timeout)
|
||||
|
||||
|
||||
# Get the ip of the cluster node
|
||||
|
||||
@@ -1,66 +1,78 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
import json
|
||||
import google.auth
|
||||
import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
|
||||
from krkn.scenario_plugins.node_actions.abstract_node_scenarios import (
|
||||
abstract_node_scenarios,
|
||||
)
|
||||
from googleapiclient import discovery
|
||||
from oauth2client.client import GoogleCredentials
|
||||
from google.cloud import compute_v1
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
|
||||
class GCP:
|
||||
def __init__(self):
|
||||
try:
|
||||
gapp_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
||||
with open(gapp_creds, "r") as f:
|
||||
f_str = f.read()
|
||||
self.project = json.loads(f_str)["project_id"]
|
||||
# self.project = runcommand.invoke("gcloud config get-value project").split("/n")[0].strip()
|
||||
logging.info("project " + str(self.project) + "!")
|
||||
credentials = GoogleCredentials.get_application_default()
|
||||
self.client = discovery.build(
|
||||
"compute", "v1", credentials=credentials, cache_discovery=False
|
||||
)
|
||||
|
||||
_, self.project_id = google.auth.default()
|
||||
self.instance_client = compute_v1.InstancesClient()
|
||||
except Exception as e:
|
||||
logging.error("Error on setting up GCP connection: " + str(e))
|
||||
|
||||
raise e
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node):
|
||||
zone_request = self.client.zones().list(project=self.project)
|
||||
while zone_request is not None:
|
||||
zone_response = zone_request.execute()
|
||||
for zone in zone_response["items"]:
|
||||
instances_request = self.client.instances().list(
|
||||
project=self.project, zone=zone["name"]
|
||||
)
|
||||
while instances_request is not None:
|
||||
instance_response = instances_request.execute()
|
||||
if "items" in instance_response.keys():
|
||||
for instance in instance_response["items"]:
|
||||
if instance["name"] in node:
|
||||
return instance["name"], zone["name"]
|
||||
instances_request = self.client.zones().list_next(
|
||||
previous_request=instances_request,
|
||||
previous_response=instance_response,
|
||||
)
|
||||
zone_request = self.client.zones().list_next(
|
||||
previous_request=zone_request, previous_response=zone_response
|
||||
# Get the instance of the node
|
||||
def get_node_instance(self, node):
|
||||
try:
|
||||
request = compute_v1.AggregatedListInstancesRequest(
|
||||
project = self.project_id
|
||||
)
|
||||
logging.info("no instances ")
|
||||
agg_list = self.instance_client.aggregated_list(request=request)
|
||||
for _, response in agg_list:
|
||||
if response.instances:
|
||||
for instance in response.instances:
|
||||
if instance.name in node:
|
||||
return instance
|
||||
logging.info("no instances ")
|
||||
except Exception as e:
|
||||
logging.error("Error getting the instance of the node: " + str(e))
|
||||
|
||||
raise e
|
||||
|
||||
# Get the instance name
|
||||
def get_instance_name(self, instance):
|
||||
if instance.name:
|
||||
return instance.name
|
||||
|
||||
# Get the instance zone
|
||||
def get_instance_zone(self, instance):
|
||||
if instance.zone:
|
||||
return instance.zone.split("/")[-1]
|
||||
|
||||
# Get the instance zone of the node
|
||||
def get_node_instance_zone(self, node):
|
||||
instance = self.get_node_instance(node)
|
||||
if instance:
|
||||
return self.get_instance_zone(instance)
|
||||
|
||||
# Get the instance name of the node
|
||||
def get_node_instance_name(self, node):
|
||||
instance = self.get_node_instance(node)
|
||||
if instance:
|
||||
return self.get_instance_name(instance)
|
||||
|
||||
# Get the instance name of the node
|
||||
def get_instance_id(self, node):
|
||||
return self.get_node_instance_name(node)
|
||||
|
||||
# Start the node instance
|
||||
def start_instances(self, zone, instance_id):
|
||||
def start_instances(self, instance_id):
|
||||
try:
|
||||
self.client.instances().start(
|
||||
project=self.project, zone=zone, instance=instance_id
|
||||
).execute()
|
||||
logging.info("vm name " + str(instance_id) + " started")
|
||||
request = compute_v1.StartInstanceRequest(
|
||||
instance=instance_id,
|
||||
project=self.project_id,
|
||||
zone=self.get_node_instance_zone(instance_id),
|
||||
)
|
||||
self.instance_client.start(request=request)
|
||||
logging.info("Instance: " + str(instance_id) + " started")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to start node instance %s. Encountered following "
|
||||
@@ -70,12 +82,15 @@ class GCP:
|
||||
raise RuntimeError()
|
||||
|
||||
# Stop the node instance
|
||||
def stop_instances(self, zone, instance_id):
|
||||
def stop_instances(self, instance_id):
|
||||
try:
|
||||
self.client.instances().stop(
|
||||
project=self.project, zone=zone, instance=instance_id
|
||||
).execute()
|
||||
logging.info("vm name " + str(instance_id) + " stopped")
|
||||
request = compute_v1.StopInstanceRequest(
|
||||
instance=instance_id,
|
||||
project=self.project_id,
|
||||
zone=self.get_node_instance_zone(instance_id),
|
||||
)
|
||||
self.instance_client.stop(request=request)
|
||||
logging.info("Instance: " + str(instance_id) + " stopped")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to stop node instance %s. Encountered following "
|
||||
@@ -84,13 +99,16 @@ class GCP:
|
||||
|
||||
raise RuntimeError()
|
||||
|
||||
# Start the node instance
|
||||
def suspend_instances(self, zone, instance_id):
|
||||
# Suspend the node instance
|
||||
def suspend_instances(self, instance_id):
|
||||
try:
|
||||
self.client.instances().suspend(
|
||||
project=self.project, zone=zone, instance=instance_id
|
||||
).execute()
|
||||
logging.info("vm name " + str(instance_id) + " suspended")
|
||||
request = compute_v1.SuspendInstanceRequest(
|
||||
instance=instance_id,
|
||||
project=self.project_id,
|
||||
zone=self.get_node_instance_zone(instance_id),
|
||||
)
|
||||
self.instance_client.suspend(request=request)
|
||||
logging.info("Instance: " + str(instance_id) + " suspended")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to suspend node instance %s. Encountered following "
|
||||
@@ -100,49 +118,65 @@ class GCP:
|
||||
raise RuntimeError()
|
||||
|
||||
# Terminate the node instance
|
||||
def terminate_instances(self, zone, instance_id):
|
||||
def terminate_instances(self, instance_id):
|
||||
try:
|
||||
self.client.instances().delete(
|
||||
project=self.project, zone=zone, instance=instance_id
|
||||
).execute()
|
||||
logging.info("vm name " + str(instance_id) + " terminated")
|
||||
request = compute_v1.DeleteInstanceRequest(
|
||||
instance=instance_id,
|
||||
project=self.project_id,
|
||||
zone=self.get_node_instance_zone(instance_id),
|
||||
)
|
||||
self.instance_client.delete(request=request)
|
||||
logging.info("Instance: " + str(instance_id) + " terminated")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to start node instance %s. Encountered following "
|
||||
"Failed to terminate node instance %s. Encountered following "
|
||||
"exception: %s." % (instance_id, e)
|
||||
)
|
||||
|
||||
raise RuntimeError()
|
||||
|
||||
# Reboot the node instance
|
||||
def reboot_instances(self, zone, instance_id):
|
||||
def reboot_instances(self, instance_id):
|
||||
try:
|
||||
self.client.instances().reset(
|
||||
project=self.project, zone=zone, instance=instance_id
|
||||
).execute()
|
||||
logging.info("vm name " + str(instance_id) + " rebooted")
|
||||
request = compute_v1.ResetInstanceRequest(
|
||||
instance=instance_id,
|
||||
project=self.project_id,
|
||||
zone=self.get_node_instance_zone(instance_id),
|
||||
)
|
||||
self.instance_client.reset(request=request)
|
||||
logging.info("Instance: " + str(instance_id) + " rebooted")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to start node instance %s. Encountered following "
|
||||
"Failed to reboot node instance %s. Encountered following "
|
||||
"exception: %s." % (instance_id, e)
|
||||
)
|
||||
|
||||
raise RuntimeError()
|
||||
|
||||
# Get instance status
|
||||
def get_instance_status(self, zone, instance_id, expected_status, timeout):
|
||||
# statuses: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING,
|
||||
def get_instance_status(self, instance_id, expected_status, timeout):
|
||||
# states: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING,
|
||||
# and TERMINATED.
|
||||
i = 0
|
||||
sleeper = 5
|
||||
while i <= timeout:
|
||||
instStatus = (
|
||||
self.client.instances()
|
||||
.get(project=self.project, zone=zone, instance=instance_id)
|
||||
.execute()
|
||||
)
|
||||
logging.info("Status of vm " + str(instStatus["status"]))
|
||||
if instStatus["status"] == expected_status:
|
||||
try:
|
||||
request = compute_v1.GetInstanceRequest(
|
||||
instance=instance_id,
|
||||
project=self.project_id,
|
||||
zone=self.get_node_instance_zone(instance_id),
|
||||
)
|
||||
instance_status = self.instance_client.get(request=request).status
|
||||
logging.info("Status of instance " + str(instance_id) + ": " + instance_status)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to get status of instance %s. Encountered following "
|
||||
"exception: %s." % (instance_id, e)
|
||||
)
|
||||
|
||||
raise RuntimeError()
|
||||
|
||||
if instance_status == expected_status:
|
||||
return True
|
||||
time.sleep(sleeper)
|
||||
i += sleeper
|
||||
@@ -153,33 +187,21 @@ class GCP:
|
||||
return False
|
||||
|
||||
# Wait until the node instance is suspended
|
||||
def wait_until_suspended(self, zone, instance_id, timeout):
|
||||
return self.get_instance_status(zone, instance_id, "SUSPENDED", timeout)
|
||||
def wait_until_suspended(self, instance_id, timeout):
|
||||
return self.get_instance_status(instance_id, "SUSPENDED", timeout)
|
||||
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, zone, instance_id, timeout):
|
||||
return self.get_instance_status(zone, instance_id, "RUNNING", timeout)
|
||||
def wait_until_running(self, instance_id, timeout):
|
||||
return self.get_instance_status(instance_id, "RUNNING", timeout)
|
||||
|
||||
# Wait until the node instance is stopped
|
||||
def wait_until_stopped(self, zone, instance_id, timeout):
|
||||
return self.get_instance_status(zone, instance_id, "TERMINATED", timeout)
|
||||
def wait_until_stopped(self, instance_id, timeout):
|
||||
# In GCP, the next state after STOPPING is TERMINATED
|
||||
return self.get_instance_status(instance_id, "TERMINATED", timeout)
|
||||
|
||||
# Wait until the node instance is terminated
|
||||
def wait_until_terminated(self, zone, instance_id, timeout):
|
||||
try:
|
||||
i = 0
|
||||
sleeper = 5
|
||||
while i <= timeout:
|
||||
instStatus = (
|
||||
self.client.instances()
|
||||
.get(project=self.project, zone=zone, instance=instance_id)
|
||||
.execute()
|
||||
)
|
||||
logging.info("Status of vm " + str(instStatus["status"]))
|
||||
time.sleep(sleeper)
|
||||
except Exception as e:
|
||||
logging.info("here " + str(e))
|
||||
return True
|
||||
def wait_until_terminated(self, instance_id, timeout):
|
||||
return self.get_instance_status(instance_id, "TERMINATED", timeout)
|
||||
|
||||
|
||||
# krkn_lib
|
||||
@@ -193,12 +215,13 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
instance_id, zone = self.gcp.get_instance_id(node)
|
||||
instance = self.gcp.get_node_instance(node)
|
||||
instance_id = self.gcp.get_instance_name(instance)
|
||||
logging.info(
|
||||
"Starting the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
self.gcp.start_instances(zone, instance_id)
|
||||
self.gcp.wait_until_running(zone, instance_id, timeout)
|
||||
self.gcp.start_instances(instance_id)
|
||||
self.gcp.wait_until_running(instance_id, timeout)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in running state" % instance_id
|
||||
@@ -215,16 +238,16 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
logging.info("stop scenario")
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
instance_id, zone = self.gcp.get_instance_id(node)
|
||||
instance = self.gcp.get_node_instance(node)
|
||||
instance_id = self.gcp.get_instance_name(instance)
|
||||
logging.info(
|
||||
"Stopping the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
self.gcp.stop_instances(zone, instance_id)
|
||||
self.gcp.wait_until_stopped(zone, instance_id, timeout)
|
||||
self.gcp.stop_instances(instance_id)
|
||||
self.gcp.wait_until_stopped(instance_id, timeout)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in stopped state" % instance_id
|
||||
)
|
||||
@@ -243,13 +266,14 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting node_termination_scenario injection")
|
||||
instance_id, zone = self.gcp.get_instance_id(node)
|
||||
instance = self.gcp.get_node_instance(node)
|
||||
instance_id = self.gcp.get_instance_name(instance)
|
||||
logging.info(
|
||||
"Terminating the node %s with instance ID: %s "
|
||||
% (node, instance_id)
|
||||
)
|
||||
self.gcp.terminate_instances(zone, instance_id)
|
||||
self.gcp.wait_until_terminated(zone, instance_id, timeout)
|
||||
self.gcp.terminate_instances(instance_id)
|
||||
self.gcp.wait_until_terminated(instance_id, timeout)
|
||||
for _ in range(timeout):
|
||||
if node not in self.kubecli.list_nodes():
|
||||
break
|
||||
@@ -267,19 +291,20 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.error("node_termination_scenario injection failed!")
|
||||
|
||||
|
||||
raise e
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
instance_id, zone = self.gcp.get_instance_id(node)
|
||||
instance = self.gcp.get_node_instance(node)
|
||||
instance_id = self.gcp.get_instance_name(instance)
|
||||
logging.info(
|
||||
"Rebooting the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
self.gcp.reboot_instances(zone, instance_id)
|
||||
self.gcp.reboot_instances(instance_id)
|
||||
self.gcp.wait_until_running(instance_id, timeout)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has been rebooted" % instance_id
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import logging
|
||||
import time
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from itertools import repeat
|
||||
|
||||
import yaml
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
@@ -64,23 +66,23 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
global node_general
|
||||
node_general = True
|
||||
return general_node_scenarios(kubecli)
|
||||
if node_scenario["cloud_type"] == "aws":
|
||||
if node_scenario["cloud_type"].lower() == "aws":
|
||||
return aws_node_scenarios(kubecli)
|
||||
elif node_scenario["cloud_type"] == "gcp":
|
||||
elif node_scenario["cloud_type"].lower() == "gcp":
|
||||
return gcp_node_scenarios(kubecli)
|
||||
elif node_scenario["cloud_type"] == "openstack":
|
||||
elif node_scenario["cloud_type"].lower() == "openstack":
|
||||
from krkn.scenario_plugins.node_actions.openstack_node_scenarios import (
|
||||
openstack_node_scenarios,
|
||||
)
|
||||
|
||||
return openstack_node_scenarios(kubecli)
|
||||
elif (
|
||||
node_scenario["cloud_type"] == "azure"
|
||||
node_scenario["cloud_type"].lower() == "azure"
|
||||
or node_scenario["cloud_type"] == "az"
|
||||
):
|
||||
return azure_node_scenarios(kubecli)
|
||||
elif (
|
||||
node_scenario["cloud_type"] == "alibaba"
|
||||
node_scenario["cloud_type"].lower() == "alibaba"
|
||||
or node_scenario["cloud_type"] == "alicloud"
|
||||
):
|
||||
from krkn.scenario_plugins.node_actions.alibaba_node_scenarios import (
|
||||
@@ -88,7 +90,7 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
)
|
||||
|
||||
return alibaba_node_scenarios(kubecli)
|
||||
elif node_scenario["cloud_type"] == "bm":
|
||||
elif node_scenario["cloud_type"].lower() == "bm":
|
||||
from krkn.scenario_plugins.node_actions.bm_node_scenarios import (
|
||||
bm_node_scenarios,
|
||||
)
|
||||
@@ -99,7 +101,7 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
node_scenario.get("bmc_password", None),
|
||||
kubecli,
|
||||
)
|
||||
elif node_scenario["cloud_type"] == "docker":
|
||||
elif node_scenario["cloud_type"].lower() == "docker":
|
||||
return docker_node_scenarios(kubecli)
|
||||
else:
|
||||
logging.error(
|
||||
@@ -120,100 +122,131 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
def inject_node_scenario(
|
||||
self, action, node_scenario, node_scenario_object, kubecli: KrknKubernetes
|
||||
):
|
||||
generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario")
|
||||
# Get the node scenario configurations
|
||||
run_kill_count = get_yaml_item_value(node_scenario, "runs", 1)
|
||||
|
||||
# Get the node scenario configurations for setting nodes
|
||||
|
||||
instance_kill_count = get_yaml_item_value(node_scenario, "instance_count", 1)
|
||||
node_name = get_yaml_item_value(node_scenario, "node_name", "")
|
||||
label_selector = get_yaml_item_value(node_scenario, "label_selector", "")
|
||||
if action == "node_stop_start_scenario":
|
||||
parallel_nodes = get_yaml_item_value(node_scenario, "parallel", False)
|
||||
|
||||
# Get the node to apply the scenario
|
||||
if node_name:
|
||||
node_name_list = node_name.split(",")
|
||||
nodes = common_node_functions.get_node_by_name(node_name_list, kubecli)
|
||||
else:
|
||||
nodes = common_node_functions.get_node(
|
||||
label_selector, instance_kill_count, kubecli
|
||||
)
|
||||
|
||||
# GCP api doesn't support multiprocessing calls, will only actually run 1
|
||||
if parallel_nodes and node_scenario['cloud_type'].lower() != "gcp":
|
||||
self.multiprocess_nodes(nodes, node_scenario_object, action, node_scenario)
|
||||
else:
|
||||
for single_node in nodes:
|
||||
self.run_node(single_node, node_scenario_object, action, node_scenario)
|
||||
|
||||
def multiprocess_nodes(self, nodes, node_scenario_object, action, node_scenario):
|
||||
try:
|
||||
logging.info("parallely call to nodes")
|
||||
# pool object with number of element
|
||||
pool = ThreadPool(processes=len(nodes))
|
||||
|
||||
pool.starmap(self.run_node,zip(nodes, repeat(node_scenario_object), repeat(action), repeat(node_scenario)))
|
||||
|
||||
pool.close()
|
||||
except Exception as e:
|
||||
logging.info("Error on pool multiprocessing: " + str(e))
|
||||
|
||||
|
||||
def run_node(self, single_node, node_scenario_object, action, node_scenario):
|
||||
logging.info("action" + str(action))
|
||||
# Get the scenario specifics for running action nodes
|
||||
run_kill_count = get_yaml_item_value(node_scenario, "runs", 1)
|
||||
if action in ("node_stop_start_scenario", "node_disk_detach_attach_scenario"):
|
||||
duration = get_yaml_item_value(node_scenario, "duration", 120)
|
||||
|
||||
timeout = get_yaml_item_value(node_scenario, "timeout", 120)
|
||||
service = get_yaml_item_value(node_scenario, "service", "")
|
||||
ssh_private_key = get_yaml_item_value(
|
||||
node_scenario, "ssh_private_key", "~/.ssh/id_rsa"
|
||||
)
|
||||
# Get the node to apply the scenario
|
||||
if node_name:
|
||||
node_name_list = node_name.split(",")
|
||||
else:
|
||||
node_name_list = [node_name]
|
||||
for single_node_name in node_name_list:
|
||||
nodes = common_node_functions.get_node(
|
||||
single_node_name, label_selector, instance_kill_count, kubecli
|
||||
generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario")
|
||||
|
||||
if node_general and action not in generic_cloud_scenarios:
|
||||
logging.info(
|
||||
"Scenario: "
|
||||
+ action
|
||||
+ " is not set up for generic cloud type, skipping action"
|
||||
)
|
||||
for single_node in nodes:
|
||||
if node_general and action not in generic_cloud_scenarios:
|
||||
logging.info(
|
||||
"Scenario: "
|
||||
+ action
|
||||
+ " is not set up for generic cloud type, skipping action"
|
||||
else:
|
||||
if action == "node_start_scenario":
|
||||
node_scenario_object.node_start_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "node_stop_scenario":
|
||||
node_scenario_object.node_stop_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "node_stop_start_scenario":
|
||||
node_scenario_object.node_stop_start_scenario(
|
||||
run_kill_count, single_node, timeout, duration
|
||||
)
|
||||
elif action == "node_termination_scenario":
|
||||
node_scenario_object.node_termination_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "node_reboot_scenario":
|
||||
node_scenario_object.node_reboot_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "node_disk_detach_attach_scenario":
|
||||
node_scenario_object.node_disk_detach_attach_scenario(
|
||||
run_kill_count, single_node, timeout, duration)
|
||||
elif action == "stop_start_kubelet_scenario":
|
||||
node_scenario_object.stop_start_kubelet_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "restart_kubelet_scenario":
|
||||
node_scenario_object.restart_kubelet_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "stop_kubelet_scenario":
|
||||
node_scenario_object.stop_kubelet_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "node_crash_scenario":
|
||||
node_scenario_object.node_crash_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "stop_start_helper_node_scenario":
|
||||
if node_scenario["cloud_type"] != "openstack":
|
||||
logging.error(
|
||||
"Scenario: " + action + " is not supported for "
|
||||
"cloud type "
|
||||
+ node_scenario["cloud_type"]
|
||||
+ ", skipping action"
|
||||
)
|
||||
else:
|
||||
if action == "node_start_scenario":
|
||||
node_scenario_object.node_start_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "node_stop_scenario":
|
||||
node_scenario_object.node_stop_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "node_stop_start_scenario":
|
||||
node_scenario_object.node_stop_start_scenario(
|
||||
run_kill_count, single_node, timeout, duration
|
||||
)
|
||||
elif action == "node_termination_scenario":
|
||||
node_scenario_object.node_termination_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "node_reboot_scenario":
|
||||
node_scenario_object.node_reboot_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "stop_start_kubelet_scenario":
|
||||
node_scenario_object.stop_start_kubelet_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "restart_kubelet_scenario":
|
||||
node_scenario_object.restart_kubelet_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "stop_kubelet_scenario":
|
||||
node_scenario_object.stop_kubelet_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "node_crash_scenario":
|
||||
node_scenario_object.node_crash_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "stop_start_helper_node_scenario":
|
||||
if node_scenario["cloud_type"] != "openstack":
|
||||
logging.error(
|
||||
"Scenario: " + action + " is not supported for "
|
||||
"cloud type "
|
||||
+ node_scenario["cloud_type"]
|
||||
+ ", skipping action"
|
||||
)
|
||||
else:
|
||||
if not node_scenario["helper_node_ip"]:
|
||||
logging.error("Helper node IP address is not provided")
|
||||
raise Exception(
|
||||
"Helper node IP address is not provided"
|
||||
)
|
||||
node_scenario_object.helper_node_stop_start_scenario(
|
||||
run_kill_count, node_scenario["helper_node_ip"], timeout
|
||||
)
|
||||
node_scenario_object.helper_node_service_status(
|
||||
node_scenario["helper_node_ip"],
|
||||
service,
|
||||
ssh_private_key,
|
||||
timeout,
|
||||
)
|
||||
else:
|
||||
logging.info(
|
||||
"There is no node action that matches %s, skipping scenario"
|
||||
% action
|
||||
if not node_scenario["helper_node_ip"]:
|
||||
logging.error("Helper node IP address is not provided")
|
||||
raise Exception(
|
||||
"Helper node IP address is not provided"
|
||||
)
|
||||
node_scenario_object.helper_node_stop_start_scenario(
|
||||
run_kill_count, node_scenario["helper_node_ip"], timeout
|
||||
)
|
||||
node_scenario_object.helper_node_service_status(
|
||||
node_scenario["helper_node_ip"],
|
||||
service,
|
||||
ssh_private_key,
|
||||
timeout,
|
||||
)
|
||||
else:
|
||||
logging.info(
|
||||
"There is no node action that matches %s, skipping scenario"
|
||||
% action
|
||||
)
|
||||
|
||||
def get_scenario_types(self) -> list[str]:
|
||||
return ["node_scenarios"]
|
||||
|
||||
@@ -13,6 +13,7 @@ from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
|
||||
from krkn.scenario_plugins.node_actions.az_node_scenarios import Azure
|
||||
from krkn.scenario_plugins.node_actions.gcp_node_scenarios import GCP
|
||||
from krkn.scenario_plugins.node_actions.openstack_node_scenarios import OPENSTACKCLOUD
|
||||
from krkn.scenario_plugins.native.node_scenarios.ibmcloud_plugin import IbmCloud
|
||||
|
||||
|
||||
class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
@@ -86,6 +87,8 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
cloud_object = OPENSTACKCLOUD()
|
||||
elif cloud_type.lower() in ["azure", "az"]:
|
||||
cloud_object = Azure()
|
||||
elif cloud_type.lower() in ["ibm", "ibmcloud"]:
|
||||
cloud_object = IbmCloud()
|
||||
else:
|
||||
logging.error(
|
||||
"Cloud type %s is not currently supported for cluster shut down"
|
||||
|
||||
@@ -29,6 +29,8 @@ class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
subnet_ids = scenario_config["subnet_id"]
|
||||
duration = scenario_config["duration"]
|
||||
cloud_type = scenario_config["cloud_type"]
|
||||
# Add support for user-provided default network ACL
|
||||
default_acl_id = scenario_config.get("default_acl_id")
|
||||
ids = {}
|
||||
acl_ids_created = []
|
||||
|
||||
@@ -58,7 +60,20 @@ class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
"Network association ids associated with "
|
||||
"the subnet %s: %s" % (subnet_id, network_association_ids)
|
||||
)
|
||||
acl_id = cloud_object.create_default_network_acl(vpc_id)
|
||||
|
||||
# Use provided default ACL if available, otherwise create a new one
|
||||
if default_acl_id:
|
||||
acl_id = default_acl_id
|
||||
logging.info(
|
||||
"Using provided default ACL ID %s - this ACL will not be deleted after the scenario",
|
||||
default_acl_id
|
||||
)
|
||||
# Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup
|
||||
else:
|
||||
acl_id = cloud_object.create_default_network_acl(vpc_id)
|
||||
logging.info("Created new default ACL %s", acl_id)
|
||||
acl_ids_created.append(acl_id)
|
||||
|
||||
new_association_id = cloud_object.replace_network_acl_association(
|
||||
network_association_ids[0], acl_id
|
||||
)
|
||||
@@ -66,7 +81,6 @@ class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
# capture the orginal_acl_id, created_acl_id and
|
||||
# new association_id to use during the recovery
|
||||
ids[new_association_id] = original_acl_id
|
||||
acl_ids_created.append(acl_id)
|
||||
|
||||
# wait for the specified duration
|
||||
logging.info(
|
||||
|
||||
@@ -11,15 +11,15 @@ coverage==7.4.1
|
||||
datetime==5.4
|
||||
docker==7.0.0
|
||||
gitpython==3.1.41
|
||||
google-api-python-client==2.116.0
|
||||
google-auth==2.37.0
|
||||
google-cloud-compute==1.22.0
|
||||
ibm_cloud_sdk_core==3.18.0
|
||||
ibm_vpc==0.20.0
|
||||
jinja2==3.1.4
|
||||
krkn-lib==4.0.3
|
||||
jinja2==3.1.5
|
||||
krkn-lib==4.0.4
|
||||
lxml==5.1.0
|
||||
kubernetes==28.1.0
|
||||
numpy==1.26.4
|
||||
oauth2client==4.1.3
|
||||
pandas==2.2.0
|
||||
openshift-client==1.0.21
|
||||
paramiko==3.4.0
|
||||
@@ -32,7 +32,7 @@ requests==2.32.2
|
||||
service_identity==24.1.0
|
||||
PyYAML==6.0.1
|
||||
setuptools==70.0.0
|
||||
werkzeug==3.0.3
|
||||
werkzeug==3.0.6
|
||||
wheel==0.42.0
|
||||
zope.interface==5.4.0
|
||||
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
node_scenarios:
|
||||
- actions: # node chaos scenarios to be injected
|
||||
- actions: # node chaos scenarios to be injected
|
||||
- node_stop_start_scenario
|
||||
node_name: # node on which scenario has to be injected; can set multiple names separated by comma
|
||||
label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
|
||||
instance_count: 1 # Number of nodes to perform action/select that match the label selector
|
||||
runs: 1 # number of times to inject each scenario under actions (will perform on same node each time)
|
||||
timeout: 360 # duration to wait for completion of node scenario injection
|
||||
duration: 120 # duration to stop the node before running the start action
|
||||
cloud_type: aws # cloud type on which Kubernetes/OpenShift runs
|
||||
node_name: # node on which scenario has to be injected; can set multiple names separated by comma
|
||||
label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection; can specify multiple by a comma separated list
|
||||
instance_count: 2 # Number of nodes to perform action/select that match the label selector
|
||||
runs: 1 # number of times to inject each scenario under actions (will perform on same node each time)
|
||||
timeout: 360 # duration to wait for completion of node scenario injection
|
||||
duration: 20 # duration to stop the node before running the start action
|
||||
cloud_type: aws # cloud type on which Kubernetes/OpenShift runs
|
||||
parallel: true # Run action on label or node name in parallel or sequential, defaults to sequential
|
||||
- actions:
|
||||
- node_reboot_scenario
|
||||
node_name:
|
||||
@@ -15,3 +16,10 @@ node_scenarios:
|
||||
instance_count: 1
|
||||
timeout: 120
|
||||
cloud_type: aws
|
||||
- actions:
|
||||
- node_disk_detach_attach_scenario
|
||||
node_name:
|
||||
label_selector:
|
||||
instance_count: 1
|
||||
timeout: 120
|
||||
cloud_type: aws
|
||||
@@ -3,3 +3,4 @@ zone_outage: # Scenario to create an out
|
||||
duration: 600 # duration in seconds after which the zone will be back online
|
||||
vpc_id: # cluster virtual private network to target
|
||||
subnet_id: [subnet1, subnet2] # List of subnet-id's to deny both ingress and egress traffic
|
||||
default_acl_id: acl-xxxxxxxx # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario.
|
||||
|
||||
@@ -112,12 +112,12 @@ def parse_arguments(parser):
|
||||
default=[],
|
||||
help="Memory related chaos tests (space separated list)",
|
||||
)
|
||||
parser.add_argument("--threshold", action="store", default="", help="Threshold")
|
||||
parser.add_argument("--threshold", action="store", help="Threshold")
|
||||
parser.add_argument(
|
||||
"--cpu-threshold", action="store", default="", help="CPU threshold"
|
||||
"--cpu-threshold", action="store", help="CPU threshold"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mem-threshold", action="store", default="", help="Memory threshold"
|
||||
"--mem-threshold", action="store", help="Memory threshold"
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
@@ -141,9 +141,9 @@ def read_configuration(config_file_path):
|
||||
prometheus_endpoint = config.get("prometheus_endpoint")
|
||||
auth_token = config.get("auth_token")
|
||||
scrape_duration = get_yaml_item_value(config, "scrape_duration", "10m")
|
||||
threshold = get_yaml_item_value(config, "threshold", ".7")
|
||||
heatmap_cpu_threshold = get_yaml_item_value(config, "cpu_threshold", ".5")
|
||||
heatmap_mem_threshold = get_yaml_item_value(config, "mem_threshold", ".3")
|
||||
threshold = get_yaml_item_value(config, "threshold")
|
||||
heatmap_cpu_threshold = get_yaml_item_value(config, "cpu_threshold")
|
||||
heatmap_mem_threshold = get_yaml_item_value(config, "mem_threshold")
|
||||
output_file = config.get("json_output_file", False)
|
||||
if output_file is True:
|
||||
output_path = config.get("json_output_folder_path")
|
||||
|
||||
Reference in New Issue
Block a user